Repository: rui314/mold Branch: main Commit: fc96c1b9efe6 Files: 2520 Total size: 25.6 MB Directory structure: gitextract_f0ho13on/ ├── .github/ │ ├── FUNDING.yml │ └── workflows/ │ ├── build-all.yml │ ├── build-native.yml │ ├── ci.yml │ ├── install-extras.sh │ ├── release-assets.yml │ ├── run-msan.sh │ └── update-manpage.yml ├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── README.md ├── dist.sh ├── docs/ │ ├── bugs.md │ ├── coding-guidelines.md │ ├── design.md │ ├── execstack.md │ ├── glossary.md │ ├── memory-sanitizer.md │ ├── mold.1 │ └── mold.md ├── install-build-deps.sh ├── install-cross-tools.sh ├── lib/ │ ├── aho-corasick.cc │ ├── atomics.h │ ├── bitvector.h │ ├── compress.cc │ ├── config.h.in │ ├── crc32.cc │ ├── demangle.cc │ ├── filepath.cc │ ├── gentoo-test.sh │ ├── glob.cc │ ├── hyperloglog.cc │ ├── integers.h │ ├── lib.h │ ├── perf.cc │ ├── random.cc │ ├── siphash.h │ ├── tar.cc │ └── update-git-hash.cmake ├── src/ │ ├── arch-arm32.cc │ ├── arch-arm64.cc │ ├── arch-i386.cc │ ├── arch-loongarch.cc │ ├── arch-m68k.cc │ ├── arch-ppc32.cc │ ├── arch-ppc64v1.cc │ ├── arch-ppc64v2.cc │ ├── arch-riscv.cc │ ├── arch-s390x.cc │ ├── arch-sh4.cc │ ├── arch-sparc64.cc │ ├── arch-x86-64.cc │ ├── archive-file.cc │ ├── cmdline.cc │ ├── elf.cc │ ├── elf.h │ ├── entry.cc │ ├── error.cc │ ├── filetype.cc │ ├── gc-sections.cc │ ├── gdb-index.cc │ ├── icf.cc │ ├── input-files.cc │ ├── input-sections.cc │ ├── jobs-unix.cc │ ├── jobs-win32.cc │ ├── linker-script.cc │ ├── lto-unix.cc │ ├── lto-win32.cc │ ├── lto.h │ ├── main.cc │ ├── mapfile.cc │ ├── mapped-file-unix.cc │ ├── mapped-file-win32.cc │ ├── mold-wrapper.c │ ├── mold.h │ ├── output-chunks.cc │ ├── output-file-unix.cc │ ├── output-file-win32.cc │ ├── passes.cc │ ├── relocatable.cc │ ├── shrink-sections.cc │ ├── signal-unix.cc │ ├── signal-win32.cc │ ├── subprocess-unix.cc │ ├── subprocess-win32.cc │ ├── thunks.cc │ └── tls.cc ├── test/ │ ├── CMakeLists.txt │ ├── abs-error.sh │ ├── absolute-symbols.sh │ ├── allow-multiple-definition.sh │ ├── ar-alignment.sh │ ├── arch-aarch64-long-thunk.sh │ ├── arch-aarch64-range-extension-thunk-disassembly.sh │ ├── arch-aarch64-variant-pcs.sh │ ├── arch-arm-abs-error.sh │ ├── arch-arm-exidx-sentinel.sh │ ├── arch-arm-range-extension-thunk-disassembly.sh │ ├── arch-arm-range-extension-thunk.sh │ ├── arch-arm-target1.sh │ ├── arch-arm-thm-jump19.sh │ ├── arch-arm-thm-jump8.sh │ ├── arch-arm-thumb-interwork.sh │ ├── arch-arm-tlsdesc.sh │ ├── arch-armeb-be32.sh │ ├── arch-i686-tls-module-base.sh │ ├── arch-i686-tlsdesc.sh │ ├── arch-loongarch64-mcmodel-extreme.sh │ ├── arch-loongarch64-relax-call36.sh │ ├── arch-loongarch64-relax-got-load.sh │ ├── arch-loongarch64-relax-pcala-addi.sh │ ├── arch-loongarch64-relax-tlsdesc.sh │ ├── arch-ppc64le-save-restore-gprs.sh │ ├── arch-riscv64-attributes.sh │ ├── arch-riscv64-attributes2.sh │ ├── arch-riscv64-global-pointer-dso.sh │ ├── arch-riscv64-global-pointer.sh │ ├── arch-riscv64-obj-compatible.sh │ ├── arch-riscv64-relax-align.sh │ ├── arch-riscv64-relax-got.sh │ ├── arch-riscv64-relax-hi20.sh │ ├── arch-riscv64-relax-j.sh │ ├── arch-riscv64-reloc-overflow.sh │ ├── arch-riscv64-symbol-size.sh │ ├── arch-riscv64-variant-cc.sh │ ├── arch-riscv64-weak-undef.sh │ ├── arch-s390x-got.sh │ ├── arch-x86_64-address-equality.sh │ ├── arch-x86_64-apx-gotpcrelx.sh │ ├── arch-x86_64-apx-gottpoff.sh │ ├── arch-x86_64-apx-gottpoff2.sh │ ├── arch-x86_64-apx-tlsdesc.sh │ ├── arch-x86_64-empty-arg.sh │ ├── arch-x86_64-empty-mergeable-section.sh │ ├── arch-x86_64-emulation-deduction.sh │ ├── arch-x86_64-exception-mcmodel-large.sh │ ├── arch-x86_64-execstack-if-needed.sh │ ├── arch-x86_64-function-multiversion.sh │ ├── arch-x86_64-gnu-linkonce.sh │ ├── arch-x86_64-gnu-retain.sh │ ├── arch-x86_64-gotpcrelx.sh │ ├── arch-x86_64-ifunc-alias.sh │ ├── arch-x86_64-ifunc-export.sh │ ├── arch-x86_64-incompatible-libs-linker-script.sh │ ├── arch-x86_64-incompatible-libs-linker-script2.sh │ ├── arch-x86_64-incompatible-libs.sh │ ├── arch-x86_64-incompatible-libs2.sh │ ├── arch-x86_64-incompatible-obj.sh │ ├── arch-x86_64-init-array-readonly.sh │ ├── arch-x86_64-init-array.sh │ ├── arch-x86_64-isa-level.sh │ ├── arch-x86_64-large-bss.sh │ ├── arch-x86_64-mergeable-records.sh │ ├── arch-x86_64-mergeable-strings-nonalloc.sh │ ├── arch-x86_64-mergeable-strings.sh │ ├── arch-x86_64-note-property.sh │ ├── arch-x86_64-note-property2.sh │ ├── arch-x86_64-note.sh │ ├── arch-x86_64-note2.sh │ ├── arch-x86_64-plt.sh │ ├── arch-x86_64-preinit-array.sh │ ├── arch-x86_64-relax.sh │ ├── arch-x86_64-reloc-overflow.sh │ ├── arch-x86_64-reloc-zero.sh │ ├── arch-x86_64-reloc.sh │ ├── arch-x86_64-section-alignment.sh │ ├── arch-x86_64-section-name.sh │ ├── arch-x86_64-tbss-only.sh │ ├── arch-x86_64-tls-gd-mcmodel-large.sh │ ├── arch-x86_64-tls-gd-to-ie.sh │ ├── arch-x86_64-tls-large-tbss.sh │ ├── arch-x86_64-tls-ld-mcmodel-large.sh │ ├── arch-x86_64-tls-module-base.sh │ ├── arch-x86_64-tlsdesc.sh │ ├── arch-x86_64-unique.sh │ ├── arch-x86_64-warn-execstack.sh │ ├── arch-x86_64-warn-shared-textrel.sh │ ├── arch-x86_64-warn-textrel.sh │ ├── arch-x86_64-z-dynamic-undefined-weak.sh │ ├── arch-x86_64-z-ibt.sh │ ├── arch-x86_64-z-ibtplt.sh │ ├── arch-x86_64-z-rewrite-endbr.sh │ ├── arch-x86_64-z-rewrite-endbr2.sh │ ├── arch-x86_64-z-rewrite-endbr3.sh │ ├── arch-x86_64-z-shstk.sh │ ├── arch-x86_64-z-text.sh │ ├── as-needed-dso.sh │ ├── as-needed-dso2.sh │ ├── as-needed-weak.sh │ ├── as-needed.sh │ ├── audit.sh │ ├── auxiliary.sh │ ├── bno-symbolic.sh │ ├── bsymbolic-functions.sh │ ├── bsymbolic-non-weak-functions.sh │ ├── bsymbolic-non-weak.sh │ ├── bsymbolic.sh │ ├── build-id.sh │ ├── canonical-plt.sh │ ├── cmdline.sh │ ├── color-diagnostics.sh │ ├── comment.sh │ ├── common-archive.sh │ ├── common-ref.sh │ ├── common-symbols.sh │ ├── common.inc │ ├── compress-debug-sections-zstd.sh │ ├── compress-debug-sections.sh │ ├── compressed-debug-info.sh │ ├── copyrel-alignment.sh │ ├── copyrel-norelro.sh │ ├── copyrel-protected.sh │ ├── copyrel-relro.sh │ ├── copyrel-relro2.sh │ ├── copyrel.sh │ ├── crel.sh │ ├── ctors-in-init-array.sh │ ├── dead-debug-sections.sh │ ├── debug-macro-section.sh │ ├── default-symver-version-script.sh │ ├── default-symver.sh │ ├── defsym-lto.sh │ ├── defsym-missing-symbol.sh │ ├── defsym.sh │ ├── defsym2.sh │ ├── demangle-cpp.sh │ ├── demangle-rust.sh │ ├── demangle.sh │ ├── depaudit.sh │ ├── depaudit2.sh │ ├── dependency-file-lto.sh │ ├── dependency-file-response-file.sh │ ├── dependency-file.sh │ ├── disable-new-dtags.sh │ ├── discard-section.sh │ ├── discard.sh │ ├── dso-undef.sh │ ├── dt-init.sh │ ├── dt-needed.sh │ ├── duplicate-error-archive.sh │ ├── duplicate-error-gc-sections.sh │ ├── duplicate-error-lto.sh │ ├── duplicate-error.sh │ ├── dynamic-dt-debug.sh │ ├── dynamic-linker.sh │ ├── dynamic-list-data.sh │ ├── dynamic-list.sh │ ├── dynamic-list2.sh │ ├── dynamic-list3.sh │ ├── dynamic-list4.sh │ ├── dynamic.sh │ ├── emit-relocs-cpp.sh │ ├── emit-relocs-dead-sections.sh │ ├── emit-relocs.sh │ ├── empty-dso.sh │ ├── empty-file.sh │ ├── empty-input.sh │ ├── empty-version.sh │ ├── entry.sh │ ├── exception-multiple-ehframe.sh │ ├── exception.sh │ ├── exclude-libs.sh │ ├── exclude-libs2.sh │ ├── exclude-libs3.sh │ ├── execstack.sh │ ├── execute-only.sh │ ├── export-dynamic.sh │ ├── export-from-exe.sh │ ├── fatal-warnings.sh │ ├── filler.sh │ ├── filter.sh │ ├── func-addr.sh │ ├── gc-sections.sh │ ├── gdb-index-compress-output.sh │ ├── gdb-index-dwarf2.sh │ ├── gdb-index-dwarf3.sh │ ├── gdb-index-dwarf4.sh │ ├── gdb-index-dwarf5.sh │ ├── gdb-index-dwarf64.sh │ ├── gdb-index-empty.sh │ ├── gdb-index-split-dwarf.sh │ ├── glibc-2.22-bug.sh │ ├── global-offset-table.sh │ ├── gnu-hash.sh │ ├── gnu-property.sh │ ├── gnu-retain.sh │ ├── gnu-unique.sh │ ├── gnu-warning.sh │ ├── hash-style-sysv.sh │ ├── hash-style.sh │ ├── hello-dynamic.sh │ ├── hello-static.sh │ ├── help.sh │ ├── hidden-archive.sh │ ├── hidden-undef.sh │ ├── hidden-weak-undef.sh │ ├── icf-gcc-except-table.sh │ ├── icf-safe.sh │ ├── icf-small.sh │ ├── icf.sh │ ├── ifunc-address-equality-exported.sh │ ├── ifunc-address-equality.sh │ ├── ifunc-alias.sh │ ├── ifunc-dlopen.sh │ ├── ifunc-dso.sh │ ├── ifunc-dynamic.sh │ ├── ifunc-export.sh │ ├── ifunc-funcptr.sh │ ├── ifunc-noplt.sh │ ├── ifunc-static-pie.sh │ ├── ifunc-static.sh │ ├── image-base.sh │ ├── init-array-priorities.sh │ ├── init-in-dso.sh │ ├── init.sh │ ├── initfirst.sh │ ├── interpose.sh │ ├── invalid-version-script.sh │ ├── issue646.sh │ ├── large-alignment-dso.sh │ ├── large-alignment.sh │ ├── large-max-page-size-strip.sh │ ├── large-max-page-size.sh │ ├── large-text.sh │ ├── library.sh │ ├── link-order.sh │ ├── linker-script-defsym.sh │ ├── linker-script-error.sh │ ├── linker-script-relocatable.sh │ ├── linker-script.sh │ ├── linker-script2.sh │ ├── linker-script3.sh │ ├── linker-script4.sh │ ├── linker-script5.sh │ ├── linker-script6.sh │ ├── lto-archive.sh │ ├── lto-archive2.sh │ ├── lto-archive3.sh │ ├── lto-comdat.sh │ ├── lto-dso.sh │ ├── lto-gcc.sh │ ├── lto-llvm.sh │ ├── lto-llvm2.sh │ ├── lto-no-plugin.sh │ ├── lto-nostdlib.sh │ ├── lto-version-script.sh │ ├── main-in-dso.sh │ ├── many-input-sections.sh │ ├── many-input-sections2.sh │ ├── many-output-sections.sh │ ├── mcmodel-large.sh │ ├── mergeable-strings.sh │ ├── missing-but-ok.sh │ ├── missing-error.sh │ ├── mold-wrapper.sh │ ├── mold-wrapper2.sh │ ├── nmagic.sh │ ├── no-allow-shlib-undefined-circular.sh │ ├── no-allow-shlib-undefined.sh │ ├── no-allow-shlib-undefined2.sh │ ├── no-allow-shlib-undefined3.sh │ ├── no-allow-shlib-undefined4.sh │ ├── no-eh-frame-header.sh │ ├── no-object-file.sh │ ├── no-quick-exit.sh │ ├── no-undefined-version.sh │ ├── nocopyreloc.sh │ ├── noinhibit-exec.sh │ ├── non-canonical-plt.sh │ ├── nostdlib.sh │ ├── oformat-binary.sh │ ├── omagic.sh │ ├── package-metadata.sh │ ├── physical-image-base.sh │ ├── pie.sh │ ├── plt-dso.sh │ ├── plt-symbols.sh │ ├── pltgot.sh │ ├── preinit-array.sh │ ├── print-dependencies.sh │ ├── protected-dynsym.sh │ ├── protected.sh │ ├── push-pop-state.sh │ ├── range-extension-thunk.sh │ ├── range-extension-thunk2.sh │ ├── range-extension-thunk3.sh │ ├── range-extension-thunk4.sh │ ├── relax-got-load.sh │ ├── reloc-rodata.sh │ ├── relocatable-archive.sh │ ├── relocatable-c++.sh │ ├── relocatable-compressed-debug-info.sh │ ├── relocatable-debug-info.sh │ ├── relocatable-exception.sh │ ├── relocatable-many-sections.sh │ ├── relocatable-merge-sections.sh │ ├── relocatable-mergeable-sections.sh │ ├── relocatable.sh │ ├── relro-alignment.sh │ ├── relro.sh │ ├── repro.sh │ ├── require-defined.sh │ ├── response-file-quoting.sh │ ├── response-file.sh │ ├── response-file2.sh │ ├── retain-symbols-file.sh │ ├── reverse-sections.sh │ ├── rodata-name.sh │ ├── rosegment.sh │ ├── rpath.sh │ ├── run-clang.sh │ ├── run.sh │ ├── section-align.sh │ ├── section-attributes.sh │ ├── section-order.sh │ ├── section-start.sh │ ├── separate-debug-file-sort.sh │ ├── separate-debug-file.sh │ ├── shared-abs-sym.sh │ ├── shared.sh │ ├── shuffle-sections-seed.sh │ ├── shuffle-sections.sh │ ├── soname.sh │ ├── sort-debug-info-compressed.sh │ ├── sort-debug-info-merged.sh │ ├── sort-debug-info.sh │ ├── spare-program-headers.sh │ ├── start-lib.sh │ ├── start-stop-symbol.sh │ ├── start-stop.sh │ ├── static-archive.sh │ ├── static-pie.sh │ ├── stdout.sh │ ├── strip-debug.sh │ ├── strip.sh │ ├── stt-common.sh │ ├── symbol-rank.sh │ ├── symbol-version-as-needed.sh │ ├── symbol-version-lto.sh │ ├── symbol-version-multi.sh │ ├── symbol-version.sh │ ├── symbol-version2.sh │ ├── symbol-version3.sh │ ├── symbol-version4.sh │ ├── symbol-version5.sh │ ├── symtab-dso.sh │ ├── symtab-section-symbols.sh │ ├── symtab.sh │ ├── synthetic-symbols.sh │ ├── sysroot-linker-script.sh │ ├── sysroot.sh │ ├── sysroot2.sh │ ├── tail-call.sh │ ├── tbss-only.sh │ ├── textrel.sh │ ├── textrel2.sh │ ├── thin-archive.sh │ ├── thread-count.sh │ ├── tls-alignment-multi.sh │ ├── tls-common.sh │ ├── tls-df-static-tls.sh │ ├── tls-dso.sh │ ├── tls-gd-dlopen.sh │ ├── tls-gd-noplt.sh │ ├── tls-gd-to-ie.sh │ ├── tls-gd.sh │ ├── tls-ie.sh │ ├── tls-irregular-start-addr.sh │ ├── tls-large-alignment.sh │ ├── tls-large-static-image.sh │ ├── tls-ld-noplt.sh │ ├── tls-ld.sh │ ├── tls-le-error.sh │ ├── tls-le.sh │ ├── tls-nopic.sh │ ├── tls-pic.sh │ ├── tls-small-alignment.sh │ ├── tlsdesc-dlopen.sh │ ├── tlsdesc-import.sh │ ├── tlsdesc-initial-exec.sh │ ├── tlsdesc-local-dynamic.sh │ ├── tlsdesc-static.sh │ ├── tlsdesc.sh │ ├── trace-symbol-symver.sh │ ├── trace-symbol.sh │ ├── trace.sh │ ├── undefined-glob-gc-sections.sh │ ├── undefined-glob.sh │ ├── undefined.sh │ ├── undefined2.sh │ ├── unknown-section-type.sh │ ├── unresolved-symbols.sh │ ├── unresolved-symbols2.sh │ ├── verbose.sh │ ├── version-script-search-paths.sh │ ├── version-script.sh │ ├── version-script10.sh │ ├── version-script11.sh │ ├── version-script12.sh │ ├── version-script13.sh │ ├── version-script14.sh │ ├── version-script15.sh │ ├── version-script16.sh │ ├── version-script17.sh │ ├── version-script18.sh │ ├── version-script19.sh │ ├── version-script2.sh │ ├── version-script20.sh │ ├── version-script21.sh │ ├── version-script22.sh │ ├── version-script23.sh │ ├── version-script3.sh │ ├── version-script4.sh │ ├── version-script5.sh │ ├── version-script6.sh │ ├── version-script7.sh │ ├── version-script8.sh │ ├── version-script9.sh │ ├── version.sh │ ├── versioned-undef.sh │ ├── visibility.sh │ ├── warn-common.sh │ ├── warn-once.sh │ ├── warn-symbol-type.sh │ ├── warn-unresolved-symbols.sh │ ├── weak-export-dso.sh │ ├── weak-export-dso2.sh │ ├── weak-export-exe.sh │ ├── weak-undef-dso.sh │ ├── weak-undef.sh │ ├── weak-undef2.sh │ ├── weak-undef4.sh │ ├── weak-undef5.sh │ ├── whole-archive.sh │ ├── wrap-lto.sh │ ├── wrap.sh │ ├── z-cet-report.sh │ ├── z-defs.sh │ ├── z-dynamic-undefined-weak-exe.sh │ ├── z-dynamic-undefined-weak.sh │ ├── z-dynamic-undefined-weak2.sh │ ├── z-max-page-size.sh │ ├── z-nodefaultlib.sh │ ├── z-nodump.sh │ ├── z-now.sh │ ├── z-origin.sh │ ├── z-pack-relative-relocs.sh │ ├── z-rodynamic.sh │ ├── z-sectionheader.sh │ ├── z-separate-code.sh │ ├── z-stack-size.sh │ ├── z-start-stop-visibility.sh │ └── zero-to-bss.sh └── third-party/ ├── blake3/ │ ├── .cargo/ │ │ └── config.toml │ ├── .git-blame-ignore-revs │ ├── .github/ │ │ └── workflows/ │ │ ├── build_b3sum.py │ │ ├── ci.yml │ │ ├── tag.yml │ │ └── upload_github_release_asset.py │ ├── .gitignore │ ├── CONTRIBUTING.md │ ├── Cargo.toml │ ├── LICENSE_A2 │ ├── LICENSE_A2LLVM │ ├── LICENSE_CC0 │ ├── README.md │ ├── b3sum/ │ │ ├── .gitignore │ │ ├── Cargo.lock │ │ ├── Cargo.toml │ │ ├── README.md │ │ ├── src/ │ │ │ ├── main.rs │ │ │ └── unit_tests.rs │ │ ├── tests/ │ │ │ └── cli_tests.rs │ │ └── what_does_check_do.md │ ├── benches/ │ │ └── bench.rs │ ├── build.rs │ ├── reference_impl/ │ │ ├── Cargo.toml │ │ ├── README.md │ │ └── reference_impl.rs │ ├── src/ │ │ ├── ffi_avx2.rs │ │ ├── ffi_avx512.rs │ │ ├── ffi_neon.rs │ │ ├── ffi_sse2.rs │ │ ├── ffi_sse41.rs │ │ ├── guts.rs │ │ ├── hazmat.rs │ │ ├── io.rs │ │ ├── join.rs │ │ ├── lib.rs │ │ ├── platform.rs │ │ ├── portable.rs │ │ ├── rust_avx2.rs │ │ ├── rust_sse2.rs │ │ ├── rust_sse41.rs │ │ ├── test.rs │ │ ├── traits.rs │ │ └── wasm32_simd.rs │ ├── test_vectors/ │ │ ├── Cargo.toml │ │ ├── cross_test.sh │ │ ├── src/ │ │ │ ├── bin/ │ │ │ │ └── generate.rs │ │ │ └── lib.rs │ │ └── test_vectors.json │ └── tools/ │ ├── compiler_version/ │ │ ├── Cargo.toml │ │ ├── build.rs │ │ └── src/ │ │ └── main.rs │ ├── instruction_set_support/ │ │ ├── Cargo.toml │ │ └── src/ │ │ └── main.rs │ └── release.md ├── mimalloc/ │ ├── .gitattributes │ ├── .gitignore │ ├── CMakeLists.txt │ ├── LICENSE │ ├── SECURITY.md │ ├── azure-pipelines.yml │ ├── bin/ │ │ ├── mimalloc-redirect-arm64.lib │ │ ├── mimalloc-redirect-arm64ec.lib │ │ ├── mimalloc-redirect.lib │ │ ├── mimalloc-redirect32.lib │ │ └── readme.md │ ├── cmake/ │ │ ├── JoinPaths.cmake │ │ ├── mimalloc-config-version.cmake │ │ └── mimalloc-config.cmake │ ├── contrib/ │ │ ├── docker/ │ │ │ ├── alpine/ │ │ │ │ └── Dockerfile │ │ │ ├── alpine-arm32v7/ │ │ │ │ └── Dockerfile │ │ │ ├── manylinux-x64/ │ │ │ │ └── Dockerfile │ │ │ └── readme.md │ │ └── vcpkg/ │ │ ├── portfile.cmake │ │ ├── readme.md │ │ ├── usage │ │ ├── vcpkg-cmake-wrapper.cmake │ │ └── vcpkg.json │ ├── doc/ │ │ ├── doxyfile │ │ ├── mimalloc-doc.h │ │ └── mimalloc-doxygen.css │ ├── ide/ │ │ └── vs2022/ │ │ ├── mimalloc-lib.vcxproj │ │ ├── mimalloc-lib.vcxproj.filters │ │ ├── mimalloc-override-dll.vcxproj │ │ ├── mimalloc-override-dll.vcxproj.filters │ │ ├── mimalloc-override-test-dep.vcxproj │ │ ├── mimalloc-override-test.vcxproj │ │ ├── mimalloc-test-api.vcxproj │ │ ├── mimalloc-test-stress.vcxproj │ │ ├── mimalloc-test.vcxproj │ │ └── mimalloc.sln │ ├── include/ │ │ ├── mimalloc/ │ │ │ ├── atomic.h │ │ │ ├── internal.h │ │ │ ├── prim.h │ │ │ ├── track.h │ │ │ └── types.h │ │ ├── mimalloc-new-delete.h │ │ ├── mimalloc-override.h │ │ ├── mimalloc-stats.h │ │ └── mimalloc.h │ ├── mimalloc.pc.in │ ├── readme.md │ ├── src/ │ │ ├── alloc-aligned.c │ │ ├── alloc-override.c │ │ ├── alloc-posix.c │ │ ├── alloc.c │ │ ├── arena-abandon.c │ │ ├── arena.c │ │ ├── bitmap.c │ │ ├── bitmap.h │ │ ├── free.c │ │ ├── heap.c │ │ ├── init.c │ │ ├── libc.c │ │ ├── options.c │ │ ├── os.c │ │ ├── page-queue.c │ │ ├── page.c │ │ ├── prim/ │ │ │ ├── emscripten/ │ │ │ │ └── prim.c │ │ │ ├── osx/ │ │ │ │ ├── alloc-override-zone.c │ │ │ │ └── prim.c │ │ │ ├── prim.c │ │ │ ├── readme.md │ │ │ ├── unix/ │ │ │ │ └── prim.c │ │ │ ├── wasi/ │ │ │ │ └── prim.c │ │ │ └── windows/ │ │ │ ├── etw-mimalloc.wprp │ │ │ ├── etw.h │ │ │ ├── etw.man │ │ │ ├── prim.c │ │ │ └── readme.md │ │ ├── random.c │ │ ├── segment-map.c │ │ ├── segment.c │ │ ├── static.c │ │ └── stats.c │ └── test/ │ ├── CMakeLists.txt │ ├── main-override-dep.cpp │ ├── main-override-dep.h │ ├── main-override-static.c │ ├── main-override.c │ ├── main-override.cpp │ ├── main.c │ ├── readme.md │ ├── test-api-fill.c │ ├── test-api.c │ ├── test-stress.c │ ├── test-wrong.c │ └── testhelper.h ├── rust-demangle/ │ ├── .clang-format │ ├── .gitignore │ ├── Cargo.toml │ ├── LICENSE-APACHE │ ├── LICENSE-MIT │ ├── README.md │ ├── rust-demangle.c │ ├── rust-demangle.h │ └── test-harness/ │ ├── Cargo.toml │ ├── build.rs │ ├── examples/ │ │ └── check-csv-dataset.rs │ ├── src/ │ │ └── lib.rs │ └── tests/ │ ├── legacy.rs │ ├── top_level.rs │ └── v0.rs ├── tbb/ │ ├── .bazelrc │ ├── .bazelversion │ ├── .gitattributes │ ├── .github/ │ │ ├── CODEOWNERS │ │ ├── ISSUE_TEMPLATE/ │ │ │ ├── 1_question.md │ │ │ ├── 2_bug_report.md │ │ │ ├── 3_feature_request.md │ │ │ └── 4_documentation.md │ │ ├── dependabot.yml │ │ ├── issue_labeler.yml │ │ ├── labeler.yml │ │ ├── pull_request_template.md │ │ ├── scripts/ │ │ │ └── codespell.sh │ │ └── workflows/ │ │ ├── ci.yml │ │ ├── codeql.yml │ │ ├── coverity.yml │ │ ├── issue_labeler.yml │ │ ├── labeler.yml │ │ └── ossf-scorecard.yml │ ├── .gitignore │ ├── BUILD.bazel │ ├── Bazel.md │ ├── CMakeLists.txt │ ├── CODEOWNERS │ ├── CODE_OF_CONDUCT.md │ ├── CONTRIBUTING.md │ ├── INSTALL.md │ ├── LICENSE.txt │ ├── MAINTAINERS.md │ ├── MODULE.bazel │ ├── README.md │ ├── RELEASE_NOTES.md │ ├── SECURITY.md │ ├── SUPPORT.md │ ├── SYSTEM_REQUIREMENTS.md │ ├── WASM_Support.md │ ├── WORKSPACE.bazel │ ├── cmake/ │ │ ├── README.md │ │ ├── android/ │ │ │ ├── device_environment_cleanup.cmake │ │ │ ├── environment.cmake │ │ │ └── test_launcher.cmake │ │ ├── compilers/ │ │ │ ├── AppleClang.cmake │ │ │ ├── Clang.cmake │ │ │ ├── GNU.cmake │ │ │ ├── Intel.cmake │ │ │ ├── IntelLLVM.cmake │ │ │ ├── MSVC.cmake │ │ │ └── QCC.cmake │ │ ├── config_generation.cmake │ │ ├── hwloc_detection.cmake │ │ ├── memcheck.cmake │ │ ├── packaging.cmake │ │ ├── post_install/ │ │ │ └── CMakeLists.txt │ │ ├── python/ │ │ │ └── test_launcher.cmake │ │ ├── resumable_tasks.cmake │ │ ├── sanitize.cmake │ │ ├── scripts/ │ │ │ └── cmake_gen_github_configs.cmake │ │ ├── suppressions/ │ │ │ ├── lsan.suppressions │ │ │ └── tsan.suppressions │ │ ├── templates/ │ │ │ ├── TBBConfig.cmake.in │ │ │ └── TBBConfigVersion.cmake.in │ │ ├── test_spec.cmake │ │ ├── toolchains/ │ │ │ ├── mips.cmake │ │ │ └── riscv64.cmake │ │ ├── utils.cmake │ │ └── vars_utils.cmake │ ├── doc/ │ │ ├── Doxyfile.in │ │ ├── DoxygenLayout.xml │ │ ├── GSG/ │ │ │ ├── get_started.rst │ │ │ ├── installation.rst │ │ │ ├── integrate.rst │ │ │ ├── intro_gsg.rst │ │ │ ├── next_steps.rst │ │ │ ├── samples.rst │ │ │ └── system_requirements.rst │ │ ├── README.md │ │ ├── _static/ │ │ │ ├── custom.js │ │ │ └── theme_overrides.css │ │ ├── conf.py │ │ ├── index/ │ │ │ ├── index_intro.rst │ │ │ ├── toctree.rst │ │ │ └── useful_topics.rst │ │ ├── index.rst │ │ ├── main/ │ │ │ ├── _templates/ │ │ │ │ └── layout.html │ │ │ ├── examples_testing/ │ │ │ │ └── CMakeLists.txt │ │ │ ├── intro/ │ │ │ │ ├── Benefits.rst │ │ │ │ ├── help_support.rst │ │ │ │ ├── intro_os.rst │ │ │ │ ├── limitations.rst │ │ │ │ ├── notation.rst │ │ │ │ └── testing_approach.rst │ │ │ ├── reference/ │ │ │ │ ├── blocked_nd_range_ctad.rst │ │ │ │ ├── blocking_terminate.rst │ │ │ │ ├── concurrent_lru_cache_cls.rst │ │ │ │ ├── constraints_extensions.rst │ │ │ │ ├── constructors_for_nodes.rst │ │ │ │ ├── custom_mutex_chmap.rst │ │ │ │ ├── examples/ │ │ │ │ │ ├── blocked_nd_range_ctad_example.cpp │ │ │ │ │ ├── custom_mutex_chmap_example.cpp │ │ │ │ │ ├── fixed_pool_example.cpp │ │ │ │ │ ├── helpers_for_expressing_graphs_preview_api_example.cpp │ │ │ │ │ ├── helpers_for_expressing_graphs_regular_api_example.cpp │ │ │ │ │ ├── make_edges_function_example.cpp │ │ │ │ │ ├── malloc_replacement_log_example.cpp │ │ │ │ │ ├── memory_pool_allocator_example.cpp │ │ │ │ │ ├── memory_pool_example.cpp │ │ │ │ │ ├── parallel_phase_example.cpp │ │ │ │ │ ├── parallel_sort_ranges_extension_example.cpp │ │ │ │ │ ├── rvalue_reduce.cpp │ │ │ │ │ └── try_put_and_wait_example.cpp │ │ │ │ ├── follows_and_precedes_functions.rst │ │ │ │ ├── helpers_for_expressing_graphs.rst │ │ │ │ ├── heterogeneous_extensions_chmap.rst │ │ │ │ ├── info_namespace.rst │ │ │ │ ├── info_namespace_extensions.rst │ │ │ │ ├── make_edges_function.rst │ │ │ │ ├── make_node_set_function.rst │ │ │ │ ├── mutex_cls.rst │ │ │ │ ├── parallel_for_each_semantics.rst │ │ │ │ ├── parallel_phase_for_task_arena.rst │ │ │ │ ├── parallel_sort_ranges_extension.rst │ │ │ │ ├── reference.rst │ │ │ │ ├── rvalue_reduce.rst │ │ │ │ ├── rw_mutex_cls.rst │ │ │ │ ├── scalable_memory_pools/ │ │ │ │ │ ├── fixed_pool_cls.rst │ │ │ │ │ ├── malloc_replacement_log.rst │ │ │ │ │ ├── memory_pool_allocator_cls.rst │ │ │ │ │ └── memory_pool_cls.rst │ │ │ │ ├── scalable_memory_pools.rst │ │ │ │ ├── task_group_extensions.rst │ │ │ │ ├── try_put_and_wait.rst │ │ │ │ └── type_specified_message_keys.rst │ │ │ └── tbb_userguide/ │ │ │ ├── Advanced_Example.rst │ │ │ ├── Advanced_Topic_Other_Kinds_of_Iteration_Spaces.rst │ │ │ ├── Allocator_Configuration.rst │ │ │ ├── Automatic_Chunking.rst │ │ │ ├── Automically_Replacing_malloc.rst │ │ │ ├── Bandwidth_and_Cache_Affinity_os.rst │ │ │ ├── Basic_Flow_Graph_concepts.rst │ │ │ ├── Cancellation_Without_An_Exception.rst │ │ │ ├── Cancellation_and_Nested_Parallelism.rst │ │ │ ├── Concurrent_Queue_Classes.rst │ │ │ ├── Constraints.rst │ │ │ ├── Containers.rst │ │ │ ├── Controlling_Chunking_os.rst │ │ │ ├── Cook_Until_Done_parallel_do.rst │ │ │ ├── Data_Flow_Graph.rst │ │ │ ├── Debug_Versus_Release_Libraries.rst │ │ │ ├── Dependence_Graph.rst │ │ │ ├── Edges.rst │ │ │ ├── Exceptions_and_Cancellation.rst │ │ │ ├── Floating_Point_Settings.rst │ │ │ ├── Flow-Graph-exception-tips.rst │ │ │ ├── Flow-Graph-waiting-tips.rst │ │ │ ├── Flow_Graph.rst │ │ │ ├── Flow_Graph_Buffering_in_Nodes.rst │ │ │ ├── Flow_Graph_Message_Passing_Protocol.rst │ │ │ ├── Flow_Graph_Reservation.rst │ │ │ ├── Flow_Graph_Single_Vs_Broadcast.rst │ │ │ ├── Flow_Graph_Tips.rst │ │ │ ├── Flow_Graph_exception_tips.rst │ │ │ ├── Flow_Graph_making_edges_tips.rst │ │ │ ├── Flow_Graph_nested_parallelism_tips.rst │ │ │ ├── Flow_Graph_resource_tips.rst │ │ │ ├── Flow_Graph_waiting_tips.rst │ │ │ ├── Graph_Main_Categories.rst │ │ │ ├── Graph_Object.rst │ │ │ ├── Guiding_Task_Scheduler_Execution.rst │ │ │ ├── How_Task_Scheduler_Works.rst │ │ │ ├── Initializing_and_Terminating_the_Library.rst │ │ │ ├── Iterating_Over_a_Concurrent_Queue_for_Debugging.rst │ │ │ ├── Lambda_Expressions.rst │ │ │ ├── Linux_C_Dynamic_Memory_Interface_Replacement.rst │ │ │ ├── Linux_OS.rst │ │ │ ├── Lock_Pathologies.rst │ │ │ ├── Mac_OS.rst │ │ │ ├── Mapping_Nodes2Tasks.rst │ │ │ ├── Memory_Allocation.rst │ │ │ ├── Migration_Guide/ │ │ │ │ ├── Mixing_Two_Runtimes.rst │ │ │ │ ├── Task_API.rst │ │ │ │ └── Task_Scheduler_Init.rst │ │ │ ├── Migration_Guide.rst │ │ │ ├── More_on_HashCompare.rst │ │ │ ├── Mutex_Flavors.rst │ │ │ ├── Mutual_Exclusion.rst │ │ │ ├── Nodes.rst │ │ │ ├── Non-Linear_Pipelines.rst │ │ │ ├── Package_Contents_os.rst │ │ │ ├── Parallelizing_Complex_Loops.rst │ │ │ ├── Parallelizing_Flow_Graph.rst │ │ │ ├── Parallelizing_Simple_Loops_os.rst │ │ │ ├── Parallelizing_Simple_Loops_toctree.rst │ │ │ ├── Partitioner_Summary.rst │ │ │ ├── Predefined_Node_Types.rst │ │ │ ├── Reader_Writer_Mutexes.rst │ │ │ ├── References.rst │ │ │ ├── Scalable_Memory_Allocator.rst │ │ │ ├── Summary_of_Containers.rst │ │ │ ├── Summary_of_Loops_and_Pipelines.rst │ │ │ ├── Task-Based_Programming.rst │ │ │ ├── Task_Scheduler_Bypass.rst │ │ │ ├── The_Task_Scheduler.rst │ │ │ ├── Throughput_of_pipeline.rst │ │ │ ├── Timing.rst │ │ │ ├── UpgradeDowngrade.rst │ │ │ ├── Using_Circular_Buffers.rst │ │ │ ├── When_Not_to_Use_Queues.rst │ │ │ ├── When_Task-Based_Programming_Is_Inappropriate.rst │ │ │ ├── Which_Dynamic_Libraries_to_Use.rst │ │ │ ├── Windows_C_Dynamic_Memory_Interface_Replacement.rst │ │ │ ├── Windows_OS_ug.rst │ │ │ ├── Working_on_the_Assembly_Line_pipeline.rst │ │ │ ├── always_use_wait_for_all.rst │ │ │ ├── appendix_A.rst │ │ │ ├── appendix_B.rst │ │ │ ├── attach_flow_graph_to_arena.rst │ │ │ ├── automatically-replacing-malloc.rst │ │ │ ├── avoid_dynamic_node_removal.rst │ │ │ ├── avoiding_data_races.rst │ │ │ ├── broadcast_or_send.rst │ │ │ ├── cancel_a_graph.rst │ │ │ ├── cancelling_nested_parallelism.rst │ │ │ ├── catching_exceptions.rst │ │ │ ├── communicate_with_nodes.rst │ │ │ ├── concurrent_hash_map.rst │ │ │ ├── concurrent_vector_ug.rst │ │ │ ├── create_token_based_system.rst │ │ │ ├── design_patterns/ │ │ │ │ ├── Agglomeration.rst │ │ │ │ ├── Design_Patterns.rst │ │ │ │ ├── Divide_and_Conquer.rst │ │ │ │ ├── Elementwise.rst │ │ │ │ ├── Fenced_Data_Transfer.rst │ │ │ │ ├── GUI_Thread.rst │ │ │ │ ├── General_References.rst │ │ │ │ ├── Lazy_Initialization.rst │ │ │ │ ├── Local_Serializer.rst │ │ │ │ ├── Non-Preemptive_Priorities.rst │ │ │ │ ├── Odd-Even_Communication.rst │ │ │ │ ├── Reduction.rst │ │ │ │ ├── Reference_Counting.rst │ │ │ │ └── Wavefront.rst │ │ │ ├── destroy_graphs_outside_main_thread.rst │ │ │ ├── estimate_flow_graph_performance.rst │ │ │ ├── examples/ │ │ │ │ ├── blocked_nd_range_example.cpp │ │ │ │ ├── flow_graph_examples.cpp │ │ │ │ ├── parallel_for_lambda_example_1.cpp │ │ │ │ ├── parallel_for_lambda_example_2.cpp │ │ │ │ └── parallel_for_os_example.cpp │ │ │ ├── parallel_for_os.rst │ │ │ ├── parallel_for_toctree.rst │ │ │ ├── parallel_reduce.rst │ │ │ ├── snippets/ │ │ │ │ ├── blocked_nd_range_example.cpp │ │ │ │ ├── blocked_nd_range_example.h │ │ │ │ └── flow_graph_examples.cpp │ │ │ ├── std_invoke.rst │ │ │ ├── title.rst │ │ │ ├── use_concurrency_limits.rst │ │ │ ├── use_graph_reset.rst │ │ │ ├── use_input_node.rst │ │ │ ├── use_limiter_node.rst │ │ │ ├── use_make_edge.rst │ │ │ ├── use_nested_algorithms.rst │ │ │ ├── use_nested_flow_graphs.rst │ │ │ └── work_isolation.rst │ │ ├── make.bat │ │ └── test_classification.dox │ ├── examples/ │ │ ├── .clang-format │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ ├── common/ │ │ │ ├── cmake/ │ │ │ │ ├── common.cmake │ │ │ │ └── modules/ │ │ │ │ └── FindTBB.cmake │ │ │ ├── gui/ │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── convideo.cpp │ │ │ │ ├── d2dvideo.cpp │ │ │ │ ├── gdivideo.cpp │ │ │ │ ├── macvideo.cpp │ │ │ │ ├── video.hpp │ │ │ │ ├── winvideo.hpp │ │ │ │ ├── xcode/ │ │ │ │ │ └── tbbExample/ │ │ │ │ │ ├── OpenGLView.h │ │ │ │ │ ├── OpenGLView.m │ │ │ │ │ ├── PkgInfo │ │ │ │ │ ├── en.lproj/ │ │ │ │ │ │ ├── InfoPlist.strings │ │ │ │ │ │ ├── MainMenu.nib │ │ │ │ │ │ └── MainMenu.xib │ │ │ │ │ ├── iOS.storyboard │ │ │ │ │ ├── main.m │ │ │ │ │ ├── tbbAppDelegate.h │ │ │ │ │ ├── tbbAppDelegate.m │ │ │ │ │ ├── tbbExample-Info.ios.plist │ │ │ │ │ └── tbbExample-Info.plist │ │ │ │ └── xvideo.cpp │ │ │ └── utility/ │ │ │ ├── fast_random.hpp │ │ │ ├── get_default_num_threads.hpp │ │ │ └── utility.hpp │ │ ├── concurrent_hash_map/ │ │ │ ├── README.md │ │ │ └── count_strings/ │ │ │ ├── CMakeLists.txt │ │ │ ├── README.md │ │ │ └── count_strings.cpp │ │ ├── concurrent_priority_queue/ │ │ │ ├── README.md │ │ │ └── shortpath/ │ │ │ ├── CMakeLists.txt │ │ │ ├── README.md │ │ │ └── shortpath.cpp │ │ ├── getting_started/ │ │ │ ├── README.md │ │ │ └── sub_string_finder/ │ │ │ ├── CMakeLists.txt │ │ │ ├── README.md │ │ │ ├── sub_string_finder.cpp │ │ │ ├── sub_string_finder_extended.cpp │ │ │ └── sub_string_finder_pretty.cpp │ │ ├── graph/ │ │ │ ├── README.md │ │ │ ├── binpack/ │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── README.md │ │ │ │ └── binpack.cpp │ │ │ ├── cholesky/ │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── README.md │ │ │ │ ├── cholesky.cpp │ │ │ │ └── init.cpp │ │ │ ├── dining_philosophers/ │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── README.md │ │ │ │ └── dining_philosophers.cpp │ │ │ ├── fgbzip2/ │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── README.md │ │ │ │ ├── blocksort.cpp │ │ │ │ ├── bzlib.cpp │ │ │ │ ├── bzlib.hpp │ │ │ │ ├── bzlib_private.hpp │ │ │ │ ├── compress.cpp │ │ │ │ ├── crctable.cpp │ │ │ │ ├── decompress.cpp │ │ │ │ ├── fgbzip2.cpp │ │ │ │ ├── huffman.cpp │ │ │ │ └── randtable.cpp │ │ │ ├── logic_sim/ │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── D_latch.hpp │ │ │ │ ├── README.md │ │ │ │ ├── basics.hpp │ │ │ │ ├── four_bit_adder.hpp │ │ │ │ ├── one_bit_adder.hpp │ │ │ │ ├── test_all.cpp │ │ │ │ └── two_bit_adder.hpp │ │ │ └── som/ │ │ │ ├── CMakeLists.txt │ │ │ ├── README.md │ │ │ ├── som.cpp │ │ │ ├── som.hpp │ │ │ └── som_graph.cpp │ │ ├── migration/ │ │ │ ├── README.md │ │ │ └── recursive_fibonacci/ │ │ │ ├── CMakeLists.txt │ │ │ ├── README.md │ │ │ ├── fibonacci.cpp │ │ │ ├── fibonacci_single_task.h │ │ │ ├── fibonacci_two_tasks.h │ │ │ └── task_emulation_layer.h │ │ ├── parallel_for/ │ │ │ ├── README.md │ │ │ ├── game_of_life/ │ │ │ │ ├── Board.hpp │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── Evolution.cpp │ │ │ │ ├── Evolution.hpp │ │ │ │ ├── Game_of_life.cpp │ │ │ │ ├── README.md │ │ │ │ └── Update_state.cpp │ │ │ ├── polygon_overlay/ │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── README.md │ │ │ │ ├── gui/ │ │ │ │ │ ├── polygon_overlay.rc │ │ │ │ │ └── resource.h │ │ │ │ ├── polymain.cpp │ │ │ │ ├── polymain.hpp │ │ │ │ ├── polyover.cpp │ │ │ │ ├── polyover.hpp │ │ │ │ ├── pover_global.hpp │ │ │ │ ├── pover_video.cpp │ │ │ │ ├── pover_video.hpp │ │ │ │ └── rpolygon.hpp │ │ │ ├── seismic/ │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── README.md │ │ │ │ ├── gui/ │ │ │ │ │ ├── resource.h │ │ │ │ │ └── seismic.rc │ │ │ │ ├── main.cpp │ │ │ │ ├── resource.hpp │ │ │ │ ├── seismic_video.cpp │ │ │ │ ├── seismic_video.hpp │ │ │ │ ├── universe.cpp │ │ │ │ └── universe.hpp │ │ │ └── tachyon/ │ │ │ ├── CMakeLists.txt │ │ │ ├── README.md │ │ │ ├── gui/ │ │ │ │ ├── resource.h │ │ │ │ └── tachyon.rc │ │ │ └── src/ │ │ │ ├── api.cpp │ │ │ ├── api.hpp │ │ │ ├── apigeom.cpp │ │ │ ├── apitrigeom.cpp │ │ │ ├── apitrigeom.hpp │ │ │ ├── bndbox.cpp │ │ │ ├── bndbox.hpp │ │ │ ├── box.cpp │ │ │ ├── box.hpp │ │ │ ├── camera.cpp │ │ │ ├── camera.hpp │ │ │ ├── coordsys.cpp │ │ │ ├── coordsys.hpp │ │ │ ├── cylinder.cpp │ │ │ ├── cylinder.hpp │ │ │ ├── extvol.cpp │ │ │ ├── extvol.hpp │ │ │ ├── global.cpp │ │ │ ├── global.hpp │ │ │ ├── grid.cpp │ │ │ ├── grid.hpp │ │ │ ├── imageio.cpp │ │ │ ├── imageio.hpp │ │ │ ├── imap.cpp │ │ │ ├── imap.hpp │ │ │ ├── intersect.cpp │ │ │ ├── intersect.hpp │ │ │ ├── jpeg.cpp │ │ │ ├── jpeg.hpp │ │ │ ├── light.cpp │ │ │ ├── light.hpp │ │ │ ├── machine.hpp │ │ │ ├── macros.hpp │ │ │ ├── main.cpp │ │ │ ├── objbound.cpp │ │ │ ├── objbound.hpp │ │ │ ├── parse.cpp │ │ │ ├── parse.hpp │ │ │ ├── plane.cpp │ │ │ ├── plane.hpp │ │ │ ├── ppm.cpp │ │ │ ├── ppm.hpp │ │ │ ├── pthread.cpp │ │ │ ├── pthread_w.hpp │ │ │ ├── quadric.cpp │ │ │ ├── quadric.hpp │ │ │ ├── render.cpp │ │ │ ├── render.hpp │ │ │ ├── ring.cpp │ │ │ ├── ring.hpp │ │ │ ├── shade.cpp │ │ │ ├── shade.hpp │ │ │ ├── sphere.cpp │ │ │ ├── sphere.hpp │ │ │ ├── tachyon_video.cpp │ │ │ ├── tachyon_video.hpp │ │ │ ├── texture.cpp │ │ │ ├── texture.hpp │ │ │ ├── tgafile.cpp │ │ │ ├── tgafile.hpp │ │ │ ├── trace.hpp │ │ │ ├── trace.omp.cpp │ │ │ ├── trace.serial.cpp │ │ │ ├── trace.simple.cpp │ │ │ ├── trace.taskq.cpp │ │ │ ├── trace.tbb.cpp │ │ │ ├── trace.tbb1d.cpp │ │ │ ├── trace.threads.cpp │ │ │ ├── trace.threads2d.cpp │ │ │ ├── trace_rest.cpp │ │ │ ├── triangle.cpp │ │ │ ├── triangle.hpp │ │ │ ├── types.hpp │ │ │ ├── ui.cpp │ │ │ ├── ui.hpp │ │ │ ├── util.cpp │ │ │ ├── util.hpp │ │ │ ├── vector.cpp │ │ │ ├── vector.hpp │ │ │ ├── vol.cpp │ │ │ └── vol.hpp │ │ ├── parallel_for_each/ │ │ │ ├── README.md │ │ │ └── parallel_preorder/ │ │ │ ├── CMakeLists.txt │ │ │ ├── Graph.cpp │ │ │ ├── Graph.hpp │ │ │ ├── Matrix.hpp │ │ │ ├── README.md │ │ │ ├── main.cpp │ │ │ └── parallel_preorder.cpp │ │ ├── parallel_pipeline/ │ │ │ ├── README.md │ │ │ └── square/ │ │ │ ├── CMakeLists.txt │ │ │ ├── README.md │ │ │ ├── gen_input.cpp │ │ │ └── square.cpp │ │ ├── parallel_reduce/ │ │ │ ├── README.md │ │ │ ├── convex_hull/ │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── README.md │ │ │ │ ├── convex_hull.hpp │ │ │ │ ├── convex_hull_bench.cpp │ │ │ │ └── convex_hull_sample.cpp │ │ │ ├── pi/ │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── README.md │ │ │ │ ├── common.h │ │ │ │ ├── main.cpp │ │ │ │ └── pi.cpp │ │ │ └── primes/ │ │ │ ├── CMakeLists.txt │ │ │ ├── README.md │ │ │ ├── main.cpp │ │ │ ├── primes.cpp │ │ │ └── primes.hpp │ │ ├── task_arena/ │ │ │ ├── README.md │ │ │ └── fractal/ │ │ │ ├── CMakeLists.txt │ │ │ ├── README.md │ │ │ ├── fractal.cpp │ │ │ ├── fractal.hpp │ │ │ ├── fractal_video.hpp │ │ │ ├── gui/ │ │ │ │ ├── fractal.rc │ │ │ │ └── resource.h │ │ │ └── main.cpp │ │ ├── task_group/ │ │ │ ├── README.md │ │ │ └── sudoku/ │ │ │ ├── CMakeLists.txt │ │ │ ├── README.md │ │ │ ├── input1 │ │ │ ├── input2 │ │ │ ├── input3 │ │ │ ├── input4 │ │ │ └── sudoku.cpp │ │ └── test_all/ │ │ ├── README.md │ │ └── fibonacci/ │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ └── fibonacci.cpp │ ├── include/ │ │ ├── oneapi/ │ │ │ ├── tbb/ │ │ │ │ ├── blocked_nd_range.h │ │ │ │ ├── blocked_range.h │ │ │ │ ├── blocked_range2d.h │ │ │ │ ├── blocked_range3d.h │ │ │ │ ├── blocked_rangeNd.h │ │ │ │ ├── cache_aligned_allocator.h │ │ │ │ ├── collaborative_call_once.h │ │ │ │ ├── combinable.h │ │ │ │ ├── concurrent_hash_map.h │ │ │ │ ├── concurrent_lru_cache.h │ │ │ │ ├── concurrent_map.h │ │ │ │ ├── concurrent_priority_queue.h │ │ │ │ ├── concurrent_queue.h │ │ │ │ ├── concurrent_set.h │ │ │ │ ├── concurrent_unordered_map.h │ │ │ │ ├── concurrent_unordered_set.h │ │ │ │ ├── concurrent_vector.h │ │ │ │ ├── detail/ │ │ │ │ │ ├── _aggregator.h │ │ │ │ │ ├── _aligned_space.h │ │ │ │ │ ├── _allocator_traits.h │ │ │ │ │ ├── _assert.h │ │ │ │ │ ├── _attach.h │ │ │ │ │ ├── _concurrent_queue_base.h │ │ │ │ │ ├── _concurrent_skip_list.h │ │ │ │ │ ├── _concurrent_unordered_base.h │ │ │ │ │ ├── _config.h │ │ │ │ │ ├── _containers_helpers.h │ │ │ │ │ ├── _exception.h │ │ │ │ │ ├── _export.h │ │ │ │ │ ├── _flow_graph_body_impl.h │ │ │ │ │ ├── _flow_graph_cache_impl.h │ │ │ │ │ ├── _flow_graph_impl.h │ │ │ │ │ ├── _flow_graph_indexer_impl.h │ │ │ │ │ ├── _flow_graph_item_buffer_impl.h │ │ │ │ │ ├── _flow_graph_join_impl.h │ │ │ │ │ ├── _flow_graph_node_impl.h │ │ │ │ │ ├── _flow_graph_node_set_impl.h │ │ │ │ │ ├── _flow_graph_nodes_deduction.h │ │ │ │ │ ├── _flow_graph_tagged_buffer_impl.h │ │ │ │ │ ├── _flow_graph_trace_impl.h │ │ │ │ │ ├── _flow_graph_types_impl.h │ │ │ │ │ ├── _hash_compare.h │ │ │ │ │ ├── _intrusive_list_node.h │ │ │ │ │ ├── _machine.h │ │ │ │ │ ├── _mutex_common.h │ │ │ │ │ ├── _namespace_injection.h │ │ │ │ │ ├── _node_handle.h │ │ │ │ │ ├── _pipeline_filters.h │ │ │ │ │ ├── _pipeline_filters_deduction.h │ │ │ │ │ ├── _range_common.h │ │ │ │ │ ├── _rtm_mutex.h │ │ │ │ │ ├── _rtm_rw_mutex.h │ │ │ │ │ ├── _scoped_lock.h │ │ │ │ │ ├── _segment_table.h │ │ │ │ │ ├── _small_object_pool.h │ │ │ │ │ ├── _string_resource.h │ │ │ │ │ ├── _task.h │ │ │ │ │ ├── _task_handle.h │ │ │ │ │ ├── _template_helpers.h │ │ │ │ │ ├── _utils.h │ │ │ │ │ └── _waitable_atomic.h │ │ │ │ ├── enumerable_thread_specific.h │ │ │ │ ├── flow_graph.h │ │ │ │ ├── flow_graph_abstractions.h │ │ │ │ ├── global_control.h │ │ │ │ ├── info.h │ │ │ │ ├── memory_pool.h │ │ │ │ ├── mutex.h │ │ │ │ ├── null_mutex.h │ │ │ │ ├── null_rw_mutex.h │ │ │ │ ├── parallel_for.h │ │ │ │ ├── parallel_for_each.h │ │ │ │ ├── parallel_invoke.h │ │ │ │ ├── parallel_pipeline.h │ │ │ │ ├── parallel_reduce.h │ │ │ │ ├── parallel_scan.h │ │ │ │ ├── parallel_sort.h │ │ │ │ ├── partitioner.h │ │ │ │ ├── profiling.h │ │ │ │ ├── queuing_mutex.h │ │ │ │ ├── queuing_rw_mutex.h │ │ │ │ ├── rw_mutex.h │ │ │ │ ├── scalable_allocator.h │ │ │ │ ├── spin_mutex.h │ │ │ │ ├── spin_rw_mutex.h │ │ │ │ ├── task.h │ │ │ │ ├── task_arena.h │ │ │ │ ├── task_group.h │ │ │ │ ├── task_scheduler_observer.h │ │ │ │ ├── tbb_allocator.h │ │ │ │ ├── tbbmalloc_proxy.h │ │ │ │ ├── tick_count.h │ │ │ │ └── version.h │ │ │ └── tbb.h │ │ └── tbb/ │ │ ├── blocked_nd_range.h │ │ ├── blocked_range.h │ │ ├── blocked_range2d.h │ │ ├── blocked_range3d.h │ │ ├── blocked_rangeNd.h │ │ ├── cache_aligned_allocator.h │ │ ├── collaborative_call_once.h │ │ ├── combinable.h │ │ ├── concurrent_hash_map.h │ │ ├── concurrent_lru_cache.h │ │ ├── concurrent_map.h │ │ ├── concurrent_priority_queue.h │ │ ├── concurrent_queue.h │ │ ├── concurrent_set.h │ │ ├── concurrent_unordered_map.h │ │ ├── concurrent_unordered_set.h │ │ ├── concurrent_vector.h │ │ ├── enumerable_thread_specific.h │ │ ├── flow_graph.h │ │ ├── flow_graph_abstractions.h │ │ ├── global_control.h │ │ ├── info.h │ │ ├── memory_pool.h │ │ ├── mutex.h │ │ ├── null_mutex.h │ │ ├── null_rw_mutex.h │ │ ├── parallel_for.h │ │ ├── parallel_for_each.h │ │ ├── parallel_invoke.h │ │ ├── parallel_pipeline.h │ │ ├── parallel_reduce.h │ │ ├── parallel_scan.h │ │ ├── parallel_sort.h │ │ ├── partitioner.h │ │ ├── profiling.h │ │ ├── queuing_mutex.h │ │ ├── queuing_rw_mutex.h │ │ ├── rw_mutex.h │ │ ├── scalable_allocator.h │ │ ├── spin_mutex.h │ │ ├── spin_rw_mutex.h │ │ ├── task.h │ │ ├── task_arena.h │ │ ├── task_group.h │ │ ├── task_scheduler_observer.h │ │ ├── tbb.h │ │ ├── tbb_allocator.h │ │ ├── tbbmalloc_proxy.h │ │ ├── tick_count.h │ │ └── version.h │ ├── integration/ │ │ ├── cmake/ │ │ │ └── generate_vars.cmake │ │ ├── linux/ │ │ │ ├── env/ │ │ │ │ ├── vars.sh │ │ │ │ └── vars.sh.in │ │ │ ├── modulefiles/ │ │ │ │ ├── tbb │ │ │ │ └── tbb32 │ │ │ ├── oneapi/ │ │ │ │ └── vars.sh │ │ │ └── sys_check/ │ │ │ └── sys_check.sh │ │ ├── mac/ │ │ │ └── env/ │ │ │ ├── vars.sh │ │ │ └── vars.sh.in │ │ ├── pkg-config/ │ │ │ └── tbb.pc.in │ │ └── windows/ │ │ ├── env/ │ │ │ ├── vars.bat │ │ │ └── vars.bat.in │ │ ├── nuget/ │ │ │ ├── inteltbb.devel.win.targets │ │ │ └── inteltbb.redist.win.targets │ │ ├── oneapi/ │ │ │ └── vars.bat │ │ └── sys_check/ │ │ └── sys_check.bat │ ├── python/ │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ ├── TBB.py │ │ ├── rml/ │ │ │ ├── CMakeLists.txt │ │ │ ├── ipc_server.cpp │ │ │ ├── ipc_utils.cpp │ │ │ └── ipc_utils.h │ │ ├── setup.py │ │ └── tbb/ │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── api.i │ │ ├── pool.py │ │ └── test.py │ ├── rfcs/ │ │ ├── README.md │ │ ├── archived/ │ │ │ └── README.md │ │ ├── experimental/ │ │ │ ├── README.md │ │ │ ├── blocked_nd_range_ctad/ │ │ │ │ └── README.md │ │ │ └── parallel_phase_for_task_arena/ │ │ │ └── README.md │ │ ├── proposed/ │ │ │ ├── README.md │ │ │ ├── loading-dependencies/ │ │ │ │ └── loading-dependencies-by-module-name.org │ │ │ └── numa_support/ │ │ │ ├── README.md │ │ │ └── tbbbind-link-static-hwloc.org │ │ ├── supported/ │ │ │ └── README.md │ │ └── template.md │ ├── src/ │ │ ├── tbb/ │ │ │ ├── CMakeLists.txt │ │ │ ├── address_waiter.cpp │ │ │ ├── allocator.cpp │ │ │ ├── arena.cpp │ │ │ ├── arena.h │ │ │ ├── arena_slot.cpp │ │ │ ├── arena_slot.h │ │ │ ├── assert_impl.h │ │ │ ├── cancellation_disseminator.h │ │ │ ├── co_context.h │ │ │ ├── concurrent_bounded_queue.cpp │ │ │ ├── concurrent_monitor.h │ │ │ ├── concurrent_monitor_mutex.h │ │ │ ├── def/ │ │ │ │ ├── lin32-tbb.def │ │ │ │ ├── lin64-tbb.def │ │ │ │ ├── mac64-tbb.def │ │ │ │ ├── win32-tbb.def │ │ │ │ └── win64-tbb.def │ │ │ ├── dynamic_link.cpp │ │ │ ├── dynamic_link.h │ │ │ ├── environment.h │ │ │ ├── exception.cpp │ │ │ ├── global_control.cpp │ │ │ ├── governor.cpp │ │ │ ├── governor.h │ │ │ ├── intrusive_list.h │ │ │ ├── itt_notify.cpp │ │ │ ├── itt_notify.h │ │ │ ├── mailbox.h │ │ │ ├── main.cpp │ │ │ ├── main.h │ │ │ ├── market.cpp │ │ │ ├── market.h │ │ │ ├── market_concurrent_monitor.h │ │ │ ├── misc.cpp │ │ │ ├── misc.h │ │ │ ├── misc_ex.cpp │ │ │ ├── observer_proxy.cpp │ │ │ ├── observer_proxy.h │ │ │ ├── parallel_pipeline.cpp │ │ │ ├── permit_manager.h │ │ │ ├── pm_client.h │ │ │ ├── private_server.cpp │ │ │ ├── profiling.cpp │ │ │ ├── queuing_rw_mutex.cpp │ │ │ ├── rml_base.h │ │ │ ├── rml_tbb.cpp │ │ │ ├── rml_tbb.h │ │ │ ├── rml_thread_monitor.h │ │ │ ├── rtm_mutex.cpp │ │ │ ├── rtm_rw_mutex.cpp │ │ │ ├── scheduler_common.h │ │ │ ├── semaphore.cpp │ │ │ ├── semaphore.h │ │ │ ├── small_object_pool.cpp │ │ │ ├── small_object_pool_impl.h │ │ │ ├── task.cpp │ │ │ ├── task_dispatcher.cpp │ │ │ ├── task_dispatcher.h │ │ │ ├── task_group_context.cpp │ │ │ ├── task_stream.h │ │ │ ├── tbb.rc │ │ │ ├── tcm.h │ │ │ ├── tcm_adaptor.cpp │ │ │ ├── tcm_adaptor.h │ │ │ ├── thread_control_monitor.h │ │ │ ├── thread_data.h │ │ │ ├── thread_dispatcher.cpp │ │ │ ├── thread_dispatcher.h │ │ │ ├── thread_dispatcher_client.h │ │ │ ├── thread_request_serializer.cpp │ │ │ ├── thread_request_serializer.h │ │ │ ├── threading_control.cpp │ │ │ ├── threading_control.h │ │ │ ├── threading_control_client.h │ │ │ ├── tls.h │ │ │ ├── tools_api/ │ │ │ │ ├── disable_warnings.h │ │ │ │ ├── ittnotify.h │ │ │ │ ├── ittnotify_config.h │ │ │ │ ├── ittnotify_static.c │ │ │ │ ├── ittnotify_static.h │ │ │ │ ├── ittnotify_types.h │ │ │ │ └── legacy/ │ │ │ │ └── ittnotify.h │ │ │ ├── version.cpp │ │ │ └── waiters.h │ │ ├── tbbbind/ │ │ │ ├── CMakeLists.txt │ │ │ ├── def/ │ │ │ │ ├── lin32-tbbbind.def │ │ │ │ ├── lin64-tbbbind.def │ │ │ │ ├── mac64-tbbbind.def │ │ │ │ ├── win32-tbbbind.def │ │ │ │ └── win64-tbbbind.def │ │ │ ├── tbb_bind.cpp │ │ │ └── tbb_bind.rc │ │ ├── tbbmalloc/ │ │ │ ├── CMakeLists.txt │ │ │ ├── Customize.h │ │ │ ├── MapMemory.h │ │ │ ├── Statistics.h │ │ │ ├── Synchronize.h │ │ │ ├── TypeDefinitions.h │ │ │ ├── backend.cpp │ │ │ ├── backend.h │ │ │ ├── backref.cpp │ │ │ ├── def/ │ │ │ │ ├── lin32-tbbmalloc.def │ │ │ │ ├── lin64-tbbmalloc.def │ │ │ │ ├── mac64-tbbmalloc.def │ │ │ │ ├── win32-tbbmalloc.def │ │ │ │ └── win64-tbbmalloc.def │ │ │ ├── frontend.cpp │ │ │ ├── large_objects.cpp │ │ │ ├── large_objects.h │ │ │ ├── shared_utils.h │ │ │ ├── tbbmalloc.cpp │ │ │ ├── tbbmalloc.rc │ │ │ ├── tbbmalloc_internal.h │ │ │ └── tbbmalloc_internal_api.h │ │ └── tbbmalloc_proxy/ │ │ ├── CMakeLists.txt │ │ ├── def/ │ │ │ ├── lin32-proxy.def │ │ │ └── lin64-proxy.def │ │ ├── function_replacement.cpp │ │ ├── function_replacement.h │ │ ├── proxy.cpp │ │ ├── proxy.h │ │ ├── proxy_overload_osx.h │ │ └── tbbmalloc_proxy.rc │ ├── test/ │ │ ├── CMakeLists.txt │ │ ├── common/ │ │ │ ├── allocator_overload.h │ │ │ ├── allocator_stl_test_common.h │ │ │ ├── allocator_test_common.h │ │ │ ├── checktype.h │ │ │ ├── common_arena_constraints.h │ │ │ ├── concepts_common.h │ │ │ ├── concurrency_tracker.h │ │ │ ├── concurrent_associative_common.h │ │ │ ├── concurrent_lru_cache_common.h │ │ │ ├── concurrent_ordered_common.h │ │ │ ├── concurrent_priority_queue_common.h │ │ │ ├── concurrent_unordered_common.h │ │ │ ├── config.h │ │ │ ├── container_move_support.h │ │ │ ├── containers_common.h │ │ │ ├── cpu_usertime.h │ │ │ ├── custom_allocators.h │ │ │ ├── doctest.h │ │ │ ├── dummy_body.h │ │ │ ├── exception_handling.h │ │ │ ├── fp_control.h │ │ │ ├── graph_utils.h │ │ │ ├── initializer_list_support.h │ │ │ ├── inject_scheduler.h │ │ │ ├── iterator.h │ │ │ ├── memory_usage.h │ │ │ ├── node_handling_support.h │ │ │ ├── parallel_for_each_common.h │ │ │ ├── parallel_invoke_common.h │ │ │ ├── parallel_reduce_common.h │ │ │ ├── range_based_for_support.h │ │ │ ├── rwm_upgrade_downgrade.h │ │ │ ├── spin_barrier.h │ │ │ ├── state_trackable.h │ │ │ ├── test.h │ │ │ ├── test_comparisons.h │ │ │ ├── test_follows_and_precedes_api.h │ │ │ ├── test_invoke.h │ │ │ ├── test_join_node_multiple_predecessors.h │ │ │ ├── tls_limit.h │ │ │ ├── utils.h │ │ │ ├── utils_assert.h │ │ │ ├── utils_concurrency_limit.h │ │ │ ├── utils_dynamic_libs.h │ │ │ ├── utils_env.h │ │ │ ├── utils_report.h │ │ │ ├── utils_yield.h │ │ │ └── vector_types.h │ │ ├── conformance/ │ │ │ ├── conformance_allocators.cpp │ │ │ ├── conformance_arena_constraints.cpp │ │ │ ├── conformance_async_node.cpp │ │ │ ├── conformance_blocked_nd_range.cpp │ │ │ ├── conformance_blocked_range.cpp │ │ │ ├── conformance_blocked_range2d.cpp │ │ │ ├── conformance_blocked_range3d.cpp │ │ │ ├── conformance_blocked_rangeNd.cpp │ │ │ ├── conformance_broadcast_node.cpp │ │ │ ├── conformance_buffer_node.cpp │ │ │ ├── conformance_collaborative_call_once.cpp │ │ │ ├── conformance_combinable.cpp │ │ │ ├── conformance_composite_node.cpp │ │ │ ├── conformance_concurrent_hash_map.cpp │ │ │ ├── conformance_concurrent_lru_cache.cpp │ │ │ ├── conformance_concurrent_map.cpp │ │ │ ├── conformance_concurrent_priority_queue.cpp │ │ │ ├── conformance_concurrent_queue.cpp │ │ │ ├── conformance_concurrent_set.cpp │ │ │ ├── conformance_concurrent_unordered_map.cpp │ │ │ ├── conformance_concurrent_unordered_set.cpp │ │ │ ├── conformance_concurrent_vector.cpp │ │ │ ├── conformance_continue_node.cpp │ │ │ ├── conformance_enumerable_thread_specific.cpp │ │ │ ├── conformance_flowgraph.h │ │ │ ├── conformance_function_node.cpp │ │ │ ├── conformance_global_control.cpp │ │ │ ├── conformance_graph.cpp │ │ │ ├── conformance_indexer_node.cpp │ │ │ ├── conformance_input_node.cpp │ │ │ ├── conformance_join_node.cpp │ │ │ ├── conformance_limiter_node.cpp │ │ │ ├── conformance_multifunction_node.cpp │ │ │ ├── conformance_mutex.cpp │ │ │ ├── conformance_mutex.h │ │ │ ├── conformance_overwrite_node.cpp │ │ │ ├── conformance_parallel_for.cpp │ │ │ ├── conformance_parallel_for_each.cpp │ │ │ ├── conformance_parallel_invoke.cpp │ │ │ ├── conformance_parallel_pipeline.cpp │ │ │ ├── conformance_parallel_reduce.cpp │ │ │ ├── conformance_parallel_scan.cpp │ │ │ ├── conformance_parallel_sort.cpp │ │ │ ├── conformance_priority_queue_node.cpp │ │ │ ├── conformance_queue_node.cpp │ │ │ ├── conformance_resumable_tasks.cpp │ │ │ ├── conformance_sequencer_node.cpp │ │ │ ├── conformance_split_node.cpp │ │ │ ├── conformance_task_arena.cpp │ │ │ ├── conformance_task_group.cpp │ │ │ ├── conformance_task_group_context.cpp │ │ │ ├── conformance_tick_count.cpp │ │ │ ├── conformance_version.cpp │ │ │ └── conformance_write_once_node.cpp │ │ ├── tbb/ │ │ │ ├── test_adaptive_mutex.cpp │ │ │ ├── test_allocators.cpp │ │ │ ├── test_arena_constraints.cpp │ │ │ ├── test_arena_priorities.cpp │ │ │ ├── test_async_node.cpp │ │ │ ├── test_blocked_range.cpp │ │ │ ├── test_broadcast_node.cpp │ │ │ ├── test_buffer_node.cpp │ │ │ ├── test_buffering_try_put_and_wait.h │ │ │ ├── test_collaborative_call_once.cpp │ │ │ ├── test_composite_node.cpp │ │ │ ├── test_concurrent_hash_map.cpp │ │ │ ├── test_concurrent_lru_cache.cpp │ │ │ ├── test_concurrent_map.cpp │ │ │ ├── test_concurrent_monitor.cpp │ │ │ ├── test_concurrent_priority_queue.cpp │ │ │ ├── test_concurrent_queue.cpp │ │ │ ├── test_concurrent_queue_whitebox.cpp │ │ │ ├── test_concurrent_set.cpp │ │ │ ├── test_concurrent_unordered_map.cpp │ │ │ ├── test_concurrent_unordered_set.cpp │ │ │ ├── test_concurrent_vector.cpp │ │ │ ├── test_continue_node.cpp │ │ │ ├── test_dynamic_link.cpp │ │ │ ├── test_eh_algorithms.cpp │ │ │ ├── test_eh_flow_graph.cpp │ │ │ ├── test_eh_thread.cpp │ │ │ ├── test_enumerable_thread_specific.cpp │ │ │ ├── test_environment_whitebox.cpp │ │ │ ├── test_flow_graph.cpp │ │ │ ├── test_flow_graph_priorities.cpp │ │ │ ├── test_flow_graph_whitebox.cpp │ │ │ ├── test_function_node.cpp │ │ │ ├── test_fuzzing.cpp │ │ │ ├── test_global_control.cpp │ │ │ ├── test_handle_perror.cpp │ │ │ ├── test_hw_concurrency.cpp │ │ │ ├── test_implicit_linkage_on_windows.cpp │ │ │ ├── test_indexer_node.cpp │ │ │ ├── test_input_node.cpp │ │ │ ├── test_intrusive_list.cpp │ │ │ ├── test_join_node.cpp │ │ │ ├── test_join_node.h │ │ │ ├── test_join_node_key_matching.cpp │ │ │ ├── test_join_node_key_matching_n_args.cpp │ │ │ ├── test_join_node_msg_key_matching.cpp │ │ │ ├── test_join_node_msg_key_matching_n_args.cpp │ │ │ ├── test_join_node_preview.cpp │ │ │ ├── test_limiter_node.cpp │ │ │ ├── test_multifunction_node.cpp │ │ │ ├── test_mutex.cpp │ │ │ ├── test_mutex.h │ │ │ ├── test_numa_dist.cpp │ │ │ ├── test_openmp.cpp │ │ │ ├── test_overwrite_node.cpp │ │ │ ├── test_parallel_for.cpp │ │ │ ├── test_parallel_for_each.cpp │ │ │ ├── test_parallel_invoke.cpp │ │ │ ├── test_parallel_phase.cpp │ │ │ ├── test_parallel_pipeline.cpp │ │ │ ├── test_parallel_reduce.cpp │ │ │ ├── test_parallel_scan.cpp │ │ │ ├── test_parallel_sort.cpp │ │ │ ├── test_partitioner.cpp │ │ │ ├── test_partitioner.h │ │ │ ├── test_priority_queue_node.cpp │ │ │ ├── test_profiling.cpp │ │ │ ├── test_queue_node.cpp │ │ │ ├── test_resumable_tasks.cpp │ │ │ ├── test_scheduler_mix.cpp │ │ │ ├── test_semaphore.cpp │ │ │ ├── test_sequencer_node.cpp │ │ │ ├── test_split_node.cpp │ │ │ ├── test_tagged_msg.cpp │ │ │ ├── test_task.cpp │ │ │ ├── test_task_arena.cpp │ │ │ ├── test_task_group.cpp │ │ │ ├── test_tbb_fork.cpp │ │ │ ├── test_tbb_header.cpp │ │ │ ├── test_tbb_header_secondary.cpp │ │ │ ├── test_tick_count.cpp │ │ │ └── test_write_once_node.cpp │ │ └── tbbmalloc/ │ │ ├── test_malloc_atexit.cpp │ │ ├── test_malloc_compliance.cpp │ │ ├── test_malloc_init_shutdown.cpp │ │ ├── test_malloc_lib_unload.cpp │ │ ├── test_malloc_new_handler.cpp │ │ ├── test_malloc_overload.cpp │ │ ├── test_malloc_overload_disable.cpp │ │ ├── test_malloc_pools.cpp │ │ ├── test_malloc_pure_c.c │ │ ├── test_malloc_regression.cpp │ │ ├── test_malloc_shutdown_hang.cpp │ │ ├── test_malloc_used_by_lib.cpp │ │ ├── test_malloc_whitebox.cpp │ │ └── test_scalable_allocator.cpp │ └── third-party-programs.txt ├── xxhash/ │ ├── .gitattributes │ ├── .github/ │ │ ├── dependabot.yml │ │ └── workflows/ │ │ ├── ci.yml │ │ └── scorecard.yml │ ├── .gitignore │ ├── CHANGELOG │ ├── Doxyfile │ ├── Doxyfile-internal │ ├── LICENSE │ ├── Makefile │ ├── README.md │ ├── SECURITY.md │ ├── appveyor.yml │ ├── cli/ │ │ ├── .tipi/ │ │ │ ├── deps │ │ │ └── opts │ │ ├── COPYING │ │ ├── README.md │ │ ├── xsum_arch.c │ │ ├── xsum_arch.h │ │ ├── xsum_bench.c │ │ ├── xsum_bench.h │ │ ├── xsum_config.h │ │ ├── xsum_os_specific.c │ │ ├── xsum_os_specific.h │ │ ├── xsum_output.c │ │ ├── xsum_output.h │ │ ├── xsum_sanity_check.c │ │ ├── xsum_sanity_check.h │ │ ├── xxhsum.1 │ │ ├── xxhsum.1.md │ │ └── xxhsum.c │ ├── clib.json │ ├── cmake_unofficial/ │ │ ├── .gitignore │ │ ├── CMakeLists.txt │ │ ├── JoinPaths.cmake │ │ ├── README.md │ │ └── xxHashConfig.cmake.in │ ├── doc/ │ │ ├── README.md │ │ ├── xxhash.cry │ │ └── xxhash_spec.md │ ├── fuzz/ │ │ └── fuzzer.c │ ├── libxxhash.pc.in │ ├── tests/ │ │ ├── Makefile │ │ ├── cli-comment-line.sh │ │ ├── cli-ignore-missing.sh │ │ ├── collisions/ │ │ │ ├── .gitignore │ │ │ ├── LICENSE │ │ │ ├── Makefile │ │ │ ├── README.md │ │ │ ├── allcodecs/ │ │ │ │ ├── README.md │ │ │ │ ├── dummy.c │ │ │ │ └── dummy.h │ │ │ ├── hashes.h │ │ │ ├── main.c │ │ │ ├── pool.c │ │ │ ├── pool.h │ │ │ ├── sort.cc │ │ │ ├── sort.hh │ │ │ ├── threading.c │ │ │ └── threading.h │ │ ├── filename-escape.sh │ │ ├── generate_unicode_test.c │ │ ├── multiInclude.c │ │ ├── ppc_define.c │ │ ├── sanity_test.c │ │ ├── sanity_test_vectors.h │ │ ├── sanity_test_vectors_generator.c │ │ └── unicode_lint.sh │ ├── xxh3.h │ ├── xxh_x86dispatch.c │ ├── xxh_x86dispatch.h │ ├── xxhash.c │ └── xxhash.h ├── zlib/ │ ├── CMakeFiles/ │ │ ├── zlib.dir/ │ │ │ └── build.make │ │ └── zlibstatic.dir/ │ │ └── build.make │ ├── CMakeLists.txt │ ├── ChangeLog │ ├── FAQ │ ├── INDEX │ ├── LICENSE │ ├── Makefile │ ├── Makefile.in │ ├── README │ ├── adler32.c │ ├── amiga/ │ │ ├── Makefile.pup │ │ └── Makefile.sas │ ├── compress.c │ ├── configure │ ├── contrib/ │ │ ├── README.contrib │ │ ├── ada/ │ │ │ ├── buffer_demo.adb │ │ │ ├── mtest.adb │ │ │ ├── read.adb │ │ │ ├── readme.txt │ │ │ ├── test.adb │ │ │ ├── zlib-streams.adb │ │ │ ├── zlib-streams.ads │ │ │ ├── zlib-thin.adb │ │ │ ├── zlib-thin.ads │ │ │ ├── zlib.adb │ │ │ ├── zlib.ads │ │ │ └── zlib.gpr │ │ ├── blast/ │ │ │ ├── Makefile │ │ │ ├── README │ │ │ ├── blast.c │ │ │ ├── blast.h │ │ │ ├── test.pk │ │ │ └── test.txt │ │ ├── delphi/ │ │ │ ├── ZLib.pas │ │ │ ├── ZLibConst.pas │ │ │ ├── readme.txt │ │ │ └── zlibd32.mak │ │ ├── dotzlib/ │ │ │ ├── DotZLib/ │ │ │ │ ├── AssemblyInfo.cs │ │ │ │ ├── ChecksumImpl.cs │ │ │ │ ├── CircularBuffer.cs │ │ │ │ ├── CodecBase.cs │ │ │ │ ├── Deflater.cs │ │ │ │ ├── DotZLib.cs │ │ │ │ ├── DotZLib.csproj │ │ │ │ ├── GZipStream.cs │ │ │ │ ├── Inflater.cs │ │ │ │ └── UnitTests.cs │ │ │ ├── DotZLib.build │ │ │ ├── DotZLib.chm │ │ │ ├── DotZLib.sln │ │ │ ├── LICENSE_1_0.txt │ │ │ └── readme.txt │ │ ├── gcc_gvmat64/ │ │ │ └── gvmat64.S │ │ ├── infback9/ │ │ │ ├── README │ │ │ ├── infback9.c │ │ │ ├── infback9.h │ │ │ ├── inffix9.h │ │ │ ├── inflate9.h │ │ │ ├── inftree9.c │ │ │ └── inftree9.h │ │ ├── iostream/ │ │ │ ├── test.cpp │ │ │ ├── zfstream.cpp │ │ │ └── zfstream.h │ │ ├── iostream2/ │ │ │ ├── zstream.h │ │ │ └── zstream_test.cpp │ │ ├── iostream3/ │ │ │ ├── README │ │ │ ├── TODO │ │ │ ├── test.cc │ │ │ ├── zfstream.cc │ │ │ └── zfstream.h │ │ ├── minizip/ │ │ │ ├── Makefile │ │ │ ├── Makefile.am │ │ │ ├── MiniZip64_Changes.txt │ │ │ ├── MiniZip64_info.txt │ │ │ ├── configure.ac │ │ │ ├── crypt.h │ │ │ ├── ioapi.c │ │ │ ├── ioapi.h │ │ │ ├── iowin32.c │ │ │ ├── iowin32.h │ │ │ ├── make_vms.com │ │ │ ├── miniunz.c │ │ │ ├── miniunzip.1 │ │ │ ├── minizip.1 │ │ │ ├── minizip.c │ │ │ ├── minizip.pc.in │ │ │ ├── mztools.c │ │ │ ├── mztools.h │ │ │ ├── unzip.c │ │ │ ├── unzip.h │ │ │ ├── zip.c │ │ │ └── zip.h │ │ ├── nuget/ │ │ │ ├── nuget.csproj │ │ │ └── nuget.sln │ │ ├── pascal/ │ │ │ ├── example.pas │ │ │ ├── readme.txt │ │ │ ├── zlibd32.mak │ │ │ └── zlibpas.pas │ │ ├── puff/ │ │ │ ├── Makefile │ │ │ ├── README │ │ │ ├── puff.c │ │ │ ├── puff.h │ │ │ ├── pufftest.c │ │ │ └── zeros.raw │ │ ├── testzlib/ │ │ │ ├── testzlib.c │ │ │ └── testzlib.txt │ │ ├── untgz/ │ │ │ ├── Makefile │ │ │ ├── Makefile.msc │ │ │ └── untgz.c │ │ └── vstudio/ │ │ ├── readme.txt │ │ ├── vc10/ │ │ │ ├── miniunz.vcxproj │ │ │ ├── miniunz.vcxproj.filters │ │ │ ├── minizip.vcxproj │ │ │ ├── minizip.vcxproj.filters │ │ │ ├── testzlib.vcxproj │ │ │ ├── testzlib.vcxproj.filters │ │ │ ├── testzlibdll.vcxproj │ │ │ ├── testzlibdll.vcxproj.filters │ │ │ ├── zlib.rc │ │ │ ├── zlibstat.vcxproj │ │ │ ├── zlibstat.vcxproj.filters │ │ │ ├── zlibvc.def │ │ │ ├── zlibvc.sln │ │ │ ├── zlibvc.vcxproj │ │ │ └── zlibvc.vcxproj.filters │ │ ├── vc11/ │ │ │ ├── miniunz.vcxproj │ │ │ ├── minizip.vcxproj │ │ │ ├── testzlib.vcxproj │ │ │ ├── testzlibdll.vcxproj │ │ │ ├── zlib.rc │ │ │ ├── zlibstat.vcxproj │ │ │ ├── zlibvc.def │ │ │ ├── zlibvc.sln │ │ │ └── zlibvc.vcxproj │ │ ├── vc12/ │ │ │ ├── miniunz.vcxproj │ │ │ ├── minizip.vcxproj │ │ │ ├── testzlib.vcxproj │ │ │ ├── testzlibdll.vcxproj │ │ │ ├── zlib.rc │ │ │ ├── zlibstat.vcxproj │ │ │ ├── zlibvc.def │ │ │ ├── zlibvc.sln │ │ │ └── zlibvc.vcxproj │ │ ├── vc14/ │ │ │ ├── miniunz.vcxproj │ │ │ ├── minizip.vcxproj │ │ │ ├── testzlib.vcxproj │ │ │ ├── testzlibdll.vcxproj │ │ │ ├── zlib.rc │ │ │ ├── zlibstat.vcxproj │ │ │ ├── zlibvc.def │ │ │ ├── zlibvc.sln │ │ │ └── zlibvc.vcxproj │ │ ├── vc17/ │ │ │ ├── miniunz.vcxproj │ │ │ ├── minizip.vcxproj │ │ │ ├── testzlib.vcxproj │ │ │ ├── testzlibdll.vcxproj │ │ │ ├── zlib.rc │ │ │ ├── zlibstat.vcxproj │ │ │ ├── zlibvc.def │ │ │ ├── zlibvc.sln │ │ │ └── zlibvc.vcxproj │ │ └── vc9/ │ │ ├── miniunz.vcproj │ │ ├── minizip.vcproj │ │ ├── testzlib.vcproj │ │ ├── testzlibdll.vcproj │ │ ├── zlib.rc │ │ ├── zlibstat.vcproj │ │ ├── zlibvc.def │ │ ├── zlibvc.sln │ │ └── zlibvc.vcproj │ ├── crc32.c │ ├── crc32.h │ ├── deflate.c │ ├── deflate.h │ ├── doc/ │ │ ├── algorithm.txt │ │ ├── rfc1950.txt │ │ ├── rfc1951.txt │ │ ├── rfc1952.txt │ │ └── txtvsbin.txt │ ├── examples/ │ │ ├── README.examples │ │ ├── enough.c │ │ ├── fitblk.c │ │ ├── gun.c │ │ ├── gzappend.c │ │ ├── gzjoin.c │ │ ├── gzlog.c │ │ ├── gzlog.h │ │ ├── gznorm.c │ │ ├── zlib_how.html │ │ ├── zpipe.c │ │ ├── zran.c │ │ └── zran.h │ ├── gzclose.c │ ├── gzguts.h │ ├── gzlib.c │ ├── gzread.c │ ├── gzwrite.c │ ├── infback.c │ ├── inffast.c │ ├── inffast.h │ ├── inffixed.h │ ├── inflate.c │ ├── inflate.h │ ├── inftrees.c │ ├── inftrees.h │ ├── make_vms.com │ ├── msdos/ │ │ ├── Makefile.bor │ │ ├── Makefile.dj2 │ │ ├── Makefile.emx │ │ ├── Makefile.msc │ │ └── Makefile.tc │ ├── nintendods/ │ │ ├── Makefile │ │ └── README │ ├── old/ │ │ ├── Makefile.emx │ │ ├── Makefile.riscos │ │ ├── README │ │ ├── descrip.mms │ │ ├── os2/ │ │ │ ├── Makefile.os2 │ │ │ └── zlib.def │ │ └── visual-basic.txt │ ├── os400/ │ │ ├── README400 │ │ ├── bndsrc │ │ ├── make.sh │ │ └── zlib.inc │ ├── qnx/ │ │ └── package.qpg │ ├── test/ │ │ ├── example.c │ │ ├── infcover.c │ │ └── minigzip.c │ ├── treebuild.xml │ ├── trees.c │ ├── trees.h │ ├── uncompr.c │ ├── watcom/ │ │ ├── watcom_f.mak │ │ └── watcom_l.mak │ ├── win32/ │ │ ├── DLL_FAQ.txt │ │ ├── Makefile.bor │ │ ├── Makefile.gcc │ │ ├── Makefile.msc │ │ ├── README-WIN32.txt │ │ ├── VisualC.txt │ │ ├── zlib.def │ │ └── zlib1.rc │ ├── zconf.h.cmakein │ ├── zconf.h.in │ ├── zconf.h.included │ ├── zlib.3 │ ├── zlib.h │ ├── zlib.pc.cmakein │ ├── zlib.pc.in │ ├── zutil.c │ └── zutil.h └── zstd/ ├── .buckconfig ├── .buckversion ├── .cirrus.yml ├── .gitattributes ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── bug_report.md │ │ └── feature_request.md │ ├── dependabot.yml │ └── workflows/ │ ├── android-ndk-build.yml │ ├── commit.yml │ ├── dev-long-tests.yml │ ├── dev-short-tests.yml │ ├── nightly.yml │ ├── publish-release-artifacts.yml │ ├── scorecards.yml │ └── windows-artifacts.yml ├── .gitignore ├── CHANGELOG ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── COPYING ├── LICENSE ├── Makefile ├── Package.swift ├── README.md ├── SECURITY.md ├── TESTING.md ├── contrib/ │ ├── VS2005/ │ │ ├── README.md │ │ ├── fullbench/ │ │ │ └── fullbench.vcproj │ │ ├── fuzzer/ │ │ │ └── fuzzer.vcproj │ │ ├── zstd/ │ │ │ └── zstd.vcproj │ │ ├── zstd.sln │ │ └── zstdlib/ │ │ └── zstdlib.vcproj │ ├── cleanTabs │ ├── diagnose_corruption/ │ │ ├── .gitignore │ │ ├── Makefile │ │ └── check_flipped_bits.c │ ├── docker/ │ │ ├── Dockerfile │ │ └── README.md │ ├── freestanding_lib/ │ │ └── freestanding.py │ ├── linux-kernel/ │ │ ├── .gitignore │ │ ├── Makefile │ │ ├── README.md │ │ ├── btrfs-benchmark.sh │ │ ├── btrfs-extract-benchmark.sh │ │ ├── decompress_sources.h │ │ ├── linux.mk │ │ ├── linux_zstd.h │ │ ├── mem.h │ │ ├── squashfs-benchmark.sh │ │ ├── test/ │ │ │ ├── Makefile │ │ │ ├── include/ │ │ │ │ └── linux/ │ │ │ │ ├── compiler.h │ │ │ │ ├── errno.h │ │ │ │ ├── kernel.h │ │ │ │ ├── limits.h │ │ │ │ ├── math64.h │ │ │ │ ├── module.h │ │ │ │ ├── printk.h │ │ │ │ ├── stddef.h │ │ │ │ ├── swab.h │ │ │ │ ├── types.h │ │ │ │ ├── unaligned.h │ │ │ │ └── xxhash.h │ │ │ ├── macro-test.sh │ │ │ ├── static_test.c │ │ │ └── test.c │ │ ├── zstd_common_module.c │ │ ├── zstd_compress_module.c │ │ ├── zstd_decompress_module.c │ │ └── zstd_deps.h │ ├── match_finders/ │ │ ├── README.md │ │ ├── zstd_edist.c │ │ └── zstd_edist.h │ ├── premake/ │ │ ├── premake4.lua │ │ └── zstd.lua │ ├── recovery/ │ │ ├── Makefile │ │ └── recover_directory.c │ ├── seekable_format/ │ │ ├── README.md │ │ ├── examples/ │ │ │ ├── .gitignore │ │ │ ├── Makefile │ │ │ ├── parallel_compression.c │ │ │ ├── parallel_processing.c │ │ │ ├── seekable_compression.c │ │ │ ├── seekable_decompression.c │ │ │ └── seekable_decompression_mem.c │ │ ├── tests/ │ │ │ ├── .gitignore │ │ │ ├── Makefile │ │ │ └── seekable_tests.c │ │ ├── zstd_seekable.h │ │ ├── zstd_seekable_compression_format.md │ │ ├── zstdseek_compress.c │ │ └── zstdseek_decompress.c │ ├── seqBench/ │ │ ├── Makefile │ │ └── seqBench.c │ └── snap/ │ └── snapcraft.yaml ├── doc/ │ ├── README.md │ ├── decompressor_errata.md │ ├── decompressor_permissive.md │ ├── educational_decoder/ │ │ ├── .gitignore │ │ ├── Makefile │ │ ├── README.md │ │ ├── harness.c │ │ ├── zstd_decompress.c │ │ └── zstd_decompress.h │ ├── zstd_compression_format.md │ └── zstd_manual.html ├── examples/ │ ├── .gitignore │ ├── Makefile │ ├── README.md │ ├── common.h │ ├── dictionary_compression.c │ ├── dictionary_decompression.c │ ├── multiple_simple_compression.c │ ├── multiple_streaming_compression.c │ ├── simple_compression.c │ ├── simple_decompression.c │ ├── streaming_compression.c │ ├── streaming_compression_thread_pool.c │ ├── streaming_decompression.c │ └── streaming_memory_usage.c ├── lib/ │ ├── .gitignore │ ├── Makefile │ ├── README.md │ ├── common/ │ │ ├── allocations.h │ │ ├── bits.h │ │ ├── bitstream.h │ │ ├── compiler.h │ │ ├── cpu.h │ │ ├── debug.c │ │ ├── debug.h │ │ ├── entropy_common.c │ │ ├── error_private.c │ │ ├── error_private.h │ │ ├── fse.h │ │ ├── fse_decompress.c │ │ ├── huf.h │ │ ├── mem.h │ │ ├── pool.c │ │ ├── pool.h │ │ ├── portability_macros.h │ │ ├── threading.c │ │ ├── threading.h │ │ ├── xxhash.c │ │ ├── xxhash.h │ │ ├── zstd_common.c │ │ ├── zstd_deps.h │ │ ├── zstd_internal.h │ │ └── zstd_trace.h │ ├── compress/ │ │ ├── clevels.h │ │ ├── fse_compress.c │ │ ├── hist.c │ │ ├── hist.h │ │ ├── huf_compress.c │ │ ├── zstd_compress.c │ │ ├── zstd_compress_internal.h │ │ ├── zstd_compress_literals.c │ │ ├── zstd_compress_literals.h │ │ ├── zstd_compress_sequences.c │ │ ├── zstd_compress_sequences.h │ │ ├── zstd_compress_superblock.c │ │ ├── zstd_compress_superblock.h │ │ ├── zstd_cwksp.h │ │ ├── zstd_double_fast.c │ │ ├── zstd_double_fast.h │ │ ├── zstd_fast.c │ │ ├── zstd_fast.h │ │ ├── zstd_lazy.c │ │ ├── zstd_lazy.h │ │ ├── zstd_ldm.c │ │ ├── zstd_ldm.h │ │ ├── zstd_ldm_geartab.h │ │ ├── zstd_opt.c │ │ ├── zstd_opt.h │ │ ├── zstd_preSplit.c │ │ ├── zstd_preSplit.h │ │ ├── zstdmt_compress.c │ │ └── zstdmt_compress.h │ ├── decompress/ │ │ ├── huf_decompress.c │ │ ├── huf_decompress_amd64.S │ │ ├── zstd_ddict.c │ │ ├── zstd_ddict.h │ │ ├── zstd_decompress.c │ │ ├── zstd_decompress_block.c │ │ ├── zstd_decompress_block.h │ │ └── zstd_decompress_internal.h │ ├── deprecated/ │ │ ├── zbuff.h │ │ ├── zbuff_common.c │ │ ├── zbuff_compress.c │ │ └── zbuff_decompress.c │ ├── dictBuilder/ │ │ ├── cover.c │ │ ├── cover.h │ │ ├── divsufsort.c │ │ ├── divsufsort.h │ │ ├── fastcover.c │ │ └── zdict.c │ ├── dll/ │ │ └── example/ │ │ ├── Makefile │ │ ├── README.md │ │ ├── build_package.bat │ │ ├── fullbench-dll.sln │ │ └── fullbench-dll.vcxproj │ ├── legacy/ │ │ ├── zstd_legacy.h │ │ ├── zstd_v01.c │ │ ├── zstd_v01.h │ │ ├── zstd_v02.c │ │ ├── zstd_v02.h │ │ ├── zstd_v03.c │ │ ├── zstd_v03.h │ │ ├── zstd_v04.c │ │ ├── zstd_v04.h │ │ ├── zstd_v05.c │ │ ├── zstd_v05.h │ │ ├── zstd_v06.c │ │ ├── zstd_v06.h │ │ ├── zstd_v07.c │ │ └── zstd_v07.h │ ├── libzstd.mk │ ├── libzstd.pc.in │ ├── module.modulemap │ ├── zdict.h │ ├── zstd.h │ └── zstd_errors.h ├── tests/ │ ├── .gitignore │ ├── DEPRECATED-test-zstd-speed.py │ ├── Makefile │ ├── README.md │ ├── automated_benchmarking.py │ ├── bigdict.c │ ├── checkTag.c │ ├── check_size.py │ ├── cli-tests/ │ │ ├── .gitignore │ │ ├── README.md │ │ ├── basic/ │ │ │ ├── args.sh │ │ │ ├── args.sh.exit │ │ │ ├── args.sh.stderr.glob │ │ │ ├── help.sh │ │ │ ├── help.sh.stdout.glob │ │ │ ├── memlimit.sh │ │ │ ├── memlimit.sh.stderr.exact │ │ │ ├── memlimit.sh.stdout.exact │ │ │ ├── output_dir.sh │ │ │ ├── output_dir.sh.stderr.exact │ │ │ ├── output_dir.sh.stdout.exact │ │ │ ├── version.sh │ │ │ └── version.sh.stdout.glob │ │ ├── bin/ │ │ │ ├── cmp_size │ │ │ ├── datagen │ │ │ ├── die │ │ │ ├── println │ │ │ ├── zstd │ │ │ ├── zstdgrep │ │ │ └── zstdless │ │ ├── cltools/ │ │ │ ├── setup │ │ │ ├── zstdgrep.sh │ │ │ ├── zstdgrep.sh.exit │ │ │ ├── zstdgrep.sh.stderr.exact │ │ │ ├── zstdgrep.sh.stdout.glob │ │ │ ├── zstdless.sh │ │ │ ├── zstdless.sh.stderr.exact │ │ │ └── zstdless.sh.stdout.glob │ │ ├── common/ │ │ │ ├── format.sh │ │ │ ├── mtime.sh │ │ │ ├── permissions.sh │ │ │ └── platform.sh │ │ ├── compression/ │ │ │ ├── adapt.sh │ │ │ ├── basic.sh │ │ │ ├── compress-literals.sh │ │ │ ├── format.sh │ │ │ ├── golden.sh │ │ │ ├── gzip-compat.sh │ │ │ ├── levels.sh │ │ │ ├── levels.sh.stderr.exact │ │ │ ├── long-distance-matcher.sh │ │ │ ├── multi-threaded.sh │ │ │ ├── multi-threaded.sh.stderr.exact │ │ │ ├── multiple-files.sh │ │ │ ├── multiple-files.sh.stdout.exact │ │ │ ├── row-match-finder.sh │ │ │ ├── setup │ │ │ ├── stream-size.sh │ │ │ ├── verbose-wlog.sh │ │ │ ├── verbose-wlog.sh.stderr.glob │ │ │ ├── verbose-wlog.sh.stdout.glob │ │ │ ├── window-resize.sh │ │ │ ├── window-resize.sh.stderr.ignore │ │ │ └── window-resize.sh.stdout.glob │ │ ├── decompression/ │ │ │ ├── detectErrors.sh │ │ │ ├── golden.sh │ │ │ ├── pass-through.sh │ │ │ ├── pass-through.sh.stderr.exact │ │ │ └── pass-through.sh.stdout.exact │ │ ├── dict-builder/ │ │ │ ├── empty-input.sh │ │ │ ├── empty-input.sh.stderr.exact │ │ │ ├── no-inputs.sh │ │ │ ├── no-inputs.sh.exit │ │ │ └── no-inputs.sh.stderr.exact │ │ ├── dictionaries/ │ │ │ ├── dictionary-mismatch.sh │ │ │ ├── dictionary-mismatch.sh.stderr.exact │ │ │ ├── golden.sh │ │ │ ├── setup │ │ │ └── setup_once │ │ ├── file-handling/ │ │ │ ├── directory-mirror.sh │ │ │ ├── directory-mirror.sh.stderr.exact │ │ │ └── directory-mirror.sh.stdout.exact │ │ ├── file-stat/ │ │ │ ├── compress-file-to-dir-without-write-perm.sh │ │ │ ├── compress-file-to-dir-without-write-perm.sh.stderr.exact │ │ │ ├── compress-file-to-file.sh │ │ │ ├── compress-file-to-file.sh.stderr.exact │ │ │ ├── compress-file-to-stdout.sh │ │ │ ├── compress-file-to-stdout.sh.stderr.exact │ │ │ ├── compress-stdin-to-file.sh │ │ │ ├── compress-stdin-to-file.sh.stderr.exact │ │ │ ├── compress-stdin-to-stdout.sh │ │ │ ├── compress-stdin-to-stdout.sh.stderr.exact │ │ │ ├── decompress-file-to-file.sh │ │ │ ├── decompress-file-to-file.sh.stderr.exact │ │ │ ├── decompress-file-to-stdout.sh │ │ │ ├── decompress-file-to-stdout.sh.stderr.exact │ │ │ ├── decompress-stdin-to-file.sh │ │ │ ├── decompress-stdin-to-file.sh.stderr.exact │ │ │ ├── decompress-stdin-to-stdout.sh │ │ │ └── decompress-stdin-to-stdout.sh.stderr.exact │ │ ├── progress/ │ │ │ ├── no-progress.sh │ │ │ ├── no-progress.sh.stderr.glob │ │ │ ├── progress.sh │ │ │ └── progress.sh.stderr.glob │ │ ├── run.py │ │ └── zstd-symlinks/ │ │ ├── setup │ │ ├── zstdcat.sh │ │ └── zstdcat.sh.stdout.exact │ ├── datagencli.c │ ├── decodecorpus.c │ ├── dict-files/ │ │ └── zero-weight-dict │ ├── external_matchfinder.c │ ├── external_matchfinder.h │ ├── fullbench.c │ ├── fuzz/ │ │ ├── .gitignore │ │ ├── Makefile │ │ ├── README.md │ │ ├── block_decompress.c │ │ ├── block_round_trip.c │ │ ├── decompress_cross_format.c │ │ ├── decompress_dstSize_tooSmall.c │ │ ├── dictionary_decompress.c │ │ ├── dictionary_loader.c │ │ ├── dictionary_round_trip.c │ │ ├── dictionary_stream_round_trip.c │ │ ├── fse_read_ncount.c │ │ ├── fuzz.h │ │ ├── fuzz.py │ │ ├── fuzz_data_producer.c │ │ ├── fuzz_data_producer.h │ │ ├── fuzz_helpers.c │ │ ├── fuzz_helpers.h │ │ ├── fuzz_third_party_seq_prod.h │ │ ├── generate_sequences.c │ │ ├── huf_decompress.c │ │ ├── huf_round_trip.c │ │ ├── raw_dictionary_round_trip.c │ │ ├── regression_driver.c │ │ ├── seekable_roundtrip.c │ │ ├── seq_prod_fuzz_example/ │ │ │ ├── Makefile │ │ │ ├── README.md │ │ │ └── example_seq_prod.c │ │ ├── sequence_compression_api.c │ │ ├── simple_compress.c │ │ ├── simple_decompress.c │ │ ├── simple_round_trip.c │ │ ├── stream_decompress.c │ │ ├── stream_round_trip.c │ │ ├── zstd_frame_info.c │ │ ├── zstd_helpers.c │ │ └── zstd_helpers.h │ ├── fuzzer.c │ ├── golden-compression/ │ │ ├── PR-3517-block-splitter-corruption-test │ │ ├── http │ │ ├── huffman-compressed-larger │ │ └── large-literal-and-match-lengths │ ├── golden-decompression-errors/ │ │ ├── .gitignore │ │ ├── off0.bin.zst │ │ ├── truncated_huff_state.zst │ │ └── zeroSeq_extraneous.zst │ ├── golden-dictionaries/ │ │ └── http-dict-missing-symbols │ ├── gzip/ │ │ ├── Makefile │ │ ├── gzip-env.sh │ │ ├── helin-segv.sh │ │ ├── help-version.sh │ │ ├── hufts.sh │ │ ├── init.cfg │ │ ├── init.sh │ │ ├── keep.sh │ │ ├── list.sh │ │ ├── memcpy-abuse.sh │ │ ├── mixed.sh │ │ ├── null-suffix-clobber.sh │ │ ├── stdin.sh │ │ ├── test-driver.sh │ │ ├── trailing-nul.sh │ │ ├── unpack-invalid.sh │ │ ├── z-suffix.sh │ │ ├── zdiff.sh │ │ ├── zgrep-context.sh │ │ ├── zgrep-f.sh │ │ ├── zgrep-signal.sh │ │ └── znew-k.sh │ ├── invalidDictionaries.c │ ├── largeDictionary.c │ ├── legacy.c │ ├── libzstd_builds.sh │ ├── longmatch.c │ ├── loremOut.c │ ├── loremOut.h │ ├── paramgrill.c │ ├── playTests.sh │ ├── poolTests.c │ ├── rateLimiter.py │ ├── regression/ │ │ ├── .gitignore │ │ ├── Makefile │ │ ├── README.md │ │ ├── config.c │ │ ├── config.h │ │ ├── data.c │ │ ├── data.h │ │ ├── levels.h │ │ ├── method.c │ │ ├── method.h │ │ ├── result.c │ │ ├── result.h │ │ ├── results.csv │ │ └── test.c │ ├── roundTripCrash.c │ ├── seqgen.c │ ├── seqgen.h │ ├── test-license.py │ ├── test-variants.sh │ ├── test-zstd-versions.py │ └── zstreamtest.c └── zlibWrapper/ ├── .gitignore ├── Makefile ├── README.md ├── examples/ │ ├── example.c │ ├── example_original.c │ ├── fitblk.c │ ├── fitblk_original.c │ ├── minigzip.c │ └── zwrapbench.c ├── gzclose.c ├── gzcompatibility.h ├── gzguts.h ├── gzlib.c ├── gzread.c ├── gzwrite.c ├── zstd_zlibwrapper.c └── zstd_zlibwrapper.h ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/FUNDING.yml ================================================ # These are supported funding model platforms github: [rui314] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] patreon: # Replace with a single Patreon username open_collective: # Replace with a single Open Collective username ko_fi: # Replace with a single Ko-fi username tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry liberapay: # Replace with a single Liberapay username issuehunt: # Replace with a single IssueHunt username otechie: # Replace with a single Otechie username lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] ================================================ FILE: .github/workflows/build-all.yml ================================================ name: Build all tarballs on: schedule: - cron: '0 0 * * *' workflow_dispatch: jobs: build-linux: strategy: matrix: include: - { target: x86_64, os: ubuntu-24.04 } - { target: aarch64, os: ubuntu-24.04-arm } - { target: arm, os: ubuntu-24.04-arm } - { target: riscv64, os: ubuntu-24.04 } - { target: ppc64le, os: ubuntu-24.04 } - { target: s390x, os: ubuntu-24.04 } - { target: loongarch64, os: ubuntu-24.04 } runs-on: ${{ matrix.os }} permissions: contents: read packages: write steps: - name: Checkout Repository uses: actions/checkout@v4 - name: Install Podman run: sudo apt-get update && sudo apt-get install -y podman qemu-user-static - name: Login to GitHub Container Registry uses: redhat-actions/podman-login@v1 with: registry: ghcr.io username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - name: Build a tarball run: ./dist.sh ${{ matrix.target }} - name: Upload artifact uses: actions/upload-artifact@v4 with: name: ${{ matrix.target }} path: dist/mold-*.tar.gz compression-level: 0 build-windows: runs-on: windows-latest steps: - name: Checkout Repository uses: actions/checkout@v4 - name: Build and Archive shell: pwsh run: | mkdir build cd build cmake -T clangcl .. cmake --build . --config Release -j $Env:NUMBER_OF_PROCESSORS cmake --install . --config Release --prefix ../mold-install cd .. New-Item -ItemType Directory -Force dist | Out-Null $version = $Env:TAG -replace '^v', '' Compress-Archive -Path mold-install\* -DestinationPath dist\mold-$version-windows-x86_64.zip - name: Upload artifact uses: actions/upload-artifact@v4 with: name: win-x86_64 path: dist/mold-*.* ================================================ FILE: .github/workflows/build-native.yml ================================================ name: Build native tarballs on: push: workflow_dispatch: jobs: build-tarballs: strategy: matrix: include: - { target: x86_64, os: ubuntu-24.04 } - { target: aarch64, os: ubuntu-24.04-arm } - { target: arm, os: ubuntu-24.04-arm } runs-on: ${{ matrix.os }} permissions: contents: read packages: write steps: - name: Checkout Repository uses: actions/checkout@v4 - name: Install Podman run: sudo apt-get update && sudo apt-get install -y podman qemu-user-static - name: Login to GitHub Container Registry uses: redhat-actions/podman-login@v1 with: registry: ghcr.io username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - name: Build a tarball run: ./dist.sh ${{ matrix.target }} - name: Upload artifact uses: actions/upload-artifact@v4 with: name: ${{ matrix.target }} path: dist/mold-*.tar.gz compression-level: 0 ================================================ FILE: .github/workflows/ci.yml ================================================ name: CI on: push: pull_request: env: UBSAN_OPTIONS: print_stacktrace=1:halt_on_error=1 jobs: build-asan-tsan: strategy: matrix: target: - '' - '-DMOLD_USE_ASAN=On' - '-DMOLD_USE_TSAN=On' runs-on: ubuntu-24.04 timeout-minutes: 60 steps: - uses: actions/checkout@v4 - uses: rui314/setup-mold@staging - run: sudo ./install-build-deps.sh - name: build run: | sudo apt-get install -y clang-18 clang gcc-multilib gdb dwarfdump zstd mkdir build cd build cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_C_COMPILER=clang-18 -DCMAKE_CXX_COMPILER=clang++-18 ${{ matrix.target }} .. cmake --build . -j$(nproc) - run: cd build; ctest --output-on-failure -j$(nproc) - name: archive test results uses: actions/upload-artifact@v4 if: failure() with: name: test-results-clang path: | build !build/CMakeFiles build-msan: runs-on: ubuntu-24.04 timeout-minutes: 60 steps: - uses: actions/checkout@v4 - uses: redhat-actions/podman-login@v1 with: registry: ghcr.io username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - run: .github/workflows/run-msan.sh build-multi-archs: runs-on: ubuntu-24.04 timeout-minutes: 60 steps: - uses: actions/checkout@v4 - name: install-build-deps run: | sudo ./install-build-deps.sh sudo ./install-cross-tools.sh sudo .github/workflows/install-extras.sh - name: build run: | mkdir build cd build cmake .. cmake --build . -j$(nproc) - run: cd build; ctest --output-on-failure -j$(nproc) - name: archive test results uses: actions/upload-artifact@v4 if: failure() with: name: test-results-multi-archs path: | build !build/CMakeFiles build-distros: strategy: matrix: distro: - alpine - archlinux - debian:11 # GCC 10 and CMake 3.18 - the minimum supported by mold - fedora - gentoo/stage3 - opensuse/tumbleweed - ubuntu:22.04 - ubuntu:25.04 runs-on: ubuntu-latest timeout-minutes: 60 container: ${{ matrix.distro }} steps: - uses: actions/checkout@v4 - run: ./install-build-deps.sh - name: build run: | mkdir build cd build cmake .. cmake --build . -j$(nproc) - run: cd build; ctest --output-on-failure -j$(nproc) - name: archive test results uses: actions/upload-artifact@v4 if: failure() with: name: test-results-${{ matrix.distro }} path: | build !build/CMakeFiles build-macos: runs-on: macos-latest timeout-minutes: 60 steps: - uses: actions/checkout@v4 - name: build run: | mkdir build cd build cmake .. cmake --build . -j$(sysctl -n hw.physicalcpu) build-windows: runs-on: windows-latest timeout-minutes: 60 steps: - uses: actions/checkout@v4 - name: build run: | mkdir build cd build cmake -T clangcl .. cmake --build . -j $Env:NUMBER_OF_PROCESSORS build-msys: runs-on: windows-latest timeout-minutes: 60 steps: - uses: actions/checkout@v4 - name: Setup MSYS2 uses: msys2/setup-msys2@v2 with: msystem: UCRT64 update: true pacboy: gcc-libs:p libwinpthread-git:p tbb:p zlib:p zstd:p dlfcn:p cc:p cmake:p ninja:p - name: build shell: msys2 {0} run: | mkdir build cd build cmake -GNinja -DMOLD_USE_MIMALLOC=OFF -DMOLD_USE_SYSTEM_TBB=ON .. cmake --build . -j $(nproc) build-freebsd: runs-on: ubuntu-latest timeout-minutes: 60 steps: - uses: actions/checkout@v4 - name: Build and test uses: vmactions/freebsd-vm@v1 with: usesh: true run: | ./install-build-deps.sh mkdir build cd build cmake .. cmake --build . -j$(nproc) ctest --output-on-failure -j$(nproc) ================================================ FILE: .github/workflows/install-extras.sh ================================================ #!/bin/bash -x apt-get update apt-get install -y wget xz-utils # Install a 32-bit RISC-V toolchain mkdir /rv32 wget -O- --progress=dot:mega https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2025.06.13/riscv32-glibc-ubuntu-24.04-gcc-nightly-2025.06.13-nightly.tar.xz | tar -C /rv32 --strip-components=1 --xz -xf - ln -sf /rv32/sysroot /usr/riscv32-linux-gnu echo '/rv32/bin/riscv32-unknown-linux-gnu-gcc -L/usr/riscv32-linux-gnu "$@"' > /usr/bin/riscv32-linux-gnu-gcc echo '/rv32/bin/riscv32-unknown-linux-gnu-g++ -L/usr/riscv32-linux-gnu "$@"' > /usr/bin/riscv32-linux-gnu-g++ chmod 755 /usr/bin/riscv32-linux-gnu-{gcc,g++} for i in objdump objcopy strip; do ln -sf /rv32/bin/riscv32-unknown-linux-gnu-$i /usr/bin/riscv32-linux-gnu-$i done # Install a LoongArch toolchain mkdir /larch wget -O- --progress=dot:mega https://github.com/loongson/build-tools/releases/download/2025.06.06/x86_64-cross-tools-loongarch64-binutils_2.44-gcc_15.1.0-glibc_2.41.tar.xz | tar -C /larch --strip-components=1 --xz -xf - cp -r /larch/loongarch64-unknown-linux-gnu/lib/* /larch/target/lib64 ln -sf /larch/target /usr/loongarch64-linux-gnu for i in gcc g++ objdump objcopy strip; do ln -sf /larch/bin/loongarch64-unknown-linux-gnu-$i /usr/bin/loongarch64-linux-gnu-$i done wget -O /usr/local/bin/qemu-loongarch64 --progress=dot:mega https://github.com/loongson/build-tools/releases/download/2025.06.06/qemu-loongarch64 chmod 755 /usr/local/bin/qemu-loongarch64 # Install ARM32 big-endian toolchain mkdir /armeb wget -O- --progress=dot:mega https://toolchains.bootlin.com/downloads/releases/toolchains/armebv7-eabihf/tarballs/armebv7-eabihf--glibc--stable-2024.05-1.tar.xz | tar -C /armeb --strip-components=1 --xz -xf - ln -sf /armeb/armeb-buildroot-linux-gnueabihf/sysroot /usr/armeb-linux-gnueabihf echo '/armeb/bin/armeb-linux-gcc -L/usr/armeb-linux-gnu "$@"' > /usr/bin/armeb-linux-gnueabihf-gcc echo '/armeb/bin/armeb-linux-g++ -L/usr/armeb-linux-gnu "$@"' > /usr/bin/armeb-linux-gnueabihf-g++ chmod 755 /usr/bin/armeb-linux-gnueabihf-{gcc,g++} for i in objdump objcopy strip; do ln -sf /armeb/bin/armeb-linux-$i /usr/bin/armeb-linux-gnueabihf-$i done # Install ARM64 big-endian toolchain mkdir /aarch64be wget -O- --progress=dot:mega https://toolchains.bootlin.com/downloads/releases/toolchains/aarch64be/tarballs/aarch64be--glibc--stable-2024.05-1.tar.xz | tar -C /aarch64be --strip-components=1 --xz -xf - ln -sf /aarch64be/aarch64_be-buildroot-linux-gnu/sysroot /usr/aarch64_be-linux-gnu echo '/aarch64be/bin/aarch64_be-linux-gcc -L/usr/aarch64_be-linux-gnu "$@"' > /usr/bin/aarch64_be-linux-gnu-gcc echo '/aarch64be/bin/aarch64_be-linux-g++ -L/usr/aarch64_be-linux-gnu "$@"' > /usr/bin/aarch64_be-linux-gnu-g++ chmod 755 /usr/bin/aarch64_be-linux-gnu-{gcc,g++} for i in objdump objcopy strip; do ln -sf /aarch64be/bin/aarch64_be-linux-$i /usr/bin/aarch64_be-linux-gnu-$i done # Install SH4 big-endian toolchain mkdir /sh4aeb wget -O- --progress=dot:mega https://toolchains.bootlin.com/downloads/releases/toolchains/sh-sh4aeb/tarballs/sh-sh4aeb--glibc--stable-2024.05-1.tar.xz | tar -C /sh4aeb --strip-components=1 --xz -xf - ln -sf /sh4aeb/sh4aeb-buildroot-linux-gnu/sysroot /usr/sh4aeb-linux-gnu echo '/sh4aeb/bin/sh4aeb-linux-gcc -L/usr/sh4aeb-linux-gnu "$@"' > /usr/bin/sh4aeb-linux-gnu-gcc echo '/sh4aeb/bin/sh4aeb-linux-g++ -L/usr/sh4aeb-linux-gnu "$@"' > /usr/bin/sh4aeb-linux-gnu-g++ chmod 755 /usr/bin/sh4aeb-linux-gnu-{gcc,g++} for i in objdump objcopy strip; do ln -sf /sh4aeb/bin/sh4aeb-linux-$i /usr/bin/sh4aeb-linux-gnu-$i done # Install Intel SDE CPU emulator for CET-related tests mkdir /sde wget -O- --progress=dot:mega https://downloadmirror.intel.com/850782/sde-external-9.53.0-2025-03-16-lin.tar.xz | tar -C /sde --strip-components=1 --xz -xf - ln -s /sde/sde64 /usr/bin ================================================ FILE: .github/workflows/release-assets.yml ================================================ name: Build & attach tarballs on tag push on: push: tags: ['v[0-9]*'] permissions: contents: write packages: write id-token: write env: TAG: ${{ github.ref_name }} jobs: build-linux: strategy: matrix: include: - { target: x86_64, os: ubuntu-24.04 } - { target: aarch64, os: ubuntu-24.04-arm } - { target: arm, os: ubuntu-24.04-arm } - { target: riscv64, os: ubuntu-24.04 } - { target: ppc64le, os: ubuntu-24.04 } - { target: s390x, os: ubuntu-24.04 } - { target: loongarch64, os: ubuntu-24.04 } runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 - name: Install Podman run: sudo apt-get update && sudo apt-get install -y podman qemu-user-static - name: Login to GitHub Container Registry uses: redhat-actions/podman-login@v1 with: registry: ghcr.io username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - run: ./dist.sh ${{ matrix.target }} - uses: actions/upload-artifact@v4 with: name: ${{ matrix.target }} path: dist/mold-*.tar.gz build-windows: runs-on: windows-latest steps: - uses: actions/checkout@v4 - name: Build and Archive shell: pwsh run: | mkdir build cd build cmake -T clangcl .. cmake --build . --config Release -j $Env:NUMBER_OF_PROCESSORS cmake --install . --config Release --prefix ../mold-install cd .. New-Item -ItemType Directory -Force dist | Out-Null $version = $Env:TAG -replace '^v', '' Compress-Archive -Path mold-install\* -DestinationPath dist\mold-$version-x86_64-windows.zip - uses: actions/upload-artifact@v4 with: name: win-x86_64 path: dist/mold-*.* publish: needs: [build-linux, build-windows] runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: actions/download-artifact@v4 with: path: dist pattern: '*' merge-multiple: true - name: Ensure release exists (create if missing) env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | set -e if gh release view "$TAG" >& /dev/null; then echo "Existing release found for $TAG" else echo "Creating draft release for $TAG" gh release create "$TAG" --draft \ --title "$TAG" \ --notes "Automated draft – fill the changelog later." fi - name: Upload tarballs env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | gh release upload "$TAG" dist/* --clobber gh release view "$TAG" --json assets --jq '.assets[].name' ================================================ FILE: .github/workflows/run-msan.sh ================================================ #!/bin/bash set -e -x cd "$(dirname $0)"/../.. if [ "$GITHUB_REPOSITORY" = '' ]; then image=mold-msan image_build="podman build -t $image -" else # If this script is running on GitHub Actions, we want to cache # the created container image in GitHub's container repostiory. image=ghcr.io/$GITHUB_REPOSITORY/mold-msan image_build="podman build -t $image --output=type=registry --layers --cache-to $image --cache-from $image -" fi cat < # # Specifies the C compiler name to use. The default value is `cc`. # # -DCMAKE_CXX_COMPILER= # # Specifies the C++ compiler name to use. The default value is `c++`. # # -DCMAKE_INSTALL_PREFIX= # # Specifies the install target directory. The default value is `/usr/local`. # # -DCMAKE_BUILD_TYPE=[Debug | Release | RelWithDebInfo | MinSizeRel] # # Specifies the build type. The default is `Release`, which is the right # option unless you are debugging mold. # # An example of a cmake command line is shown below: # # $ cmake -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_CXX_COMPILER=g++-12 .. # # where `..` refers to this directory. # # With cmake, you may run `cmake --install .` instead of `make install` to # install build artifacts to system directories. If you want to install # artifacts to a temporary target directory, run `cmake --install . --prefix # `. # # You can see the current cmake variables and their values by running # `cmake -N -L .` in a build directory. # # Note that in this file, we provide various dials and knobs to configure # how to build mold. However, as a policy, we do not provide a way to # enable/disable any individual mold's feature. In other words, we do not # provide options like `--enable-foo` or `--disable-foo`. The motivation # behind this is build reproducibility. We want to guarantee that all builds # of the mold linker of the same version will have the exact same set of # features and behave exactly the same. cmake_minimum_required(VERSION 3.14) project(mold VERSION 2.40.4) include(CMakeDependentOption) include(CheckSymbolExists) include(GNUInstallDirs) add_executable(mold) target_compile_features(mold PRIVATE cxx_std_20) if(MINGW) target_link_libraries(mold PRIVATE dl) else() target_link_libraries(mold PRIVATE ${CMAKE_DL_LIBS}) endif() # Build mold itself using mold if -DMOLD_USE_MOLD=ON option(MOLD_USE_MOLD "Use mold to build mold" OFF) if(MOLD_USE_MOLD) target_link_options(mold PRIVATE -fuse-ld=mold) if(CMAKE_BUILD_TYPE MATCHES "Deb") target_link_options(mold PRIVATE -Wl,--gdb-index) endif() if(CMAKE_BUILD_TYPE MATCHES "^Rel") target_link_options(mold PRIVATE -Wl,--gc-sections,--icf=safe) endif() endif() if(NOT "${CMAKE_CXX_COMPILER_FRONTEND_VARIANT}" STREQUAL "MSVC") target_compile_options(mold PRIVATE -fno-exceptions -fno-unwind-tables -fno-asynchronous-unwind-tables -ffunction-sections -fdata-sections -Wall -Wextra -Wno-sign-compare -Wno-unused-function -Wno-unused-parameter -Wno-missing-field-initializers -ggnu-pubnames) endif() if(CMAKE_BUILD_TYPE STREQUAL "Debug") target_compile_options(mold PRIVATE -D_GLIBCXX_ASSERTIONS) endif() if(CMAKE_SYSTEM_NAME STREQUAL "OpenBSD") set(OPENBSD ON) endif() # Build mold with -flto if -DMOLD_LTO=ON option(MOLD_LTO "Build mold with link-time optimization enabled") if(MOLD_LTO) set_property(TARGET mold PROPERTY INTERPROCEDURAL_OPTIMIZATION ON) endif() # Enable AddressSanitizer if -DMOLD_USE_ASAN=ON option(MOLD_USE_ASAN "Build mold with AddressSanitizer" OFF) if(MOLD_USE_ASAN) target_compile_options(mold PRIVATE -fsanitize=address -fsanitize=undefined) target_link_options(mold PRIVATE -fsanitize=address -fsanitize=undefined) endif() # Enabled ThreadSanitizer if -DMOLD_USE_TSAN=ON option(MOLD_USE_TSAN "Build mold with ThreadSanitizer" OFF) if(MOLD_USE_TSAN) target_compile_options(mold PRIVATE -fsanitize=thread) target_link_options(mold PRIVATE -fsanitize=thread) endif() # Enable MemorySanitizer if -DMOLD_USE_MSAN=ON option(MOLD_USE_MSAN "Build mold with MemorySanitizer" OFF) if(MOLD_USE_MSAN) set(MOLD_STDLIB_PREFIX /usr CACHE FILEPATH "prefix of instrumented stdlib") target_compile_options(mold PRIVATE -g -Og -fsanitize=memory -fsanitize-memory-track-origins -nostdinc++ -isystem ${MOLD_STDLIB_PREFIX}/include/c++/v1 -DENABLE_MSAN_UNPOISON ) target_link_options(mold PRIVATE -fsanitize=memory -nostdlib++ -L ${MOLD_STDLIB_PREFIX}/lib -lc++ -Wl,-rpath,${MOLD_STDLIB_PREFIX}/lib ) endif() # Statically-link libstdc++ if -DMOLD_MOSTLY_STATIC=ON. # # This option is intended to be used by `./dist.sh` script to create a # mold binary that works on various Linux distros. You probably don't # need nor want to set this to ON. option(MOLD_MOSTLY_STATIC "Statically link libstdc++ and some other libraries" OFF) if(MOLD_MOSTLY_STATIC) target_link_options(mold PRIVATE -static-libstdc++) endif() # Find zlib. If libz.so is not found, we compile a bundled one and # statically-link it to mold. find_package(ZLIB QUIET) if(ZLIB_FOUND AND NOT MOLD_MOSTLY_STATIC) target_link_libraries(mold PRIVATE ZLIB::ZLIB) else() set(ZLIB_BUILD_EXAMPLES OFF CACHE INTERNAL "") add_subdirectory(third-party/zlib EXCLUDE_FROM_ALL) target_include_directories(zlibstatic INTERFACE third-party/zlib $) target_link_libraries(mold PRIVATE zlibstatic) endif() # Find BLAKE3 cryptographic hash library. Just like zlib, if libblkae3.so # is not found, we compile a bundled one and statically-link it to mold. find_package(BLAKE3 QUIET) if(BLAKE3_FOUND AND NOT MOLD_MOSTLY_STATIC) target_link_libraries(mold PRIVATE BLAKE3::blake3) else() function(mold_add_blake3) set(BUILD_SHARED_LIBS OFF) add_subdirectory(third-party/blake3/c EXCLUDE_FROM_ALL) target_link_libraries(mold PRIVATE blake3) target_include_directories(mold PUBLIC third-party/blake3/c) endfunction() mold_add_blake3() endif() # Find zstd compression library. If zstd.h is not found, we compile a # bundled one and statically-link it to mold. include(CheckIncludeFile) check_include_file(zstd.h HAVE_ZSTD_H) if(HAVE_ZSTD_H AND NOT MOLD_MOSTLY_STATIC) target_link_libraries(mold PRIVATE zstd) else() set(ZSTD_BUILD_PROGRAMS OFF) set(ZSTD_BUILD_CONTRIB OFF) set(ZSTD_BUILD_TESTS OFF) set(ZSTD_MULTITHREAD_SUPPORT OFF) set(ZSTD_BUILD_SHARED OFF) set(ZSTD_BUILD_STATIC ON) add_subdirectory(third-party/zstd/build/cmake EXCLUDE_FROM_ALL) target_include_directories(mold PUBLIC third-party/zstd/lib) target_link_libraries(mold PRIVATE libzstd_static) endif() # Find mimalloc. mimalloc is an alternative malloc implementation # optimized for multi-threaded applications. # # If you want to use the usual libc's malloc, pass -DMOLD_USE_MIMALLOC=OFF. # # We enable mimalloc by default for 64-bit targets. It doesn't seem to # be stable on 32-bit targets. cmake_dependent_option( MOLD_USE_MIMALLOC "Use mimalloc" ON "CMAKE_SIZEOF_VOID_P EQUAL 8; NOT APPLE; NOT ANDROID; NOT OPENBSD; NOT MOLD_USE_ASAN; NOT MOLD_USE_TSAN" OFF) cmake_dependent_option( MOLD_USE_SYSTEM_MIMALLOC "Use system or vendored mimalloc" OFF MOLD_USE_MIMALLOC OFF) # By default, we build a bundled mimalloc and statically-link it to # mold. If you want to dynamically link to the system's # libmimalloc.so, pass -DMOLD_USE_SYSTEM_MIMALLOC=ON. if(MOLD_USE_MIMALLOC) if(MOLD_USE_SYSTEM_MIMALLOC) find_package(mimalloc REQUIRED) target_link_libraries(mold PRIVATE mimalloc) else() function(mold_add_mimalloc) set(MI_BUILD_STATIC ON CACHE INTERNAL "") set(MI_BUILD_TESTS OFF CACHE INTERNAL "") set(MI_NO_OPT_ARCH ON CACHE INTERNAL "") add_subdirectory(third-party/mimalloc EXCLUDE_FROM_ALL) target_compile_definitions(mimalloc-static PRIVATE MI_USE_ENVIRON=0) target_link_libraries(mold PRIVATE mimalloc-static) endfunction() mold_add_mimalloc() endif() endif() # Find TBB. TBB (OneTBB or Intel TBB) is a high-level threading library. # Use of this library is mandatory. # # By default, we build a bundled one and statically-link the library # to mold. If you want to link to the system's libtbb2.so, pass # -DMOLD_USE_SYSTEM_TBB=ON. option(MOLD_USE_SYSTEM_TBB "Use system or vendored TBB" OFF) if(MOLD_USE_SYSTEM_TBB OR BLAKE3_USE_TBB) find_package(TBB REQUIRED) target_link_libraries(mold PRIVATE TBB::tbb) else() function(mold_add_tbb) set(BUILD_SHARED_LIBS OFF) set(TBB_TEST OFF CACHE INTERNAL "") set(TBB_STRICT OFF CACHE INTERNAL "") add_subdirectory(third-party/tbb EXCLUDE_FROM_ALL) target_compile_definitions(tbb PRIVATE __TBB_DYNAMIC_LOAD_ENABLED=0) target_link_libraries(mold PRIVATE TBB::tbb) endfunction() mold_add_tbb() endif() # We always use Clang to build mold on Windows. MSVC can't compile mold. if(WIN32) if(MSVC AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "Clang") message(FATAL_ERROR "Your compiler is not supported; install Clang from Visual Studio Installer and re-run cmake with '-T clangcl'") endif() target_compile_definitions(mold PRIVATE NOGDI NOMINMAX) if(MINGW) target_compile_definitions(mold PRIVATE _WIN32_WINNT=0xA00) target_link_libraries(mold PRIVATE bcrypt) endif() else() include(CheckLibraryExists) check_library_exists(m pow "" LIBM_FOUND) if(LIBM_FOUND) target_link_libraries(mold PRIVATE m) endif() endif() # Build mold-wrapper.so if(NOT APPLE AND NOT WIN32) add_library(mold-wrapper SHARED) install(TARGETS mold-wrapper DESTINATION ${CMAKE_INSTALL_LIBDIR}/mold) # Remove the default `lib` prefix set_target_properties(mold-wrapper PROPERTIES PREFIX "") target_link_libraries(mold-wrapper PRIVATE ${CMAKE_DL_LIBS}) target_sources(mold-wrapper PRIVATE src/mold-wrapper.c) endif() # If atomics doesn't work by default, add -latomic. # We need the flag on riscv, armv6 and m68k. include(CheckCXXSourceCompiles) check_cxx_source_compiles("#include int main() { std::atomic_uint8_t a; std::atomic_uint16_t b; std::atomic_uint32_t c; std::atomic_uint64_t d; return ++a + ++b + ++c + ++d; }" HAVE_FULL_ATOMIC_SUPPORT) if(NOT HAVE_FULL_ATOMIC_SUPPORT) target_link_libraries(mold PRIVATE atomic) endif() # Add -pthread if(NOT APPLE AND NOT MSVC) target_compile_options(mold PRIVATE -pthread) target_link_options(mold PRIVATE -pthread) endif() check_symbol_exists(madvise sys/mman.h HAVE_MADVISE) check_symbol_exists(uname sys/utsname.h HAVE_UNAME) # Create a .cc file containing the current git hash for `mold --version`. add_custom_target(git_hash COMMAND ${CMAKE_COMMAND} -DSOURCE_DIR=${CMAKE_SOURCE_DIR} -DOUTPUT_FILE=${CMAKE_BINARY_DIR}/mold-git-hash.h -P ${CMAKE_SOURCE_DIR}/lib/update-git-hash.cmake DEPENDS lib/update-git-hash.cmake BYPRODUCTS mold-git-hash.h VERBATIM) add_dependencies(mold git_hash) # Almost all functions are template in mold which take a target type # (e.g. X86_64) as its type parameter. Since we suport more than 10 # targets, compiling a single source file for all the targets is very # slow. # # As a workaround, we create a .cc file for each target and spawn many # compiler instances. This is hacky but greatly reduces compile time # on a multicore machine. # # You can build mold for a specific set of targets by passing, e.g., # -DMOLD_TARGETS='X86_64;ARM64LE', though this is strongly discouraged # for build reproducibility. Use this option only if you build mold # frequently for your personal use; otherwise, always build mold with all # targets enabled. We provide this flag in the trust that you will not # abuse it, so please don’t betray that trust. set(MOLD_TARGETS X86_64 I386 ARM64LE ARM64BE ARM32LE ARM32BE RV32LE RV32BE RV64LE RV64BE PPC32 PPC64V1 PPC64V2 S390X SPARC64 M68K SH4LE SH4BE LOONGARCH32 LOONGARCH64 CACHE STRING "List of supported targets") list(GET MOLD_TARGETS 0 MOLD_FIRST_TARGET) list(APPEND MOLD_TEMPLATE_FILES src/arch-arm32.cc src/arch-arm64.cc src/arch-i386.cc src/arch-loongarch.cc src/arch-m68k.cc src/arch-ppc32.cc src/arch-ppc64v1.cc src/arch-ppc64v2.cc src/arch-riscv.cc src/arch-s390x.cc src/arch-sh4.cc src/arch-sparc64.cc src/arch-x86-64.cc src/archive-file.cc src/cmdline.cc src/error.cc src/filetype.cc src/gc-sections.cc src/gdb-index.cc src/icf.cc src/input-files.cc src/input-sections.cc src/linker-script.cc src/main.cc src/mapfile.cc src/output-chunks.cc src/passes.cc src/relocatable.cc src/shrink-sections.cc src/thunks.cc src/tls.cc ) if(WIN32 AND NOT MINGW) list(APPEND MOLD_TEMPLATE_FILES src/lto-win32.cc) else() list(APPEND MOLD_TEMPLATE_FILES src/lto-unix.cc) endif() if(WIN32) list(APPEND MOLD_TEMPLATE_FILES src/output-file-win32.cc src/subprocess-win32.cc ) else() list(APPEND MOLD_TEMPLATE_FILES src/output-file-unix.cc src/subprocess-unix.cc ) endif() function(mold_instantiate_templates SOURCE TARGET) set(PATH ${CMAKE_BINARY_DIR}/${SOURCE}.${TARGET}.cc) if(NOT EXISTS ${PATH}) file(WRITE ${PATH} "#define MOLD_${TARGET} 1 #define MOLD_TARGET ${TARGET} #include \"${CMAKE_SOURCE_DIR}/${SOURCE}\" ") endif() target_sources(mold PRIVATE ${PATH}) endfunction() foreach (SOURCE IN LISTS MOLD_TEMPLATE_FILES) foreach(TARGET IN LISTS MOLD_TARGETS) mold_instantiate_templates(${SOURCE} ${TARGET}) set(HAVE_TARGET_${TARGET} 1) endforeach() endforeach() # Add other non-template source files. target_sources(mold PRIVATE lib/aho-corasick.cc lib/compress.cc lib/crc32.cc lib/demangle.cc lib/filepath.cc lib/glob.cc lib/hyperloglog.cc lib/perf.cc lib/random.cc lib/tar.cc src/elf.cc src/entry.cc third-party/rust-demangle/rust-demangle.c ) if(WIN32) target_sources(mold PRIVATE src/jobs-win32.cc src/mapped-file-win32.cc src/signal-win32.cc ) else() target_sources(mold PRIVATE src/jobs-unix.cc src/mapped-file-unix.cc src/signal-unix.cc ) endif() # Create config.h file configure_file(lib/config.h.in config.h) include_directories(${CMAKE_CURRENT_BINARY_DIR}) # Test configs include(CTest) if(BUILD_TESTING) # Create the ld symlinks required for testing if(NOT WIN32) add_custom_command( TARGET mold POST_BUILD COMMAND ${CMAKE_COMMAND} -E create_symlink mold ld BYPRODUCTS ld WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} VERBATIM) endif() if(${UNIX}) add_subdirectory(test) endif() endif() if(NOT CMAKE_SKIP_INSTALL_RULES) install(TARGETS mold RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) install(FILES docs/mold.1 DESTINATION ${CMAKE_INSTALL_MANDIR}/man1/) install(FILES LICENSE DESTINATION ${CMAKE_INSTALL_DOCDIR}) function(mold_install_relative_symlink OLD NEW) install(CODE " get_filename_component(PREFIX_ABS \${CMAKE_INSTALL_PREFIX}/ ABSOLUTE) get_filename_component(OLD_ABS ${OLD} ABSOLUTE BASE_DIR \${PREFIX_ABS}) get_filename_component(NEW_ABS ${NEW} ABSOLUTE BASE_DIR \${PREFIX_ABS}) get_filename_component(NEW_DIR \${NEW_ABS} DIRECTORY) file(RELATIVE_PATH OLD_REL \${NEW_DIR} \${OLD_ABS}) message(STATUS \"Installing symlink: \$ENV{DESTDIR}\${NEW_ABS} -> \${OLD_REL}\") file(MAKE_DIRECTORY \$ENV{DESTDIR}\${NEW_DIR}) file(CREATE_LINK \${OLD_REL} \$ENV{DESTDIR}\${NEW_ABS} SYMBOLIC)") endfunction() if(NOT WIN32) mold_install_relative_symlink(${CMAKE_INSTALL_BINDIR}/mold${CMAKE_EXECUTABLE_SUFFIX} ${CMAKE_INSTALL_LIBEXECDIR}/mold/ld${CMAKE_EXECUTABLE_SUFFIX}) mold_install_relative_symlink(${CMAKE_INSTALL_BINDIR}/mold${CMAKE_EXECUTABLE_SUFFIX} ${CMAKE_INSTALL_BINDIR}/ld.mold${CMAKE_EXECUTABLE_SUFFIX}) mold_install_relative_symlink(${CMAKE_INSTALL_MANDIR}/man1/mold.1 ${CMAKE_INSTALL_MANDIR}/man1/ld.mold.1) endif() endif() ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2023 Rui Ueyama Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # mold: A Modern Linker mold is a faster drop-in replacement for existing Unix linkers. It is several times quicker than the LLVM lld linker, the second-fastest open-source linker, which I initially developed a few years ago. mold aims to enhance developer productivity by minimizing build time, particularly in rapid debug-edit-rebuild cycles. Here is a performance comparison of GNU ld, GNU gold, LLVM lld, and mold when linking final debuginfo-enabled executables for major large programs on a simulated 16-core, 32-thread machine. ![Link speed comparison](docs/chart.svg) | Program (linker output size) | GNU ld | GNU gold | LLVM lld | mold |-------------------------------|--------|----------|----------|------ | MySQL 8.3 (0.47 GiB) | 10.84s | 7.47s | 1.64s | 0.46s | Clang 19 (1.56 GiB) | 42.07s | 33.13s | 5.20s | 1.35s | Chromium 124 (1.35 GiB) | N/A | 27.40s | 6.10s | 1.52s mold is so fast that it is only 2x _slower_ than the `cp` command on the same machine. If you find that mold is not faster than other linkers, feel free to [file a bug report](https://github.com/rui314/mold/issues). mold supports x86-64, i386, ARM64, ARM32, 64-bit/32-bit little/big-endian RISC-V, 32-bit PowerPC, 64-bit big-endian PowerPC ELFv1, 64-bit little-endian PowerPC ELFv2, s390x, 64-bit/32-bit LoongArch, SPARC64, m68k, and SH-4. ## Why does linking speed matter? If you are using a compiled language such as C, C++, or Rust, a build consists of two phases. In the first phase, a compiler compiles source files into object files (`.o` files). In the second phase, a linker takes all object files and combines them into a single executable or shared library file. The second phase can be time-consuming if your build output is large. mold can speed up this process, saving you time and preventing distractions while waiting for a lengthy build to finish. The difference is most noticeable during rapid debug-edit-rebuild cycles. ## Installation Binary packages for the following systems are currently available: [![Packaging status](https://repology.org/badge/vertical-allrepos/mold.svg)](https://repology.org/project/mold/versions) ## How to Build mold is written in C++20, so if you build mold yourself, you will need a recent version of a C++ compiler and a C++ standard library. We recommend GCC 10.2 or Clang 16.0.0 (or later) and libstdc++ 10 or libc++ 7 (or later). ### Install Dependencies To install build dependencies, run `./install-build-deps.sh` in this directory. It will detect your Linux distribution and attempt to install the necessary packages. You may need to run it as root. ### Compile mold ```shell git clone --branch stable https://github.com/rui314/mold.git cd mold ./install-build-deps.sh cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=c++ -B build cmake --build build -j$(nproc) sudo cmake --build build --target install ``` You might need to pass a C++20 compiler command name to `cmake`. In the example above, `c++` is passed. If that doesn't work for you, try a specific version of a compiler, such as `g++-10` or `clang++-12`. By default, `mold` is installed to `/usr/local/bin`. You can change the installation location by passing `-DCMAKE_INSTALL_PREFIX=`. For other cmake options, see the comments in `CMakeLists.txt`. If you are not using a recent enough Linux distribution, or if `cmake` does not work for you for any reason, you can use Podman to build mold in a container. To do so, run `./dist.sh` in this directory instead of using `cmake`. The shell script will pull a container image, build mold and auxiliary files inside it, and package them into a single tar file named `dist/mold-$version-$arch-linux.tar.gz`. You can extract the tar file anywhere and use the mold executable in it. ## How to use
A classic way to use mold On Unix, the linker command (usually `/usr/bin/ld`) is indirectly invoked by the compiler driver (typically `cc`, `gcc`, or `clang`), which is in turn indirectly invoked by `make` or other build system commands. If you can specify an additional command line option for your compiler driver by modifying the build system's config files, add one of the following flags to use mold instead of `/usr/bin/ld`: - For Clang: pass `-fuse-ld=mold` - For GCC 12.1.0 or later: pass `-fuse-ld=mold` - For GCC before 12.1.0: the `-fuse-ld` option does not accept `mold` as a valid argument, so you need to use the `-B` option instead. The `-B` option tells GCC where to look for external commands like `ld`. If you have installed mold with `make install`, there should be a directory named `/usr/libexec/mold` (or `/usr/local/libexec/mold`, depending on your `$PREFIX`), and the `ld` command should be there. The `ld` is actually a symlink to `mold`. So, all you need is to pass `-B/usr/libexec/mold` (or `-B/usr/local/libexec/mold`) to GCC. If you haven't installed `ld.mold` to any `$PATH`, you can still pass `-fuse-ld=/absolute/path/to/mold` to clang to use mold. However, GCC does not accept an absolute path as an argument for `-fuse-ld`.
If you are using Rust Create `.cargo/config.toml` in your project directory with the following: ```toml [target.'cfg(target_os = "linux")'] linker = "clang" rustflags = ["-C", "link-arg=-fuse-ld=/path/to/mold"] ``` where `/path/to/mold` is an absolute path to the mold executable. In the example above, we use `clang` as a linker driver since it always accepts the `-fuse-ld` option. If your GCC is recent enough to recognize the option, you may be able to remove the `linker = "clang"` line. ```toml [target.'cfg(target_os = "linux")'] rustflags = ["-C", "link-arg=-fuse-ld=mold"] ``` If you want to use mold for all projects, add the above snippet to `~/.cargo/config.toml`.
If you are using Nim Create `config.nims` in your project directory with the following: ```nim when findExe("mold").len > 0 and defined(linux): switch("passL", "-fuse-ld=mold") ``` where `mold` must be included in the `PATH` environment variable. In this example, `gcc` is used as the linker driver. Use the `-fuse-ld` option if your GCC is recent enough to recognize this option. If you want to use mold for all projects, add the above snippet to `~/.config/config.nims`.
If you are using Conan package manager You can configure [Conan](https://github.com/conan-io) to download the latest version of `mold` and use it as the linker when building your dependencies and projects from source. Please see the instructions [here](https://conan.io/center/recipes/mold).
mold -run It is sometimes very hard to pass an appropriate command line option to `cc` to specify an alternative linker. To address this situation, mold has a feature to intercept all invocations of `ld`, `ld.bfd`, `ld.lld`, or `ld.gold` and redirect them to itself. To use this feature, run `make` (or another build command) as a subcommand of mold as follows: ```shell mold -run make ``` Internally, mold invokes a given command with the `LD_PRELOAD` environment variable set to its companion shared object file. The shared object file intercepts all function calls to `exec(3)`-family functions to replace `argv[0]` with `mold` if it is `ld`, `ld.bf`, `ld.gold`, or `ld.lld`.
GitHub Actions You can use our [setup-mold](https://github.com/rui314/setup-mold) GitHub Action to speed up GitHub-hosted continuous builds. Although GitHub Actions run on a 4 core machine, mold is still significantly faster than the default GNU linker, especially when linking large programs.
Verify that you are using mold mold leaves its identification string in the `.comment` section of an output file. You can print it out to verify that you are actually using mold. ```shell $ readelf -p .comment String dump of section '.comment': [ 0] GCC: (Ubuntu 10.2.0-5ubuntu1~20.04) 10.2.0 [ 2b] mold 9a1679b47d9b22012ec7dfbda97c8983956716f7 ``` If `mold` is present in the `.comment` section, the file was created by mold.
Online manual Since mold is a drop-in replacement, you should be able to use it without reading its manual. However, if you need it, [mold's man page](docs/mold.md) is available online. You can read the same manual by running `man mold`.
## Why is mold so fast? One reason is that it utilizes faster algorithms and more efficient data structures compared to other linkers. Another reason is that mold is highly parallelized. Here is a side-by-side comparison of per-core CPU usage for lld (left) and mold (right), linking the same program, a Chromium executable. ![CPU usage comparison in htop animation](docs/htop.gif) As you can see, mold uses all available cores throughout its execution and finishes quickly. In contrast, lld fails to utilize available cores most of the time. In this demo, the maximum parallelism is artificially capped at 16, so that the bars fit in the GIF. For details, please see the [design notes](docs/design.md). ## Sponsors It is taken for granted nowadays that compiler toolchains can be easily installed and used for free, and people may not think too much about the individuals behind these "free tools". mold supports many projects, but it is essentially a one-person project. This situation is similar to the one depicted in the following xkcd illustration. [![xkcd 2347](https://imgs.xkcd.com/comics/dependency.png)](https://xkcd.com/2347) If you think that the "Nebraska guy" should be rewarded, please consider becoming our [GitHub sponsor](https://github.com/sponsors/rui314)! We thank everyone who sponsors our project. In particular, we'd like to acknowledge the following people and organizations who have sponsored $128/month or more: ### Corporate sponsors Mercury Cybozu Emerge Tools
- [G-Research](https://www.gresearch.co.uk) - [Signal Slot Inc.](https://github.com/signal-slot) - [GlareDB](https://github.com/GlareDB) ### Individual sponsors - [Wei Wu](https://github.com/lazyparser) - [kyle-elliott](https://github.com/kyle-elliott) - [Bryant Biggs](https://github.com/bryantbiggs) - [kraptor23](https://github.com/kraptor23) - [Jinkyu Yi](https://github.com/jincreator) - [Pedro Navarro](https://github.com/pedronavf) ================================================ FILE: dist.sh ================================================ #!/bin/bash # # This script creates a mold binary distribution. The output is written to # the `dist` directory as `mold-$version-$arch-linux.tar.gz` (e.g. # `mold-2.40.0-x86_64-linux.tar.gz`). # # This script aims to produce reproducible outputs. That means each time # it's run on the same git commit, it generates a bit-for-bit identical # binary file regardless of when or where it's executed. This property # serves as a strong safeguard against supply chain attacks. With a # reproducible build, anyone can independently verify that the binary # files published on our GitHub release page were built from the git # commit tagged for release by rebuilding the binaries themselves. # # Debian provides snapshot.debian.org to host all historical binary # packages. We use it to construct a container image pinned to a # particular timestamp. snapshot.debian.org is known to be very slow, # but that shouldn't be a big problem for us because we only need that # site the first time. # # The mold executable created by this script is statically linked to # libstdc++, but dynamically linked to glibc, libm and a few other # libraries, as these libraries are almost always available on any Linux # system. We can't statically link glibc because doing so would disable # dlopen(), which is required to load the LTO linker plugin. # # We use a reasonably old Debian version for the build environment because # a binary dynamically linked against a newer version of glibc won't work # on a system with an older version of glibc. # # We prefer to build mold with Clang rather than GCC because mold's # Identical Code Folding works best with the LLVM address significance # table (.llvm_addrsig). Building a release binary with GCC produces a # slightly larger binary than with Clang. # # We need a recent version of Clang to build mold. If it's not available # via apt-get, we'll build it ourselves. # # This script can be used to create non-native binaries (e.g., building # aarch64 binary on x86-64) because Podman automatically runs everything # under QEMU if the container image is not native. To use this script for # non-native builds, you may need to install the qemu-user-static package. set -e -x cd "$(dirname $0)" usage() { echo "Usage: $0 [ x86_64 | aarch64 | arm | riscv64 | ppc64le | s390x | loongarch64 ]" exit 1 } case $# in 0) arch=$(uname -m) if [ $arch = arm64 ]; then arch=aarch64 elif [[ $arch = arm* ]]; then arch=arm fi ;; 1) arch="$1" ;; *) usage esac # Create a Podman image. if [ "$GITHUB_REPOSITORY" = '' ]; then image=mold-builder-$arch image_build="podman build --arch $arch -t $image -" else # If this script is running on GitHub Actions, we want to cache # the created container image in GitHub's container repostiory. image=ghcr.io/$GITHUB_REPOSITORY/mold-builder-$arch image_build="podman build --arch $arch -t $image --output=type=registry --layers --cache-to $image --cache-from $image -" fi case $arch in x86_64) # Debian 9 (Stretch) released in June 2017. # # We use a Google-provided mirror (gcr.io) instead of the official Docker # Hub (docker.io) because docker.io has a strict rate limit policy. # # The toolchain in Debian 9 is too old to build mold, so we rebuild it # from source. We download source archives from official sites and build # them locally, rather than downloading pre-built binaries from somewhere # else, to avoid relying on unverifiable third-party binary blobs. Podman # caches the result of each RUN command, so rebuilding is done only once # per host. cat < /etc/apt/apt.conf.d/80-retries && \ apt-get update && \ apt-get install -y --no-install-recommends wget file make gcc g++ zlib1g-dev libssl-dev ca-certificates && \ rm -rf /var/lib/apt/lists # Build CMake 3.27 RUN mkdir /build && \ cd /build && \ wget -O- --progress=dot:mega https://cmake.org/files/v3.27/cmake-3.27.7.tar.gz | tar xzf - --strip-components=1 && \ ./bootstrap --parallel=\$(nproc) && \ make -j\$(nproc) && \ make install && \ rm -rf /build # Build GCC 14 RUN mkdir /build && \ cd /build && \ wget -O- --progress=dot:mega https://ftpmirror.gnu.org/gcc/gcc-14.2.0/gcc-14.2.0.tar.gz | tar xzf - --strip-components=1 && \ mkdir gmp mpc mpfr && \ wget -O- --progress=dot:mega https://ftpmirror.gnu.org/gmp/gmp-6.3.0.tar.gz | tar xzf - --strip-components=1 -C gmp && \ wget -O- --progress=dot:mega https://ftpmirror.gnu.org/mpc/mpc-1.3.1.tar.gz | tar xzf - --strip-components=1 -C mpc && \ wget -O- --progress=dot:mega https://ftpmirror.gnu.org/mpfr/mpfr-4.2.1.tar.gz | tar xzf - --strip-components=1 -C mpfr && \ ./configure --prefix=/usr --enable-languages=c,c++ --disable-bootstrap --disable-multilib && \ make -j\$(nproc) && \ make install && \ ln -sf /usr/lib64/libstdc++.so.6 /usr/lib/x86_64-linux-gnu/libstdc++.so.6 && \ rm -rf /build # Build GNU binutils 2.43 RUN mkdir /build && \ cd /build && \ wget -O- --progress=dot:mega https://ftpmirror.gnu.org/binutils/binutils-2.43.tar.gz | tar xzf - --strip-components=1 && \ ./configure --prefix=/usr && \ make -j\$(nproc) && \ make install && \ rm -fr /build # Build Python 3.12.7 RUN mkdir /build && \ cd /build && \ wget -O- --progress=dot:mega https://www.python.org/ftp/python/3.12.7/Python-3.12.7.tgz | tar xzf - --strip-components=1 && \ ./configure && \ make -j\$(nproc) && \ make install && \ rm -rf /build # Build LLVM 20 RUN mkdir /build && \ cd /build && \ wget -O- --progress=dot:mega https://github.com/llvm/llvm-project/archive/refs/tags/llvmorg-20.1.3.tar.gz | tar xzf - --strip-components=1 && \ mkdir b && \ cd b && \ cmake -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS=clang ../llvm && \ cmake --build . -j\$(nproc) && \ cmake --install . --strip && \ rm -rf /build EOF ;; aarch64 | arm | ppc64le | s390x) # Debian 11 (Bullseye) released in August 2021 # # We don't want to build Clang for these targets on QEMU becuase it # would take an extremely long time. Also, I believe old Linux boxes # are typically x86-64. cat < /etc/apt/apt.conf.d/80-retries && \ apt-get update && \ apt-get install -y --no-install-recommends build-essential gcc-10 g++-10 clang-16 cmake && \ ln -sf /usr/bin/clang-16 /usr/bin/clang && \ ln -sf /usr/bin/clang++-16 /usr/bin/clang++ && \ rm -rf /var/lib/apt/lists EOF ;; riscv64) cat < /etc/apt/apt.conf.d/80-retries && \ apt-get update && \ apt-get install -y --no-install-recommends build-essential gcc-14 g++-14 clang-18 cmake && \ ln -sf /usr/bin/clang-18 /usr/bin/clang && \ ln -sf /usr/bin/clang++-18 /usr/bin/clang++ && \ rm -rf /var/lib/apt/lists EOF ;; loongarch64) cat < /etc/apt/apt.conf.d/80-retries && \ apt-get update && \ apt-get install -y --no-install-recommends build-essential gcc-14 g++-14 clang-19 cmake && \ ln -sf /usr/bin/clang-19 /usr/bin/clang && \ ln -sf /usr/bin/clang++-19 /usr/bin/clang++ && \ rm -rf /var/lib/apt/lists EOF ;; *) usage ;; esac version=$(sed -n 's/^project(mold VERSION \(.*\))/\1/p' CMakeLists.txt) dest=mold-$version-$arch-linux # Source tarballs available on GitHub don't contain .git directory. # Clone the repo if missing. [ -d .git ] || git clone --branch v$version --depth 1 --bare https://github.com/rui314/mold .git # We use the timestamp of the last Git commit as the file timestamp # for build artifacts. timestamp=$(git log -1 --format=%ct) # `uname -m` in an ARM32 container running on an ARM64 host reports it # not as ARM32 but as ARM64. That confuses BLAKE3's cmake script and # erroneously enables NEON SIMD instructions. `setarch` can be used to # change the output of `uname -m`. setarch= [ $arch = arm ] && setarch='setarch linux32' mkdir -p dist # Build mold in a container. # # SOURCE_DATE_EPOCH is a standardized environment variable that allows # build artifacts to appear as if they were built at a specific time. # We use it to control how the compiler expands the C/C++ __DATE__ and # __TIME__ macros. podman run --arch $arch -it --rm --userns=host --pids-limit=-1 --network=none \ --pull=never -v "$(pwd):/mold:ro" -v "$(pwd)/dist:/dist" $image \ $setarch bash -c " set -e export SOURCE_DATE_EPOCH=$timestamp mkdir /build cd /build cmake -DCMAKE_BUILD_TYPE=Release -DMOLD_MOSTLY_STATIC=1 -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ /mold cmake --build . -j\$(nproc) cmake --install . cmake -DMOLD_USE_MOLD=1 . cmake --build . -j\$(nproc) ctest --output-on-failure -j\$(nproc) cmake --install . --prefix $dest --strip find $dest -print | xargs touch --no-dereference --date=@$timestamp find $dest -print | sort | tar -cf - --no-recursion --files-from=- | gzip -9nc > /dist/$dest.tar.gz cp mold /dist sha256sum /dist/$dest.tar.gz " ================================================ FILE: docs/bugs.md ================================================ This is a note about interesting bugs that I met during the development of the mold linker. ## GNU IFUNC Problem: A statically-linked "hello world" program mysteriously crashed in `__libc_start_main` function which is called just after `_start`. Investigation: I opened up gdb and found that the program reads a bogus value from some array. It looks like `memcpy` failed to copy proper data there. After some investigation, I noticed that `memcpy` did't copy data at all but instead returned the address of `__memcpy_avx_unaligned` function, which is a real `memcpy` function optimized for machines with the AVX registers. This odd issue was caused by the GNU IFUNC mechanism. That is, if a function symbol has type `STT_GNU_IFUNC`, the function does not do what its name suggests to do but instead returns a pointer to a function that does the actual job. In this case, `memcpy` is an IFUNC function, and it returns an address of `__memcpy_avx_unaligned` which is a real `memcpy` function. IFUNC function addresses are stored to `.got` section in an ELF executable. The dynamic loader executes all IFUNC functions at startup to replace their GOT entries with their return values. This mechanism allows programs to choose the best implementation among variants of the same function at runtime based on the machine info. If a program is statically-linked, there's no dynamic loader that rewrites the GOT entries. Therefore, if a program is statically-linked, a libc's startup routine does that on behalf of the dynamic loader. Concretely, a startup routine interprets all dynamic relocations between `__rela_iplt_start` and `__rela_iplt_stop` symbols. It is linker's responsibility to emit dynamic relocations for IFUNC symbols even if it is linking a statically-linked program and mark the beginning and the ending of a `.rela.dyn` section with the symbols, so that the startup routine can find the relocations. The bug was my linker didn't define `__rela_iplt_start` and `__rela_iplt_stop` symbols. Since these symbols are weak, they are initialized to zero. From the point of the initializer function, there's no dynamic relocations between `__rela_iplt_start` and `__rela_iplt_stop` symbols. That left GOT entries for IFUNC symbols untouched. The proper fix was to emit dynamic relocations for IFUNC symbols and define the linker-synthesized symbols. I did that, and the bug was fixed. ## stdio buffering Problem: A statically-linked "Hello world" program prints out the message if executed as `./hello`, but it doesn't output anything if executed as `./hello | cat`. Investigation: I knew that the default buffering mode for stdout is line buffering (buffer is flushed on every '\n'), but if it is not connected to the terminal (i.e. `isatty(2)` returns 0 on `STDOUT_FILENO`), it automatically switches to full buffering (buffer is flushed when it becomes full). So, it looks like libc failed to flush the stdout on program exit for some reason. I traced all function calls using gdb and noticed that `__libc_atexit` was not called. That function seemed to be responsible for buffer flushing. I don't know how exactly I found the root cause, but after spending an hour or two, I found that `__start___libc_atexit` and `__stop___libc_atexit` have value 0 in my linker's output while they mark a section containing the address of `__libc_atexit` in GNU ld's output. So, libc doesn't directly call `__libc_atexit` but instead call all function pointers between `__start___libc_atexit` and `__stop___libc_atexit` symbols. libc puts `__libc_atexit` address in `_libc_atexit` section, expecting that the linker automatically creates the start and the end marker symbols for the section. There's an obscure linker feature: if a section name is valid as a C identifier (e.g. `foo` or `_foo_bar` but not `.foo`), the linker automatically creates marker symbols by prepending `__start_` and `__stop_` to the section name. My linker lacked the feature. I implemented the feature, and the bug was fixed. ## TLS variable initialization Problem: A statically-linked "hello world" program crashes after reading a thread-local variable. Investigation: Thread-local variables are very different from other types of varaibles because there may be more than one instance of the same variable in memory. Each thread has its copy of thread-local varaibles. `%fs` segment register points the end of the variable area for the current thread, and the variables are accessed as an offset from `%fs`. Thread-local variables may be initialized (e.g. `thread_local int x = 5;`). The linker gathers all thread-local variables and put them into `PT_TLS` segment. At runtime, the contents of the segment is used as an "initialization image" for new threads. When a new thread is created, the image is memcpy'ed to the new thread's thread-local variable area. The initialization image itself is read-only at runtime. It took more than a day to find out that memcpy copies the initialization image to a different place than the thread-local variables reside. That means, thread-local variables had garbage as initial values, and the program crashed when using them. The problem is that I set a very large value (4096) to the alignment of `PT_TLS` segment. All `PT_LOAD` segments are naturally aligned to the page boundary, so I use the same value for `PT_TLS`, but that was a mistake. When a thread initialization routine sets a value to `%fs`, it first aligns the end of the thread-local variable area address to `PT_TLS` alignment value. So, if you set a large value to `PT_TLS` alignment, `%fs` is set to a wrong place. I fixed `PT_TLS` alignment, and the bug was gone. ## stdio buffering (another issue) I noticed that a dynamically-linked "hello world" program didn't flush its stdout buffer on exit. The cause of the problem was that the executable had more than one DT_NEEDED entry for `libc.so`. DT_NEEDED entries in `.dynamic` section specify a list of shared object file names which need to be linked at runtime. I added one DT_NEEDED entry for each library specified with the `-l` option. The pitfall is, unlike object files, libraries are allowed to appear more than once in a command line, and the linker has to de-duplicate them before processing. Adding more than one DT_NEEDED entry for the same shared object causes mysterious issues like this. # Copy relocations and symbol aliases environ, _environ and __environ point to the same location in libc.so, so when we create a copy relocation for one of the symbols, we need to do that for all of them. Otherwise, they'll end up pointing to different places which causes a very mysterious issue. # DT_DEBUG and gdb If you forget to add an entry with type DT_DEBUG to .dynamic, gdb's `info sharedlibrary` command doesn't print out a list of shared libraries loaded to memory. The value of the entry doesn't matter, so it can be just zero. The existence of it is important. # __EH_FRAME_BEGIN__ in libgcc/crtstuff.c ================================================ FILE: docs/coding-guidelines.md ================================================ # Coding Guidelines mold is written in C++20, but as is the case with every C++ project, it has local coding rules. In this document, I'll explain some of them and try to give justifications for why I chose such rules. ## DOs - Always use `i64` (which is a type alias for `int64_t` in mold) for integers unless you have a reason not to. For example, even if you know that a loop counter won't exceed 100, you should stop thinking about it and just use `i64`. Justification: Local variables are usually on CPU registers, so on 64-bit CPUs, there's no performance peanlty on choosing `i64` over `i32`. Even if a compiler has to spill register values to the stack, I don't think there's an observable difference between `i32` and `i64`. Therefore, extra 32 bits are essentially free. On 32-bit CPUs, they are not free, but that's OK because we are writing mold for modern computers. mold will still run on 32-bit computers but a bit slowly. By always using `i64`, we can eliminate the need to think about the "right" size for each variable. It also reduces the risk of integer overflow. Exceptions: If you have to allocate a very large number (e.g. millions) of the same object, its size matters. In that case, use a smaller type. ## DON'Ts - Don't use `auto` unless its actual type is obvious in the very narrow context. Currently, we use `auto` only for lambdas. Justification: I think `auto` makes code writing easier but code reading harder, because readers have to make a guess as to what is the actual type of `auto`. If you are already familiar with the existing codebase, you may be able to guess it easily, but that's not always the case. I want to keep the mold codebase friendly to first-time readers. - Don't over-use inheritance. In mold, most classes don't have parents, and even if they do, their class hierarchy is very shallow. Currently its height is just two (i.e. abstract classes and their implementations). Justification: Designing class hierarchies is fun as it feels like taxonomy, but I don't think that always help writing code. It looks like simpler class hierarchy makes its code simpler. ================================================ FILE: docs/design.md ================================================ [This document was written in 2020, and the contents are outdated. Specifically, we no longer believe that object preloading is a good idea. That being said, most of the points in this document still hold even today. Therefore, I'll keep this document as-is.] ## Design and implementation of mold For the rest of this documentation, I'll explain the design and the implementation of mold. If you are only interested in using mold, you don't need to read the below. ### Motivation Here is why I'm writing a new linker: - Even though lld has significantly improved the situation, linking is still one of the slowest steps in a build. It is especially annoying when I changed one line of code and had to wait for a few seconds or even more for a linker to complete. It should be instantaneous. There's a need for a faster linker. - The number of cores on a PC has increased a lot lately, and this trend is expected to continue. However, the existing linkers can't take the advantage of the trend because they don't scale well for more cores. I have a 64-core/128-thread machine, so my goal is to create a linker that uses the CPU nicely. mold should be much faster than other linkers on 4 or 8-core machines too, though. - It looks to me that the designs of the existing linkers are somewhat too similar, and I believe there are a lot of drastically different designs that haven't been explored yet. Developers generally don't care about linkers as long as they work correctly, and they don't even think about creating a new one. So there may be lots of low hanging fruits there in this area. ### Basic design - In order to achieve a `cp`-like performance, the most important thing is to fix the layout of an output file as quickly as possible, so that we can start copying actual data from input object files to an output file as soon as possible. - Copying data from input files to an output file is I/O-bounded, so there should be room for doing computationally-intensive tasks while copying data from one file to another. - We should allow the linker to preload object files from disk and parse them in memory before a complete set of input object files is ready. To do so, we need to split the linker into two in such a way that the latter half of the process finishes as quickly as possible by speculatively parsing and preprocessing input files in the first half of the process. - One of the most computationally-intensive stage among linker stages is symbol resolution. To resolve symbols, we basically have to throw all symbol strings into a hash table to match undefined symbols with defined symbols. But this can be done in the preloading stage using [string interning](https://en.wikipedia.org/wiki/String_interning). - Object files may contain a special section called a mergeable string section. The section contains lots of null-terminated strings, and the linker is expected to gather all mergeable string sections and merge their contents. So, if two object files contain the same string literal, for example, the resulting output will contain a single merged string. This step is computationally intensive, but string merging can be done in the preloading stage using string interning. - Static archives (.a files) contain object files, but the static archive's string table contains only defined symbols of member object files and lacks other types of symbols. That makes static archives unsuitable for speculative parsing. Therefore, the linker should ignore the symbol table of static archive and directly read static archive members. - If there's a relocation that uses a GOT of a symbol, then we have to create a GOT entry for that symbol. Otherwise, we shouldn't. That means we need to scan all relocation tables to fix the length and the contents of a .got section. This is computationally intensive, but this step is parallelizable. ### Linker Script Linker script is an embedded language for the linker. It is mainly used to control how input sections are mapped to output sections and the layout of the output, but it can also do a lot of tricky stuff. Its feature is useful especially for embedded programming, but it's also an awfully underdocumented and complex language. We have to implement a subset of the linker script language anwyay, because on Linux, /usr/lib/x86_64-linux-gnu/libc.so is (despite its name) not a shared object file but actually an ASCII file containing linker script code to load the _actual_ libc.so file. But the feature set for this purpose is very limited, and it is okay to implement them to mold. Besides that, we really don't want to implement the linker script language. But at the same time, we want to satisfy the user needs that are currently satisfied with the linker script language. So, what should we do? Here is my observation: - Linker script allows doing a lot of tricky stuff, such as specifying the exact layout of a file, inserting arbitrary bytes between sections, etc. But most of them can be done with a post-link binary editing tool (such as `objcopy`). - It looks like there are two things that truly cannot be done by a post-link editing tool: (a) mapping input sections to output sections, and (b) applying relocations. From the above observation, I believe we need to provide only the following features instead of the entire linker script language: - A method to specify how input sections are mapped to output sections, and - a method to set addresses to output sections, so that relocations are applied based on desired addresses. I believe everything else can be done with a post-link binary editing tool. ### Details - As we aim to the 1-second goal for Chromium, every millisecond counts. We can't ignore the latency of process exit. If we mmap a lot of files, \_exit(2) is not instantaneous but takes a few hundred milliseconds because the kernel has to clean up a lot of resources. As a workaround, we should organize the linker command as two processes; the first process forks the second process, and the second process does the actual work. As soon as the second process writes a result file to a filesystem, it notifies the first process, and the first process exits. The second process can take time to exit, because it is not an interactive process. - At least on Linux, it looks like the filesystem's performance to allocate new blocks to a new file is the limiting factor when creating a new large file and filling its contents using mmap. If you already have a large file in the buffer cache, writing to it is much faster than creating a new fresh file and writing to it. Based on this observation, mold overwrites to an existing executable file if exists. My quick benchmark showed that I could save 300 milliseconds when creating a 2 GiB output file. Linux doesn't allow to open an executable for writing if it is running (you'll get a "text busy" error if you attempt). mold falls back to the usual way if it fails to open an output file. - The output from the linker should be deterministic for the sake of [build reproducibility](https://en.wikipedia.org/wiki/Reproducible_builds) and ease of debugging. This might add a little bit of overhead to the linker, but that shouldn't be too much. - A .build-id, a unique ID embedded to an output file, is usually computed by applying a cryptographic hash function (e.g. SHA-1) to an output file. This is a slow step, but we can speed it up by splitting a file into small chunks, computing SHA-1 for each chunk, and then computing SHA-1 of the concatenated SHA-1 hashes (i.e. constructing a [Merkle Tree](https://en.wikipedia.org/wiki/Merkle_tree) of height 2). Modern x86 processors have purpose-built instructions for SHA-1 and can compute SHA-1 pretty quickly at about 2 GiB/s. Using 16 cores, a build-id for a 2 GiB executable can be computed in 60 to 70 milliseconds. - BFD, gold, and lld support section garbage collection. That is, a linker runs a mark-sweep garbage collection on an input graph, where sections are vertices and relocations are edges, to discard all sections that are not reachable from the entry point symbol (i.e. `_start`) or a few other root sections. In mold, we are using multiple threads to mark sections concurrently. - Similarly, BFD, gold an lld support Identical Comdat Folding (ICF) as yet another size optimization. ICF merges two or more read-only sections that happen to have the same contents and relocations. To do that, we have to find isomorphic subgraphs from larger graphs. I implemented a new algorithm for mold, which is 5x faster than lld to do ICF for Chromium (from 5 seconds to 1 second). - [Intel Threading Building Blocks](https://github.com/oneapi-src/oneTBB) (TBB) is a good library for parallel execution and has several concurrent containers. We are particularly interested in using `parallel_for_each` and `concurrent_hash_map`. - TBB provides `tbbmalloc` which works better for multi-threaded applications than the glib'c malloc, but it looks like [jemalloc](https://github.com/jemalloc/jemalloc) and [mimalloc](https://github.com/microsoft/mimalloc) are a little bit more scalable than `tbbmalloc`. ### Size of the problem When linking Chrome, a linker reads 3,430,966,844 bytes of data in total. The data contains the following items: | Data item | Number | ------------------------ | ------ | Object files | 30,723 | Public undefined symbols | 1,428,149 | Mergeable strings | 1,579,996 | Comdat groups | 9,914,510 | Regular sections¹ | 10,345,314 | Public defined symbols | 10,512,135 | Symbols | 23,953,607 | Sections | 27,543,225 | Relocations against SHF_ALLOC sections | 39,496,375 | Relocations | 62,024,719 ¹ Sections that have to be copied from input object files to an output file. Sections that contain relocations or symbols are for example excluded. ### Internals In this section, I'll explain the internals of mold linker. #### A brief history of Unix and the Unix linker Conceptually, what a linker does is pretty simple. A compiler compiles a fragment of a program (a single source file) into a fragment of machine code and data (an object file, which typically has the .o extension), and a linker stitches them together into a single executable or a shared library image. In reality, modern linkers for Unix-like systems are much more complicated than the naive understanding because they have gradually gained one feature at a time over the 50 years history of Unix, and they are now something like a bag of lots of miscellaneous features in which none of the features is more important than the others. It is very easy to miss the forest for the trees, since for those who don't know the details of the Unix linker, it is not clear which feature is essential and which is not. That being said, one thing is clear that at any point of Unix history, a Unix linker has a coherent feature set for the Unix of that age. So, let me entangle the history to see how the operating system, runtime, and linker have gained features that we see today. That should give you an idea of why a particular feature has been added to a linker in the first place. 1. Original Unix didn't support shared libraries, and a program was always loaded to a fixed address. An executable was something like a memory dump that was just loaded to a particular address by the kernel. After loading, the kernel started executing the program by setting the instruction pointer to a particular address. The most essential feature for any linker is relocation processing. The original Unix linker of course supported that. Let me explain what that is. Individual object files are inevitably incomplete as a program, because when a compiler created them, it only see a part of an entire program. For example, if an object file contains a function call that refers to another object file, the `call` instruction in the object cannot be complete, as the compiler has no idea as to what is the called function's address. To deal with this, the compiler emits a placeholder value (typically just zero) instead of a real address and leaves metadata in an object file saying "fix offset X of this file with an address of Y". That metadata is called "relocation". Relocations are typically processed by the linker. It is easy for a linker to apply relocations for the original Unix because a program is always loaded to a fixed address. It exactly knows the addresses of all functions and data when linking a program. Static library support, which is still an important feature of Unix linker, also dates back to this early period of Unix history. To understand what it is, imagine that you are trying to compile a program for the early Unix. You don't want to waste time to compile libc functions every time you compile your program (the computers of the era were incredibly slow), so you have already placed each libc function into a separate source file and compiled them individually. That means you have object files for each libc function, e.g., printf.o, scanf.o, atoi.o, write.o, etc. Given this configuration, all you have to do to link your program against libc functions is to pick up the right set of libc object files and give them to the linker along with the object files of your program. But, keeping the linker command line in sync with the libc functions you are using in your program is bothersome. You can be conservative; you can specify all libc object files to the command line, but that leads to program bloat because the linker unconditionally link all object files given to it no matter whether they are used or not. So, a new feature was added to the linker to fix the problem. That is the static library, which is also called the archive file. An archive file is just a bundle of object files, just like zip file but in an uncompressed form. An archive file typically has the .a file extension and named after its contents. For example, the archive file containing all libc objects is named `libc.a`. If you pass an archive file along with other object files to the linker, the linker pulls out an object file from the archive _only when_ it is referenced by other object files. In other words, unlike object files directly given to a linker, object files wrapped in an archive are not linked to the output by default. An archive works as a supplement to complete your program. Even today, you can still find a libc archive file. Run `ar t /usr/lib/x86_64-linux-gnu/libc.a` on Linux should give you a list of object files in the libc archive. 2. In the '80s, Sun Microsystems, a leading commercial Unix vendor at the time, added shared library support to their Unix variant, SunOS. (This section is incomplete.) ### Concurrency strategy In this section, I'll explain the high-level concurrency strategy of mold. In most places, mold adopts data parallelism. That is, we have a huge number of pieces of data of the same kind, and we process each of them individually using parallel for-loop. For example, after identifying the exact set of input object files, we need to scan all relocation tables to determine the sizes of .got and .plt sections. We do that using a parallel for-loop. The granularity of parallel processing in this case is the relocation table. Data parallelism is very efficient and scalable because there's no need for threads to communicate with each other while working on each element of data. In addition to that, data parallelism is easy to understand, as it is just a for-loop in which multiple iterations may be executed in parallel. We don't use high-level communication or synchronization mechanisms such as channels, futures, promises, latches or something like that in mold. In some cases, we need to share a little bit of data between threads while executing a parallel for-loop. For example, the loop to scan relocations turns on "requires GOT" or "requires PLT" flags in a symbol. Symbol is a shared resource, and writing to them from multiple threads without synchronization is unsafe. To deal with it, we made the flag an atomic variable. The other common pattern you can find in mold which is build on top of the parallel for-loop is the map-reduce pattern. That is, we run a parallel for-loop on a large data set to produce a small data set and process the small data set with a single thread. Let me take a build-id computation as an example. Build-id is typically computed by applying a cryptographic hash function such as SHA-1 on a linker's output file. To compute it, we first consider an output as a sequence of 1 MiB blocks and compute a SHA-1 hash for each block in parallel. Then, we concatenate the SHA-1 hashes and compute a SHA-1 hash on the hashes to get a final build-id. Finally, we use concurrent hashmap at a few places in mold. Concurrent hashmap is a hashmap to which multiple threads can safely insert items in parallel. We use it in the symbol resolution stage, for example. To resolve symbols, we basically have to throw in all defined symbols into a hash table, so that we can find a matching defined symbol for an undefined symbol by name. We do the hash table insertion from a parallel for-loop which iterates over a list of input files. Overall, even though mold is highly scalable, it succeeded to avoid complexties you often find in complex parallel programs. From high level, mold just serially executes the linker's internal passes one by one. Each pass is parallelized using parallel for-loops. ### Rejected ideas In this section, I'll explain the alternative designs I currently do not plan to implement and why I turned them down. - Placing variable-length sections at end of an output file and start copying file contents before fixing the output file layout Idea: Fixing the layout of regular sections seems easy, and if we place them at beginning of a file, we can start copying their contents from their input files to an output file. While copying file contents, we can compute the sizes of variable-length sections such as .got or .plt and place them at end of the file. Reason for rejection: I did not choose this design because I doubt if it could actually shorten link time and I think I don't need it anyway. The linker has to de-duplicate comdat sections (i.e. inline functions that are included in multiple object files), so we cannot compute the layout of regular sections until we resolve all symbols and de-duplicate comdats. That takes a few hundred milliseconds. After that, we can compute the sizes of variable-length sections in less than 100 milliseconds. It's quite fast, so it doesn't seem to make much sense to proceed without fixing the final file layout. The other reason to reject this idea is because there's good a chance for this idea to have a negative impact on linker's overall performance. If we copy file contents before fixing the layout, we can't apply relocations to them while copying because symbol addresses are not available yet. If we fix the file layout first, we can apply relocations while copying, which is effectively zero-cost due to a very good data locality. On the other hand, if we apply relocations long after we copy file contents, it's pretty expensive because section contents are very likely to have been evicted from CPU cache. - Incremental linking Idea: Incremental linking is a technique to patch a previous linker's output file so that only functions or data that are updated from the previous build are written to it. It is expected to significantly reduce the amount of data copied from input files to an output file and thus speed up linking. GNU BFD and gold linkers support it. Reason for rejection: I turned it down because it (1) is complicated, (2) doesn't seem to speed it up that much and (3) has several practical issues. Let me explain each of them. First, incremental linking for real C/C++ programs is not as easy as one might think. Let me take malloc as an example. malloc is usually defined by libc, but you can implement it in your program, and if that's the case, the symbol `malloc` will be resolved to your function instead of the one in libc. If you include a library that defines malloc (such as libjemalloc or libtbbmallc) before libc, their malloc will override libc's malloc. Assume that you are using a nonstandard malloc. What if you remove your malloc from your code, or remove `-ljemalloc` from your Makefile? The linker has to include a malloc from libc, which may include more object files to satisfy its dependencies. Such code change can affect the entire program rather than just replacing one function. The same is true for adding malloc to your program. Making a local change doesn't necessarily result in a local change in the binary level. It can easily have cascading effects. Some ELF fancy features make incremental linking even harder to implement. Take the weak symbol as an example. If you define `atoi` as a weak symbol in your program, and if you are not using `atoi` at all in your program, that symbol will be resolved to address 0. But if you start using some libc function that indirectly calls `atoi`, then `atoi` will be included in your program, and your weak symbol will be resolved to that function. I don't know how to efficiently fix up a binary for this case. This is a hard problem, so existing linkers don't try too hard to solve it. For example, IIRC, gold falls back to full link if any function is removed from a previous build. If you want to not annoy users in the fallback case, you need to make full link fast anyway. Second, incremental linking itself has an overhead. It has to detect updated files, patch an existing output file and write additional data to an output file for future incremental linking. GNU gold, for instance, takes almost 30 seconds on my machine to do a null incremental link (i.e. no object files are updated from a previous build) for chrome. It's just too slow. Third, there are other practical issues in incremental linking. It's not reproducible, so your binary isn't going to be the same as other binaries even if you are compiling the same source tree using the same compiler toolchain. Or, it is complex and there might be a bug in it. If something doesn't work correctly, "remove --incremental from your Makefile and try again" could be a piece of advice, but that isn't ideal. So, all in all, incremental linking is tricky. I wanted to make full link as fast as possible, so that we don't have to think about how to work around the slowness of full link. - Defining a completely new file format and use it Idea: Sometimes, the ELF file format itself seems to be a limiting factor in improving the linker's performance. We might be able to make a far better one if we create a new file format. Reason for rejection: I rejected the idea because it apparently has a practical issue (backward compatibility issue) and also doesn't seem to improve the performance of linkers that much. As clearly demonstrated by mold, we can create a fast linker for ELF. I believe ELF isn't that bad, after all. The semantics of the existing Unix linkers, such as the name resolution algorithm or the linker script, have slowed the linkers down, but that's not a problem of the file format itself. - Watching object files using inotify(2) Idea: When mold is running as a daemon for preloading, use inotify(2) to watch file system updates so that it can reload files as soon as they are updated. Reason for rejection: Just like the maximum number of files you can simultaneously open, the maximum number of files you can watch using inotify(2) isn't that large. Maybe just a single instance of mold is fine with inotify(2), but it may fail if you run multiple of it. The other reason for not doing it is because mold is quite fast without it anyway. Invoking stat(2) on each file for file update check takes less than 100 milliseconds for Chrome, and if most of the input files are not updated, parsing updated files takes almost no time. ================================================ FILE: docs/execstack.md ================================================ This page explains the following warning message and how to fix it. mold emits this message when it sees an object file that may not be compatible with mold. ``` mold: warning: foo.o: this file may cause a segmentation fault because it requires an executable stack. See https://github.com/rui314/mold/tree/main/docs/execstack.md for more info. ``` # Background On modern computers, the stack area (to which local variables are stored) cannot contain executable code. If the control reaches the stack area, the CPU refuses to execute any code there and the program is usually terminated due to segmentation fault. This is a security measure. The stack area used to be executable (old CPUs generally execute any code as long as it is in a readable memory region), but that provided an easy attack vector to a malicious user. They wrote executable code to the stack area using some buffer overflow bug and jumped there to run arbitrary code in a remote server process. To prevent this type of attack, the stack area is no longer executable since the early 2000s. On Linux, the stack's executable-ness is controlled by a bit in an executable, and the loader respects that bit. The bit is set by the linker. GCC had (and still has) a feature that depends on the executable stack, so they invented a way to tell the linker to mark the stack executable. Specifically, if an object file contains a `.note.GNU-stack` section with the `SHF_EXECSTACK` bit, GNU linker silently makes the stack of an output file executable. But the GNU linker's behavior is dangerous. If you accidentally link an object file that has that marker section, the entire stack area silently becomes executable, disabling the security mechanism. Therefore, mold simply ignores that marker section. If you are using mold, you need to explicitly pass `-z execstack` to the linker to make the stack executable. # What caused this issue? You are likely to use GCC's [Nested Functions](https://gcc.gnu.org/onlinedocs/gcc/Nested-Functions.html) feature which still depends on the executable stack. # How to fix it? If you know what you are doing, pass `-z execstack` to mold. Beware that this will significantly weaken your program's security. If you don't want to pass `-z execstack`, rewrite your code so that your code does not depend on the executable stack. ================================================ FILE: docs/glossary.md ================================================ The very concept of linking is simple: a compiler compiles a piece of source code into an object file (a file containing machine code), and a linker combines object files into a single executable or a shared library file. However, the actual implementation of the linker for modern systems is much more complicated because hardware, operating system, compiler and linker all have many more features. In this file, I'll explain random topics in the glossary format that you need to understand to read mold code. ## DSO A .so file. Short for Dynamic Shared Object. Often called as a shared library, a dynamic libray or a shared object as well. An DSO contains common functions and data that are used by multiple executables and/or other DSOs. At runtime, a DSO is loaded to a contiguous region in the virtual address. ## Object file A .o file. An object file contains machine code and data, but it cannot be executed because it's not self-contained. For example, if you compile a C source file containing a call of `printf`, the actual function code of `printf` is not included in the resulting object file. You include `stdio.h`, but that teaches the compiler only about `printf`'s type, and the compiler still don't know what `printf` actually does. Therefore, it cannot emit code for `printf`. You need to link an object file with other object file or a shared library to make it exectuable. ## Virtual address space A pointer has a value like 0x803020 which is an address of the pointee. But it doesn't mean that the pointee resides at the physical memory address 0x803020 on the computer. Modern CPUs contains so-called Mmeory Management Unit (MMU), and all access to the memory are first translated by MMU to the physical address. The address before translation is called the "virtual address". Unless you are doing the kernel programming, all addresses you handle are virtual addresses. The OS kernel controls the MMU so that each process owns the entire virtual address space. So, even if two process uses the same virtual address, they don't conflict. They are mapped to different physical addresses. The existence of MMU has several implications to the linker. First, we can link the main executable to a specific address. On process startup, there's no code or data in the virtual address space, so the mapping of the main executable always succeed. However, it's not true to DSOs because they are loaded after the main executable and possibly other DSOs. Therefore, shared libraries must be linked in a way that they can be loaded to any address in the virtual address space. ## Relocation A piece of information for the linker as to how to link object files or a dynamic objects. Object files can refer functions or data in other object files. For example, if you compile a function which calls a non-local function `foo`, the resulting code contains something like this: ``` 26: e8 00 00 00 00 callq 2b 27: R_X86_64_PLT32 foo-0x4 ``` The above `callq` is the instruction to call a function at the machine code level. It's opcode is `0xe8` in x86-64, so the instruction begins with `0xe8`. The following four bytes are displacement; that is, the address of the branch target relative to the end of this `callq` instruction. Notice that the displacement is 0. The compiler couldn't fill the displacement because it has no idea as to where `foo` will be at runtime. So, the compiler write 0 as a placeholder and instead write a relocation `R_X86_64_PLT32` with `foo` as its associated symbol. The linker reads this relocation, computes the offsets between this call instruction and function `foo` and overwrite the placeholder value 0 with an actual displacement. There are many different types of relocations. For example, if you want to fix up not with a displacement but with an absolute address of a symbol, you need to use `R_X86_64_ABS64` instead. ## Static library A .a file. Often called as an archive file or just archive as well. A static library is a container just like tar or zip. Actually, there's no technical reason to not use tar or (uncompressed) zip, but traditionally the .a file format is used by the linker. A static library contains object files and can be passed to the linker along with other object files and/or archives. A linker pulls out object files from an archive only if it is needed to resolve undefined symbols. In other words, object files in an archive are not linked by default and used as a complement to supply missing definitions. This is ideal for a library because you don't want to link library functions unless you are actually using them. Contrary to archive files, object files directly given to a linker are always linked to the output. To maximize the benefit of archive files, a library often used as a static library is broken down to small files to separate each function individually (for example, look at https://git.musl-libc.org/cgit/musl/tree/src/stdio). By doing this, you import only used functions. A static file is created by `ar`, whose command line arguments are similar to `tar`. A static library contains the symbol table which offers a quick way to look up an object file for a defined symbol, but mold does not use the static library's symbol table. mold doesdn't need a symbol table to exist in an archive, and if exists, mold just ignores it. See also: DSO (dynamic library) ## Symbol A symbol is a label assigned to a specific location in an input file or an output file. For example, if you define function `foo` and compile it, the resulting object file contains a symbol `foo` pointing to the beginning of the machine code for `foo`. Usually, a symbol name is a function or a variable name. If an object is anonymous (such the one for a string literal), a compiler generated a unique symbol, which often starts with `.` to avoid conflict with user-defined symbols. For C++, symbol name is a complex "mangled" name. We need to mangle identifiers because a simple name such as `foo` cannot be uniquely identify a function or a data in C++, because for example `foo` may be in a namespace or defined as a static member in some class. If `foo` is an overloaded function, we need to distinguish different `foo`s by its type. Therefore, C++ compiler mangles an identifier by appending nmaepsace names, type information and such so that different things get different names. For example, a function `int foo(int)` in a namespace `bar` is mangled as `_ZN3bar3fooEi`. A symbol can be either defined or undefined. A defined symbol points to some location in a file which may contain the function's machine code or the variable's initial value. An undefined symbol does not point to anywhere. It needs to be merged with a defined symbol with the same name at link-time. This merging process is called "name resolution". For example, if your program is using `printf`, it usually contains `printf` as an undefined symbol. You need to link it with `libc.a` or `libc.so`, which contain a defined symbol of `printf`, to make a complete program. ================================================ FILE: docs/memory-sanitizer.md ================================================ # Instrumenting mold with MemorySanitizer ## Introduction Per : > If you want MemorySanitizer to work properly and not produce any false > positives, you must ensure that all the code in your program and in > libraries it uses is instrumented (i.e. built with `-fsanitize=memory`). > In particular, you would need to link against an MSan-instrumented C++ > standard library. We recommend to use [libc++](https://libcxx.llvm.org/) > for that purpose. ## Building instrumented libc++ Build an MSan-instrumented libc++ from source: ```sh cd ~ git clone https://github.com/llvm/llvm-project cd llvm-project cmake -S ./runtimes -B build-libcxx -G Ninja \ -DCMAKE_BUILD_TYPE=Release \ -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi" \ -DCMAKE_C_COMPILER=clang \ -DCMAKE_CXX_COMPILER=clang++ \ -DLLVM_USE_SANITIZER=MemoryWithOrigins cmake --build build-libcxx -- cxx cxxabi export LIBCXX="$HOME/llvm-project/build-libcxx" # for subsequent build steps ``` Upon success, `./build-libcxx/{include,lib}` will contain the resulting headers and shared libraries. ## Linking mold against instrumented libc++ Use the `MOLD_USE_MSAN` and `MOLD_STDLIB_PREFIX` cmake variables to link mold against the instrumented build of libc++: ```sh cd ~ git clone https://github.com/rui314/mold.git cd mold cmake -B build -G Ninja \ -DCMAKE_BUILD_TYPE=Debug \ -DCMAKE_C_COMPILER=clang \ -DCMAKE_CXX_COMPILER=clang++ \ -DMOLD_USE_MSAN=ON \ -DMOLD_STDLIB_PREFIX="$LIBCXX" \ -DMOLD_USE_MIMALLOC=OFF \ -DMOLD_USE_SYSTEM_TBB=ON cmake --build build ``` Most of mold's tests (except those for `-flto`) should work at this point. Run them like normal: ```sh ctest --test-dir build ``` Any resulting MemorySanitizer errors should be visible in `./build/Temporary/Testing/LastTest.log`. ## Building instrumented LTO plugin (experimental) Exercising `-flto` with MemorySanitizer (and without false positives) requires instrumenting the transitive dependencies of `lto-unix.cc`: - `libiberty.a` - `liblto_plugin.so` - `LLVMgold.so` > [!NOTE] > This is more involved and time-consuming than building only libc++ from > source. The cost-benefit ratio of this additional instrumentation and > test coverage may be unfavorable in many cases. > [!IMPORTANT] > The following steps are experimental and unlikely to work exactly as-is > under CI runners and individual developer environments. Consider the > following a starting point rather than a complete HOWTO. Build an MSan-instrumented GNU libiberty: ```sh cd ~ git clone git@github.com:gcc-mirror/gcc.git cd gcc/libiberty export CC=clang export CFLAGS="-g -Og -fsanitize=memory -fsanitize-memory-track-origins" export LDFLAGS="-fsanitize=memory" ./configure make -j$(nproc) unset CC CFLAGS LDFLAGS ``` Install the resulting `./libiberty.a` into the build toolchain being tested with mold. If the system image is ephemeral or disposable (e.g. a short-lived VM or container), a quick-and-dirty install could look like: ```sh sudo cp /usr/lib/libiberty.a{,.bak} sudo cp ./libiberty.a /usr/lib/ ``` Build an MSan-instrumented GCC LTO plugin library: ```sh cd ~/gcc/lto-plugin # use already-cloned repo from earlier step export CC=clang export CFLAGS="-g -Og -fsanitize=memory -fsanitize-memory-track-origins" export LDFLAGS="-fsanitize=memory" ./configure --with-libiberty=/usr/lib make -j$(nproc) unset CC CFLAGS LDFLAGS ``` Install the resulting `liblto_plugin.so`. With the same caveats discussed above, a simple install into an ephemeral environment could look like: ```sh export GCC_VERSION="$(gcc -dumpversion)" sudo cp /usr/lib/gcc/x86_64-pc-linux-gnu/$GCC_VERSION/liblto_plugin.so{,.bak} sudo cp .libs/liblto_plugin.so /usr/lib/gcc/x86_64-pc-linux-gnu/$GCC_VERSION/ ``` Build an MSan-instrumented LLVM LTO plugin library: ```sh cd ~/llvm-project # use already-cloned repo from earlier step CF="-nostdinc++ -isystem $LIBCXX/include/c++/v1" INCDIR="$(find /usr -name plugin-api.h -type f | head -1 | xargs dirname)" function configure { cmake -S ./llvm -B build-plugin -G Ninja \ -DLLVM_ENABLE_PROJECTS=clang \ -DLLVM_TARGETS_TO_BUILD=X86 \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_C_COMPILER=clang \ -DCMAKE_CXX_COMPILER=clang++ \ -DCMAKE_C_FLAGS="$CF" \ -DCMAKE_CXX_FLAGS="$CF" \ -DCMAKE_EXE_LINKER_FLAGS="$1" \ -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ -DLLVM_BINUTILS_INCDIR="$INCDIR" \ -DLLVM_USE_SANITIZER=MemoryWithOrigins } # workaround for linker issues: configure twice with different LDFLAGS configure "-nostdlib++ -L $LIBCXX/lib -Wl,--rpath=$LIBCXX/lib" configure "-nostdlib++ -L $LIBCXX/lib -lc++ -Wl,--rpath=$LIBCXX/lib" cmake --build build-plugin -- LLVMgold.so ``` Install the resulting `LLVMgold.so`. With the same caveats discussed above, a simple install into an ephemeral environment could look like: ```sh sudo cp /usr/lib/LLVMgold.so{,.bak} sudo cp ./build-plugin/lib/LLVMgold.so /usr/lib/ ``` mold's tests for `-flto` should now (mostly) work, though there do seem to be issues around `gcc` inferring `-flto` when not explicitly specified. One workaround is to use `clang` as `TEST_CC`: ```sh cd ~/mold # use already-cloned and built repo from earlier step TEST_CC=clang TEST_CXX=clang++ ctest --test-dir build ``` ## References - - ================================================ FILE: docs/mold.1 ================================================ .\" generated with Ronn-NG/v0.9.1 .\" http://github.com/apjanke/ronn-ng/tree/0.9.1 .TH "MOLD" "1" "December 2025" "" .SH "NAME" \fBmold\fR \- a modern linker .SH "SYNOPSIS" \fBmold\fR [\fIoption\fR\|\.\|\.\|\.] \fIfile\fR\|\.\|\.\|\. .SH "DESCRIPTION" \fBmold\fR is a faster drop\-in replacement for the default GNU ld(1)\. .SS "How to use" See \fIhttps://github\.com/rui314/mold#how\-to\-use\fR\. .SS "Compatibility" \fBmold\fR is designed to be a drop\-in replacement for the GNU linkers for linking user\-land programs\. If your user\-land program cannot be built due to missing command\-line options, please file a bug at \fIhttps://github\.com/rui314/mold/issues\fR\. .P \fBmold\fR supports a very limited set of linker script features, which is just sufficient to read \fB/usr/lib/x86_64\-linux\-gnu/libc\.so\fR on Linux systems (on Linux, that file is contrary to its name not a shared library but an ASCII linker script that loads a real \fBlibc\.so\fR file\.) .P Beyond that, we have no plan to support any additional linker script features\. The linker script is an ad\-hoc, over\-designed, complex language which we believe needs to be replaced by a simpler mechanism\. We have a plan to add a replacement for the linker script to \fBmold\fR instead\. .SS "Archive symbol resolution" Traditionally, Unix linkers are sensitive to the order in which input files appear on the command line\. They process input files from the first (leftmost) file to the last (rightmost) file one\-by\-one\. While reading input files, they maintain sets of defined and undefined symbols\. When visiting an archive file (\fB\.a\fR files), they pull out object files to resolve as many undefined symbols as possible and move on to the next input file\. Object files that weren't pulled out will never have a chance for a second look\. .P Due to this behavior, you usually have to add archive files at the end of a command line, so that when a linker reaches archive files, it knows what symbols remain as undefined\. .P If you put archive files at the beginning of a command line, a linker doesn't have any undefined symbols, and thus no object files will be pulled out from archives\. You can change the processing order by using the \fB\-\-start\-group\fR and \fB\-\-end\-group\fR options, though they make a linker slower\. .P \fBmold\fR, as well as the LLVM lld(1) linker, takes a different approach\. They remember which symbols can be resolved from archive files instead of forgetting them after processing each archive\. Therefore, \fBmold\fR and lld(1) can "go back" in a command line to pull out object files from archives if they are needed to resolve remaining undefined symbols\. They are not sensitive to the input file order\. .P \fB\-\-start\-group\fR and \fB\-\-end\-group\fR are still accepted by \fBmold\fR and lld(1) for compatibility with traditional linkers, but they are silently ignored\. .SS "Dynamic symbol resolution" Some Unix linker features are difficult to understand without comprehending the semantics of dynamic symbol resolution\. Therefore, even though it's not specific to \fBmold\fR, we'll explain it here\. .P We use "ELF module" or just "module" as a collective term to refer to an executable or a shared library file in the ELF format\. .P An ELF module may have lists of imported symbols and exported symbols, as well as a list of shared library names from which imported symbols should be imported\. The point is that imported symbols are not bound to any specific shared library until runtime\. .P Here is how the Unix dynamic linker resolves dynamic symbols\. Upon the start of an ELF program, the dynamic linker constructs a list of ELF modules which, as a whole, consist of a complete program\. The executable file is always at the beginning of the list followed by its dependent shared libraries\. An imported symbol is searched from the beginning of the list to the end\. If two or more modules define the same symbol, the one that appears first in the list takes precedence over the others\. .P This Unix semantics are contrary to systems such as Windows that have a two\-level namespace for dynamic symbols\. On Windows, for example, dynamic symbols are represented as a tuple of (\fBsymbol\-name\fR, \fBshared\-library\-name\fR), so that each dynamic symbol is guaranteed to be resolved from some specific library\. .P Typically, an ELF module that exports a symbol also imports the same symbol\. Such a symbol is usually resolved to itself, but that's not the case if a module that appears before it in the symbol search list provides another definition of the same symbol\. .P Let's take \fBmalloc\fR as an example\. Assume that you define your version of \fBmalloc\fR in your main executable file\. Then, all \fBmalloc\fR calls from any module are resolved to your function instead of the one in libc, because the executable is always at the beginning of the dynamic symbol search list\. Note that even \fBmalloc\fR calls within libc are resolved to your definition since libc exports and imports \fBmalloc\fR\. Therefore, by defining \fBmalloc\fR yourself, you can overwrite a library function, and the \fBmalloc\fR in libc becomes dead code\. .P These Unix semantics are tricky and sometimes considered harmful\. For example, assume that you accidentally define \fBatoi\fR as a global function in your executable that behaves completely differently from the one in the C standard\. Then, all \fBatoi\fR function calls from any modules (even function calls within libc) are redirected to your function instead of the one in libc, which will very likely cause a problem\. That is a somewhat surprising consequence for an accidental name conflict\. On the other hand, this semantic is sometimes useful because it allows users to override library functions without rebuilding modules containing them\. .P Whether good or bad, you should keep these semantics in mind to understand Unix linkers' behaviors\. .SS "Build reproducibility" \fBmold\fR's output is deterministic\. That is, if you pass the same object files and the same command\-line options to the same version of \fBmold\fR, it is guaranteed that \fBmold\fR produces the bit\-for\-bit identical output\. The linker's internal randomness, such as the timing of thread scheduling or iteration orders of hash tables, doesn't affect the output\. .P \fBmold\fR does not have any host\-specific default settings\. This is contrary to the GNU linkers, for which some configurable values, such as system\-dependent library search paths, are hard\-coded\. \fBmold\fR depends only on its command\-line arguments\. .SH "OPTION NOTATIONS" Multi\-letter long options may precede either a single dash or double dashes, except for those starting with the letter "o"\. For historical reasons, long options beginning with "o" must precede double dashes\. .P For example, you can spell \fB\-\-as\-needed\fR as \fB\-as\-needed\fR, but \fB\-\-omagic\fR must not be spelled as \fB\-omagic\fR\. \fB\-omagic\fR will be interpreted not as \fB\-\-omagic\fR but as \fB\-o magic\fR\. .SH "MOLD\-SPECIFIC OPTIONS" .TP \fB\-\-chroot\fR=\fIdir\fR Set \fIdir\fR as the root directory\. .TP \fB\-\-color\-diagnostics\fR=[ \fIauto\fR | \fIalways\fR | \fInever\fR ] Show diagnostic messages in color using ANSI escape sequences\. \fBauto\fR means that \fBmold\fR prints out messages in color only if the standard output is connected to a TTY\. Default is \fBauto\fR\. .TP \fB\-\-color\-diagnostics\fR Synonym for \fB\-\-color\-diagnostics=auto\fR\. .TP \fB\-\-no\-color\-diagnostics\fR Synonym for \fB\-\-color\-diagnostics=never\fR\. .TP \fB\-\-detach\fR, \fB\-\-no\-detach\fR Permit or do not permit mold to create a debug info file in the background\. .TP \fB\-\-fork\fR, \fB\-\-no\-fork\fR Spawn a child process and let it do the actual linking\. When linking a large program, the OS kernel can take a few hundred milliseconds to terminate a \fBmold\fR process\. \fB\-\-fork\fR hides that latency\. By default, it does fork\. .IP Note that \fB\-\-fork\fR also hides the resource usage statistics reported by time(2), since it doesn't call waitpid(2) on the child process\. If you need those statistics, pass \fB\-\-no\-fork\fR\. .TP \fB\-\-perf\fR Print performance statistics\. .TP \fB\-\-print\-dependencies\fR Print out dependency information for input files\. .IP Each line of the output for this option shows which file depends on which file to use a specific symbol\. This option is useful for debugging why some object file in a static archive got linked or why some shared library is kept in an output file's dependency list even with \fB\-\-as\-needed\fR\. .TP \fB\-\-relocatable\-merge\-sections\fR By default, \fBmold\fR doesn't merge input sections by name when merging input object files into a single output object file for \fB\-r\fR\. For example, \fB\.text\.foo\fR and \fB\.text\.bar\fR aren't merged for \fB\-r\fR even though they are merged into \fB\.text\fR based on the default section merging rules\. .IP This option changes the behavior so that \fBmold\fR merges input sections by name by the default section merging rules\. .TP \fB\-\-repro\fR Archive input files, as well as a text file containing command line options, in a tar file so that you can run \fBmold\fR with the exact same inputs again\. This is useful for reporting a bug with a reproducer\. The output filename is \fBpath/to/output\.tar\fR, where \fBpath/to/output\fR is an output filename specified by \fB\-o\fR\. .TP \fB\-\-reverse\-sections\fR Reverse the order of input sections before assigning them the offsets in the output file\. .IP This option is useful for finding bugs that depend on the initialization order of global objects\. In C++, constructors of global objects in a single source file are guaranteed to be executed in the source order, but there's no such guarantee across compilation units\. Usually, constructors are executed in the order given to the linker, but depending on it is a mistake\. .IP By reversing the order of input sections using \fB\-\-reverse\-sections\fR, you can easily test that your program works in the reversed initialization order\. .TP \fB\-\-run\fR \fIcommand\fR \fIarg\fR\|\.\|\.\|\. Run \fIcommand\fR with \fBmold\fR as \fB/usr/bin/ld\fR\. Specifically, \fBmold\fR runs a given command with the \fBLD_PRELOAD\fR environment set to intercept exec(3) family functions and replaces \fBargv[0]\fR with itself if it is \fBld\fR, \fBld\.gold\fR, or \fBld\.lld\fR\. .TP \fB\-\-separate\-debug\-file\fR, \fB\-\-separate\-debug\-file\fR=\fIfile\fR Bundle debug info sections into a separate file instead of embedding them in an output executable or a shared library\. mold creates a debug info file in the background by default, so that you can start running your executable as soon as possible\. .IP By default, the debug info file is created in the same directory as is the output file, with the \fB\.dbg\fR file extension\. That filename is embedded into the output file so that \fBgdb\fR can automatically find the debug info file for the output file\. For more info about gdb features related to separate debug files, see \fIhttps://sourceware\.org/gdb/current/onlinedocs/gdb\.html/Separate\-Debug\-Files\.html\fR\. .IP mold holds a file lock with flock(2) while creating a debug info file in the background\. .IP If you don't want to create a debug info file in the background, pass the \fB\-\-no\-detach\fR option\. .TP \fB\-\-shuffle\-sections\fR, \fB\-\-shuffle\-sections\fR=\fInumber\fR Randomize the output by shuffling the order of input sections before assigning them the offsets in the output file\. If a \fInumber\fR is given, it's used as a seed for the random number generator, so that the linker produces the same output for the same seed\. If no seed is given, a random number is used as a seed\. .IP This option is useful for benchmarking\. Modern CPUs are sensitive to a program's memory layout\. A seemingly benign change in program layout, such as a small size increase of a function in the middle of a program, can affect the program's performance\. Therefore, even if you write new code and get a good benchmark result, it is hard to say whether the new code improves the program's performance; it is possible that the new memory layout happens to perform better\. .IP By running a benchmark multiple times with randomized memory layouts using \fB\-\-shuffle\-sections\fR, you can isolate your program's real performance number from the randomness caused by memory layout changes\. .TP \fB\-\-spare\-program\-headers\fR=\fInumber\fR Append the given number of \fBPT_NULL\fR entries to the end of the program header, so that post\-link processing tools can easily add new segments by overwriting the null entries\. .IP Note that ELF requires all \fBPT_LOAD\fR segments to be sorted by \fBp_vaddr\fR\. Therefore, if you add a new LOAD segment, you may need to sort the entire program header\. .TP \fB\-\-stats\fR Print input statistics\. .TP \fB\-\-thread\-count\fR=\fIcount\fR Use \fIcount\fR number of threads\. .TP \fB\-\-threads\fR, \fB\-\-no\-threads\fR Use multiple threads\. By default, \fBmold\fR uses as many threads as the number of cores or 32, whichever is smaller\. The reason it is capped at 32 is because \fBmold\fR doesn't scale well beyond that point\. To use only one thread, pass \fB\-\-no\-threads\fR or \fB\-\-thread\-count=1\fR\. .TP \fB\-\-quick\-exit\fR, \fB\-\-no\-quick\-exit\fR Use or do not use \fBquick_exit\fR to exit\. .TP \fB\-\-zero\-to\-bss\fR Convert all\-zero data sections into BSS\. .IP When this option is enabled, \fBmold\fR scans input data sections that are not of type \fBSHT_NOBITS\fR and checks whether their contents consist solely of zero bytes\. Such sections are then converted into BSS (\fBSHT_NOBITS\fR) sections\. This reduces the size of the output file, since BSS sections occupy no space in the file image\. .IP This behavior is especially useful for user\-defined sections created with \fB__attribute__((section("\.sectname")))\fR that contain uninitialized global variables\. GCC and Clang do not automatically mark such sections as BSS even if their contents are entirely zero, and instead emit them as regular data sections\. .IP For example, consider \fB__attribute__((section("\.sectname"))) int vec[256];\fR\. .IP By default, this results in a \fB\.sectname\fR section of type \fBSHT_PROGBITS\fR filled with zeros\. With \fB\-\-zero\-to\-bss\fR, the linker will recognize it as empty data and convert it to a \fBSHT_NOBITS\fR section, reducing the output file size without changing runtime semantics\. .TP \fB\-z rewrite\-endbr\fR, \fB\-z norewrite\-endbr\fR As a security measure, some CPU instruction sets have recently gained a feature to protect control flow integrity by disallowing indirect branches by default\. If the feature is enabled, the instruction that is executed immediately after an indirect branch must be an branch target marker instruction, or a CPU\-level fault will raise\. The marker instruction is also known as "landing pad" instruction, to which indirect branches can land\. This feature makes ROP attacks harder to conduct\. .IP To use the feature, a function whose pointer is taken needs to begin with a landing pad because a function call via a function pointer is compiled to an indirect branch\. On the other hand, if a function is called only directly (i\.e\. referred to only by \fIdirect\fR branch instructions), it doesn't have to begin with it\. .IP By default, the compiler always emits a landing pad at the beginning of each global function because it doesn't know whether or not the function's pointer is taken in another translation unit\. As a result, the resulting binary has more attack surface than necessary\. .IP If \fB\-\-rewrite\-endbr\fR is given, mold conducts a whole program analysis to identify functions whose addresses are actually taken and rewrites landing pads with no\-ops for non\-address\-taken functions, reducing the attack surface\. .IP This feature is currently available only on x86\-64\. .SH "GNU\-COMPATIBLE OPTIONS" .TP \fB\-\-help\fR Report usage information to stdout and exit\. .TP \fB\-v\fR, \fB\-\-version\fR Report version information to stdout\. .TP \fB\-V\fR Report version and target information to stdout\. .TP \fB\-E\fR, \fB\-\-export\-dynamic\fR, \fB\-\-no\-export\-dynamic\fR When creating an executable, using the \fB\-E\fR option causes all global symbols to be put into the dynamic symbol table, so that the symbols are visible from other ELF modules at runtime\. .IP By default, or if \fB\-\-no\-export\-dynamic\fR is given, only symbols that are referenced by DSOs at link\-time are exported from an executable\. .TP \fB\-F\fR \fIlibname\fR, \fB\-\-filter\fR=\fIlibname\fR Set the \fBDT_FILTER\fR dynamic section field to \fIlibname\fR\. .TP \fB\-I\fR \fIfile\fR, \fB\-\-dynamic\-linker\fR=\fIfile\fR, \fB\-\-no\-dynamic\-linker\fR Set the dynamic linker path to \fIfile\fR\. If no \fB\-I\fR option is given, or if \fB\-\-no\-dynamic\-linker\fR is given, no dynamic linker path is set to an output file\. This is contrary to the GNU linkers which set a default dynamic linker path in that case\. This difference doesn't usually make any difference because the compiler driver always passes \fB\-I\fR to the linker\. .TP \fB\-L\fR \fIdir\fR, \fB\-\-library\-path\fR=\fIdir\fR Add \fIdir\fR to the list of library search paths from which \fBmold\fR searches libraries for the \fB\-l\fR option\. .IP Unlike the GNU linkers, \fBmold\fR does not have default search paths\. This difference doesn't usually make any difference because the compiler driver always passes all necessary search paths to the linker\. .TP \fB\-M\fR, \fB\-\-print\-map\fR Write a map file to stdout\. .TP \fB\-N\fR, \fB\-\-omagic\fR, \fB\-\-no\-omagic\fR Force \fBmold\fR to emit an output file with an old\-fashioned memory layout\. First, it makes the first data segment not aligned to a page boundary\. Second, text segments are marked as writable if the option is given\. .TP \fB\-S\fR, \fB\-\-strip\-debug\fR Omit \fB\.debug_*\fR sections from the output file\. .TP \fB\-T\fR \fIfile\fR, \fB\-\-script\fR=\fIfile\fR Read linker script from \fIfile\fR\. .TP \fB\-X\fR, \fB\-\-discard\-locals\fR Discard temporary local symbols to reduce the sizes of the symbol table and the string table\. Temporary local symbols are local symbols starting with \fB\.L\fR\. Compilers usually generate such symbols for unnamed program elements such as string literals or floating\-point literals\. .TP \fB\-e\fR \fIsymbol\fR, \fB\-\-entry\fR=\fIsymbol\fR: .TP \fB\-f\fR \fIshlib\fR, \fB\-\-auxiliary\fR=\fIshlib\fR Set the \fBDT_AUXILIARY\fR dynamic section field to \fIshlib\fR\. .TP \fB\-h\fR \fIlibname\fR, \fB\-\-soname\fR=\fIlibname\fR Set the \fBDT_SONAME\fR dynamic section field to \fIlibname\fR\. This option is used when creating a shared object file\. Typically, when you create \fBlibfoo\.so\fR, you want to pass \fB\-\-soname=foo\fR to a linker\. .TP \fB\-l\fR \fIlibname\fR Search for \fBlib\fR\fIlibname\fR\fB\.so\fR or \fBlib\fR\fIlibname\fR\fB\.a\fR from library search paths\. .TP \fB\-m\fR \fItarget\fR Choose a \fItarget\fR\. .TP \fB\-o\fR \fIfile\fR, \fB\-\-output\fR=\fIfile\fR Use \fIfile\fR as the output file name instead of the default name \fBa\.out\fR\. .TP \fB\-r\fR, \fB\-\-relocatable\fR Instead of generating an executable or a shared object file, combine input object files to generate another object file that can be used as an input to a linker\. .TP \fB\-s\fR, \fB\-\-strip\-all\fR Omit \fB\.symtab\fR section from the output file\. .TP \fB\-u\fR \fIsymbol\fR, \fB\-\-undefined\fR=\fIsymbol\fR If \fIsymbol\fR remains as an undefined symbol after reading all object files, and if there is a static archive that contains an object file defining \fIsymbol\fR, pull out the object file and link it so that the output file contains a definition of \fIsymbol\fR\. .TP \fB\-y\fR \fIsymbol\fR, \fB\-\-trace\-symbol\fR=\fIsymbol\fR Trace references to \fIsymbol\fR\. .TP \fB\-\-Bdynamic\fR Link against shared libraries\. .TP \fB\-\-Bstatic\fR Do not link against shared libraries\. .TP \fB\-\-Bsymbolic\fR When creating a shared library, make global symbols export\-only (i\.e\. do not import the same symbol)\. As a result, references within a shared library are always resolved locally, negating symbol override at runtime\. See "Dynamic symbol resolution" for more information about symbol imports and exports\. .TP \fB\-\-Bsymbolic\-functions\fR This option has the same effect as \fB\-\-Bsymbolic\fR but works only for function symbols\. Data symbols remain being both imported and exported\. .TP \fB\-\-Bsymbolic\-non\-weak\fR This option has the same effect as \fB\-\-Bsymbolic\fR but works only for non\-weak symbols\. Weak symbols remain being both imported and exported\. .TP \fB\-\-Bsymbolic\-non\-weak\-functions\fR This option has the same effect as \fB\-\-Bsymbolic\fR but works only for non\-weak function symbols\. Data symbols and weak function symbols remain being both imported and exported\. .TP \fB\-\-Bno\-symbolic\fR Cancel \fB\-\-Bsymbolic\fR, \fB\-\-Bsymbolic\-functions\fR, \fB\-\-Bsymbolic\-non\-weak\fR and \fB\-\-Bsymbolic\-non\-weak\-functions\fR\. .TP \fB\-\-Map\fR=\fIfile\fR Write map file to \fIfile\fR\. .TP \fB\-\-Tbss\fR=\fIaddress\fR Alias for \fB\-\-section\-start=\.bss=\fR\fIaddress\fR\. .TP \fB\-\-Tdata\fR=\fIaddress\fR Alias for \fB\-\-section\-start=\.data=\fR\fIaddress\fR\. .TP \fB\-\-Ttext\fR=\fIaddress\fR Alias for \fB\-\-section\-start=\.text=\fR\fIaddress\fR\. .TP \fB\-\-allow\-multiple\-definition\fR Normally, the linker reports an error if there are more than one definition of a symbol\. This option changes the default behavior so that it doesn't report an error for duplicate definitions and instead use the first definition\. .TP \fB\-\-allow\-shlib\-undefined\fR, \fB\-\-no\-allow\-shlib\-undefined\fR Even if mold succeeds in linking a main executable without undefined symbol errors, you may still encounter symbol lookup errors at runtime because the dynamic linker cannot find some symbols in shared libraries in any ELF module\. This occurs because mold ignores undefined symbols in shared libraries by default\. .IP If you pass \fB\-\-no\-allow\-shlib\-undefined\fR, mold verifies that undefined symbols in shared libraries given to the linker can be resolved at link\-time\. In other words, this converts the runtime error to a link\-time error\. .IP Note that you need to pass all shared libraries, including indirectly dependent ones, to the linker as arguments for \fB\-l\fR\. If a shared library depends on a library that's not passed to the linker, the verification will be skipped for that file\. .TP \fB\-\-as\-needed\fR, \fB\-\-no\-as\-needed\fR By default, shared libraries given to the linker are unconditionally added to the list of required libraries in an output file\. However, shared libraries after \fB\-\-as\-needed\fR are added to the list only when at least one symbol is actually used by the output file\. In other words, shared libraries after \fB\-\-as\-needed\fR are not added to the list of needed libraries if they are not needed by a program\. .IP The \fB\-\-no\-as\-needed\fR option restores the default behavior for subsequent files\. .TP \fB\-\-build\-id\fR=[ \fBmd5\fR | \fBsha1\fR | \fBsha256\fR | \fBfast\fR | \fBuuid\fR | \fB0x\fR\fIhexstring\fR | \fBnone\fR ] Create a \fB\.note\.gnu\.build\-id\fR section containing a byte string to uniquely identify an output file\. \fBsha256\fR compute a 256\-bit cryptographic hash of an output file and set it to build\-id\. \fBmd5\fR and \fBsha1\fR compute the same hash but truncate it to 128 and 160 bits, respectively, before setting it to build\-id\. \fBuuid\fR sets a random 128\-bit UUID\. \fB0x\fR\fIhexstring\fR sets \fIhexstring\fR\. \fBfast\fR is a synonym for \fBsha256\fR\. .TP \fB\-\-build\-id\fR Synonym for \fB\-\-build\-id=sha256\fR\. .TP \fB\-\-no\-build\-id\fR Synonym for \fB\-\-build\-id=none\fR\. .TP \fB\-\-compress\-debug\-sections\fR=[ \fBzlib\fR | \fBzlib\-gabi\fR | \fBzstd\fR | \fBnone\fR ] Compress DWARF debug info (\fB\.debug_*\fR sections) using the zlib or zstd compression algorithm\. \fBzlib\-gabi\fR is an alias for \fBzlib\fR\. .TP \fB\-\-defsym\fR=\fIsymbol\fR=\fIvalue\fR Define \fIsymbol\fR as an alias for \fIvalue\fR\. .IP \fIvalue\fR is either an integer (in decimal or hexadecimal with \fB0x\fR prefix) or a symbol name\. If an integer is given as a value, \fIsymbol\fR is defined as an absolute symbol with the given value\. .TP \fB\-\-default\-symver\fR Use soname as a symbol version and append that version to all symbols\. .TP \fB\-\-demangle\fR, \fB\-\-no\-demangle\fR Demangle C++ and Rust symbols in log messages\. .TP \fB\-\-dependency\-file\fR=\fIfile\fR Write a dependency file to \fIfile\fR\. The contents of the written file is readable by make(1), which defines only one rule with the linker's output file as a target and all input files as its prerequisites\. Users are expected to include the generated dependency file into a Makefile to automate the dependency management\. This option is analogous to the compiler's \fB\-MM \-MF\fR options\. .TP \fB\-\-dynamic\-list\fR=\fIfile\fR Read a list of dynamic symbols from \fIfile\fR\. Same as \fB\-\-export\-dynamic\-symbol\-list\fR, except that it implies \fB\-\-Bsymbolic\fR\. If \fIfile\fR does not exist in the current directory, it is searched from library search paths for the sake of compatibility with GNU ld\. .TP \fB\-\-eh\-frame\-hdr\fR, \fB\-\-no\-eh\-frame\-hdr\fR Create \fB\.eh_frame_hdr\fR section\. .TP \fB\-\-emit\-relocs\fR The linker usually "consumes" relocation sections\. That is, the linker applies relocations to other sections, and relocation sections themselves are discarded\. .IP The \fB\-\-emit\-relocs\fR instructs the linker to leave relocation sections in the output file\. Some post\-link binary analysis or optimization tools such as LLVM Bolt need them\. .TP \fB\-\-enable\-new\-dtags\fR, \fB\-\-disable\-new\-dtags\fR By default, \fBmold\fR emits \fBDT_RUNPATH\fR for \fB\-\-rpath\fR\. If you pass \fB\-\-disable\-new\-dtags\fR, \fBmold\fR emits \fBDT_RPATH\fR for \fB\-\-rpath\fR instead\. .TP \fB\-\-execute\-only\fR: .TP \fB\-\-exclude\-libs\fR=\fIlibraries\fR \|\.\|\.\|\. Mark all symbols in the given \fIlibraries\fR hidden\. .TP \fB\-\-export\-dynamic\-symbol\fR=\fIsymbol\fR Put symbols matching \fIsymbol\fR in the dynamic symbol table\. \fIsymbol\fR may be a glob pattern in the same syntax as for the \fB\-\-export\-dynamic\-symbol\-list\fR or \fB\-\-version\-script\fR options\. .TP \fB\-\-export\-dynamic\-symbol\-list\fR=\fIfile\fR Read a list of dynamic symbols from \fIfile\fR\. .TP \fB\-\-fatal\-warnings\fR, \fB\-\-no\-fatal\-warnings\fR Treat warnings as errors\. .TP \fB\-\-fini\fR=\fIsymbol\fR Call \fIsymbol\fR at unload\-time\. .TP \fB\-\-gc\-sections\fR, \fB\-\-no\-gc\-sections\fR Remove unreferenced sections\. .TP \fB\-\-gdb\-index\fR Create a \fB\.gdb_index\fR section to speed up GNU debugger\. To use this, you need to compile source files with the \fB\-ggnu\-pubnames\fR compiler flag\. .TP \fB\-\-hash\-style\fR=[ \fBsysv\fR | \fBgnu\fR | \fBboth\fR | \fBnone\fR ] Set hash style\. .TP \fB\-\-icf\fR=[ \fBsafe\fR | \fBall\fR | \fBnone\fR ], \fB\-\-no\-icf\fR It is not uncommon for a program to contain many identical functions that differ only in name\. For example, a C++ template \fBstd::vector\fR is very likely to be instantiated to the identical code for \fBstd::vector\fR and \fBstd::vector\fR because the container cares only about the size of the parameter type\. Identical Code Folding (ICF) is a size optimization to identify and merge such identical functions\. .IP If \fB\-\-icf=all\fR is given, \fBmold\fR tries to merge all identical functions\. This reduces the size of the output most, but it is not a "safe" optimization\. It is guaranteed in C and C++ that two pointers pointing two different functions will never be equal, but \fB\-\-icf=all\fR breaks that assumption as two identical functions have the same address after merging\. So a care must be taken when you use this flag that your program does not depend on the function pointer uniqueness\. .IP \fB\-\-icf=safe\fR is a flag to merge functions only when it is safe to do so\. That is, if a program does not take an address of a function, it is safe to merge that function with other function, as you cannot compare a function pointer with something else without taking an address of a function\. .IP \fB\-\-icf=safe\fR needs to be used with a compiler that supports \fB\.llvm_addrsig\fR section which contains the information as to what symbols are address\-taken\. LLVM/Clang supports that section by default\. Since GCC does not support it yet, you cannot use \fB\-\-icf=safe\fR with GCC (it doesn't do any harm but can't optimize at all\.) .IP \fB\-\-icf=none\fR and \fB\-\-no\-icf\fR disables ICF\. .TP \fB\-\-ignore\-data\-address\-equality\fR Make ICF to merge not only functions but also data\. This option should be used in combination with \fB\-\-icf=all\fR\. .TP \fB\-\-image\-base\fR=\fIaddr\fR Set the base address to \fIaddr\fR\. .TP \fB\-\-init\fR=\fIsymbol\fR Call \fIsymbol\fR at load\-time\. .TP \fB\-\-no\-undefined\fR Report undefined symbols (even with \fB\-\-shared\fR)\. .TP \fB\-\-noinhibit\-exec\fR Create an output file even if errors occur\. .TP \fB\-\-package\-metadata\fR=\fIpercent\-encoded\-string\fR Embed a specified string into the \fB\.note\.package\fR section\. This option is designed for build scripts that generate binary packages, such as \fB\.rpm\fR or \fB\.deb\fR, to include package metadata in each executable\. It simplifies the process of identifying the corresponding package for a given executable or core file\. .IP An argument to this option is treated as percent\-encoded and decoded before being inserted into the section, allowing you to avoid the use of the comma (\fB,\fR) character in the argument\. This is useful because the compiler replaces all occurrences of commas in \fB\-Wl,\fR with spaces before forwarding them to the linker\. Note that \fBmold\fR always interprets the argument as percent\-encoded, so you also need to escape all occurrences of \fB%\fR as \fB%25\fR\. .TP \fB\-\-pack\-dyn\-relocs\fR=[ \fBrelr\fR | \fBnone\fR ] If \fBrelr\fR is specified, all \fBR_*_RELATIVE\fR relocations are put into \fB\.relr\.dyn\fR section instead of \fB\.rel\.dyn\fR or \fB\.rela\.dyn\fR section\. Since \fB\.relr\.dyn\fR section uses a space\-efficient encoding scheme, specifying this flag can reduce the size of the output\. This is typically most effective for position\-independent executable\. .IP Note that a runtime loader has to support \fB\.relr\.dyn\fR to run executables or shared libraries linked with \fB\-\-pack\-dyn\-relocs=relr\fR\. As of 2022, only ChromeOS, Android and Fuchsia support it\. .TP \fB\-\-pie\fR, \fB\-\-pic\-executable\fR, \fB\-\-no\-pie\fR, \fB\-\-no\-pic\-executable\fR Create a position\-independent executable\. .TP \fB\-\-print\-gc\-sections\fR, \fB\-\-no\-print\-gc\-sections\fR Print removed unreferenced sections\. .TP \fB\-\-print\-icf\-sections\fR, \fB\-\-no\-print\-icf\-sections\fR Print folded identical sections\. .TP \fB\-\-push\-state\fR, \fB\-\-pop\-state\fR \fB\-\-push\-state\fR saves the current values of \fB\-\-as\-needed\fR, \fB\-\-whole\-archive\fR, \fB\-\-static\fR, and \fB\-\-start\-lib\fR\. The saved values can be restored by pop\-state\. .IP \fB\-\-push\-state\fR and \fB\-\-pop\-state\fR pairs can nest\. .IP These options are useful when you want to construct linker command line options programmatically\. For example, if you want to link \fBlibfoo\.so\fR by as\-needed basis but don't want to change the global state of \fB\-\-as\-needed\fR, you can append \fB\-\-push\-state \-\-as\-needed \-lfoo \-\-pop\-state\fR to the linker command line options\. .TP \fB\-\-relax, \-\-no\-relax\fR Rewrite machine instructions with more efficient ones for some relocations\. The feature is enabled by default\. .TP \fB\-\-require\-defined\fR=\fIsymbol\fR Like \fB\-\-undefined\fR, except the new symbol must be defined by the end of the link\. .TP \fB\-\-retain\-symbols\-file\fR=\fIfile\fR Keep only symbols listed in \fIfile\fR\. \fIfile\fR is a text file containing a symbol name on each line\. \fBmold\fR discards all local symbols as well as global symbol that are not in \fIfile\fR\. Note that this option removes symbols only from \fB\.symtab\fR section and does not affect \fB\.dynsym\fR section, which is used for dynamic linking\. .TP \fB\-\-rpath\fR=\fIdir\fR Add \fIdir\fR to runtime search paths\. .TP \fB\-\-section\-start\fR=\fIsection\fR=\fIaddress\fR Set \fIaddress\fR to section\. \fIaddress\fR is a hexadecimal number that may start with an optional \fB0x\fR\. .TP \fB\-\-shared\fR, \fB\-\-Bshareable\fR Create a share library\. .TP \fB\-\-spare\-dynamic\-tags\fR=\fInumber\fR Append the given number of \fBDT_NULL\fR entries to the end of the \fB\.dynamic\fR section, so that post\-link processing tools can easily add new dynamic tags by overwriting the null entries\. .TP \fB\-\-start\-lib\fR, \fB\-\-end\-lib\fR Handle object files between \fB\-\-start\-lib\fR and \fB\-\-end\-lib\fR as if they were in an archive file\. That means object files between them are linked only when they are needed to resolve undefined symbols\. The options are useful if you want to link object files only when they are needed but want to avoid the overhead of running ar(3)\. .TP \fB\-\-static\fR Do not link against shared libraries\. .TP \fB\-\-sysroot\fR=\fIdir\fR Set target system root directory to \fIdir\fR\. .TP \fB\-\-trace\fR Print name of each input file\. .TP \fB\-\-undefined\-glob\fR=\fIpattern\fR Synonym for \fB\-\-undefined\fR, except that \fB\-\-undefined\-glob\fR takes a glob pattern instead of just a single symbol name\. .TP \fB\-\-undefined\-version\fR, \fB\-\-no\-undefined\-version\fR By default, \fBmold\fR warns on a symbol specified by a version script or by \fB\-\-export\-dynamic\-symbol\fR if it is not defined\. You can silence the warning by \fB\-\-undefined\-version\fR\. .TP \fB\-\-unique\fR=\fIpattern\fR Don't merge input sections that match the given glob pattern \fIpattern\fR\. .TP \fB\-\-unresolved\-symbols\fR=[ \fBreport\-all\fR | \fBignore\-all\fR | \fBignore\-in\-object\-files\fR | \fBignore\-in\-shared\-libs\fR ] How to handle undefined symbols\. .TP \fB\-\-version\-script\fR=\fIfile\fR Read version script from \fIfile\fR\. If \fIfile\fR does not exist in the current directory, it is searched from library search paths for the sake of compatibility with GNU ld\. .TP \fB\-\-warn\-common\fR, \fB\-\-no\-warn\-common\fR Warn about common symbols\. .TP \fB\-\-warn\-once\fR Only warn once for each undefined symbol instead of warn for each relocation referring an undefined symbol\. .TP \fB\-\-warn\-unresolved\-symbols\fR, \fB\-\-error\-unresolved\-symbols\fR Normally, the linker reports an error for unresolved symbols\. \fB\-\-warn\-unresolved\-symbols\fR option turns it into a warning\. \fB\-\-error\-unresolved\-symbols\fR option restores the default behavior\. .TP \fB\-\-whole\-archive\fR, \fB\-\-no\-whole\-archive\fR When archive files (\fB\.a\fR files) are given to the linker, only object files that are needed to resolve undefined symbols are extracted from them and linked to an output file\. \fB\-\-whole\-archive\fR changes that behavior for subsequent archives so that the linker extracts all object files and links them to an output\. For example, if you are creating a shared object file and you want to include all archive members to the output, you should pass \fB\-\-whole\-archive\fR\. \fB\-\-no\-whole\-archive\fR restores the default behavior for subsequent archives\. .TP \fB\-\-wrap\fR=\fIsymbol\fR Make \fIsymbol\fR be resolved to \fB__wrap_\fR\fIsymbol\fR\. The original symbol can be resolved as \fB__real_\fR\fIsymbol\fR\. This option is typically used for wrapping an existing function\. .TP \fB\-z cet\-report\fR=[ \fBwarning\fR | \fBerror\fR | \fBnone\fR ] Intel Control\-flow Enforcement Technology (CET) is a new x86 feature available since Tiger Lake which is released in 2020\. It defines new instructions to harden security to protect programs from control hijacking attacks\. You can tell the compiler to use the feature by specifying the \fB\-fcf\-protection\fR flag\. .IP \fB\-z cet\-report\fR flag is used to make sure that all object files were compiled with a correct \fB\-fcf\-protection\fR flag\. If \fBwarning\fR or \fBerror\fR are given, \fBmold\fR prints out a warning or an error message if an object file was not compiled with the compiler flag\. .IP \fBmold\fR looks for \fBGNU_PROPERTY_X86_FEATURE_1_IBT\fR bit and \fBGNU_PROPERTY_X86_FEATURE_1_SHSTK\fR bit in \fB\.note\.gnu\.property\fR section to determine whether or not an object file was compiled with \fB\-fcf\-protection\fR\. .TP \fB\-z now\fR, \fB\-z lazy\fR By default, functions referring to other ELF modules are resolved by the dynamic linker when they are called for the first time\. \fB\-z now\fR marks an executable or a shared library file so that all dynamic symbols are resolved when a file is loaded to memory\. \fB\-z lazy\fR restores the default behavior\. .TP \fB\-z origin\fR Mark object requiring immediate \fB$ORIGIN\fR processing at runtime\. .TP \fB\-z ibt\fR Turn on \fBGNU_PROPERTY_X86_FEATURE_1_IBT\fR bit in \fB\.note\.gnu\.property\fR section to indicate that the output uses IBT\-enabled PLT\. This option implies \fB\-z ibtplt\fR\. .TP \fB\-z ibtplt\fR Generate Intel Branch Tracking (IBT)\-enabled PLT which is the default on x86\-64\. This is the default\. .TP \fB\-z execstack\fR, \fB\-z noexecstack\fR By default, the pages for the stack area (i\.e\. the pages where local variables reside) are not executable for security reasons\. \fB\-z execstack\fR makes it executable\. \fB\-z noexecstack\fR restores the default behavior\. .TP \fB\-z keep\-text\-section\-prefix\fR, \fB\-z nokeep\-text\-section\-prefix\fR Keep \fB\.text\.hot\fR, \fB\.text\.unknown\fR, \fB\.text\.unlikely\fR, \fB\.text\.startup\fR, and \fB\.text\.exit\fR as separate sections in the final binary instead of merging them as \fB\.text\fR\. .TP \fB\-z rodynamic\fR Make the \fB\.dynamic\fR section read\-only\. .TP \fB\-z relro\fR, \fB\-z norelro\fR Some sections such as \fB\.dynamic\fR have to be writable only during a module is being loaded to memory\. Once the dynamic linker finishes its job, such sections won't be mutated by anyone\. As a security mitigation, it is preferred to make such segments read\-only during program execution\. .IP \fB\-z relro\fR puts such sections into a special segment called \fBrelro\fR\. The dynamic linker makes a relro segment read\-only after it finishes its job\. .IP By default, \fBmold\fR generates a relro segment\. \fB\-z norelro\fR disables the feature\. .TP \fB\-z sectionheader\fR, \fB\-z nosectionheader\fR \fB\-z nosectionheader\fR tell the linker to omit the section header\. By default, the linker does not omit the section header\. .TP \fB\-z separate\-loadable\-segments\fR, \fB\-z separate\-code\fR, \fB\-z noseparate\-code\fR If one memory page contains multiple segments, the page protection bits are set in such a way that the needed attributes (writable or executable) are satisfied for all segments\. This usually happens at a boundary of two segments with two different attributes\. .IP \fBseparate\-loadable\-segments\fR adds paddings between segments with different attributes so that they do not share the same page\. .IP \fBseparate\-code\fR adds paddings only between executable and non\-executable segments\. This is the default\. .IP \fBnoseparate\-code\fR does not add any paddings between segments\. .TP \fB\-z defs\fR, \fB\-z nodefs\fR Report undefined symbols (even with \fB\-\-shared\fR)\. .TP \fB\-z shstk\fR Enforce shadow stack by turning \fBGNU_PROPERTY_X86_FEATURE_1_SHSTK\fR bit in \fB\.note\.gnu\.property\fR output section\. Shadow stack is part of Intel Control\-flow Enforcement Technology (CET), which is available since Tiger Lake (2020)\. .TP \fB\-z start_stop_visibility\fR=[ \fBhidden\fR | \fBprotected\fR ] If a section name is valid as a C identifier (i\.e\., it matches \fB/^[_a\-zA\-Z][_a\-zA\-Z0\-9]*$/\fR), mold creates \fB__start_SECNAME\fR and \fB__stop_SECNAME\fR symbols to mark the beginning and end of the section, where \fBSECNAME\fR is the section name\. .IP You can make these marker symbols visible from other ELF modules by passing \fB\-z start_stop_visibility=protected\fR\. Default is \fBhidden\fR\. .TP \fB\-z text\fR, \fB\-z notext\fR, \fB\-z textoff\fR \fBmold\fR by default reports an error if dynamic relocations are created in read\-only sections\. If \fB\-z notext\fR or \fB\-z textoff\fR are given, \fBmold\fR creates such dynamic relocations without reporting an error\. \fB\-z text\fR restores the default behavior\. .TP \fB\-z max\-page\-size\fR=\fInumber\fR Some CPU ISAs support multiple memory page sizes\. This option specifies the maximum page size that an output binary can run on\. In general, binaries built for a larger page size can run on a system with a smaller page size, but not vice versa\. The default value is 4 KiB for i386, x86\-64, and RISC\-V, and 64 KiB for ARM64\. .TP \fB\-z nodefaultlib\fR Make the dynamic loader ignore default search paths\. .TP \fB\-z nodelete\fR Mark DSO non\-deletable at runtime\. .TP \fB\-z nodlopen\fR Mark DSO not available to dlopen(3)\. This option makes it possible for the linker to optimize thread\-local variable accesses by rewriting instructions for some targets\. .TP \fB\-z nodump\fR Mark DSO not available to dldump(3)\. .TP \fB\-z nocopyreloc\fR Do not create copy relocations\. .TP \fB\-z initfirst\fR Mark DSO to be initialized first at runtime\. .TP \fB\-z interpose\fR Mark object to interpose all DSOs but executable\. .TP \fB\-(\fR, \fB\-)\fR, \fB\-EL\fR, \fB\-O\fR\fInumber\fR, \fB\-\-dc\fR, \fB\-\-dp\fR, \fB\-\-end\-group\fR, \fB\-\-no\-add\-needed\fR, \fB\-\-no\-copy\-dt\-needed\-entries\fR, \fB\-\-nostdlib\fR, \fB\-\-rpath\-link=Ar dir\fR, \fB\-\-sort\-common\fR, \fB\-\-sort\-section\fR, \fB\-\-start\-group\fR, \fB\-\-warn\-constructors\fR, \fB\-\-warn\-once\fR, \fB\-\-fix\-cortex\-a53\-835769\fR, \fB\-\-fix\-cortex\-a53\-843419\fR, \fB\-z combreloc\fR, \fB\-z common\-page\-size\fR, \fB\-z nocombreloc\fR Ignored .SH "ENVIRONMENT VARIABLES" .TP \fBMOLD_JOBS\fR If this variable is set to \fB1\fR, only one \fBmold\fR process will run at a time\. If a new mold process is initiated while another is already active, the new process will wait until the active one completes before starting\. .IP The primary reason for this environment variable is to minimize peak memory usage\. Since mold is designed to operate with high parallelism, running multiple mold instances simultaneously may not be beneficial\. If you execute N instances of mold concurrently, it could require N times the time and N times the memory\. On the other hand, running them one after the other might still take N times longer, but the peak memory usage would be the same as running just a single instance\. .IP If your build system invokes multiple linker processes simultaneously and some of them often get killed due to out\-of\-memory errors, you might consider setting this environment variable to \fB1\fR to see if it addresses the OOM issue\. .IP Currently, any value other than \fB1\fR is silently ignored\. .TP \fBMOLD_DEBUG\fR If this variable is set to a non\-empty string, \fBmold\fR embeds its command\-line options in the output file's \fB\.comment\fR section\. .TP \fBMOLD_REPRO\fR Setting this variable to a non\-empty string has the same effect as passing the \fB\-\-repro\fR option\. .SH "SEE ALSO" gold(1), ld(1), elf(5), ld\.so(8) .SH "AUTHOR" Rui Ueyama \fIruiu@cs\.stanford\.edu\fR .SH "BUGS" Report bugs to \fIhttps://github\.com/rui314/mold/issues\fR\. ================================================ FILE: docs/mold.md ================================================ mold(1) -- a modern linker ========================== ## SYNOPSIS `mold` [_option_...] _file_... ## DESCRIPTION `mold` is a faster drop-in replacement for the default GNU ld(1). ### How to use See . ### Compatibility `mold` is designed to be a drop-in replacement for the GNU linkers for linking user-land programs. If your user-land program cannot be built due to missing command-line options, please file a bug at . `mold` supports a very limited set of linker script features, which is just sufficient to read `/usr/lib/x86_64-linux-gnu/libc.so` on Linux systems (on Linux, that file is contrary to its name not a shared library but an ASCII linker script that loads a real `libc.so` file.) Beyond that, we have no plan to support any additional linker script features. The linker script is an ad-hoc, over-designed, complex language which we believe needs to be replaced by a simpler mechanism. We have a plan to add a replacement for the linker script to `mold` instead. ### Archive symbol resolution Traditionally, Unix linkers are sensitive to the order in which input files appear on the command line. They process input files from the first (leftmost) file to the last (rightmost) file one-by-one. While reading input files, they maintain sets of defined and undefined symbols. When visiting an archive file (`.a` files), they pull out object files to resolve as many undefined symbols as possible and move on to the next input file. Object files that weren't pulled out will never have a chance for a second look. Due to this behavior, you usually have to add archive files at the end of a command line, so that when a linker reaches archive files, it knows what symbols remain as undefined. If you put archive files at the beginning of a command line, a linker doesn't have any undefined symbols, and thus no object files will be pulled out from archives. You can change the processing order by using the `--start-group` and `--end-group` options, though they make a linker slower. `mold`, as well as the LLVM lld(1) linker, takes a different approach. They remember which symbols can be resolved from archive files instead of forgetting them after processing each archive. Therefore, `mold` and lld(1) can "go back" in a command line to pull out object files from archives if they are needed to resolve remaining undefined symbols. They are not sensitive to the input file order. `--start-group` and `--end-group` are still accepted by `mold` and lld(1) for compatibility with traditional linkers, but they are silently ignored. ### Dynamic symbol resolution Some Unix linker features are difficult to understand without comprehending the semantics of dynamic symbol resolution. Therefore, even though it's not specific to `mold`, we'll explain it here. We use "ELF module" or just "module" as a collective term to refer to an executable or a shared library file in the ELF format. An ELF module may have lists of imported symbols and exported symbols, as well as a list of shared library names from which imported symbols should be imported. The point is that imported symbols are not bound to any specific shared library until runtime. Here is how the Unix dynamic linker resolves dynamic symbols. Upon the start of an ELF program, the dynamic linker constructs a list of ELF modules which, as a whole, consist of a complete program. The executable file is always at the beginning of the list followed by its dependent shared libraries. An imported symbol is searched from the beginning of the list to the end. If two or more modules define the same symbol, the one that appears first in the list takes precedence over the others. This Unix semantics are contrary to systems such as Windows that have a two-level namespace for dynamic symbols. On Windows, for example, dynamic symbols are represented as a tuple of (`symbol-name`, `shared-library-name`), so that each dynamic symbol is guaranteed to be resolved from some specific library. Typically, an ELF module that exports a symbol also imports the same symbol. Such a symbol is usually resolved to itself, but that's not the case if a module that appears before it in the symbol search list provides another definition of the same symbol. Let's take `malloc` as an example. Assume that you define your version of `malloc` in your main executable file. Then, all `malloc` calls from any module are resolved to your function instead of the one in libc, because the executable is always at the beginning of the dynamic symbol search list. Note that even `malloc` calls within libc are resolved to your definition since libc exports and imports `malloc`. Therefore, by defining `malloc` yourself, you can overwrite a library function, and the `malloc` in libc becomes dead code. These Unix semantics are tricky and sometimes considered harmful. For example, assume that you accidentally define `atoi` as a global function in your executable that behaves completely differently from the one in the C standard. Then, all `atoi` function calls from any modules (even function calls within libc) are redirected to your function instead of the one in libc, which will very likely cause a problem. That is a somewhat surprising consequence for an accidental name conflict. On the other hand, this semantic is sometimes useful because it allows users to override library functions without rebuilding modules containing them. Whether good or bad, you should keep these semantics in mind to understand Unix linkers' behaviors. ### Build reproducibility `mold`'s output is deterministic. That is, if you pass the same object files and the same command-line options to the same version of `mold`, it is guaranteed that `mold` produces the bit-for-bit identical output. The linker's internal randomness, such as the timing of thread scheduling or iteration orders of hash tables, doesn't affect the output. `mold` does not have any host-specific default settings. This is contrary to the GNU linkers, for which some configurable values, such as system-dependent library search paths, are hard-coded. `mold` depends only on its command-line arguments. ## OPTION NOTATIONS Multi-letter long options may precede either a single dash or double dashes, except for those starting with the letter "o". For historical reasons, long options beginning with "o" must precede double dashes. For example, you can spell `--as-needed` as `-as-needed`, but `--omagic` must not be spelled as `-omagic`. `-omagic` will be interpreted not as `--omagic` but as `-o magic`. ## MOLD-SPECIFIC OPTIONS * `--chroot`=_dir_: Set _dir_ as the root directory. * `--color-diagnostics`=[ _auto_ | _always_ | _never_ ]: Show diagnostic messages in color using ANSI escape sequences. `auto` means that `mold` prints out messages in color only if the standard output is connected to a TTY. Default is `auto`. * `--color-diagnostics`: Synonym for `--color-diagnostics=auto`. * `--no-color-diagnostics`: Synonym for `--color-diagnostics=never`. * `--detach`, `--no-detach`: Permit or do not permit mold to create a debug info file in the background. * `--fork`, `--no-fork`: Spawn a child process and let it do the actual linking. When linking a large program, the OS kernel can take a few hundred milliseconds to terminate a `mold` process. `--fork` hides that latency. By default, it does fork. Note that `--fork` also hides the resource usage statistics reported by time(2), since it doesn't call waitpid(2) on the child process. If you need those statistics, pass `--no-fork`. * `--perf`: Print performance statistics. * `--print-dependencies`: Print out dependency information for input files. Each line of the output for this option shows which file depends on which file to use a specific symbol. This option is useful for debugging why some object file in a static archive got linked or why some shared library is kept in an output file's dependency list even with `--as-needed`. * `--relocatable-merge-sections`: By default, `mold` doesn't merge input sections by name when merging input object files into a single output object file for `-r`. For example, `.text.foo` and `.text.bar` aren't merged for `-r` even though they are merged into `.text` based on the default section merging rules. This option changes the behavior so that `mold` merges input sections by name by the default section merging rules. * `--repro`: Archive input files, as well as a text file containing command line options, in a tar file so that you can run `mold` with the exact same inputs again. This is useful for reporting a bug with a reproducer. The output filename is `path/to/output.tar`, where `path/to/output` is an output filename specified by `-o`. * `--reverse-sections`: Reverse the order of input sections before assigning them the offsets in the output file. This option is useful for finding bugs that depend on the initialization order of global objects. In C++, constructors of global objects in a single source file are guaranteed to be executed in the source order, but there's no such guarantee across compilation units. Usually, constructors are executed in the order given to the linker, but depending on it is a mistake. By reversing the order of input sections using `--reverse-sections`, you can easily test that your program works in the reversed initialization order. * `--run` _command_ _arg_...: Run _command_ with `mold` as `/usr/bin/ld`. Specifically, `mold` runs a given command with the `LD_PRELOAD` environment set to intercept exec(3) family functions and replaces `argv[0]` with itself if it is `ld`, `ld.gold`, or `ld.lld`. * `--separate-debug-file`, `--separate-debug-file`=_file_: Bundle debug info sections into a separate file instead of embedding them in an output executable or a shared library. mold creates a debug info file in the background by default, so that you can start running your executable as soon as possible. By default, the debug info file is created in the same directory as is the output file, with the `.dbg` file extension. That filename is embedded into the output file so that `gdb` can automatically find the debug info file for the output file. For more info about gdb features related to separate debug files, see . mold holds a file lock with flock(2) while creating a debug info file in the background. If you don't want to create a debug info file in the background, pass the `--no-detach` option. * `--shuffle-sections`, `--shuffle-sections`=_number_: Randomize the output by shuffling the order of input sections before assigning them the offsets in the output file. If a _number_ is given, it's used as a seed for the random number generator, so that the linker produces the same output for the same seed. If no seed is given, a random number is used as a seed. This option is useful for benchmarking. Modern CPUs are sensitive to a program's memory layout. A seemingly benign change in program layout, such as a small size increase of a function in the middle of a program, can affect the program's performance. Therefore, even if you write new code and get a good benchmark result, it is hard to say whether the new code improves the program's performance; it is possible that the new memory layout happens to perform better. By running a benchmark multiple times with randomized memory layouts using `--shuffle-sections`, you can isolate your program's real performance number from the randomness caused by memory layout changes. * `--spare-program-headers`=_number_: Append the given number of `PT_NULL` entries to the end of the program header, so that post-link processing tools can easily add new segments by overwriting the null entries. Note that ELF requires all `PT_LOAD` segments to be sorted by `p_vaddr`. Therefore, if you add a new LOAD segment, you may need to sort the entire program header. * `--stats`: Print input statistics. * `--thread-count`=_count_: Use _count_ number of threads. * `--threads`, `--no-threads`: Use multiple threads. By default, `mold` uses as many threads as the number of cores or 32, whichever is smaller. The reason it is capped at 32 is because `mold` doesn't scale well beyond that point. To use only one thread, pass `--no-threads` or `--thread-count=1`. * `--quick-exit`, `--no-quick-exit`: Use or do not use `quick_exit` to exit. * `--zero-to-bss`: Convert all-zero data sections into BSS. When this option is enabled, `mold` scans input data sections that are not of type `SHT_NOBITS` and checks whether their contents consist solely of zero bytes. Such sections are then converted into BSS (`SHT_NOBITS`) sections. This reduces the size of the output file, since BSS sections occupy no space in the file image. This behavior is especially useful for user-defined sections created with `__attribute__((section(".sectname")))` that contain uninitialized global variables. GCC and Clang do not automatically mark such sections as BSS even if their contents are entirely zero, and instead emit them as regular data sections. For example, consider `__attribute__((section(".sectname"))) int vec[256];`. By default, this results in a `.sectname` section of type `SHT_PROGBITS` filled with zeros. With `--zero-to-bss`, the linker will recognize it as empty data and convert it to a `SHT_NOBITS` section, reducing the output file size without changing runtime semantics. * `-z rewrite-endbr`, `-z norewrite-endbr`: As a security measure, some CPU instruction sets have recently gained a feature to protect control flow integrity by disallowing indirect branches by default. If the feature is enabled, the instruction that is executed immediately after an indirect branch must be an branch target marker instruction, or a CPU-level fault will raise. The marker instruction is also known as "landing pad" instruction, to which indirect branches can land. This feature makes ROP attacks harder to conduct. To use the feature, a function whose pointer is taken needs to begin with a landing pad because a function call via a function pointer is compiled to an indirect branch. On the other hand, if a function is called only directly (i.e. referred to only by _direct_ branch instructions), it doesn't have to begin with it. By default, the compiler always emits a landing pad at the beginning of each global function because it doesn't know whether or not the function's pointer is taken in another translation unit. As a result, the resulting binary has more attack surface than necessary. If `--rewrite-endbr` is given, mold conducts a whole program analysis to identify functions whose addresses are actually taken and rewrites landing pads with no-ops for non-address-taken functions, reducing the attack surface. This feature is currently available only on x86-64. ## GNU-COMPATIBLE OPTIONS * `--help`: Report usage information to stdout and exit. * `-v`, `--version`: Report version information to stdout. * `-V`: Report version and target information to stdout. * `-E`, `--export-dynamic`, `--no-export-dynamic`: When creating an executable, using the `-E` option causes all global symbols to be put into the dynamic symbol table, so that the symbols are visible from other ELF modules at runtime. By default, or if `--no-export-dynamic` is given, only symbols that are referenced by DSOs at link-time are exported from an executable. * `-F` _libname_, `--filter`=_libname_: Set the `DT_FILTER` dynamic section field to _libname_. * `-I` _file_, `--dynamic-linker`=_file_, `--no-dynamic-linker`: Set the dynamic linker path to _file_. If no `-I` option is given, or if `--no-dynamic-linker` is given, no dynamic linker path is set to an output file. This is contrary to the GNU linkers which set a default dynamic linker path in that case. This difference doesn't usually make any difference because the compiler driver always passes `-I` to the linker. * `-L` _dir_, `--library-path`=_dir_: Add _dir_ to the list of library search paths from which `mold` searches libraries for the `-l` option. Unlike the GNU linkers, `mold` does not have default search paths. This difference doesn't usually make any difference because the compiler driver always passes all necessary search paths to the linker. * `-M`, `--print-map`: Write a map file to stdout. * `-N`, `--omagic`, `--no-omagic`: Force `mold` to emit an output file with an old-fashioned memory layout. First, it makes the first data segment not aligned to a page boundary. Second, text segments are marked as writable if the option is given. * `-S`, `--strip-debug`: Omit `.debug_*` sections from the output file. * `-T` _file_, `--script`=_file_: Read linker script from _file_. * `-X`, `--discard-locals`: Discard temporary local symbols to reduce the sizes of the symbol table and the string table. Temporary local symbols are local symbols starting with `.L`. Compilers usually generate such symbols for unnamed program elements such as string literals or floating-point literals. * `-e` _symbol_, `--entry`=_symbol_: Use _symbol_ as the entry point symbol instead of the default entry point symbol _start. * `-f` _shlib_, `--auxiliary`=_shlib_: Set the `DT_AUXILIARY` dynamic section field to _shlib_. * `-h` _libname_, `--soname`=_libname_: Set the `DT_SONAME` dynamic section field to _libname_. This option is used when creating a shared object file. Typically, when you create `libfoo.so`, you want to pass `--soname=foo` to a linker. * `-l` _libname_: Search for `lib`_libname_`.so` or `lib`_libname_`.a` from library search paths. * `-m` _target_: Choose a _target_. * `-o` _file_, `--output`=_file_: Use _file_ as the output file name instead of the default name `a.out`. * `-r`, `--relocatable`: Instead of generating an executable or a shared object file, combine input object files to generate another object file that can be used as an input to a linker. * `-s`, `--strip-all`: Omit `.symtab` section from the output file. * `-u` _symbol_, `--undefined`=_symbol_: If _symbol_ remains as an undefined symbol after reading all object files, and if there is a static archive that contains an object file defining _symbol_, pull out the object file and link it so that the output file contains a definition of _symbol_. * `-y` _symbol_, `--trace-symbol`=_symbol_: Trace references to _symbol_. * `--Bdynamic`: Link against shared libraries. * `--Bstatic`: Do not link against shared libraries. * `--Bsymbolic`: When creating a shared library, make global symbols export-only (i.e. do not import the same symbol). As a result, references within a shared library are always resolved locally, negating symbol override at runtime. See "Dynamic symbol resolution" for more information about symbol imports and exports. * `--Bsymbolic-functions`: This option has the same effect as `--Bsymbolic` but works only for function symbols. Data symbols remain being both imported and exported. * `--Bsymbolic-non-weak`: This option has the same effect as `--Bsymbolic` but works only for non-weak symbols. Weak symbols remain being both imported and exported. * `--Bsymbolic-non-weak-functions`: This option has the same effect as `--Bsymbolic` but works only for non-weak function symbols. Data symbols and weak function symbols remain being both imported and exported. * `--Bno-symbolic`: Cancel `--Bsymbolic`, `--Bsymbolic-functions`, `--Bsymbolic-non-weak` and `--Bsymbolic-non-weak-functions`. * `--Map`=_file_: Write map file to _file_. * `--Tbss`=_address_: Alias for `--section-start=.bss=`_address_. * `--Tdata`=_address_: Alias for `--section-start=.data=`_address_. * `--Ttext`=_address_: Alias for `--section-start=.text=`_address_. * `--allow-multiple-definition`: Normally, the linker reports an error if there are more than one definition of a symbol. This option changes the default behavior so that it doesn't report an error for duplicate definitions and instead use the first definition. * `--allow-shlib-undefined`, `--no-allow-shlib-undefined`: Even if mold succeeds in linking a main executable without undefined symbol errors, you may still encounter symbol lookup errors at runtime because the dynamic linker cannot find some symbols in shared libraries in any ELF module. This occurs because mold ignores undefined symbols in shared libraries by default. If you pass `--no-allow-shlib-undefined`, mold verifies that undefined symbols in shared libraries given to the linker can be resolved at link-time. In other words, this converts the runtime error to a link-time error. Note that you need to pass all shared libraries, including indirectly dependent ones, to the linker as arguments for `-l`. If a shared library depends on a library that's not passed to the linker, the verification will be skipped for that file. * `--as-needed`, `--no-as-needed`: By default, shared libraries given to the linker are unconditionally added to the list of required libraries in an output file. However, shared libraries after `--as-needed` are added to the list only when at least one symbol is actually used by the output file. In other words, shared libraries after `--as-needed` are not added to the list of needed libraries if they are not needed by a program. The `--no-as-needed` option restores the default behavior for subsequent files. * `--build-id`=[ `md5` | `sha1` | `sha256` | `fast` | `uuid` | `0x`_hexstring_ | `none` ]: Create a `.note.gnu.build-id` section containing a byte string to uniquely identify an output file. `sha256` compute a 256-bit cryptographic hash of an output file and set it to build-id. `md5` and `sha1` compute the same hash but truncate it to 128 and 160 bits, respectively, before setting it to build-id. `uuid` sets a random 128-bit UUID. `0x`_hexstring_ sets _hexstring_. `fast` is a synonym for `sha256`. * `--build-id`: Synonym for `--build-id=sha256`. * `--no-build-id`: Synonym for `--build-id=none`. * `--compress-debug-sections`=[ `zlib` | `zlib-gabi` | `zstd` | `none` ]: Compress DWARF debug info (`.debug_*` sections) using the zlib or zstd compression algorithm. `zlib-gabi` is an alias for `zlib`. * `--defsym`=_symbol_=_value_: Define _symbol_ as an alias for _value_. _value_ is either an integer (in decimal or hexadecimal with `0x` prefix) or a symbol name. If an integer is given as a value, _symbol_ is defined as an absolute symbol with the given value. * `--default-symver`: Use soname as a symbol version and append that version to all symbols. * `--demangle`, `--no-demangle`: Demangle C++ and Rust symbols in log messages. * `--dependency-file`=_file_: Write a dependency file to _file_. The contents of the written file is readable by make(1), which defines only one rule with the linker's output file as a target and all input files as its prerequisites. Users are expected to include the generated dependency file into a Makefile to automate the dependency management. This option is analogous to the compiler's `-MM -MF` options. * `--dynamic-list`=_file_: Read a list of dynamic symbols from _file_. Same as `--export-dynamic-symbol-list`, except that it implies `--Bsymbolic`. If _file_ does not exist in the current directory, it is searched from library search paths for the sake of compatibility with GNU ld. * `--eh-frame-hdr`, `--no-eh-frame-hdr`: Create `.eh_frame_hdr` section. * `--emit-relocs`: The linker usually "consumes" relocation sections. That is, the linker applies relocations to other sections, and relocation sections themselves are discarded. The `--emit-relocs` instructs the linker to leave relocation sections in the output file. Some post-link binary analysis or optimization tools such as LLVM Bolt need them. * `--enable-new-dtags`, `--disable-new-dtags`: By default, `mold` emits `DT_RUNPATH` for `--rpath`. If you pass `--disable-new-dtags`, `mold` emits `DT_RPATH` for `--rpath` instead. * `--execute-only`: Traditionally, setting the executable bit to 1 for a memory page implies that the page also become readable, which allows machine code to be read as data at runtime. That is actually what an attacker often does after gaining a limited control of a process to find pieces of machine code they can use to gain the full control of the process. As a mitigation, recent processors including some ARM64 ones allows "execute-only" pages. If a page is execute-only, you can call a function there as long as you know its address but can't read it as data. This option marks text segments as execute-only by setting just the "X" bit instead of "RX". Note that on most systems, the absence of the "R" bit in the text segment serves just as a hint. If you run a program linked with `--execute-only` on a processor that doesn't support execute-only pages, your executable will likely still function normally, but the text segment will remain readable. * `--exclude-libs`=_libraries_ ...: Mark all symbols in the given _libraries_ hidden. * `--export-dynamic-symbol`=_symbol_: Put symbols matching _symbol_ in the dynamic symbol table. _symbol_ may be a glob pattern in the same syntax as for the `--export-dynamic-symbol-list` or `--version-script` options. * `--export-dynamic-symbol-list`=_file_: Read a list of dynamic symbols from _file_. * `--fatal-warnings`, `--no-fatal-warnings`: Treat warnings as errors. * `--fini`=_symbol_: Call _symbol_ at unload-time. * `--gc-sections`, `--no-gc-sections`: Remove unreferenced sections. * `--gdb-index`: Create a `.gdb_index` section to speed up GNU debugger. To use this, you need to compile source files with the `-ggnu-pubnames` compiler flag. * `--hash-style`=[ `sysv` | `gnu` | `both` | `none` ]: Set hash style. * `--icf`=[ `safe` | `all` | `none` ], `--no-icf`: It is not uncommon for a program to contain many identical functions that differ only in name. For example, a C++ template `std::vector` is very likely to be instantiated to the identical code for `std::vector` and `std::vector` because the container cares only about the size of the parameter type. Identical Code Folding (ICF) is a size optimization to identify and merge such identical functions. If `--icf=all` is given, `mold` tries to merge all identical functions. This reduces the size of the output most, but it is not a "safe" optimization. It is guaranteed in C and C++ that two pointers pointing two different functions will never be equal, but `--icf=all` breaks that assumption as two identical functions have the same address after merging. So a care must be taken when you use this flag that your program does not depend on the function pointer uniqueness. `--icf=safe` is a flag to merge functions only when it is safe to do so. That is, if a program does not take an address of a function, it is safe to merge that function with other function, as you cannot compare a function pointer with something else without taking an address of a function. `--icf=safe` needs to be used with a compiler that supports `.llvm_addrsig` section which contains the information as to what symbols are address-taken. LLVM/Clang supports that section by default. Since GCC does not support it yet, you cannot use `--icf=safe` with GCC (it doesn't do any harm but can't optimize at all.) `--icf=none` and `--no-icf` disables ICF. * `--ignore-data-address-equality`: Make ICF to merge not only functions but also data. This option should be used in combination with `--icf=all`. * `--image-base`=_addr_: Set the base address to _addr_. * `--init`=_symbol_: Call _symbol_ at load-time. * `--no-undefined`: Report undefined symbols (even with `--shared`). * `--noinhibit-exec`: Create an output file even if errors occur. * `--package-metadata`=_percent-encoded-string_: Embed a specified string into the `.note.package` section. This option is designed for build scripts that generate binary packages, such as `.rpm` or `.deb`, to include package metadata in each executable. It simplifies the process of identifying the corresponding package for a given executable or core file. An argument to this option is treated as percent-encoded and decoded before being inserted into the section, allowing you to avoid the use of the comma (`,`) character in the argument. This is useful because the compiler replaces all occurrences of commas in `-Wl,` with spaces before forwarding them to the linker. Note that `mold` always interprets the argument as percent-encoded, so you also need to escape all occurrences of `%` as `%25`. * `--pack-dyn-relocs`=[ `relr` | `none` ]: If `relr` is specified, all `R_*_RELATIVE` relocations are put into `.relr.dyn` section instead of `.rel.dyn` or `.rela.dyn` section. Since `.relr.dyn` section uses a space-efficient encoding scheme, specifying this flag can reduce the size of the output. This is typically most effective for position-independent executable. Note that a runtime loader has to support `.relr.dyn` to run executables or shared libraries linked with `--pack-dyn-relocs=relr`. As of 2022, only ChromeOS, Android and Fuchsia support it. * `--pie`, `--pic-executable`, `--no-pie`, `--no-pic-executable`: Create a position-independent executable. * `--print-gc-sections`, `--no-print-gc-sections`: Print removed unreferenced sections. * `--print-icf-sections`, `--no-print-icf-sections`: Print folded identical sections. * `--push-state`, `--pop-state`: `--push-state` saves the current values of `--as-needed`, `--whole-archive`, `--static`, and `--start-lib`. The saved values can be restored by pop-state. `--push-state` and `--pop-state` pairs can nest. These options are useful when you want to construct linker command line options programmatically. For example, if you want to link `libfoo.so` by as-needed basis but don't want to change the global state of `--as-needed`, you can append `--push-state --as-needed -lfoo --pop-state` to the linker command line options. * `--relax, --no-relax`: Rewrite machine instructions with more efficient ones for some relocations. The feature is enabled by default. * `--require-defined`=_symbol_: Like `--undefined`, except the new symbol must be defined by the end of the link. * `--retain-symbols-file`=_file_: Keep only symbols listed in _file_. _file_ is a text file containing a symbol name on each line. `mold` discards all local symbols as well as global symbol that are not in _file_. Note that this option removes symbols only from `.symtab` section and does not affect `.dynsym` section, which is used for dynamic linking. * `--rpath`=_dir_: Add _dir_ to runtime search paths. * `--section-start`=_section_=_address_: Set _address_ to section. _address_ is a hexadecimal number that may start with an optional `0x`. * `--shared`, `--Bshareable`: Create a share library. * `--spare-dynamic-tags`=_number_: Append the given number of `DT_NULL` entries to the end of the `.dynamic` section, so that post-link processing tools can easily add new dynamic tags by overwriting the null entries. * `--start-lib`, `--end-lib`: Handle object files between `--start-lib` and `--end-lib` as if they were in an archive file. That means object files between them are linked only when they are needed to resolve undefined symbols. The options are useful if you want to link object files only when they are needed but want to avoid the overhead of running ar(3). * `--static`: Do not link against shared libraries. * `--sysroot`=_dir_: Set target system root directory to _dir_. * `--trace`: Print name of each input file. * `--undefined-glob`=_pattern_: Synonym for `--undefined`, except that `--undefined-glob` takes a glob pattern instead of just a single symbol name. * `--undefined-version`, `--no-undefined-version`: By default, `mold` warns on a symbol specified by a version script or by `--export-dynamic-symbol` if it is not defined. You can silence the warning by `--undefined-version`. * `--unique`=_pattern_: Don't merge input sections that match the given glob pattern _pattern_. * `--unresolved-symbols`=[ `report-all` | `ignore-all` | `ignore-in-object-files` | `ignore-in-shared-libs` ]: How to handle undefined symbols. * `--version-script`=_file_: Read version script from _file_. If _file_ does not exist in the current directory, it is searched from library search paths for the sake of compatibility with GNU ld. * `--warn-common`, `--no-warn-common`: Warn about common symbols. * `--warn-once`: Only warn once for each undefined symbol instead of warn for each relocation referring an undefined symbol. * `--warn-unresolved-symbols`, `--error-unresolved-symbols`: Normally, the linker reports an error for unresolved symbols. `--warn-unresolved-symbols` option turns it into a warning. `--error-unresolved-symbols` option restores the default behavior. * `--whole-archive`, `--no-whole-archive`: When archive files (`.a` files) are given to the linker, only object files that are needed to resolve undefined symbols are extracted from them and linked to an output file. `--whole-archive` changes that behavior for subsequent archives so that the linker extracts all object files and links them to an output. For example, if you are creating a shared object file and you want to include all archive members to the output, you should pass `--whole-archive`. `--no-whole-archive` restores the default behavior for subsequent archives. * `--wrap`=_symbol_: Make _symbol_ be resolved to `__wrap_`_symbol_. The original symbol can be resolved as `__real_`_symbol_. This option is typically used for wrapping an existing function. * `-z cet-report`=[ `warning` | `error` | `none` ]: Intel Control-flow Enforcement Technology (CET) is a new x86 feature available since Tiger Lake which is released in 2020. It defines new instructions to harden security to protect programs from control hijacking attacks. You can tell the compiler to use the feature by specifying the `-fcf-protection` flag. `-z cet-report` flag is used to make sure that all object files were compiled with a correct `-fcf-protection` flag. If `warning` or `error` are given, `mold` prints out a warning or an error message if an object file was not compiled with the compiler flag. `mold` looks for `GNU_PROPERTY_X86_FEATURE_1_IBT` bit and `GNU_PROPERTY_X86_FEATURE_1_SHSTK` bit in `.note.gnu.property` section to determine whether or not an object file was compiled with `-fcf-protection`. * `-z now`, `-z lazy`: By default, functions referring to other ELF modules are resolved by the dynamic linker when they are called for the first time. `-z now` marks an executable or a shared library file so that all dynamic symbols are resolved when a file is loaded to memory. `-z lazy` restores the default behavior. * `-z origin`: Mark object requiring immediate `$ORIGIN` processing at runtime. * `-z ibt`: Turn on `GNU_PROPERTY_X86_FEATURE_1_IBT` bit in `.note.gnu.property` section to indicate that the output uses IBT-enabled PLT. This option implies `-z ibtplt`. * `-z ibtplt`: Generate Intel Branch Tracking (IBT)-enabled PLT which is the default on x86-64. This is the default. * `-z execstack`, `-z noexecstack`: By default, the pages for the stack area (i.e. the pages where local variables reside) are not executable for security reasons. `-z execstack` makes it executable. `-z noexecstack` restores the default behavior. * `-z keep-text-section-prefix`, `-z nokeep-text-section-prefix`: Keep `.text.hot`, `.text.unknown`, `.text.unlikely`, `.text.startup`, and `.text.exit` as separate sections in the final binary instead of merging them as `.text`. * `-z rodynamic`: Make the `.dynamic` section read-only. * `-z relro`, `-z norelro`: Some sections such as `.dynamic` have to be writable only during a module is being loaded to memory. Once the dynamic linker finishes its job, such sections won't be mutated by anyone. As a security mitigation, it is preferred to make such segments read-only during program execution. `-z relro` puts such sections into a special segment called `relro`. The dynamic linker makes a relro segment read-only after it finishes its job. By default, `mold` generates a relro segment. `-z norelro` disables the feature. * `-z sectionheader`, `-z nosectionheader`: `-z nosectionheader` tell the linker to omit the section header. By default, the linker does not omit the section header. * `-z separate-loadable-segments`, `-z separate-code`, `-z noseparate-code`: If one memory page contains multiple segments, the page protection bits are set in such a way that the needed attributes (writable or executable) are satisfied for all segments. This usually happens at a boundary of two segments with two different attributes. `separate-loadable-segments` adds paddings between segments with different attributes so that they do not share the same page. `separate-code` adds paddings only between executable and non-executable segments. This is the default. `noseparate-code` does not add any paddings between segments. * `-z defs`, `-z nodefs`: Report undefined symbols (even with `--shared`). * `-z shstk`: Enforce shadow stack by turning `GNU_PROPERTY_X86_FEATURE_1_SHSTK` bit in `.note.gnu.property` output section. Shadow stack is part of Intel Control-flow Enforcement Technology (CET), which is available since Tiger Lake (2020). * `-z start_stop_visibility`=[ `hidden` | `protected` ]: If a section name is valid as a C identifier (i.e., it matches `/^[_a-zA-Z][_a-zA-Z0-9]*$/`), mold creates `__start_SECNAME` and `__stop_SECNAME` symbols to mark the beginning and end of the section, where `SECNAME` is the section name. You can make these marker symbols visible from other ELF modules by passing `-z start_stop_visibility=protected`. Default is `hidden`. * `-z text`, `-z notext`, `-z textoff`: `mold` by default reports an error if dynamic relocations are created in read-only sections. If `-z notext` or `-z textoff` are given, `mold` creates such dynamic relocations without reporting an error. `-z text` restores the default behavior. * `-z max-page-size`=_number_: Some CPU ISAs support multiple memory page sizes. This option specifies the maximum page size that an output binary can run on. In general, binaries built for a larger page size can run on a system with a smaller page size, but not vice versa. The default value is 4 KiB for i386, x86-64, and RISC-V, and 64 KiB for ARM64. * `-z nodefaultlib`: Make the dynamic loader ignore default search paths. * `-z nodelete`: Mark DSO non-deletable at runtime. * `-z nodlopen`: Mark DSO not available to dlopen(3). This option makes it possible for the linker to optimize thread-local variable accesses by rewriting instructions for some targets. * `-z nodump`: Mark DSO not available to dldump(3). * `-z nocopyreloc`: Do not create copy relocations. * `-z initfirst`: Mark DSO to be initialized first at runtime. * `-z interpose`: Mark object to interpose all DSOs but executable. * `-(`, `-)`, `-EL`, `-O`_number_, `--dc`, `--dp`, `--end-group`, `--no-add-needed`, `--no-copy-dt-needed-entries`, `--nostdlib`, `--rpath-link=Ar dir`, `--sort-common`, `--sort-section`, `--start-group`, `--warn-constructors`, `--warn-once`, `--fix-cortex-a53-835769`, `--fix-cortex-a53-843419`, `-z combreloc`, `-z common-page-size`, `-z nocombreloc`: Ignored ## ENVIRONMENT VARIABLES * `MOLD_JOBS`: If this variable is set to `1`, only one `mold` process will run at a time. If a new mold process is initiated while another is already active, the new process will wait until the active one completes before starting. The primary reason for this environment variable is to minimize peak memory usage. Since mold is designed to operate with high parallelism, running multiple mold instances simultaneously may not be beneficial. If you execute N instances of mold concurrently, it could require N times the time and N times the memory. On the other hand, running them one after the other might still take N times longer, but the peak memory usage would be the same as running just a single instance. If your build system invokes multiple linker processes simultaneously and some of them often get killed due to out-of-memory errors, you might consider setting this environment variable to `1` to see if it addresses the OOM issue. Currently, any value other than `1` is silently ignored. * `MOLD_DEBUG`: If this variable is set to a non-empty string, `mold` embeds its command-line options in the output file's `.comment` section. * `MOLD_REPRO`: Setting this variable to a non-empty string has the same effect as passing the `--repro` option. ## SEE ALSO gold(1), ld(1), elf(5), ld.so(8) ## AUTHOR Rui Ueyama ## BUGS Report bugs to . ================================================ FILE: install-build-deps.sh ================================================ #!/bin/sh # This script installs binary packages needed to build mold. # Feel free to send me a PR if your OS is not on this list. set -e . /etc/os-release set -x case "$ID" in ubuntu | pop | linuxmint | debian | raspbian | neon | zorin) apt-get update apt-get install -y cmake gcc g++ clang gdb ;; fedora | fedora-* | amzn | rhel | centos) dnf install -y gcc-c++ cmake glibc-static libstdc++-static diffutils util-linux tar ;; rocky | ol) dnf install -y gcc-c++ cmake diffutils util-linux ;; opensuse-*) zypper install -y make cmake gcc-c++ glibc-devel-static tar diffutils util-linux ;; gentoo) emerge-webrsync FEATURES='getbinpkg binpkg-request-signature' emerge dev-build/cmake ;; arch | archarm | artix | endeavouros | manjaro | cachyos) pacman -Sy --needed --noconfirm base-devel cmake util-linux ;; void) xbps-install -Sy xbps bash make cmake gcc tar diffutils util-linux ;; alpine) apk update apk add bash make linux-headers cmake gcc g++ ;; clear-linux-os) swupd update swupd bundle-add c-basic diffutils ;; almalinux) dnf install -y gcc-toolset-13-gcc-c++ gcc-toolset-13-libstdc++-devel cmake diffutils ;; altlinux) apt-get update apt-get install -y gcc-c++ make cmake ctest diffutils util-linux ;; freebsd) pkg update pkg install -y cmake bash binutils gcc ;; *) echo "Error: don't know anything about build dependencies on $ID-$VERSION_ID" exit 1 esac ================================================ FILE: install-cross-tools.sh ================================================ #!/bin/bash set -e . /etc/os-release set -x # This script install packages for -DMOLD_ENABLE_QEMU_TESTS=1 # to enable cross-target tests. # # Feel free to send me a PR if your OS is not on this list. case "$ID" in ubuntu | pop | linuxmint | debian | raspbian) apt-get install -y qemu-user {gcc,g++}-{i686,aarch64,riscv64,powerpc,powerpc64,powerpc64le,s390x,sparc64,m68k,sh4}-linux-gnu {gcc,g++}-arm-linux-gnueabihf ;; *) echo "Error: don't know anything about build dependencies on $ID-$VERSION_ID" exit 1 esac ================================================ FILE: lib/aho-corasick.cc ================================================ // This file implements the Aho-Corasick algorithm to search multiple // strings within an input string simultaneously. It is essentially a // trie with additional links. For details, see // https://en.wikipedia.org/wiki/Aho-Corasick_algorithm. // // We use it for simple glob patterns in version scripts or dynamic // list files. Here are some examples of glob patterns: // // qt_private_api_tag* // *16QAccessibleCache* // *32QAbstractFileIconProviderPrivate* // *17QPixmapIconEngine* // // Aho-Corasick can do only substring search, so it cannot handle // complex glob patterns such as `*foo*bar*`. We handle such patterns // with the Glob class. #include "lib.h" #include namespace mold { bool AhoCorasick::can_handle(std::string_view pat) { if (pat.starts_with('*')) pat.remove_prefix(1); if (pat.ends_with('*')) pat.remove_suffix(1); return pat.find_first_of("*?[") == pat.npos; } i64 AhoCorasick::find(std::string_view str) { if (nodes.empty()) return -1; i64 idx = 0; i64 val = -1; auto walk = [&](u8 c) { for (i64 j = idx; j != -1; j = nodes[j].suffix_link) { i64 child = nodes[j].children[c]; if (child != -1) { idx = child; val = std::max(val, nodes[child].value); return; } } idx = 0; }; walk('\0'); for (u8 c : str) walk(c); walk('\0'); return val; } bool AhoCorasick::add(std::string_view pat, i64 val) { assert(can_handle(pat)); if (nodes.empty()) nodes.resize(1); i64 idx = 0; auto walk = [&](u8 c) { if (nodes[idx].children[c] == -1) { nodes[idx].children[c] = nodes.size(); nodes.resize(nodes.size() + 1); } idx = nodes[idx].children[c]; }; // We handle "foo" as if "\0foo\0", "*foo" as if "foo\0", "foo*" as // if "\0foo", and "*foo*" as if "foo". Aho-Corasick can do only // substring matching, so we use \0 as a beginning/end-of-string // markers. if (!pat.starts_with('*')) walk('\0'); for (u8 c : pat) if (c != '*') walk(c); if (!pat.ends_with('*')) walk('\0'); nodes[idx].value = std::max(nodes[idx].value, val); return true; } void AhoCorasick::compile() { if (nodes.empty()) return; fix_suffix_links(0); fix_values(); } void AhoCorasick::fix_suffix_links(i64 idx) { for (i64 i = 0; i < 256; i++) { i64 child = nodes[idx].children[i]; if (child == -1) continue; i64 j = nodes[idx].suffix_link; for (; j != -1; j = nodes[j].suffix_link) { if (nodes[j].children[i] != -1) { nodes[child].suffix_link = j; break; } } if (j == -1) nodes[child].suffix_link = 0; fix_suffix_links(child); } } void AhoCorasick::fix_values() { std::queue queue; queue.push(0); do { i64 idx = queue.front(); queue.pop(); for (i64 child : nodes[idx].children) { if (child != -1) { i64 suffix = nodes[child].suffix_link; nodes[child].value = std::max(nodes[child].value, nodes[suffix].value); queue.push(child); } } } while (!queue.empty()); } } // namespace mold ================================================ FILE: lib/atomics.h ================================================ // This is the same as std::atomic except that the default memory // order is relaxed instead of sequential consistency. #pragma once #include namespace mold { template struct Atomic : std::atomic { static constexpr std::memory_order relaxed = std::memory_order_relaxed; using std::atomic::atomic; Atomic(const Atomic &other) : std::atomic(other.load()) {} Atomic &operator=(const Atomic &other) { store(other.load()); return *this; } void operator=(T val) { store(val); } operator T() const { return load(); } void store(T val, std::memory_order order = relaxed) { std::atomic::store(val, order); } T load(std::memory_order order = relaxed) const { return std::atomic::load(order); } T exchange(T val) { return std::atomic::exchange(val, relaxed); } T operator|=(T val) { return std::atomic::fetch_or(val, relaxed); } T operator++() { return std::atomic::fetch_add(1, relaxed) + 1; } T operator--() { return std::atomic::fetch_sub(1, relaxed) - 1; } T operator++(int) { return std::atomic::fetch_add(1, relaxed); } T operator--(int) { return std::atomic::fetch_sub(1, relaxed); } bool test_and_set() { // A relaxed load + branch (assuming miss) takes only around 20 cycles, // while an atomic RMW can easily take hundreds on x86. We note that it's // common that another thread beat us in marking, so doing an optimistic // early test tends to improve performance in the ~20% ballpark. return load() || exchange(true); } }; } // namespace mold ================================================ FILE: lib/bitvector.h ================================================ #pragma once #include "integers.h" #include #include namespace mold { class BitvectorProxy { public: BitvectorProxy(u64 &word, size_t pos) : word(word), mask(1ULL << pos) {} BitvectorProxy &operator=(bool val) { if (val) word |= mask; else word &= ~mask; return *this; } BitvectorProxy &operator=(const BitvectorProxy &other) { return *this = (bool)other; } operator bool() const { return word & mask; } private: u64 &word; u64 mask; }; class Bitvector { public: Bitvector() = default; Bitvector(i64 n) : size(n), words((n + 63) / 64) {} void resize(i64 n) { words.clear(); words.resize((n + 63) / 64); size = n; } Bitvector &operator|=(const Bitvector &x) { assert(size == x.size); for (i64 i = 0; i < words.size(); i++) words[i] |= x.words[i]; return *this; } Bitvector &operator&=(const Bitvector &x) { assert(size == x.size); for (i64 i = 0; i < words.size(); i++) words[i] &= x.words[i]; return *this; } Bitvector &operator<<=(size_t n) { assert(n == 1); for (i64 i = words.size() - 1; i > 0; i--) words[i] = (words[i] << 1) | (words[i - 1] >> 63); words[0] <<= 1; return *this; } BitvectorProxy operator[](size_t pos) { assert(pos < size); return {words[pos / 64], pos % 64}; } i64 size = 0; std::vector words; }; } // namespace mold ================================================ FILE: lib/compress.cc ================================================ // This file implements a multi-threaded zlib and zstd compression // routine. // // zlib-compressed data can be merged just by concatenation as long as // each piece of data is flushed with Z_SYNC_FLUSH. In this file, we // split input data into multiple shards, compress them individually // and concatenate them. We then append a header, a trailer and a // checksum so that the concatenated data is valid zlib-format data. // // zstd-compressed data can be merged in the same way. // // Using threads to compress data has a downside. Since the dictionary // is reset on boundaries of shards, compression ratio is sacrificed // a little bit. However, if a shard size is large enough, that loss // is negligible in practice. #include "lib.h" #include #include #include #define CHECK(fn) \ do { \ [[maybe_unused]] int r = (fn); \ assert(r == Z_OK); \ } while (0) namespace mold { static constexpr i64 SHARD_SIZE = 1024 * 1024; Compressor::~Compressor() { for (std::span shard : shards) delete[] shard.data(); } static std::vector> split(std::span input) { std::vector> vec; while (!input.empty()) { i64 sz = std::min(SHARD_SIZE, input.size()); vec.push_back(input.subspan(0, sz)); input = input.subspan(sz); } return vec; } static std::span zlib_compress(std::span input) { // Initialize zlib stream. Since debug info is generally compressed // pretty well with lower compression levels, we chose compression // level 1. z_stream strm = {}; CHECK(deflateInit2(&strm, 1, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY)); // Set an input buffer strm.avail_in = input.size(); strm.next_in = input.data(); // Set an output buffer. deflateBound() returns an upper bound // on the compression size. +16 for Z_SYNC_FLUSH. i64 bufsize = deflateBound(&strm, strm.avail_in) + 16; u8 *buf = new u8[bufsize]; // Compress data. It writes all compressed bytes except the last // partial byte, so up to 7 bits can be held to be written to the // buffer. strm.avail_out = bufsize; strm.next_out = buf; CHECK(deflate(&strm, Z_BLOCK)); // This is a workaround for libbacktrace before 2022-04-06. // // Zlib is a bit stream, and what Z_SYNC_FLUSH does is to write a // three bit value indicating the start of an uncompressed data // block followed by four byte data 00 00 ff ff which indicates that // the length of the block is zero. libbacktrace uses its own zlib // inflate routine, and it had a bug that if that particular three // bit value happens to end at a byte boundary, it accidentally // skipped the next byte. // // In order to avoid triggering that bug, we should avoid calling // deflate() with Z_SYNC_FLUSH if the current bit position is 5. // If it's 5, we insert an empty block consisting of 10 bits so // that the bit position is 7 in the next byte. // // https://github.com/ianlancetaylor/libbacktrace/pull/87 int nbits = 0; deflatePending(&strm, Z_NULL, &nbits); if (nbits == 5) CHECK(deflatePrime(&strm, 10, 2)); CHECK(deflate(&strm, Z_SYNC_FLUSH)); deflateEnd(&strm); return {buf, (size_t)(bufsize - strm.avail_out)}; } ZlibCompressor::ZlibCompressor(u8 *buf, i64 size) { std::vector> inputs = split(std::span(buf, size)); std::vector adlers(inputs.size()); shards.resize(inputs.size()); // Compress each shard tbb::parallel_for((i64)0, (i64)inputs.size(), [&](i64 i) { std::span in = inputs[i]; adlers[i] = adler32(1, in.data(), in.size()); shards[i] = zlib_compress(in); }); // Combine checksums checksum = adlers[0]; for (i64 i = 1; i < inputs.size(); i++) checksum = adler32_combine(checksum, adlers[i], inputs[i].size()); // Comput the total size compressed_size = 8; // the header and the trailer for (std::span &shard : shards) compressed_size += shard.size(); } void ZlibCompressor::write_to(u8 *buf) { // Write a zlib-format header buf[0] = 0x78; buf[1] = 0x9c; // Copy compressed data std::vector offsets(shards.size()); offsets[0] = 2; // +2 for the header for (i64 i = 1; i < shards.size(); i++) offsets[i] = offsets[i - 1] + shards[i - 1].size(); tbb::parallel_for((i64)0, (i64)shards.size(), [&](i64 i) { memcpy(buf + offsets[i], shards[i].data(), shards[i].size()); }); // Write a trailer u8 *end = buf + compressed_size; end[-6] = 3; end[-5] = 0; *(ub32 *)(end - 4) = checksum; } static std::span zstd_compress(std::span input) { i64 bufsize = ZSTD_COMPRESSBOUND(input.size()); u8 *buf = new u8[bufsize]; int level = 3; // compression level; must be between 1 to 22 size_t sz = ZSTD_compress(buf, bufsize, input.data(), input.size(), level); assert(!ZSTD_isError(sz)); return {buf, sz}; } ZstdCompressor::ZstdCompressor(u8 *buf, i64 size) { std::vector> inputs = split(std::span(buf, size)); shards.resize(inputs.size()); // Compress each shard tbb::parallel_for((i64)0, (i64)inputs.size(), [&](i64 i) { shards[i] = zstd_compress(inputs[i]); }); compressed_size = 0; for (std::span &shard : shards) compressed_size += shard.size(); } void ZstdCompressor::write_to(u8 *buf) { // Copy compressed data std::vector offsets(shards.size()); for (i64 i = 1; i < shards.size(); i++) offsets[i] = offsets[i - 1] + shards[i - 1].size(); tbb::parallel_for((i64)0, (i64)shards.size(), [&](i64 i) { memcpy(buf + offsets[i], shards[i].data(), shards[i].size()); }); } } // namespace mold ================================================ FILE: lib/config.h.in ================================================ #define MOLD_VERSION "@mold_VERSION@" #define MOLD_LIBDIR "@CMAKE_INSTALL_FULL_LIBDIR@" #define MOLD_FIRST_TARGET @MOLD_FIRST_TARGET@ #cmakedefine01 HAVE_MADVISE #cmakedefine01 HAVE_UNAME #cmakedefine01 MOLD_USE_MIMALLOC #cmakedefine01 MOLD_USE_SYSTEM_MIMALLOC #cmakedefine01 HAVE_TARGET_X86_64 #cmakedefine01 HAVE_TARGET_I386 #cmakedefine01 HAVE_TARGET_ARM64LE #cmakedefine01 HAVE_TARGET_ARM64BE #cmakedefine01 HAVE_TARGET_ARM32LE #cmakedefine01 HAVE_TARGET_ARM32BE #cmakedefine01 HAVE_TARGET_RV32LE #cmakedefine01 HAVE_TARGET_RV32BE #cmakedefine01 HAVE_TARGET_RV64LE #cmakedefine01 HAVE_TARGET_RV64BE #cmakedefine01 HAVE_TARGET_PPC32 #cmakedefine01 HAVE_TARGET_PPC64V1 #cmakedefine01 HAVE_TARGET_PPC64V2 #cmakedefine01 HAVE_TARGET_S390X #cmakedefine01 HAVE_TARGET_SPARC64 #cmakedefine01 HAVE_TARGET_M68K #cmakedefine01 HAVE_TARGET_SH4LE #cmakedefine01 HAVE_TARGET_SH4BE #cmakedefine01 HAVE_TARGET_LOONGARCH64 #cmakedefine01 HAVE_TARGET_LOONGARCH32 ================================================ FILE: lib/crc32.cc ================================================ #include "lib.h" #include #include namespace mold { // This function "forges" a CRC. That is, given the current and a desired // CRC32 value, crc32_solve() returns a binary blob to add to the end of // the original data to yield the desired CRC. Trailing garbage is ignored // by many bianry file formats, so you can create a file with a desired // CRC using crc32_solve(). We need it for --separate-debug-file. std::vector crc32_solve(u32 current, u32 desired) { constexpr u32 poly = 0xedb88320; u32 x = ~desired; // Each iteration computes x = (x * x^-1) mod poly. for (i64 i = 0; i < 32; i++) { x = std::rotl(x, 1); x ^= (x & 1) * (poly << 1); } x ^= ~current; std::vector out(4); out[0] = x; out[1] = x >> 8; out[2] = x >> 16; out[3] = x >> 24; return out; } // Compute a CRC for given data in parallel u32 compute_crc32(u32 crc, u8 *buf, i64 len) { struct Shard { u8 *buf; i64 len; u32 crc; }; constexpr i64 shard_size = 1024 * 1024; // 1 MiB std::vector shards; while (len > 0) { i64 sz = std::min(len, shard_size); shards.push_back({buf, sz, 0}); buf += sz; len -= sz; } tbb::parallel_for_each(shards, [](Shard &shard) { shard.crc = crc32(0, shard.buf, shard.len); }); for (Shard &shard : shards) crc = crc32_combine(crc, shard.crc, shard.len); return crc; } } // namespace mold ================================================ FILE: lib/demangle.cc ================================================ #include "lib.h" #include #ifndef _WIN32 #include #endif #include "../third-party/rust-demangle/rust-demangle.h" namespace mold { std::optional demangle_cpp(std::string_view name) { static thread_local char *buf; static thread_local size_t buflen; // TODO(cwasser): Actually demangle Symbols on Windows using e.g. // `UnDecorateSymbolName` from Dbghelp, maybe even Itanium symbols? #ifndef _WIN32 if (name.starts_with("_Z")) { int status; char *p = abi::__cxa_demangle(std::string(name).c_str(), buf, &buflen, &status); if (status == 0) { buf = p; return p; } } #endif return {}; } std::optional demangle_rust(std::string_view name) { static thread_local char *buf; free(buf); buf = rust_demangle(std::string(name).c_str(), 0); if (buf) return buf; return {}; } } // namespace mold ================================================ FILE: lib/filepath.cc ================================================ #include "lib.h" #include #include #ifdef __APPLE__ # include #endif #ifdef __FreeBSD__ # include #endif namespace mold { // Returns the path of the mold executable itself std::string get_self_path() { #if __APPLE__ || _WIN32 fprintf(stderr, "mold: get_self_path is not supported"); exit(1); #elif __FreeBSD__ // /proc may not be mounted on FreeBSD. The proper way to get the // current executable's path is to use sysctl(2). int mib[4]; mib[0] = CTL_KERN; mib[1] = KERN_PROC; mib[2] = KERN_PROC_PATHNAME; mib[3] = -1; size_t size; sysctl(mib, 4, NULL, &size, NULL, 0); std::string path; path.resize(size); sysctl(mib, 4, path.data(), &size, NULL, 0); return path; #else return std::filesystem::read_symlink("/proc/self/exe").string(); #endif } } // namespace mold ================================================ FILE: lib/gentoo-test.sh ================================================ #!/bin/bash # # This test script takes a Gentoo package name and tries to build it # using mold in a Podman environment. We chose Gentoo Linux as a test # target, because its source-based package allows us to build programs # locally and run their test suites without any hassle. # # You can get a complete list of Gentoo packages availalbe for testing # with the following command: # # podman run --rm mold-gentoo emerge --color n -s '' | \ # perl -ne 'next unless m!^\*\s+(\S+/\S+)!; print "$1\n"' package="$1" if [ "$package" = "" ]; then echo "Usage: $0 gentoo-package-name" exit 1 fi set -x # Create a Podman image if ! podman image ls mold-gentoo | grep -q mold-gentoo; then set -e cat <> /etc/portage/make.conf && \ echo 'ACCEPT_KEYWORDS="~amd64"' >> /etc/portage/make.conf && \ echo 'ACCEPT_LICENSE="* -@EULA"' >> /etc/portage/make.conf && \ echo 'FEATURES="\${FEATURE} noclean nostrip ccache -ipc-sandbox -network-sandbox -pid-sandbox -sandbox"' >> /etc/portage/make.conf && \ echo 'CCACHE_DIR="/ccache"' >> /etc/portage/make.conf && \ FEATURES='getbinpkg binpkg-request-signature' emerge gdb lld llvm-core/clang vim emacs strace ccache xeyes dev-build/cmake dev-vcs/git && \ rm -rf /var/tmp/portage EOF set +e fi git_hash=$(./dist/mold --version | perl -ne '/\((\w+)/; print $1;') if [ "$package" = dev-libs/concurrencykit ]; then echo "Skipping known broken package: $package" exit 0 fi # Build a given package in Podman cmd1='(cd /usr/bin; ln -sf /mold/dist/mold $(realpath ld))' cmd2="MAKEOPTS=-'j$(nproc) --load-average=100' emerge --onlydeps $package" cmd3="MAKEOPTS='-j$(nproc) --load-average=100' FEATURES=test emerge $package" filename=`echo "$package" | sed 's!/!_!g'` podman="podman run --rm --pids-limit=-1 --cap-add=SYS_PTRACE -v `pwd`:/mold:ro -v /var/cache/ccache-gentoo:/ccache mold-gentoo timeout -v -k 15s 3h" dir=gentoo/$git_hash mkdir -p "$dir"/success "$dir"/failure $podman chrt --idle 0 nice -n 19 bash -c "$cmd1 && $cmd2 && $cmd3" >& "$dir"/"$filename".mold if [ $? = 0 ]; then mv "$dir"/"$filename".mold "$dir"/success else mv "$dir"/"$filename".mold "$dir"/failure fi $podman chrt --idle 0 nice -n 19 bash -c "$cmd2 && $cmd3" >& "$dir"/"$filename".ld if [ $? = 0 ]; then mv "$dir"/"$filename".ld "$dir"/success else mv "$dir"/"$filename".ld "$dir"/failure fi ================================================ FILE: lib/glob.cc ================================================ // This file implements a glob matcher that can run multiple glob patterns // against an input string. mold uses the glob matcher for symbol name // patterns in a version script or a dynamic list file. Since we may need to // match hundreds of glob patterns against millions of symbol names, the // speed of the matcher is very important. // // The pattern match implemented in this file is NFA-based, although the // cost of the function is O(n*m), where n is the number of NFA states and m // is the length of the input string. We do not use recursion or // backtracking, unlike a generic NFA-based regular expression matcher. This // is doable because glob patterns are very limited subsets of regexes. // // Here is the explanation of the algorithm. Observe that the only "tricky" // meta-character in a glob pattern is "*", which matches zero or more // characters. Other characters and meta-characters always match a single // input character. So the key of the algorithm is to handle "*" efficiently. // // We can represent a glob pattern "a*b*" with three NFA states: q_start, q1 // and q_accept, with the following transition functions: // // δ(q_start, "a") = q1 // δ(q1, ) = q1 // δ(q1, "b") = q_accept // δ(q_accept, ) = q_accept // // We can construct such an NFA in a straightforward manner. We maintain NFA // states as a list, with the initial contents being the start state. Each // character except for "*" creates a new NFA state, adds a transition from // the last state in the list to the new one, and appends the new state at // the end of the list. "*" sets the "is_star" flag on the last NFA state. // The flag indicates that the state machine can remain in the state for any // input character. // // An NFA constructed this way doesn't have any complicated loops, // ε-transitions, or anything like that. Each state has only one incoming // edge. The only loops in the state transition are the self-loops on states // followed by a "*". Aside from that, the state machine progresses linearly // from the start state to the accept state. // // Each state of an NFA can be represented by a single bit. If a bit is 1, // the non-deterministic state machine is in that state. Otherwise, it's // not. Observe that a state with the "is_star" flag will continued to be 1 // once it becomes 1, since the state machine can loop over the state on any // input character. // // With that observation, we can represent an NFA with a bit vector of N // bits, where N is the number of NFA states. For each input character, bit // M becomes 1 if // // - bit M-1 is 1 and there's a transition from state_{M-1} to state_M // with the given character, or // - bit M is 1 and state_M's "is_star" flag is 1. // // Initially, the 0th bit is 1 for the start state. At each step, the bits // propagate from least significant to most significant positions, at most // one bit at a time. If the most significant bit is 1 after the entire // input has been processed, the string matches. // // This propagation can be implemented with bitwise OR, bitwise AND, and a // one-bit bit shift on the bit vector. All these operations are very cheap. // // We can combine multiple glob matchers into a single matcher by simply // concatenating the bit vectors of their state machines. #include "lib.h" #include namespace mold { static std::vector parse_glob(std::string_view pat) { std::vector vec(1); while (!pat.empty()) { u8 c = pat[0]; pat = pat.substr(1); std::bitset<256> chars; switch (c) { case '*': vec.back().is_star = true; continue; case '?': chars.set(); break; case '\\': if (pat.empty()) return {}; chars[pat[0]] = true; pat = pat.substr(1); break; case '[': { // Here are a few bracket pattern examples: // // [abc]: a, b or c // [$\]!]: $, ] or ! // [a-czg-i]: a, b, c, z, g, h, or i // [^a-z]: Any character except lowercase letters bool negate = false; bool closed = false; if (!pat.empty() && pat[0] == '^') { negate = true; pat = pat.substr(1); } while (!pat.empty()) { if (pat[0] == ']') { pat = pat.substr(1); closed = true; break; } if (pat[0] == '\\') { pat = pat.substr(1); if (pat.empty()) return {}; } if (pat.size() >= 3 && pat[1] == '-') { u8 start = pat[0]; u8 end = pat[2]; pat = pat.substr(3); if (end == '\\') { if (pat.empty()) return {}; end = pat[0]; pat = pat.substr(1); } if (end < start) return {}; for (i64 i = start; i <= end; i++) chars[i] = true; } else { chars[pat[0]] = true; pat = pat.substr(1); } } if (!closed) return {}; if (negate) chars.flip(); break; } default: chars[c] = true; break; } vec.push_back({chars, false}); } return vec; } // Instead of returning just a match/no match boolean value, our glob // matcher returns an integer value associated with each given pattern. // If multiple patterns match at the same time, the largest associated // value will be returned by find(). bool MultiGlob::add(std::string_view pat, i64 val) { std::vector vec = parse_glob(pat); if (vec.empty()) return false; patterns.push_back({std::move(vec), val}); return true; } void MultiGlob::compile() { if (patterns.empty()) return; ranges::stable_sort(patterns, ranges::greater(), &GlobPattern::value); std::vector states; for (GlobPattern &p : patterns) append(states, p.states); i64 sz = states.size(); start_states.resize(sz); for (i64 pos = 0; GlobPattern &p : patterns) { start_states[pos] = true; pos += p.states.size(); } star_mask.resize(sz); for (i64 i = 0; i < sz; i++) if (states[i].is_star) star_mask[i] = true; for (i64 i = 0; i < 256; i++) { char_mask[i].resize(sz); for (i64 j = 0; j < sz; j++) if (states[j].incoming_edge[i]) char_mask[i][j] = true; } } i64 MultiGlob::find(std::string_view str) { if (patterns.empty()) return -1; Bitvector bits = start_states; Bitvector tmp; for (u8 c : str) { // This is equivalent to // // bits = (bits & star_mask) | ((bits << 1) & char_mask[c]) // // but we update the existing objects in place to avoid allocating // temporary objects. tmp = bits; tmp &= star_mask; bits <<= 1; bits &= char_mask[c]; bits |= tmp; } for (i64 pos = 0; GlobPattern &p : patterns) { pos += p.states.size(); if (bits[pos - 1]) return p.value; } return -1; } bool Glob::add(std::string_view pat, i64 val) { assert(val >= 0); assert(!is_compiled); // If the pattern requires only a single substring search, the // Aho-Corasick algorithm is even faster than our glob matcher. if (aho_corasick.can_handle(pat)) return aho_corasick.add(pat, val); return multi_glob.add(pat, val); } i64 Glob::find(std::string_view str) { std::call_once(once, [&] { multi_glob.compile(); aho_corasick.compile(); is_compiled = true; }); return std::max(multi_glob.find(str), aho_corasick.find(str)); } } // namespace mold ================================================ FILE: lib/hyperloglog.cc ================================================ // This file implements HyperLogLog algorithm, which estimates // the number of unique items in a given multiset. // // For more info, read // https://engineering.fb.com/2018/12/13/data-infrastructure/hyperloglog #include "lib.h" #include namespace mold { i64 HyperLogLog::get_cardinality() const { double z = 0; for (i64 val : buckets) z += std::ldexp(1.0, -val); return ALPHA * NBUCKETS * NBUCKETS / z; } } // namespace mold ================================================ FILE: lib/integers.h ================================================ // This file defines integral types for file input/output. We need to use // these types instead of the plain integers (such as uint32_t or int32_t) // when reading from/writing to an mmap'ed file area for the following // reasons: // // 1. mold is always a cross linker and should not depend on what host it // is running on. For example, users should be able to run mold on a // little-endian x86 machine to create a big-endian s390x binary. // // 2. Even though data members in all ELF data strucutres are naturally // aligned, they are not guaranteed to be aligned on memory because of // archive files. Archive files (.a files) align each file only to a // 2 byte boundary, so anything larger than 2 bytes may be misaligned // in an mmap'ed memory. Misaligned access is an undefined behavior in // C/C++, so we shouldn't cast an arbitrary pointer to a uint32_t, for // example, to read a 32 bit value. // // The data types defined in this file are independent of the host byte // order and are designed to avoid unaligned access. // // Note that in C/C++, memcpy is a portable and efficient way to access // unaligned data, as it is typically treated as an intrinsic. Compilers // can easily optimize memcpy calls in this file into a single load or // store instruction. #pragma once #include #include #include #include #if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__) # if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ # define __LITTLE_ENDIAN__ 1 # elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ # define __BIG_ENDIAN__ 1 # else # error "unknown host byte order" # endif #endif namespace mold { template class Integer { public: constexpr Integer() = default; constexpr Integer(T v) { if (std::is_constant_evaluated() || size == 3) { for (int i = 0; i < size; i++) buf[is_le ? i : (size - i - 1)] = v >> (i * 8); } else { if (!is_native) v = bswap(v); memcpy(buf, &v, size); } } operator T() const { if (size == 3) { if (is_le) return buf[2] << 16 | buf[1] << 8 | buf[0]; return buf[0] << 16 | buf[1] << 8 | buf[2]; } T v; memcpy(&v, buf, size); return is_native ? v : bswap(v); } Integer &operator=(T v) { new (this) Integer(v); return *this; } Integer &operator++() { return *this = *this + 1; } Integer operator++(int) { auto x = *this; ++*this; return x; } Integer &operator--() { return *this = *this - 1; } Integer operator--(int) { auto x = *this; --*this; return x; } Integer &operator+=(T v) { return *this = *this + v; } Integer &operator-=(T v) { return *this = *this - v; } Integer &operator&=(T v) { return *this = *this & v; } Integer &operator|=(T v) { return *this = *this | v; } private: static constexpr bool is_native = (std::endian::native == (is_le ? std::endian::little : std::endian::big)); static T bswap(T v) { switch (size) { case 2: return __builtin_bswap16(v); case 4: return __builtin_bswap32(v); case 8: return __builtin_bswap64(v); } __builtin_unreachable(); } uint8_t buf[size]; }; using i8 = int8_t; using i16 = int16_t; using i32 = int32_t; using i64 = int64_t; using u8 = uint8_t; using u16 = uint16_t; using u32 = uint32_t; using u64 = uint64_t; using il16 = Integer; using il32 = Integer; using il64 = Integer; using ul16 = Integer; using ul24 = Integer; using ul32 = Integer; using ul64 = Integer; using ib16 = Integer; using ib32 = Integer; using ib64 = Integer; using ub16 = Integer; using ub24 = Integer; using ub32 = Integer; using ub64 = Integer; } // namespace mold ================================================ FILE: lib/lib.h ================================================ #pragma once #include "atomics.h" #include "integers.h" #include "bitvector.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _WIN32 # include #else # include # include #endif #define XXH_INLINE_ALL 1 #include "../third-party/xxhash/xxhash.h" #ifdef NDEBUG # define unreachable() __builtin_unreachable() #else # define unreachable() assert(0 && "unreachable") #endif inline uint64_t hash_string(std::string_view str) { return XXH3_64bits(str.data(), str.size()); } class HashCmp { public: static size_t hash(const std::string_view &k) { return hash_string(k); } static bool equal(const std::string_view &k1, const std::string_view &k2) { return k1 == k2; } }; namespace mold { namespace ranges = std::ranges; using namespace std::literals::string_literals; using namespace std::literals::string_view_literals; inline u64 combine_hash(u64 a, u64 b) { return a ^ (b + 0x9e3779b9 + (a << 6) + (a >> 2)); } // // perf.cc // // Counter is used to collect statistics numbers. class Counter { public: Counter(std::string_view name, i64 value = 0) : name(name), values(value) { static std::mutex mu; std::scoped_lock lock(mu); instances.push_back(this); } Counter &operator++(int) { if (enabled) [[unlikely]] values.local()++; return *this; } Counter &operator+=(int delta) { if (enabled) [[unlikely]] values.local() += delta; return *this; } static void print(); static inline bool enabled = false; private: i64 get_value(); std::string_view name; tbb::enumerable_thread_specific values; static inline std::vector instances; }; // Timer and TimeRecord records elapsed time (wall clock time) // used by each pass of the linker. struct TimerRecord { TimerRecord(std::string name, TimerRecord *parent = nullptr); void stop(); std::string name; TimerRecord *parent; tbb::concurrent_vector children; i64 start; i64 end; i64 user; i64 sys; bool stopped = false; }; void print_timer_records(tbb::concurrent_vector> &); template class Timer { public: Timer(Context &ctx, std::string name, Timer *parent = nullptr) { record = new TimerRecord(name, parent ? parent->record : nullptr); ctx.timer_records.emplace_back(record); } Timer(const Timer &) = delete; ~Timer() { record->stop(); } void stop() { record->stop(); } private: TimerRecord *record; }; // // Utility functions // // Some C++ libraries haven't implemented std::has_single_bit yet. inline bool has_single_bit(u64 val) { return std::popcount(val) == 1; } // Some C++ libraries haven't implemented std::bit_ceil yet. inline u64 bit_ceil(u64 val) { if (has_single_bit(val)) return val; return 1LL << (64 - std::countl_zero(val)); } inline u64 align_to(u64 val, u64 align) { if (align == 0) return val; assert(has_single_bit(align)); return (val + align - 1) & ~(align - 1); } inline u64 align_down(u64 val, u64 align) { assert(has_single_bit(align)); return val & ~(align - 1); } inline u64 bit(u64 val, i64 pos) { return (val >> pos) & 1; }; // Returns [hi:lo] bits of val. inline u64 bits(u64 val, u64 hi, u64 lo) { return (val >> lo) & ((1LL << (hi - lo + 1)) - 1); } // Cast val to a signed N bit integer. // For example, sign_extend(x, 32) == (i32)x for any integer x. inline i64 sign_extend(u64 val, i64 n) { return (i64)(val << (64 - n)) >> (64 - n); } inline bool is_int(u64 val, i64 n) { return sign_extend(val, n) == val; } template > void update_minimum(std::atomic &atomic, u64 new_val, Compare cmp = {}) { T old_val = atomic.load(std::memory_order_relaxed); while (cmp(new_val, old_val) && !atomic.compare_exchange_weak(old_val, new_val, std::memory_order_relaxed)); } template > void update_maximum(std::atomic &atomic, u64 new_val, Compare cmp = {}) { T old_val = atomic.load(std::memory_order_relaxed); while (cmp(old_val, new_val) && !atomic.compare_exchange_weak(old_val, new_val, std::memory_order_relaxed)); } template inline void append(std::vector &x, const auto &y) { x.insert(x.end(), y.begin(), y.end()); } template inline std::vector flatten(std::vector> &vec) { i64 size = 0; for (std::vector &v : vec) size += v.size(); std::vector ret; ret.reserve(size); for (std::vector &v : vec) append(ret, v); return ret; } template inline void remove_duplicates(std::vector &vec) { vec.erase(std::unique(vec.begin(), vec.end()), vec.end()); } inline i64 write_string(void *buf, std::string_view str) { memcpy(buf, str.data(), str.size()); *((u8 *)buf + str.size()) = '\0'; return str.size() + 1; } template inline void write_vector(void *buf, const std::vector &vec) { if (!vec.empty()) memcpy(buf, vec.data(), vec.size() * sizeof(T)); } inline void encode_uleb(std::vector &vec, u64 val) { do { u8 byte = val & 0x7f; val >>= 7; vec.push_back(val ? (byte | 0x80) : byte); } while (val); } inline void encode_sleb(std::vector &vec, i64 val) { for (;;) { u8 byte = val & 0x7f; val >>= 7; bool neg = (byte & 0x40); if ((val == 0 && !neg) || (val == -1 && neg)) { vec.push_back(byte); break; } vec.push_back(byte | 0x80); } } inline i64 write_uleb(u8 *buf, u64 val) { i64 i = 0; do { u8 byte = val & 0x7f; val >>= 7; buf[i++] = val ? (byte | 0x80) : byte; } while (val); return i; } inline u64 read_uleb(u8 **buf) { u64 val = 0; u8 shift = 0; u8 byte; do { byte = *(*buf)++; val |= (byte & 0x7f) << shift; shift += 7; } while (byte & 0x80); return val; } inline u64 read_uleb(u8 *buf) { u8 *tmp = buf; return read_uleb(&tmp); } inline i64 read_sleb(u8 **buf) { u64 val = 0; u8 shift = 0; u8 byte; do { byte = *(*buf)++; val |= (byte & 0x7f) << shift; shift += 7; } while (byte & 0x80); return sign_extend(val, shift); } inline i64 read_sleb(u8 *buf) { u8 *tmp = buf; return read_sleb(&tmp); } inline u64 read_uleb(std::string_view *str) { u8 *start = (u8 *)str->data(); u8 *ptr = start; u64 val = read_uleb(&ptr); *str = str->substr(ptr - start); return val; } inline u64 read_uleb(std::string_view str) { std::string_view tmp = str; return read_uleb(&tmp); } inline i64 uleb_size(u64 val) { for (int i = 1; i < 9; i++) if (val < (1LL << (7 * i))) return i; return 9; } inline void overwrite_uleb(u8 *loc, u64 val) { while (*loc & 0b1000'0000) { *loc++ = 0b1000'0000 | (val & 0b0111'1111); val >>= 7; } *loc = val & 0b0111'1111; } static inline void pause() { #if defined(__x86_64__) asm volatile("pause"); #elif defined(__aarch64__) asm volatile("yield"); #elif defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_8A__) asm volatile("yield"); #endif } // // Concurrent Map // // This is an implementation of a fast concurrent hash map. Unlike // ordinary hash tables, this impl just aborts if it becomes full. // So you need to give a correct estimation of the final size before // using it. We use this hash map to uniquify pieces of data in // mergeable sections. // // We've implemented this ourselves because the performance of // conrurent hash map is critical for our linker. template class ConcurrentMap { public: ConcurrentMap() = default; ConcurrentMap(i64 nbuckets) { resize(nbuckets); } ~ConcurrentMap() { if (entries) { #ifdef _WIN32 _aligned_free(entries); #else munmap(entries, sizeof(Entry) * nbuckets); #endif } } // In order to avoid unnecessary cache-line false sharing, we want // to make this object to be aligned to a reasonably large // power-of-two address. struct alignas(32) Entry { Atomic key; u32 keylen; T value; }; void resize(i64 nbuckets) { assert(!entries); this->nbuckets = std::max(MIN_NBUCKETS, bit_ceil(nbuckets)); i64 bufsize = sizeof(Entry) * this->nbuckets; // Allocate a zero-initialized buffer. We use mmap() if available // because it's faster than malloc() and memset(). #ifdef _WIN32 entries = (Entry *)_aligned_malloc(bufsize, alignof(Entry)); memset((void *)entries, 0, bufsize); #else entries = (Entry *)mmap(nullptr, bufsize, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); #endif } std::pair insert(std::string_view key, u64 hash, const T &val) { assert(has_single_bit(nbuckets)); u64 begin = hash & (nbuckets - 1); u64 mask = nbuckets / NUM_SHARDS - 1; for (i64 i = 0; i < MAX_RETRY; i++) { u64 idx = (begin & ~mask) | ((begin + i) & mask); Entry &ent = entries[idx]; // It seems avoiding compare-and-swap is faster overall at least // on my Zen4 machine, so do it. if (const char *ptr = ent.key.load(std::memory_order_acquire); ptr != nullptr && ptr != (char *)-1) { if (key == std::string_view(ptr, ent.keylen)) return {&ent.value, false}; continue; } // Otherwise, use CAS to atomically claim the ownership of the slot. const char *ptr = nullptr; bool claimed = ent.key.compare_exchange_strong(ptr, (char *)-1, std::memory_order_acquire); // If we successfully claimed the ownership of the slot, // copy values to it. if (claimed) { new (&ent.value) T(val); ent.keylen = key.size(); ent.key.store(key.data(), std::memory_order_release); return {&ent.value, true}; } // If someone is copying values to the slot, do busy wait. while (ptr == (char *)-1) { pause(); ptr = ent.key.load(std::memory_order_acquire); } // If the same key is already present, this is the slot we are // looking for. if (key == std::string_view(ptr, ent.keylen)) return {&ent.value, false}; } std::cerr << "ConcurrentMap is full\n"; abort(); } i64 get_idx(T *value) const { uintptr_t addr = (uintptr_t)value - (uintptr_t)value % sizeof(Entry); return (Entry *)addr - entries; } // Return a list of map entries sorted in a deterministic order. std::vector get_sorted_entries(i64 shard_idx) { if (nbuckets == 0) return {}; i64 shard_size = nbuckets / NUM_SHARDS; i64 begin = shard_idx * shard_size; i64 end = begin + shard_size; i64 sz = 0; for (i64 i = begin; i < end; i++) if (entries[i].key) sz++; std::vector vec; vec.reserve(sz); // Since the shard is circular, we need to handle the last entries // as if they were next to the first entries. while (begin < end && entries[end - 1].key) vec.push_back(entries + --end); // Find entries contiguous in the buckets and sort them. i64 last = 0; for (i64 i = begin; i < end;) { while (i < end && entries[i].key) vec.push_back(entries + i++); std::sort(vec.begin() + last, vec.end(), [](Entry *a, Entry *b) { if (a->keylen != b->keylen) return a->keylen < b->keylen; return memcmp(a->key, b->key, a->keylen) < 0; }); last = vec.size(); while (i < end && !entries[i].key) i++; } return vec; } std::vector get_sorted_entries_all() { std::vector> vec(NUM_SHARDS); tbb::parallel_for((i64)0, NUM_SHARDS, [&](i64 i) { vec[i] = get_sorted_entries(i); }); return flatten(vec); } static constexpr i64 MIN_NBUCKETS = 4096; static constexpr i64 NUM_SHARDS = 16; static constexpr i64 MAX_RETRY = 256; Entry *entries = nullptr; u64 nbuckets = 0; }; // // random.cc // void get_random_bytes(u8 *buf, i64 size); // // hyperloglog.cc // class HyperLogLog { public: void insert(u64 hash) { update_maximum(buckets[hash & (NBUCKETS - 1)], std::countl_zero(hash) + 1); } i64 get_cardinality() const; void merge(const HyperLogLog &other) { for (i64 i = 0; i < NBUCKETS; i++) update_maximum(buckets[i], other.buckets[i]); } private: static constexpr i64 NBUCKETS = 2048; static constexpr double ALPHA = 0.79402; Atomic buckets[NBUCKETS]; }; // // aho-corasick.cc // class AhoCorasick { public: bool add(std::string_view pat, i64 val); bool empty() const { return nodes.empty(); } void compile(); i64 find(std::string_view str); static bool can_handle(std::string_view str); private: struct TrieNode { TrieNode() { children.fill(-1); } i64 value = -1; i32 suffix_link = -1; std::array children; }; void fix_suffix_links(i64 idx); void fix_values(); std::vector nodes; }; // // glob.cc // class MultiGlob { public: bool add(std::string_view pat, i64 val); bool empty() const { return patterns.empty(); } void compile(); i64 find(std::string_view str); struct State { std::bitset<256> incoming_edge; bool is_star = false; }; struct GlobPattern { std::vector states; i64 value = -1; }; private: std::vector patterns; Bitvector start_states; Bitvector star_mask; Bitvector char_mask[256]; }; class Glob { public: bool add(std::string_view pat, i64 val); bool empty() const { return multi_glob.empty() && aho_corasick.empty(); } i64 find(std::string_view str); private: std::once_flag once; bool is_compiled = false; MultiGlob multi_glob; AhoCorasick aho_corasick; }; // // filepath.cc // inline std::filesystem::path path_dirname(std::string_view path) { return std::filesystem::path(path).parent_path(); } inline std::string path_filename(std::string_view path) { return std::filesystem::path(path).filename().string(); } inline std::string path_clean(std::string_view path) { return std::filesystem::path(path).lexically_normal().string(); } std::string get_self_path(); // // demangle.cc // std::optional demangle_cpp(std::string_view name); std::optional demangle_rust(std::string_view name); // // crc32.cc // u32 compute_crc32(u32 crc, u8 *buf, i64 len); std::vector crc32_solve(u32 current, u32 desired); // // compress.cc // class Compressor { public: virtual void write_to(u8 *buf) = 0; virtual ~Compressor(); i64 compressed_size = 0; protected: std::vector> shards; }; class ZlibCompressor : public Compressor { public: ZlibCompressor(u8 *buf, i64 size); void write_to(u8 *buf) override; private: u32 checksum = 0; }; class ZstdCompressor : public Compressor { public: ZstdCompressor(u8 *buf, i64 size); void write_to(u8 *buf) override; }; // // tar.cc // // TarFile is a class to create a tar file. // // If you pass `--repro` to mold, mold collects all input files and // put them into `.repro.tar`, so that it is easy to // run the same command with the same command line arguments. class TarWriter { public: static std::unique_ptr open(std::string output_path, std::string basedir); ~TarWriter(); void append(std::string path, std::string_view data); private: TarWriter(FILE *out, std::string basedir) : out(out), basedir(basedir) {} FILE *out = nullptr; std::string basedir; }; } // namespace mold ================================================ FILE: lib/perf.cc ================================================ #include "lib.h" #include #include #include #include #ifndef _WIN32 #include #include #endif namespace mold { i64 Counter::get_value() { return values.combine(std::plus()); } void Counter::print() { ranges::stable_sort(instances, {}, [](Counter *x) { return x->get_value(); }); for (Counter *c : instances) std::cout << std::setw(20) << std::right << c->name << "=" << c->get_value() << "\n"; } static i64 now_nsec() { return (i64)std::chrono::steady_clock::now().time_since_epoch().count(); } static std::pair get_usage() { #ifdef _WIN32 auto to_nsec = [](FILETIME t) -> i64 { return (((u64)t.dwHighDateTime << 32) + (u64)t.dwLowDateTime) * 100; }; FILETIME creation, exit, kernel, user; GetProcessTimes(GetCurrentProcess(), &creation, &exit, &kernel, &user); return {to_nsec(user), to_nsec(kernel)}; #else auto to_nsec = [](struct timeval t) -> i64 { return (i64)t.tv_sec * 1'000'000'000 + t.tv_usec * 1'000; }; struct rusage ru; getrusage(RUSAGE_SELF, &ru); return {to_nsec(ru.ru_utime), to_nsec(ru.ru_stime)}; #endif } TimerRecord::TimerRecord(std::string name, TimerRecord *parent) : name(name), parent(parent) { start = now_nsec(); std::tie(user, sys) = get_usage(); if (parent) parent->children.push_back(this); } void TimerRecord::stop() { if (stopped) return; stopped = true; i64 user2; i64 sys2; std::tie(user2, sys2) = get_usage(); end = now_nsec(); user = user2 - user; sys = sys2 - sys; } static void print_rec(TimerRecord &rec, i64 indent) { printf(" % 8.3f % 8.3f % 8.3f %s%s\n", ((double)rec.user / 1'000'000'000), ((double)rec.sys / 1'000'000'000), (((double)rec.end - rec.start) / 1'000'000'000), std::string(indent * 2, ' ').c_str(), rec.name.c_str()); ranges::stable_sort(rec.children, {}, &TimerRecord::start); for (TimerRecord *child : rec.children) print_rec(*child, indent + 1); } void print_timer_records( tbb::concurrent_vector> &records) { for (i64 i = records.size() - 1; i >= 0; i--) records[i]->stop(); for (i64 i = 0; i < records.size(); i++) { TimerRecord &inner = *records[i]; if (inner.parent) continue; for (i64 j = i - 1; j >= 0; j--) { TimerRecord &outer = *records[j]; if (outer.start <= inner.start && inner.end <= outer.end) { inner.parent = &outer; outer.children.push_back(&inner); break; } } } std::cout << " User System Real Name\n"; for (std::unique_ptr &rec : records) if (!rec->parent) print_rec(*rec, 0); std::cout << std::flush; } } // namespace mold ================================================ FILE: lib/random.cc ================================================ #include "lib.h" #include namespace mold { void get_random_bytes(u8 *buf, i64 size) { std::random_device rand; i64 i = 0; for (; i < size - 4; i += 4) { u32 val = rand(); memcpy(buf + i, &val, 4); } u32 val = rand(); memcpy(buf + i, &val, size - i); } } // namespace mold ================================================ FILE: lib/siphash.h ================================================ // This is a header-only C++20 implementation of SipHash based on the // reference implementation. To use, just copy this header file into // your project and #include it. // // https://github.com/rui314/siphash/blob/main/siphash.h #include #include #include template class SipHashTmpl { public: static_assert(OUTLEN == 64 || OUTLEN == 128); SipHashTmpl(void *key) { uint64_t k0 = read64(key); uint64_t k1 = read64((char *)key + 8); v0 = 0x736f6d6570736575 ^ k0; v1 = 0x646f72616e646f6d ^ k1; v2 = 0x6c7967656e657261 ^ k0; v3 = 0x7465646279746573 ^ k1; if (OUTLEN == 128) v1 ^= 0xee; } void update(void *msgp, int64_t msglen) { char *msg = (char *)msgp; sum += msglen; if (buflen) { if (buflen + msglen < 8) { memcpy(buf + buflen, msg, msglen); buflen += msglen; return; } int j = 8 - buflen; memcpy(buf + buflen, msg, j); compress(read64(buf)); msg += j; msglen -= j; buflen = 0; } while (msglen >= 8) { compress(read64(msg)); msg += 8; msglen -= 8; } memcpy(buf, msg, msglen); buflen = msglen; } void finish(void *out) { memset(buf + buflen, 0, 8 - buflen); compress(((uint64_t)sum << 56) | read64(buf)); v2 ^= (OUTLEN == 128) ? 0xee : 0xff; finalize(); write64(out, v0 ^ v1 ^ v2 ^ v3); if (OUTLEN == 128) { v1 ^= 0xdd; finalize(); write64((char *)out + 8, v0 ^ v1 ^ v2 ^ v3); } } static void hash(void *out, void *key, void *in, int inlen) { SipHashTmpl h(key); h.update(in, inlen); h.finish(out); } private: uint64_t v0, v1, v2, v3; uint8_t buf[8]; uint8_t buflen = 0; uint8_t sum = 0; uint64_t read64(void *loc) { uint64_t val; memcpy(&val, loc, 8); if (std::endian::native == std::endian::big) val = bswap(val); return val; } void write64(void *loc, uint64_t val) { if (std::endian::native == std::endian::big) val = bswap(val); memcpy(loc, &val, 8); } uint64_t bswap(uint64_t val) { return ((val << 56) & 0xff00000000000000) | ((val << 40) & 0x00ff000000000000) | ((val << 24) & 0x0000ff0000000000) | ((val << 8) & 0x000000ff00000000) | ((val >> 8) & 0x00000000ff000000) | ((val >> 24) & 0x0000000000ff0000) | ((val >> 40) & 0x000000000000ff00) | ((val >> 56) & 0x00000000000000ff); } void round() { v0 += v1; v1 = std::rotl(v1, 13); v1 ^= v0; v0 = std::rotl(v0, 32); v2 += v3; v3 = std::rotl(v3, 16); v3 ^= v2; v0 += v3; v3 = std::rotl(v3, 21); v3 ^= v0; v2 += v1; v1 = std::rotl(v1, 17); v1 ^= v2; v2 = std::rotl(v2, 32); } void compress(uint64_t m) { v3 ^= m; for (int i = 0; i < C_ROUNDS; i++) round(); v0 ^= m; } void finalize() { for (int i = 0; i < D_ROUNDS; i++) round(); } }; using SipHash = SipHashTmpl<2, 4, 64>; using SipHash128 = SipHashTmpl<2, 4, 128>; using SipHash13 = SipHashTmpl<1, 3, 64>; using SipHash13_128 = SipHashTmpl<1, 3, 128>; ================================================ FILE: lib/tar.cc ================================================ // This file contains functions to create a tar file. #include "lib.h" #ifdef _WIN32 # define ftruncate _chsize_s #endif namespace mold { static constexpr i64 BLOCK_SIZE = 512; // A tar file consists of one or more Ustar header followed by data. // Each Ustar header represents a single file in an archive. // // tar is an old file format, and its `name` field is only 100 bytes long. // If `name` is longer than 100 bytes, we can emit a PAX header before a // Ustar header to store a long filename. // // For simplicity, we always emit a PAX header even for a short filename. struct UstarHeader { char name[100]; char mode[8]; char uid[8]; char gid[8]; char size[12]; char mtime[12]; char checksum[8]; char typeflag[1]; char linkname[100]; char magic[6]; char version[2]; char uname[32]; char gname[32]; char devmajor[8]; char devminor[8]; char prefix[155]; char pad[12]; }; static_assert(sizeof(UstarHeader) == BLOCK_SIZE); static void finalize(UstarHeader &hdr) { memset(hdr.checksum, ' ', sizeof(hdr.checksum)); memcpy(hdr.magic, "ustar", 5); memcpy(hdr.version, "00", 2); // Compute checksum int sum = 0; for (i64 i = 0; i < sizeof(hdr); i++) sum += ((u8 *)&hdr)[i]; // We need to convince the compiler that sum isn't too big to silence // -Werror=format-truncation. if (sum >= 01'000'000) unreachable(); snprintf(hdr.checksum, sizeof(hdr.checksum), "%06o", sum); } static std::string encode_path(std::string basedir, std::string path) { path = path_clean(basedir + "/" + path); // Construct a string which contains something like // "16 path=foo/bar\n" where 16 is the size of the string // including the size string itself. i64 len = std::string(" path=\n").size() + path.size(); i64 total = std::to_string(len).size() + len; total = std::to_string(total).size() + len; return std::to_string(total) + " path=" + path + "\n"; } std::unique_ptr TarWriter::open(std::string output_path, std::string basedir) { FILE *out = fopen(output_path.c_str(), "w"); if (!out) return nullptr; return std::unique_ptr(new TarWriter(out, basedir)); } TarWriter::~TarWriter() { fclose(out); } void TarWriter::append(std::string path, std::string_view data) { // Write PAX header UstarHeader pax = {}; std::string attr = encode_path(basedir, path); snprintf(pax.size, sizeof(pax.size), "%011zo", attr.size()); pax.name[0] = '/'; pax.typeflag[0] = 'x'; finalize(pax); fwrite(&pax, sizeof(pax), 1, out); // Write pathname fwrite(attr.data(), attr.size(), 1, out); fseek(out, align_to(ftell(out), BLOCK_SIZE), SEEK_SET); // Write Ustar header UstarHeader ustar = {}; memcpy(ustar.mode, "0000664", 8); snprintf(ustar.size, sizeof(ustar.size), "%011zo", data.size()); finalize(ustar); fwrite(&ustar, sizeof(ustar), 1, out); // Write file contents fwrite(data.data(), data.size(), 1, out); fseek(out, align_to(ftell(out), BLOCK_SIZE), SEEK_SET); // A tar file must ends with two empty blocks (void)!ftruncate(fileno(out), ftell(out) + BLOCK_SIZE * 2); } } // namespace mold ================================================ FILE: lib/update-git-hash.cmake ================================================ # Get a git hash value. We do not want to use git command here # because we don't want to make git a build-time dependency. if(EXISTS "${SOURCE_DIR}/.git/HEAD") file(READ "${SOURCE_DIR}/.git/HEAD" HASH) string(STRIP "${HASH}" HASH) if(HASH MATCHES "^ref: (.*)") set(HEAD "${CMAKE_MATCH_1}") if(EXISTS "${SOURCE_DIR}/.git/${HEAD}") file(READ "${SOURCE_DIR}/.git/${HEAD}" HASH) string(STRIP "${HASH}" HASH) else() file(READ "${SOURCE_DIR}/.git/packed-refs" PACKED_REFS) string(REGEX REPLACE ".*\n([0-9a-f]+) ${HEAD}\n.*" "\\1" HASH "\n${PACKED_REFS}") endif() endif() endif() # Create new file contents and update a given file if necessary. if("${HASH}" STREQUAL "") set(NEW_CONTENTS "") else() set(NEW_CONTENTS "#define MOLD_GIT_HASH \"${HASH}\"\n") endif() if(EXISTS "${OUTPUT_FILE}") file(READ "${OUTPUT_FILE}" OLD_CONTENTS) if(NOT "${NEW_CONTENTS}" STREQUAL "${OLD_CONTENTS}") file(WRITE "${OUTPUT_FILE}" "${NEW_CONTENTS}") endif() else() file(WRITE "${OUTPUT_FILE}" "${NEW_CONTENTS}") endif() ================================================ FILE: src/arch-arm32.cc ================================================ // ARM32 is a bit special from the linker's viewpoint because ARM // processors support two different instruction encodings: Thumb and // ARM (in a narrower sense). Thumb instructions are either 16 bits or // 32 bits, while ARM instructions are all 32 bits. Feature-wise, // Thumb is a subset of ARM, so not all ARM instructions are // representable in Thumb. // // ARM processors originally supported only ARM instructions. Thumb // instructions were later added to increase code density. // // ARM processors runs in either ARM mode or Thumb mode. The mode can // be switched using BX (branch and mode exchange)-family instructions. // We need to use that instructions to, for example, call a function // encoded in Thumb from a function encoded in ARM. Sometimes, the // linker even has to emit interworking thunk code to switch mode. // // ARM instructions are aligned to 4 byte boundaries. Thumb are to 2 // byte boundaries. So the least significant bit of a function address // is always 0. // // To distinguish Thumb functions from ARM fucntions, the LSB of a // function address is repurposed as a boolean flag. If the LSB is 0, // the function referred to by the address is encoded in ARM; // otherwise, Thumb. // // For example, if a symbol `foo` is of type STT_FUNC and has value // 0x2001, `foo` is a function using Thumb instructions whose address // is 0x2000 (not 0x2001, as Thumb instructions are always 2-byte // aligned). Likewise, if a function pointer has value 0x2001, it // refers a Thumb function at 0x2000. // // https://github.com/ARM-software/abi-aa/blob/main/aaelf32/aaelf32.rst #if MOLD_ARM32LE || MOLD_ARM32BE #include "mold.h" #include #include namespace mold { using E = MOLD_TARGET; template <> i64 get_addend(u8 *loc, const ElfRel &rel) { U32 *arm = (U32 *)loc; U16 *thm = (U16 *)loc; switch (rel.r_type) { case R_ARM_ABS32: case R_ARM_REL32: case R_ARM_BASE_PREL: case R_ARM_GOTOFF32: case R_ARM_GOT_PREL: case R_ARM_GOT_BREL: case R_ARM_TLS_GD32: case R_ARM_TLS_LDM32: case R_ARM_TLS_LDO32: case R_ARM_TLS_IE32: case R_ARM_TLS_LE32: case R_ARM_TLS_GOTDESC: case R_ARM_TARGET1: case R_ARM_TARGET2: return (I32)*arm; case R_ARM_THM_JUMP8: return sign_extend(thm[0], 8) << 1; case R_ARM_THM_JUMP11: return sign_extend(thm[0], 11) << 1; case R_ARM_THM_JUMP19: { // https://developer.arm.com/documentation/ddi0597/2024-12/Base-Instructions/B--Branch- u32 S = bit(thm[0], 10); u32 J2 = bit(thm[1], 11); u32 J1 = bit(thm[1], 13); u32 imm6 = bits(thm[0], 5, 0); u32 imm11 = bits(thm[1], 10, 0); u32 val = (S << 20) | (J2 << 19) | (J1 << 18) | (imm6 << 12) | (imm11 << 1); return sign_extend(val, 21); } case R_ARM_THM_CALL: case R_ARM_THM_JUMP24: case R_ARM_THM_TLS_CALL: { // https://developer.arm.com/documentation/ddi0597/2024-12/Base-Instructions/BL--BLX--immediate---Branch-with-Link-and-optional-Exchange--immediate-- u32 S = bit(thm[0], 10); u32 J1 = bit(thm[1], 13); u32 J2 = bit(thm[1], 11); u32 I1 = !(J1 ^ S); u32 I2 = !(J2 ^ S); u32 imm10 = bits(thm[0], 9, 0); u32 imm11 = bits(thm[1], 10, 0); u32 val = (S << 24) | (I1 << 23) | (I2 << 22) | (imm10 << 12) | (imm11 << 1); return sign_extend(val, 25); } case R_ARM_CALL: case R_ARM_JUMP24: case R_ARM_PLT32: case R_ARM_TLS_CALL: return sign_extend(*arm, 24) << 2; case R_ARM_MOVW_PREL_NC: case R_ARM_MOVW_ABS_NC: case R_ARM_MOVT_PREL: case R_ARM_MOVT_ABS: { // https://developer.arm.com/documentation/ddi0597/2024-12/Base-Instructions/MOV--MOVS--immediate---Move--immediate-- u32 imm4 = bits(*arm, 19, 16); u32 imm12 = bits(*arm, 11, 0); u32 val = (imm4 << 12) | imm12; return sign_extend(val, 16); } case R_ARM_PREL31: return sign_extend(*arm, 31); case R_ARM_THM_MOVW_PREL_NC: case R_ARM_THM_MOVW_ABS_NC: case R_ARM_THM_MOVT_PREL: case R_ARM_THM_MOVT_ABS: { // https://developer.arm.com/documentation/ddi0597/2024-12/Base-Instructions/MOVT--Move-Top- u32 imm4 = bits(thm[0], 3, 0); u32 i = bit(thm[0], 10); u32 imm3 = bits(thm[1], 14, 12); u32 imm8 = bits(thm[1], 7, 0); u32 val = (imm4 << 12) | (i << 11) | (imm3 << 8) | imm8; return sign_extend(val, 16); } default: return 0; } } static void write_arm_mov(u8 *loc, u32 val) { u32 imm12 = bits(val, 11, 0); u32 imm4 = bits(val, 15, 12); *(U32 *)loc = (*(U32 *)loc & 0xfff0'f000) | (imm4 << 16) | imm12; } static void write_thm_b21(u8 *loc, u32 val) { u32 S = bit(val, 20); u32 J2 = bit(val, 19); u32 J1 = bit(val, 18); u32 imm6 = bits(val, 17, 12); u32 imm11 = bits(val, 11, 1); U16 *buf = (U16 *)loc; buf[0] = (buf[0] & 0b1111'1011'1100'0000) | (S << 10) | imm6; buf[1] = (buf[1] & 0b1101'0000'0000'0000) | (J1 << 13) | (J2 << 11) | imm11; } static void write_thm_b25(u8 *loc, u32 val) { u32 S = bit(val, 24); u32 I1 = bit(val, 23); u32 I2 = bit(val, 22); u32 J1 = !I1 ^ S; u32 J2 = !I2 ^ S; u32 imm10 = bits(val, 21, 12); u32 imm11 = bits(val, 11, 1); U16 *buf = (U16 *)loc; buf[0] = (buf[0] & 0b1111'1000'0000'0000) | (S << 10) | imm10; buf[1] = (buf[1] & 0b1101'0000'0000'0000) | (J1 << 13) | (J2 << 11) | imm11; } static void write_thm_mov(u8 *loc, u32 val) { u32 imm4 = bits(val, 15, 12); u32 i = bit(val, 11); u32 imm3 = bits(val, 10, 8); u32 imm8 = bits(val, 7, 0); U16 *buf = (U16 *)loc; buf[0] = (buf[0] & 0b1111'1011'1111'0000) | (i << 10) | imm4; buf[1] = (buf[1] & 0b1000'1111'0000'0000) | (imm3 << 12) | imm8; } template <> void write_addend(u8 *loc, i64 val, const ElfRel &rel) { switch (rel.r_type) { case R_ARM_NONE: break; case R_ARM_ABS32: case R_ARM_REL32: case R_ARM_BASE_PREL: case R_ARM_GOTOFF32: case R_ARM_GOT_PREL: case R_ARM_GOT_BREL: case R_ARM_TLS_GD32: case R_ARM_TLS_LDM32: case R_ARM_TLS_LDO32: case R_ARM_TLS_IE32: case R_ARM_TLS_LE32: case R_ARM_TLS_GOTDESC: case R_ARM_TARGET1: case R_ARM_TARGET2: *(U32 *)loc = val; break; case R_ARM_THM_JUMP8: *(U16 *)loc = (*(U16 *)loc & 0xff00) | bits(val, 8, 1); break; case R_ARM_THM_JUMP11: *(U16 *)loc = (*(U16 *)loc & 0xf800) | bits(val, 11, 1); break; case R_ARM_THM_CALL: case R_ARM_THM_JUMP24: case R_ARM_THM_TLS_CALL: write_thm_b25(loc, val); break; case R_ARM_CALL: case R_ARM_JUMP24: case R_ARM_PLT32: *(U32 *)loc = (*(U32 *)loc & 0xff00'0000) | bits(val, 25, 2); break; case R_ARM_MOVW_PREL_NC: case R_ARM_MOVW_ABS_NC: case R_ARM_MOVT_PREL: case R_ARM_MOVT_ABS: write_arm_mov(loc, val); break; case R_ARM_PREL31: *(U32 *)loc = (*(U32 *)loc & 0x8000'0000) | (val & 0x7fff'ffff); break; case R_ARM_THM_MOVW_PREL_NC: case R_ARM_THM_MOVW_ABS_NC: case R_ARM_THM_MOVT_PREL: case R_ARM_THM_MOVT_ABS: write_thm_mov(loc, val); break; default: unreachable(); } } template <> void write_plt_header(Context &ctx, u8 *buf) { constexpr ul32 insn[] = { 0xe52d'e004, // push {lr} 0xe59f'e004, // ldr lr, 2f 0xe08f'e00e, // 1: add lr, pc, lr 0xe5be'f008, // ldr pc, [lr, #8]! 0x0000'0000, // 2: .word .got.plt - 1b - 8 0x0000'0000, // (padding) 0x0000'0000, // (padding) 0x0000'0000, // (padding) }; memcpy(buf, insn, sizeof(insn)); *(U32 *)(buf + 16) = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 16; } static constexpr ul32 plt_entry[] = { 0xe59f'c004, // 1: ldr ip, 2f 0xe08c'c00f, // add ip, ip, pc 0xe59c'f000, // ldr pc, [ip] 0x0000'0000, // 2: .word sym@GOT - 1b }; template <> void write_plt_entry(Context &ctx, u8 *buf, Symbol &sym) { memcpy(buf, plt_entry, sizeof(plt_entry)); *(U32 *)(buf + 12) = sym.get_gotplt_addr(ctx) - sym.get_plt_addr(ctx) - 12; } template <> void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym) { memcpy(buf, plt_entry, sizeof(plt_entry)); *(U32 *)(buf + 12) = sym.get_got_pltgot_addr(ctx) - sym.get_plt_addr(ctx) - 12; } template <> void EhFrameSection::apply_eh_reloc(Context &ctx, const ElfRel &rel, u64 offset, u64 val) { u8 *loc = ctx.buf + this->shdr.sh_offset + offset; switch (rel.r_type) { case R_NONE: break; case R_ARM_ABS32: *(U32 *)loc = val; break; case R_ARM_REL32: *(U32 *)loc = val - this->shdr.sh_addr - offset; break; default: Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel; } } template <> void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE || rel.r_type == R_ARM_V4BX) continue; Symbol &sym = *file.symbols[rel.r_sym]; u8 *loc = base + rel.r_offset; u64 S = sym.get_addr(ctx); u64 A = get_addend(*this, rel); u64 P = get_addr() + rel.r_offset; u64 T = S & 1; u64 G = sym.get_got_idx(ctx) * sizeof(Word); u64 GOT = ctx.got->shdr.sh_addr; auto check = [&](i64 val, i64 lo, i64 hi) { check_range(ctx, i, val, lo, hi); }; auto get_thumb_thunk_addr = [&] { return sym.get_thunk_addr(ctx, P); }; auto get_arm_thunk_addr = [&] { return sym.get_thunk_addr(ctx, P) + 4; }; auto get_tlsdesc_trampoline_addr = [&] { auto it = ranges::upper_bound(output_section->thunks, P, {}, [](std::unique_ptr> &thunk) { return thunk->get_addr(); }); return (*it)->get_addr(); }; switch (rel.r_type) { case R_ARM_ABS32: case R_ARM_TARGET1: break; case R_ARM_REL32: *(U32 *)loc = S + A - P; break; case R_ARM_THM_CALL: { if (sym.is_remaining_undef_weak()) { // On ARM, calling an weak undefined symbol jumps to the // next instruction. *(U32 *)loc = 0x8000'f3af; // NOP.W break; } // THM_CALL relocation refers to either BL or BLX instruction. // They are different in only one bit. We need to use BL if // the jump target is Thumb. Otherwise, use BLX. i64 val1 = S + A - P; i64 val2 = align_to(S + A - P, 4); if (T && is_int(val1, 25)) { *(U16 *)(loc + 2) |= 0x1000; // BL write_thm_b25(loc, val1); } else if (!T && is_int(val2, 25)) { *(U16 *)(loc + 2) &= ~0x1000; // BLX write_thm_b25(loc, val2); } else { *(U16 *)(loc + 2) |= 0x1000; // BL write_thm_b25(loc, get_thumb_thunk_addr() + A - P); } break; } case R_ARM_BASE_PREL: *(U32 *)loc = GOT + A - P; break; case R_ARM_GOTOFF32: *(U32 *)loc = ((S + A) | T) - GOT; break; case R_ARM_GOT_PREL: case R_ARM_TARGET2: *(U32 *)loc = GOT + G + A - P; break; case R_ARM_GOT_BREL: *(U32 *)loc = G + A; break; case R_ARM_CALL: { if (sym.is_remaining_undef_weak()) { *(U32 *)loc = 0xe320'f000; // NOP break; } // Just like THM_CALL, ARM_CALL relocation refers to either BL or // BLX instruction. We may need to rewrite BL → BLX or BLX → BL. bool is_bl = ((*(U32 *)loc & 0xff00'0000) == 0xeb00'0000); bool is_blx = ((*(U32 *)loc & 0xfe00'0000) == 0xfa00'0000); if (!is_bl && !is_blx) Fatal(ctx) << *this << ": R_ARM_CALL refers to neither BL nor BLX"; i64 val = S + A - P; if (is_int(val, 26)) { if (T) { *(U32 *)loc = 0xfa00'0000; // BLX *(U32 *)loc |= (bit(val, 1) << 24) | bits(val, 25, 2); } else { *(U32 *)loc = 0xeb00'0000; // BL *(U32 *)loc |= bits(val, 25, 2); } } else { *(U32 *)loc = 0xeb00'0000; // BL *(U32 *)loc |= bits(get_arm_thunk_addr() + A - P, 25, 2); } break; } case R_ARM_JUMP24: { if (sym.is_remaining_undef_weak()) { *(U32 *)loc = 0xe320'f000; // NOP break; } // These relocs refers to a B (unconditional branch) instruction. // Unlike BL or BLX, we can't rewrite B to BX in place when the // processor mode switch is required because BX doesn't takes an // immediate; it takes only a register. So if mode switch is // required, we jump to a linker-synthesized thunk which does the // job with a longer code sequence. i64 val = S + A - P; if (T || !is_int(val, 26)) val = get_arm_thunk_addr() + A - P; *(U32 *)loc = (*(U32 *)loc & 0xff00'0000) | bits(val, 25, 2); break; } case R_ARM_PLT32: if (sym.is_remaining_undef_weak()) { *(U32 *)loc = 0xe320'f000; // NOP } else { u64 val = (T ? get_arm_thunk_addr() : S) + A - P; *(U32 *)loc = (*(U32 *)loc & 0xff00'0000) | bits(val, 25, 2); } break; case R_ARM_THM_JUMP8: check(S + A - P, -(1 << 8), 1 << 8); *(U16 *)loc &= 0xff00; *(U16 *)loc |= bits(S + A - P, 8, 1); break; case R_ARM_THM_JUMP11: check(S + A - P, -(1 << 11), 1 << 11); *(U16 *)loc &= 0xf800; *(U16 *)loc |= bits(S + A - P, 11, 1); break; case R_ARM_THM_JUMP19: check(S + A - P, -(1 << 20), 1 << 20); write_thm_b21(loc, S + A - P); break; case R_ARM_THM_JUMP24: { if (sym.is_remaining_undef_weak()) { *(U32 *)loc = 0x8000'f3af; // NOP break; } // Just like R_ARM_JUMP24, we need to jump to a thunk if we need to // switch processor mode. i64 val = S + A - P; if (!T || !is_int(val, 25)) val = get_thumb_thunk_addr() + A - P; write_thm_b25(loc, val); break; } case R_ARM_MOVW_PREL_NC: write_arm_mov(loc, ((S + A) | T) - P); break; case R_ARM_MOVW_ABS_NC: write_arm_mov(loc, (S + A) | T); break; case R_ARM_THM_MOVW_PREL_NC: write_thm_mov(loc, ((S + A) | T) - P); break; case R_ARM_PREL31: check(S + A - P, -(1LL << 30), 1LL << 30); *(U32 *)loc &= 0x8000'0000; *(U32 *)loc |= (S + A - P) & 0x7fff'ffff; break; case R_ARM_THM_MOVW_ABS_NC: write_thm_mov(loc, (S + A) | T); break; case R_ARM_MOVT_PREL: write_arm_mov(loc, (S + A - P) >> 16); break; case R_ARM_THM_MOVT_PREL: write_thm_mov(loc, (S + A - P) >> 16); break; case R_ARM_MOVT_ABS: write_arm_mov(loc, (S + A) >> 16); break; case R_ARM_THM_MOVT_ABS: write_thm_mov(loc, (S + A) >> 16); break; case R_ARM_TLS_GD32: *(U32 *)loc = sym.get_tlsgd_addr(ctx) + A - P; break; case R_ARM_TLS_LDM32: *(U32 *)loc = ctx.got->get_tlsld_addr(ctx) + A - P; break; case R_ARM_TLS_LDO32: *(U32 *)loc = S + A - ctx.dtp_addr; break; case R_ARM_TLS_IE32: *(U32 *)loc = sym.get_gottp_addr(ctx) + A - P; break; case R_ARM_TLS_LE32: *(U32 *)loc = S + A - ctx.tp_addr; break; case R_ARM_TLS_GOTDESC: // ARM32 TLSDESC uses the following code sequence to materialize // a TP-relative address in r0. // // ldr r0, .L2 // .L1: bl foo // R_ARM_TLS_CALL // .L2: .word foo + . - .L1 // R_ARM_TLS_GOTDESC // // We may relax the instructions to the following if its TP-relative // address is known at link-time // // ldr r0, .L2 // .L1: nop // ... // .L2: .word foo(tpoff) // // or to the following if the TP-relative address is known at // process startup time. // // ldr r0, .L2 // .L1: ldr r0, [pc, r0] // ... // .L2: .word foo(gottpoff) + . - .L1 if (sym.has_tlsdesc(ctx)) { // A is odd if the corresponding TLS_CALL is Thumb. *(U32 *)loc = sym.get_tlsdesc_addr(ctx) - P + A - ((A & 1) ? 6 : 4); } else if (sym.has_gottp(ctx)) { *(U32 *)loc = sym.get_gottp_addr(ctx) - P + A - ((A & 1) ? 5 : 8); } else { *(U32 *)loc = S - ctx.tp_addr; } break; case R_ARM_TLS_CALL: if (sym.has_tlsdesc(ctx)) { *(U32 *)loc = 0xeb00'0000; // bl 0 *(U32 *)loc |= bits(get_tlsdesc_trampoline_addr() - P - 8, 25, 2); } else if (sym.has_gottp(ctx)) { *(U32 *)loc = 0xe79f'0000; // ldr r0, [pc, r0] } else { *(U32 *)loc = 0xe320'f000; // nop } break; case R_ARM_THM_TLS_CALL: if (sym.has_tlsdesc(ctx)) { u64 val = align_to(get_tlsdesc_trampoline_addr() - P - 4, 4); write_thm_b25(loc, val); *(U16 *)(loc + 2) &= ~0x1000; // rewrite BL with BLX } else if (sym.has_gottp(ctx)) { // Since `ldr r0, [pc, r0]` is not representable in Thumb, // we use two instructions instead. *(U16 *)loc = 0x4478; // add r0, pc *(U16 *)(loc + 2) = 0x6800; // ldr r0, [r0] } else { *(U32 *)loc = 0x8000'f3af; // nop.w } break; default: Error(ctx) << *this << ": unknown relocation: " << rel; } } } template <> void InputSection::apply_reloc_nonalloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE || record_undef_error(ctx, rel)) continue; Symbol &sym = *file.symbols[rel.r_sym]; u8 *loc = base + rel.r_offset; SectionFragment *frag; i64 frag_addend; std::tie(frag, frag_addend) = get_fragment(ctx, rel); u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx); u64 A = frag ? frag_addend : get_addend(*this, rel); switch (rel.r_type) { case R_ARM_ABS32: if (std::optional val = get_tombstone(sym, frag)) *(U32 *)loc = *val; else *(U32 *)loc = S + A; break; case R_ARM_TLS_LDO32: if (std::optional val = get_tombstone(sym, frag)) *(U32 *)loc = *val; else *(U32 *)loc = S + A - ctx.dtp_addr; break; default: Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: " << rel; break; } } } template <> void InputSection::scan_relocations(Context &ctx) { assert(shdr().sh_flags & SHF_ALLOC); std::span> rels = get_rels(ctx); // Scan relocations for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE || record_undef_error(ctx, rel)) continue; Symbol &sym = *file.symbols[rel.r_sym]; if (sym.is_ifunc()) sym.flags |= NEEDS_GOT | NEEDS_PLT; switch (rel.r_type) { case R_ARM_MOVW_ABS_NC: case R_ARM_THM_MOVW_ABS_NC: scan_absrel(ctx, sym, rel); break; case R_ARM_THM_CALL: case R_ARM_CALL: case R_ARM_JUMP24: case R_ARM_PLT32: case R_ARM_THM_JUMP24: if (sym.is_imported) sym.flags |= NEEDS_PLT; break; case R_ARM_GOT_PREL: case R_ARM_GOT_BREL: case R_ARM_TARGET2: sym.flags |= NEEDS_GOT; break; case R_ARM_MOVT_PREL: case R_ARM_THM_MOVT_PREL: case R_ARM_PREL31: scan_pcrel(ctx, sym, rel); break; case R_ARM_TLS_GD32: sym.flags |= NEEDS_TLSGD; break; case R_ARM_TLS_LDM32: ctx.needs_tlsld = true; break; case R_ARM_TLS_IE32: sym.flags |= NEEDS_GOTTP; break; case R_ARM_TLS_CALL: case R_ARM_THM_TLS_CALL: scan_tlsdesc(ctx, sym); break; case R_ARM_TLS_LE32: check_tlsle(ctx, sym, rel); break; case R_ARM_ABS32: case R_ARM_TARGET1: case R_ARM_MOVT_ABS: case R_ARM_THM_MOVT_ABS: case R_ARM_REL32: case R_ARM_BASE_PREL: case R_ARM_GOTOFF32: case R_ARM_THM_JUMP8: case R_ARM_THM_JUMP11: case R_ARM_THM_JUMP19: case R_ARM_MOVW_PREL_NC: case R_ARM_THM_MOVW_PREL_NC: case R_ARM_TLS_LDO32: case R_ARM_V4BX: case R_ARM_TLS_GOTDESC: break; default: Error(ctx) << *this << ": unknown relocation: " << rel; } } } template <> void Thunk::copy_buf(Context &ctx) { // TLS trampoline code. ARM32's TLSDESC is designed so that this // common piece of code is factored out from object files to reduce // output size. Since no one provide, the linker has to synthesize it. constexpr ul32 hdr[] = { 0xe08e'0000, // add r0, lr, r0 0xe590'1004, // ldr r1, [r0, #4] 0xe12f'ff11, // bx r1 0xe320'f000, // nop }; // This is a range extension and mode switch thunk. // It has two entry points: +0 for Thumb and +4 for ARM. constexpr u8 entry[] = { // .thumb 0x78, 0x47, // bx pc # jumps to 1f 0xc0, 0x46, // nop // .arm 0x00, 0xc0, 0x9f, 0xe5, // 1: ldr ip, 3f 0x0f, 0xf0, 0x8c, 0xe0, // 2: add pc, ip, pc 0x00, 0x00, 0x00, 0x00, // 3: .word sym - 2b }; static_assert(E::thunk_hdr_size == sizeof(hdr)); static_assert(E::thunk_size == sizeof(entry)); u8 *base = ctx.buf + output_section.shdr.sh_offset + offset; memcpy(base, hdr, sizeof(hdr)); for (i64 i = 0; i < symbols.size(); i++) { u64 S = symbols[i]->get_addr(ctx); u64 P = get_addr() + offsets[i]; u8 *buf = base + offsets[i]; memcpy(buf, entry, sizeof(entry)); *(U32 *)(buf + 12) = S - P - 16; } } template <> u64 get_eflags(Context &ctx) { if constexpr (E::is_le) return EF_ARM_EABI_VER5; else return EF_ARM_EABI_VER5 | EF_ARM_BE8; } template <> void create_arm_exidx_section(Context &ctx) { for (i64 i = 0; i < ctx.chunks.size(); i++) { OutputSection *osec = ctx.chunks[i]->to_osec(); if (osec && osec->shdr.sh_type == SHT_ARM_EXIDX) { auto *sec = new Arm32ExidxSection(*osec); ctx.extra.exidx = sec; ctx.chunks[i] = sec; ctx.chunk_pool.emplace_back(sec); for (InputSection *isec : osec->members) isec->is_alive = false; break; } } } template <> void Arm32ExidxSection::compute_section_size(Context &ctx) { output_section.compute_section_size(ctx); this->shdr.sh_size = output_section.shdr.sh_size + 8; // +8 for sentinel } template <> void Arm32ExidxSection::update_shdr(Context &ctx) { // .ARM.exidx's sh_link should be set to the .text section index. // Runtime doesn't care about it, but the binutils's strip command does. if (Chunk *chunk = find_chunk(ctx, ".text")) this->shdr.sh_link = chunk->shndx; } // Returns the end of the text segment static u64 get_text_end(Context &ctx) { u64 ret = 0; for (Chunk *chunk : ctx.chunks) if (chunk->shdr.sh_flags & SHF_EXECINSTR) ret = std::max(ret, chunk->shdr.sh_addr + chunk->shdr.sh_size); return ret; } // ARM executables use an .ARM.exidx section to look up an exception // handling record for the current instruction pointer. The table needs // to be sorted by their addresses. // // Other target uses .eh_frame_hdr instead for the same purpose. // I don't know why only ARM uses the different mechanism, but it's // likely that it's due to some historical reason. // // This function returns contents of .ARM.exidx. template <> std::vector Arm32ExidxSection::get_contents(Context &ctx) { // .ARM.exidx records consists of a signed 31-bit relative address // and a 32-bit value. The relative address indicates the start // address of a function that the record covers. The value is one of // the followings: // // 1. CANTUNWIND indicating that there's no unwinding info for the function, // 2. a compact unwinding record encoded into a 32-bit value, or // 3. a 31-bit relative address which points to a larger record in // the .ARM.extab section. // // CANTUNWIND is value 1. The most significant bit is set in (2) but // not in (3). So we can distinguished them just by looking at a value. const u32 CANTUNWIND = 1; struct Entry { U32 addr; U32 val; }; // We reserve one extra slot for the sentinel i64 num_entries = output_section.shdr.sh_size / sizeof(Entry) + 1; std::vector buf(num_entries * sizeof(Entry)); Entry *ent = (Entry *)buf.data(); // Write section contents to the buffer output_section.shdr.sh_addr = this->shdr.sh_addr; output_section.write_to(ctx, buf.data()); // Fill in sentinel fields u64 sentinel_addr = this->shdr.sh_addr + sizeof(Entry) * (num_entries - 1); ent[num_entries - 1].addr = get_text_end(ctx) - sentinel_addr; ent[num_entries - 1].val = CANTUNWIND; // Entry's addresses are relative to themselves. In order to sort // records by address, we first translate them so that the addresses // are relative to the beginning of the section. auto is_relative = [](u32 val) { return val != CANTUNWIND && !(val & 0x8000'0000); }; tbb::parallel_for((i64)0, num_entries, [&](i64 i) { i64 offset = sizeof(Entry) * i; ent[i].addr = sign_extend(ent[i].addr, 31) + offset; if (is_relative(ent[i].val)) ent[i].val = 0x7fff'ffff & (ent[i].val + offset); }); ranges::sort(ent, ent + num_entries, {}, &Entry::addr); // Remove duplicate adjacent entries. That is, if two adjacent functions // have the same compact unwind info or are both CANTUNWIND, we can // merge them into a single address range. auto tail = ranges::unique(ent, ent + num_entries, {}, &Entry::val); num_entries -= tail.size(); buf.resize(num_entries * sizeof(Entry)); // Make addresses relative to themselves. tbb::parallel_for((i64)0, num_entries, [&](i64 i) { i64 offset = sizeof(Entry) * i; ent[i].addr = 0x7fff'ffff & (ent[i].addr - offset); if (is_relative(ent[i].val)) ent[i].val = 0x7fff'ffff & (ent[i].val - offset); }); return buf; } template <> void Arm32ExidxSection::remove_duplicate_entries(Context &ctx) { this->shdr.sh_size = get_contents(ctx).size(); } template <> void Arm32ExidxSection::copy_buf(Context &ctx) { std::vector contents = get_contents(ctx); assert(this->shdr.sh_size == contents.size()); write_vector(ctx.buf + this->shdr.sh_offset, contents); } // Even though using ARM32 in big-endian mode is very rare, the processor // technically supports both little- and big-endian modes. There are two // variants of big-endian mode: BE32 and BE8. In BE32, instructions and // data are encoded in big-endian. In BE8, instructions are encoded in // little-endian, and only data is in big-endian. BE8 is the de facto // standard for ARMv6 or later. We support only BE8. // // A tricky thing is that instructions in an object file are always // big-endian if the file is compiled for big-endian mode. In other words, // the compiler always emit code in BE32 if -mbig-endian is specified. It // is the linker's responsibility to rewrite instructions from big-endian // to little-endian for an BE8 output. This function does that. // // The text section may contain a mix of 32-bit ARM instructions, 16-bit // Thumb instructions, and data. We need to distinguish them to swap 4 // bytes, 2 bytes, or not swap bytes, respectively. The beginning of ARM // code, Thumb code, and data is labeled with a mapping symbol of $a, $t, // and $d, respectively. We use mapping symbols to determine what to do // with the text section. // // This function is called after we copy the input section contents to the // output file. We rewrite instructions in the output buffer in place. #if MOLD_ARM32BE void arm32be_swap_bytes(Context &ctx) { tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { // Collect mapping symbols std::vector *> syms; for (Symbol *sym : file->get_local_syms()) if (InputSection *isec = sym->get_input_section()) if (isec->is_alive && (isec->shdr().sh_flags & SHF_EXECINSTR)) if (std::string_view x = sym->name(); x == "$a" || x.starts_with("$a.") || x == "$t" || x.starts_with("$t.") || x == "$d" || x.starts_with("$d.")) syms.push_back(sym); // Group mapping symbols by input section and sort by address ranges::stable_sort(syms, {}, [](const Symbol *sym) { return std::tuple{(uintptr_t)sym->get_input_section(), sym->value}; }); // Swap bytes for (i64 i = 0; i < syms.size(); i++) { Symbol &sym = *syms[i]; if (sym.name().starts_with("$d")) continue; InputSection &isec = *sym.get_input_section(); u8 *base = ctx.buf + isec.output_section->shdr.sh_offset + isec.offset; u8 *start = base + sym.value; u8 *end; if (i + 1 < syms.size() && syms[i + 1]->get_input_section() == &isec) end = base + syms[i + 1]->value; else end = base + isec.sh_size; i64 sz = sym.name().starts_with("$a") ? 4 : 2; for (u8 *p = start; p < end; p += sz) std::reverse(p, p + sz); } }); } #endif } // namespace mold #endif ================================================ FILE: src/arch-arm64.cc ================================================ // This file contains ARM64-specific code. Being new, the ARM64's ELF // psABI doesn't have anything peculiar. ARM64 is a clean RISC // instruction set that supports PC-relative load/store instructions. // // Unlike ARM32, instructions length doesn't vary. All ARM64 // instructions are 4 bytes long. // // Branch instructions used for function call can jump within ±128 MiB. // We need to create range extension thunks to support binaries whose // .text is larger than that. // // Unlike most other targets, the TLSDESC access model is used by default // for -fPIC to access thread-local variables instead of the less // efficient GD model. You can still enable GD but it needs the // -mtls-dialect=trad flag. Since GD is used rarely, we don't need to // implement GD → LE relaxation. // // https://github.com/ARM-software/abi-aa/blob/main/aaelf64/aaelf64.rst #if MOLD_ARM64LE || MOLD_ARM64BE #include "mold.h" namespace mold { using E = MOLD_TARGET; static void write_adrp(u8 *buf, u64 val) { *(ul32 *)buf |= (bits(val, 13, 12) << 29) | (bits(val, 32, 14) << 5); } static void write_adr(u8 *buf, u64 val) { *(ul32 *)buf |= (bits(val, 1, 0) << 29) | (bits(val, 20, 2) << 5); } static void write_movn_movz(u8 *buf, i64 val) { *(ul32 *)buf &= 0b0000'0000'0110'0000'0000'0000'0001'1111; if (val >= 0) *(ul32 *)buf |= 0xd280'0000 | (bits(val, 15, 0) << 5); // rewrite to movz else *(ul32 *)buf |= 0x9280'0000 | (bits(~val, 15, 0) << 5); // rewrite to movn } static u64 page(u64 val) { return val & 0xffff'ffff'ffff'f000; } template <> void write_plt_header(Context &ctx, u8 *buf) { constexpr ul32 insn[] = { 0xa9bf'7bf0, // stp x16, x30, [sp,#-16]! 0x9000'0010, // adrp x16, .got.plt[2] 0xf940'0211, // ldr x17, [x16, .got.plt[2]] 0x9100'0210, // add x16, x16, .got.plt[2] 0xd61f'0220, // br x17 0xd420'7d00, // brk 0xd420'7d00, // brk 0xd420'7d00, // brk }; u64 gotplt = ctx.gotplt->shdr.sh_addr + 16; u64 plt = ctx.plt->shdr.sh_addr; memcpy(buf, insn, sizeof(insn)); write_adrp(buf + 4, page(gotplt) - page(plt + 4)); *(ul32 *)(buf + 8) |= bits(gotplt, 11, 3) << 10; *(ul32 *)(buf + 12) |= (gotplt & 0xfff) << 10; } template <> void write_plt_entry(Context &ctx, u8 *buf, Symbol &sym) { constexpr ul32 insn[] = { 0x9000'0010, // adrp x16, .got.plt[n] 0xf940'0211, // ldr x17, [x16, .got.plt[n]] 0x9100'0210, // add x16, x16, .got.plt[n] 0xd61f'0220, // br x17 }; u64 gotplt = sym.get_gotplt_addr(ctx); u64 plt = sym.get_plt_addr(ctx); memcpy(buf, insn, sizeof(insn)); write_adrp(buf, page(gotplt) - page(plt)); *(ul32 *)(buf + 4) |= bits(gotplt, 11, 3) << 10; *(ul32 *)(buf + 8) |= (gotplt & 0xfff) << 10; } template <> void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym) { constexpr ul32 insn[] = { 0x9000'0010, // adrp x16, GOT[n] 0xf940'0211, // ldr x17, [x16, GOT[n]] 0xd61f'0220, // br x17 0xd420'7d00, // brk }; u64 got = sym.get_got_pltgot_addr(ctx); u64 plt = sym.get_plt_addr(ctx); memcpy(buf, insn, sizeof(insn)); write_adrp(buf, page(got) - page(plt)); *(ul32 *)(buf + 4) |= bits(got, 11, 3) << 10; } template <> void EhFrameSection::apply_eh_reloc(Context &ctx, const ElfRel &rel, u64 offset, u64 val) { u8 *loc = ctx.buf + this->shdr.sh_offset + offset; switch (rel.r_type) { case R_NONE: break; case R_AARCH64_ABS64: *(U64 *)loc = val; break; case R_AARCH64_PREL32: *(U32 *)loc = val - this->shdr.sh_addr - offset; break; case R_AARCH64_PREL64: *(U64 *)loc = val - this->shdr.sh_addr - offset; break; default: Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel; } } static bool is_adrp(u8 *loc) { // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADRP--Form-PC-relative-address-to-4KB-page- u32 insn = *(ul32 *)loc; return (bits(insn, 31, 24) & 0b1001'1111) == 0b1001'0000; } static bool is_ldr(u8 *loc) { // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDR--immediate---Load-Register--immediate-- u32 insn = *(ul32 *)loc; return (bits(insn, 31, 20) & 0b1111'1111'1100) == 0b1111'1001'0100; } static bool is_add(u8 *loc) { // https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/ADD--immediate---Add--immediate-- u32 insn = *(ul32 *)loc; return (bits(insn, 31, 20) & 0b1111'1111'1100) == 0b1001'0001'0000; } template <> void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE) continue; Symbol &sym = *file.symbols[rel.r_sym]; u8 *loc = base + rel.r_offset; u64 S = sym.get_addr(ctx); u64 A = rel.r_addend; u64 P = get_addr() + rel.r_offset; u64 G = sym.get_got_idx(ctx) * sizeof(Word); u64 GOT = ctx.got->shdr.sh_addr; auto check = [&](i64 val, i64 lo, i64 hi) { check_range(ctx, i, val, lo, hi); }; switch (rel.r_type) { case R_AARCH64_ABS64: break; case R_AARCH64_LDST8_ABS_LO12_NC: case R_AARCH64_ADD_ABS_LO12_NC: *(ul32 *)loc |= bits(S + A, 11, 0) << 10; break; case R_AARCH64_LDST16_ABS_LO12_NC: *(ul32 *)loc |= bits(S + A, 11, 1) << 10; break; case R_AARCH64_LDST32_ABS_LO12_NC: *(ul32 *)loc |= bits(S + A, 11, 2) << 10; break; case R_AARCH64_LDST64_ABS_LO12_NC: *(ul32 *)loc |= bits(S + A, 11, 3) << 10; break; case R_AARCH64_LDST128_ABS_LO12_NC: *(ul32 *)loc |= bits(S + A, 11, 4) << 10; break; case R_AARCH64_MOVW_UABS_G0: check(S + A, 0, 1 << 16); *(ul32 *)loc |= bits(S + A, 15, 0) << 5; break; case R_AARCH64_MOVW_UABS_G0_NC: *(ul32 *)loc |= bits(S + A, 15, 0) << 5; break; case R_AARCH64_MOVW_UABS_G1: check(S + A, 0, 1LL << 32); *(ul32 *)loc |= bits(S + A, 31, 16) << 5; break; case R_AARCH64_MOVW_UABS_G1_NC: *(ul32 *)loc |= bits(S + A, 31, 16) << 5; break; case R_AARCH64_MOVW_UABS_G2: check(S + A, 0, 1LL << 48); *(ul32 *)loc |= bits(S + A, 47, 32) << 5; break; case R_AARCH64_MOVW_UABS_G2_NC: *(ul32 *)loc |= bits(S + A, 47, 32) << 5; break; case R_AARCH64_MOVW_UABS_G3: *(ul32 *)loc |= bits(S + A, 63, 48) << 5; break; case R_AARCH64_ADR_GOT_PAGE: if (sym.has_got(ctx)) { i64 val = page(G + GOT + A) - page(P); check(val, -(1LL << 32), 1LL << 32); write_adrp(loc, val); } else { // Relax GOT-loading ADRP+LDR to an immediate ADRP+ADD i64 val = page(S + A) - page(P); check(val, -(1LL << 32), 1LL << 32); write_adrp(loc, val); u32 reg = bits(*(ul32 *)loc, 4, 0); *(ul32 *)(loc + 4) = 0x9100'0000 | (reg << 5) | reg; // ADD *(ul32 *)(loc + 4) |= bits(S + A, 11, 0) << 10; i++; } break; case R_AARCH64_ADR_PREL_PG_HI21: case R_AARCH64_ADR_PREL_PG_HI21_NC: { // The ARM64 psABI defines that an `ADRP x0, foo` and `ADD x0, x0, // :lo12: foo` instruction pair to materialize a PC-relative address // in a register can be relaxed to `NOP` followed by `ADR x0, foo` // if foo is in PC ± 1 MiB. if (ctx.arg.relax && sym.is_pcrel_linktime_const(ctx) && i + 1 < rels.size()) { i64 val = S + A - P - 4; const ElfRel &rel2 = rels[i + 1]; if (is_int(val, 21) && rel2.r_type == R_AARCH64_ADD_ABS_LO12_NC && rel2.r_sym == rel.r_sym && rel2.r_offset == rel.r_offset + 4 && rel2.r_addend == rel.r_addend && is_adrp(loc) && is_add(loc + 4)) { u32 reg1 = bits(*(ul32 *)loc, 4, 0); u32 reg2 = bits(*(ul32 *)(loc + 4), 4, 0); if (reg1 == reg2) { *(ul32 *)loc = 0xd503'201f; // nop *(ul32 *)(loc + 4) = 0x1000'0000 | reg1; // adr write_adr(loc + 4, val); i++; break; } } } i64 val = page(S + A) - page(P); if (rel.r_type == R_AARCH64_ADR_PREL_PG_HI21) check(val, -(1LL << 32), 1LL << 32); write_adrp(loc, val); break; } case R_AARCH64_ADR_PREL_LO21: check(S + A - P, -(1LL << 20), 1LL << 20); write_adr(loc, S + A - P); break; case R_AARCH64_CALL26: case R_AARCH64_JUMP26: { if (sym.is_remaining_undef_weak()) { // On ARM, calling an weak undefined symbol jumps to the // next instruction. *(ul32 *)loc = 0xd503'201f; // nop break; } i64 val = S + A - P; if (!is_int(val, 28)) val = sym.get_thunk_addr(ctx, P) + A - P; *(ul32 *)loc |= bits(val, 27, 2); break; } case R_AARCH64_PLT32: check(S + A - P, -(1LL << 31), 1LL << 31); *(U32 *)loc = S + A - P; break; case R_AARCH64_CONDBR19: case R_AARCH64_LD_PREL_LO19: check(S + A - P, -(1LL << 20), 1LL << 20); *(ul32 *)loc |= bits(S + A - P, 20, 2) << 5; break; case R_AARCH64_PREL16: check(S + A - P, -(1LL << 15), 1LL << 16); *(U16 *)loc = S + A - P; break; case R_AARCH64_PREL32: check(S + A - P, -(1LL << 31), 1LL << 32); *(U32 *)loc = S + A - P; break; case R_AARCH64_PREL64: *(U64 *)loc = S + A - P; break; case R_AARCH64_LD64_GOT_LO12_NC: *(ul32 *)loc |= bits(G + GOT + A, 11, 3) << 10; break; case R_AARCH64_LD64_GOTPAGE_LO15: { i64 val = G + GOT + A - page(GOT); check(val, 0, 1 << 15); *(ul32 *)loc |= bits(val, 14, 3) << 10; break; } case R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: { i64 val = page(sym.get_gottp_addr(ctx) + A) - page(P); check(val, -(1LL << 32), 1LL << 32); write_adrp(loc, val); break; } case R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC: *(ul32 *)loc |= bits(sym.get_gottp_addr(ctx) + A, 11, 3) << 10; break; case R_AARCH64_TLSLE_MOVW_TPREL_G0: { i64 val = S + A - ctx.tp_addr; check(val, -(1 << 15), 1 << 15); write_movn_movz(loc, val); break; } case R_AARCH64_TLSLE_MOVW_TPREL_G0_NC: *(ul32 *)loc |= bits(S + A - ctx.tp_addr, 15, 0) << 5; break; case R_AARCH64_TLSLE_MOVW_TPREL_G1: { i64 val = S + A - ctx.tp_addr; check(val, -(1LL << 31), 1LL << 31); write_movn_movz(loc, val >> 16); break; } case R_AARCH64_TLSLE_MOVW_TPREL_G1_NC: *(ul32 *)loc |= bits(S + A - ctx.tp_addr, 31, 16) << 5; break; case R_AARCH64_TLSLE_MOVW_TPREL_G2: { i64 val = S + A - ctx.tp_addr; check(val, -(1LL << 47), 1LL << 47); write_movn_movz(loc, val >> 32); break; } case R_AARCH64_TLSLE_ADD_TPREL_HI12: { i64 val = S + A - ctx.tp_addr; check(val, 0, 1LL << 24); *(ul32 *)loc |= bits(val, 23, 12) << 10; break; } case R_AARCH64_TLSLE_ADD_TPREL_LO12: check(S + A - ctx.tp_addr, 0, 1 << 12); *(ul32 *)loc |= bits(S + A - ctx.tp_addr, 11, 0) << 10; break; case R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: *(ul32 *)loc |= bits(S + A - ctx.tp_addr, 11, 0) << 10; break; case R_AARCH64_TLSGD_ADR_PAGE21: { i64 val = page(sym.get_tlsgd_addr(ctx) + A) - page(P); check(val, -(1LL << 32), 1LL << 32); write_adrp(loc, val); break; } case R_AARCH64_TLSGD_ADD_LO12_NC: *(ul32 *)loc |= bits(sym.get_tlsgd_addr(ctx) + A, 11, 0) << 10; break; case R_AARCH64_TLSDESC_ADR_PAGE21: // ARM64 TLSDESC uses the following code sequence to materialize // a TP-relative address in x0. // // adrp x0, 0 // R_AARCH64_TLSDESC_ADR_PAGE21 foo // ldr x1, [x0] // R_AARCH64_TLSDESC_LD64_LO12 foo // add x0, x0, #0 // R_AARCH64_TLSDESC_ADD_LO12 foo // blr x1 // R_AARCH64_TLSDESC_CALL foo // // We may relax the instructions to the following if its TP-relative // address is known at link-time // // nop // nop // movz x0, :tls_offset_hi:foo, lsl #16 // movk x0, :tls_offset_lo:foo // // or to the following if the TP-relative address is known at // process startup time. // // nop // nop // adrp x0, :gottprel:foo // ldr x0, [x0, :gottprel_lo12:foo] if (sym.has_tlsdesc(ctx)) { i64 val = page(sym.get_tlsdesc_addr(ctx) + A) - page(P); check(val, -(1LL << 32), 1LL << 32); write_adrp(loc, val); } else { *(ul32 *)loc = 0xd503'201f; // nop } break; case R_AARCH64_TLSDESC_LD64_LO12: if (sym.has_tlsdesc(ctx)) *(ul32 *)loc |= bits(sym.get_tlsdesc_addr(ctx) + A, 11, 3) << 10; else *(ul32 *)loc = 0xd503'201f; // nop break; case R_AARCH64_TLSDESC_ADD_LO12: if (sym.has_tlsdesc(ctx)) { *(ul32 *)loc |= bits(sym.get_tlsdesc_addr(ctx) + A, 11, 0) << 10; } else if (sym.has_gottp(ctx)) { *(ul32 *)loc = 0x9000'0000; // adrp x0, 0 write_adrp(loc, page(sym.get_gottp_addr(ctx) + A) - page(P)); } else { *(ul32 *)loc = 0xd2a0'0000; // movz x0, 0, lsl #16 *(ul32 *)loc |= bits(S + A - ctx.tp_addr, 32, 16) << 5; } break; case R_AARCH64_TLSDESC_CALL: if (sym.has_tlsdesc(ctx)) { // Do nothing } else if (sym.has_gottp(ctx)) { *(ul32 *)loc = 0xf940'0000; // ldr x0, [x0, 0] *(ul32 *)loc |= bits(sym.get_gottp_addr(ctx) + A, 11, 3) << 10; } else { *(ul32 *)loc = 0xf280'0000; // movk x0, 0 *(ul32 *)loc |= bits(S + A - ctx.tp_addr, 15, 0) << 5; } break; default: unreachable(); } } } template <> void InputSection::apply_reloc_nonalloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE || record_undef_error(ctx, rel)) continue; Symbol &sym = *file.symbols[rel.r_sym]; u8 *loc = base + rel.r_offset; SectionFragment *frag; i64 frag_addend; std::tie(frag, frag_addend) = get_fragment(ctx, rel); u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx); u64 A = frag ? frag_addend : (i64)rel.r_addend; auto check = [&](i64 val, i64 lo, i64 hi) { check_range(ctx, val, i, lo, hi); }; switch (rel.r_type) { case R_AARCH64_ABS64: if (std::optional val = get_tombstone(sym, frag)) *(U64 *)loc = *val; else *(U64 *)loc = S + A; break; case R_AARCH64_ABS32: check(S + A, 0, 1LL << 32); *(U32 *)loc = S + A; break; default: Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: " << rel; break; } } } template <> void InputSection::scan_relocations(Context &ctx) { assert(shdr().sh_flags & SHF_ALLOC); std::span> rels = get_rels(ctx); // Scan relocations for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE || record_undef_error(ctx, rel)) continue; Symbol &sym = *file.symbols[rel.r_sym]; u8 *loc = (u8 *)(contents.data() + rel.r_offset); if (sym.is_ifunc()) sym.flags |= NEEDS_GOT | NEEDS_PLT; switch (rel.r_type) { case R_AARCH64_MOVW_UABS_G3: scan_absrel(ctx, sym, rel); break; case R_AARCH64_ADR_GOT_PAGE: // An ADR_GOT_PAGE and GOT_LO12_NC relocation pair is used to load a // symbol's address from GOT. If the GOT value is a link-time // constant, we may be able to rewrite the ADRP+LDR instruction pair // with an ADRP+ADD, eliminating a GOT memory load. if (ctx.arg.relax && sym.is_pcrel_linktime_const(ctx) && i + 1 < rels.size()) { // ADRP+LDR must be consecutive and use the same register to relax. const ElfRel &rel2 = rels[i + 1]; if (rel2.r_type == R_AARCH64_LD64_GOT_LO12_NC && rel2.r_offset == rel.r_offset + 4 && rel2.r_sym == rel.r_sym && rel.r_addend == 0 && rel2.r_addend == 0 && is_adrp(loc) && is_ldr(loc + 4)) { u32 rd = bits(*(ul32 *)loc, 4, 0); u32 rn = bits(*(ul32 *)(loc + 4), 9, 5); u32 rt = bits(*(ul32 *)(loc + 4), 4, 0); if (rd == rn && rn == rt) { i++; break; } } } sym.flags |= NEEDS_GOT; break; case R_AARCH64_LD64_GOT_LO12_NC: case R_AARCH64_LD64_GOTPAGE_LO15: sym.flags |= NEEDS_GOT; break; case R_AARCH64_CALL26: case R_AARCH64_JUMP26: case R_AARCH64_PLT32: if (sym.is_imported) sym.flags |= NEEDS_PLT; break; case R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: case R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC: sym.flags |= NEEDS_GOTTP; break; case R_AARCH64_ADR_PREL_PG_HI21: case R_AARCH64_ADR_PREL_PG_HI21_NC: scan_pcrel(ctx, sym, rel); break; case R_AARCH64_TLSGD_ADR_PAGE21: sym.flags |= NEEDS_TLSGD; break; case R_AARCH64_TLSDESC_CALL: scan_tlsdesc(ctx, sym); break; case R_AARCH64_TLSLE_MOVW_TPREL_G2: case R_AARCH64_TLSLE_ADD_TPREL_LO12: case R_AARCH64_TLSLE_ADD_TPREL_LO12_NC: check_tlsle(ctx, sym, rel); break; case R_AARCH64_ABS64: case R_AARCH64_ADD_ABS_LO12_NC: case R_AARCH64_ADR_PREL_LO21: case R_AARCH64_CONDBR19: case R_AARCH64_LD_PREL_LO19: case R_AARCH64_LDST16_ABS_LO12_NC: case R_AARCH64_LDST32_ABS_LO12_NC: case R_AARCH64_LDST64_ABS_LO12_NC: case R_AARCH64_LDST128_ABS_LO12_NC: case R_AARCH64_LDST8_ABS_LO12_NC: case R_AARCH64_MOVW_UABS_G0: case R_AARCH64_MOVW_UABS_G0_NC: case R_AARCH64_MOVW_UABS_G1: case R_AARCH64_MOVW_UABS_G1_NC: case R_AARCH64_MOVW_UABS_G2: case R_AARCH64_MOVW_UABS_G2_NC: case R_AARCH64_PREL16: case R_AARCH64_PREL32: case R_AARCH64_PREL64: case R_AARCH64_TLSGD_ADD_LO12_NC: case R_AARCH64_TLSLE_MOVW_TPREL_G0: case R_AARCH64_TLSLE_MOVW_TPREL_G0_NC: case R_AARCH64_TLSLE_MOVW_TPREL_G1: case R_AARCH64_TLSLE_MOVW_TPREL_G1_NC: case R_AARCH64_TLSLE_ADD_TPREL_HI12: case R_AARCH64_TLSDESC_ADR_PAGE21: case R_AARCH64_TLSDESC_LD64_LO12: case R_AARCH64_TLSDESC_ADD_LO12: break; default: Error(ctx) << *this << ": unknown relocation: " << rel; } } } // The size of a thunk entry varies on ARM64 depending on the distance to // the branch target. This function computes the size of each thunk entry. template <> void Thunk::shrink_size(Context &ctx) { offsets.clear(); offsets.push_back(0); i64 off = 0; // The distance between S and P is only reduced by shrink_size(), but // page(S) – page(P) may still increase by one page due to address // changes, so we add a safety margin. // // For example, page(0x1200) – page(0x1000) is 0, whereas // page(0x1100) – page(0xfff) is 0x1000, even though the latter // distance is shorter than the former. auto is_small = [](i64 prel) { return is_int(prel + 0x1000, 33) && is_int(prel - 0x1000, 33); }; for (Symbol *sym : symbols) { u64 S = sym->get_addr(ctx); u64 P = get_addr() + off; i64 prel = page(S) - page(P); off += is_small(prel) ? 16 : 32; offsets.push_back(off); } } template <> void Thunk::copy_buf(Context &ctx) { // Short thunk with a 33 bit displacement constexpr ul32 insn1[] = { 0x9000'0010, // adrp x16, 0 0x9100'0210, // add x16, x16 0xd61f'0200, // br x16 0xd420'7d00, // brk }; // Long thunk with a 64 bit displacement constexpr ul32 insn2[] = { 0x1000'0010, // adr x16, 0 0xd2a0'0011, // movz x17, 0, lsl #16 0xf2c0'0011, // movk x17, 0, lsl #32 0xf2e0'0011, // movk x17, 0, lsl #48 0x8b11'0210, // add x16, x16, x17 0xd61f'0200, // br x16 0xd420'7d00, // brk 0xd420'7d00, // brk }; u8 *base = ctx.buf + output_section.shdr.sh_offset + offset; for (i64 i = 0; i < symbols.size(); i++) { u64 S = symbols[i]->get_addr(ctx); u64 P = get_addr() + offsets[i]; u8 *buf = base + offsets[i]; if (offsets[i + 1] - offsets[i] == 16) { i64 prel = page(S) - page(P); assert(is_int(prel, 33)); memcpy(buf, insn1, sizeof(insn1)); write_adrp(buf, prel); *(ul32 *)(buf + 4) |= bits(S, 11, 0) << 10; } else { memcpy(buf, insn2, sizeof(insn2)); write_adr(buf, bits(S - P, 15, 0)); *(ul32 *)(buf + 4) |= bits(S - P, 31, 16) << 5; *(ul32 *)(buf + 8) |= bits(S - P, 47, 32) << 5; *(ul32 *)(buf + 12) |= bits(S - P, 63, 48) << 5; } } } template class Thunk; } // namespace mold #endif ================================================ FILE: src/arch-i386.cc ================================================ // i386 is similar to x86-64 but lacks PC-relative memory access // instructions. So it's not straightforward to support position- // independent code (PIC) on that target. // // If an object file is compiled with -fPIC, a function that needs to load // a value from memory first obtains its own address with the following // code // // call __x86.get_pc_thunk.bx // // where __x86.get_pc_thunk.bx is defined as // // __x86.get_pc_thunk.bx: // mov (%esp), %ebx # move the return address to %ebx // ret // // . With the function's own address (or, more precisely, the address // immediately after the call instruction), the function can compute an // absolute address of a variable with its address + link-time constant. // // Executing call-mov-ret isn't very cheap, and allocating one register to // store PC isn't cheap too, especially given that i386 has only 8 // general-purpose registers. But that's the cost of PIC on i386. You need // to pay it when creating a .so and a position-independent executable. // // When a position-independent function calls another function, it sets // %ebx to the address of .got. Position-independent PLT entries use that // register to load values from .got.plt/.got. // // If we are creating a position-dependent executable (PDE), we can't // assume that %ebx is set to .got. For PDE, we need to create position- // dependent PLT entries which don't use %ebx. // // https://github.com/rui314/psabi/blob/main/i386.pdf #if MOLD_I386 #include "mold.h" namespace mold { using E = I386; template <> i64 get_addend(u8 *loc, const ElfRel &rel) { switch (rel.r_type) { case R_386_8: case R_386_PC8: return *loc; case R_386_16: case R_386_PC16: return *(ul16 *)loc; case R_386_32: case R_386_PC32: case R_386_GOT32: case R_386_GOT32X: case R_386_PLT32: case R_386_GOTOFF: case R_386_GOTPC: case R_386_TLS_LDM: case R_386_TLS_GOTIE: case R_386_TLS_LE: case R_386_TLS_IE: case R_386_TLS_GD: case R_386_TLS_LDO_32: case R_386_SIZE32: case R_386_TLS_GOTDESC: return *(ul32 *)loc; default: return 0; } } template <> void write_addend(u8 *loc, i64 val, const ElfRel &rel) { switch (rel.r_type) { case R_386_NONE: break; case R_386_8: case R_386_PC8: *loc = val; break; case R_386_16: case R_386_PC16: *(ul16 *)loc = val; break; case R_386_32: case R_386_PC32: case R_386_GOT32: case R_386_GOT32X: case R_386_PLT32: case R_386_GOTOFF: case R_386_GOTPC: case R_386_TLS_LDM: case R_386_TLS_GOTIE: case R_386_TLS_LE: case R_386_TLS_IE: case R_386_TLS_GD: case R_386_TLS_LDO_32: case R_386_SIZE32: case R_386_TLS_GOTDESC: *(ul32 *)loc = val; break; default: unreachable(); } } template <> void write_plt_header(Context &ctx, u8 *buf) { if (ctx.arg.pic) { static const u8 insn[] = { 0x51, // push %ecx 0x8d, 0x8b, 0, 0, 0, 0, // lea GOTPLT+4(%ebx), %ecx 0xff, 0x31, // push (%ecx) 0xff, 0x61, 0x04, // jmp *0x4(%ecx) 0xcc, 0xcc, 0xcc, 0xcc, // (padding) }; memcpy(buf, insn, sizeof(insn)); *(ul32 *)(buf + 3) = ctx.gotplt->shdr.sh_addr - ctx.got->shdr.sh_addr + 4; } else { static const u8 insn[] = { 0x51, // push %ecx 0xb9, 0, 0, 0, 0, // mov GOTPLT+4, %ecx 0xff, 0x31, // push (%ecx) 0xff, 0x61, 0x04, // jmp *0x4(%ecx) 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // (padding) }; memcpy(buf, insn, sizeof(insn)); *(ul32 *)(buf + 2) = ctx.gotplt->shdr.sh_addr + 4; } } template <> void write_plt_entry(Context &ctx, u8 *buf, Symbol &sym) { if (ctx.arg.pic) { static const u8 insn[] = { 0xb9, 0, 0, 0, 0, // mov $reloc_offset, %ecx 0xff, 0xa3, 0, 0, 0, 0, // jmp *foo@GOT(%ebx) 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // (padding) }; memcpy(buf, insn, sizeof(insn)); *(ul32 *)(buf + 1) = sym.get_plt_idx(ctx) * sizeof(ElfRel); *(ul32 *)(buf + 7) = sym.get_gotplt_addr(ctx) - ctx.got->shdr.sh_addr; } else { static const u8 insn[] = { 0xb9, 0, 0, 0, 0, // mov $reloc_offset, %ecx 0xff, 0x25, 0, 0, 0, 0, // jmp *foo@GOT 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, // (padding) }; memcpy(buf, insn, sizeof(insn)); *(ul32 *)(buf + 1) = sym.get_plt_idx(ctx) * sizeof(ElfRel); *(ul32 *)(buf + 7) = sym.get_gotplt_addr(ctx); } } template <> void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym) { if (ctx.arg.pic) { static const u8 insn[] = { 0xff, 0xa3, 0, 0, 0, 0, // jmp *foo@GOT(%ebx) 0xcc, 0xcc, // (padding) }; memcpy(buf, insn, sizeof(insn)); *(ul32 *)(buf + 2) = sym.get_got_pltgot_addr(ctx) - ctx.got->shdr.sh_addr; } else { static const u8 insn[] = { 0xff, 0x25, 0, 0, 0, 0, // jmp *foo@GOT 0xcc, 0xcc, // (padding) }; memcpy(buf, insn, sizeof(insn)); *(ul32 *)(buf + 2) = sym.get_got_pltgot_addr(ctx); } } template <> void EhFrameSection::apply_eh_reloc(Context &ctx, const ElfRel &rel, u64 offset, u64 val) { u8 *loc = ctx.buf + this->shdr.sh_offset + offset; switch (rel.r_type) { case R_NONE: break; case R_386_32: *(ul32 *)loc = val; break; case R_386_PC32: *(ul32 *)loc = val - this->shdr.sh_addr - offset; break; default: Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel; } } static u32 relax_got32x(u8 *loc) { // mov imm(%reg1), %reg2 -> lea imm(%reg1), %reg2 if (loc[0] == 0x8b) return 0x8d00 | loc[1]; return 0; } // Relax GD to LE static void relax_gd_to_le(u8 *loc, ElfRel rel, u64 val) { static const u8 insn[] = { 0x65, 0xa1, 0, 0, 0, 0, // mov %gs:0, %eax 0x81, 0xc0, 0, 0, 0, 0, // add $tp_offset, %eax }; switch (rel.r_type) { case R_386_PLT32: case R_386_PC32: memcpy(loc - 3, insn, sizeof(insn)); *(ul32 *)(loc + 5) = val; break; case R_386_GOT32: case R_386_GOT32X: memcpy(loc - 2, insn, sizeof(insn)); *(ul32 *)(loc + 6) = val; break; default: unreachable(); } } // Relax LD to LE static void relax_ld_to_le(u8 *loc, ElfRel rel, u64 tls_size) { switch (rel.r_type) { case R_386_PLT32: case R_386_PC32: { static const u8 insn[] = { 0x65, 0xa1, 0, 0, 0, 0, // mov %gs:0, %eax 0x2d, 0, 0, 0, 0, // sub $tls_size, %eax }; memcpy(loc - 2, insn, sizeof(insn)); *(ul32 *)(loc + 5) = tls_size; break; } case R_386_GOT32: case R_386_GOT32X: { static const u8 insn[] = { 0x65, 0xa1, 0, 0, 0, 0, // mov %gs:0, %eax 0x81, 0xe8, 0, 0, 0, 0, // sub $tls_size, %eax }; memcpy(loc - 2, insn, sizeof(insn)); *(ul32 *)(loc + 6) = tls_size; break; } default: unreachable(); } } static u32 relax_tlsdesc_to_ie(u8 *loc) { switch ((loc[0] << 8) | loc[1]) { case 0x8d83: return 0x8b83; // lea 0(%ebx), %eax -> mov 0(%ebx), %eax case 0x8d9b: return 0x8b9b; // lea 0(%ebx), %ebx -> mov 0(%ebx), %ebx case 0x8d8b: return 0x8b8b; // lea 0(%ebx), %ecx -> mov 0(%ebx), %ecx case 0x8d93: return 0x8b93; // lea 0(%ebx), %edx -> mov 0(%ebx), %edx case 0x8db3: return 0x8bb3; // lea 0(%ebx), %esi -> mov 0(%ebx), %esi case 0x8dbb: return 0x8bbb; // lea 0(%ebx), %edi -> mov 0(%ebx), %edi case 0x8da3: return 0x8ba3; // lea 0(%ebx), %esp -> mov 0(%ebx), %esp case 0x8dab: return 0x8bab; // lea 0(%ebx), %ebp -> mov 0(%ebx), %ebp } return 0; } static u32 relax_tlsdesc_to_le(u8 *loc) { switch ((loc[0] << 8) | loc[1]) { case 0x8d83: return 0x90b8; // lea 0(%ebx), %eax -> mov $0, %eax case 0x8d9b: return 0x90bb; // lea 0(%ebx), %ebx -> mov $0, %ebx case 0x8d8b: return 0x90b9; // lea 0(%ebx), %ecx -> mov $0, %ecx case 0x8d93: return 0x90ba; // lea 0(%ebx), %edx -> mov $0, %edx case 0x8db3: return 0x90be; // lea 0(%ebx), %esi -> mov $0, %esi case 0x8dbb: return 0x90bf; // lea 0(%ebx), %edi -> mov $0, %edi case 0x8da3: return 0x90bc; // lea 0(%ebx), %esp -> mov $0, %esp case 0x8dab: return 0x90bd; // lea 0(%ebx), %ebp -> mov $0, %ebp } return 0; } template <> void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE) continue; Symbol &sym = *file.symbols[rel.r_sym]; u8 *loc = base + rel.r_offset; u64 S = sym.get_addr(ctx); u64 A = get_addend(*this, rel); u64 P = get_addr() + rel.r_offset; u64 G = sym.get_got_idx(ctx) * sizeof(Word); u64 GOT = ctx.got->shdr.sh_addr; auto check = [&](i64 val, i64 lo, i64 hi) { check_range(ctx, i, val, lo, hi); }; switch (rel.r_type) { case R_386_8: check(S + A, 0, 1 << 8); *loc = S + A; break; case R_386_16: check(S + A, 0, 1 << 16); *(ul16 *)loc = S + A; break; case R_386_32: break; case R_386_PC8: check(S + A - P, -(1 << 7), 1 << 7); *loc = S + A - P; break; case R_386_PC16: check(S + A - P, -(1 << 15), 1 << 15); *(ul16 *)loc = S + A - P; break; case R_386_PC32: case R_386_PLT32: *(ul32 *)loc = S + A - P; break; case R_386_GOT32: *(ul32 *)loc = G + A; break; case R_386_GOT32X: if (sym.has_got(ctx)) { *(ul32 *)loc = G + A; } else { u32 insn = relax_got32x(loc - 2); assert(insn); loc[-2] = insn >> 8; loc[-1] = insn; *(ul32 *)loc = S + A - GOT; } break; case R_386_GOTOFF: *(ul32 *)loc = S + A - GOT; break; case R_386_GOTPC: *(ul32 *)loc = GOT + A - P; break; case R_386_TLS_GOTIE: *(ul32 *)loc = sym.get_gottp_addr(ctx) + A - GOT; break; case R_386_TLS_LE: *(ul32 *)loc = S + A - ctx.tp_addr; break; case R_386_TLS_IE: *(ul32 *)loc = sym.get_gottp_addr(ctx) + A; break; case R_386_TLS_GD: if (sym.has_tlsgd(ctx)) *(ul32 *)loc = sym.get_tlsgd_addr(ctx) + A - GOT; else relax_gd_to_le(loc, rels[++i], S - ctx.tp_addr); break; case R_386_TLS_LDM: if (ctx.got->has_tlsld(ctx)) *(ul32 *)loc = ctx.got->get_tlsld_addr(ctx) + A - GOT; else relax_ld_to_le(loc, rels[++i], ctx.tp_addr - ctx.tls_begin); break; case R_386_TLS_LDO_32: *(ul32 *)loc = S + A - ctx.dtp_addr; break; case R_386_SIZE32: *(ul32 *)loc = sym.esym().st_size + A; break; case R_386_TLS_GOTDESC: // i386 TLSDESC uses the following code sequence to materialize // a TP-relative address in %eax. // // lea 0(%ebx), %eax // R_386_TLS_GOTDESC foo // call *(%eax) // R_386_TLS_DESC_CALL foo // // We may relax the instructions to the following if its TP-relative // address is known at link-time // // mov $foo@TPOFF, %eax // nop // // or to the following if the TP-relative address is known at // process startup time. // // mov foo@GOTTPOFF(%ebx), %eax // nop // // We allow the following alternative code sequence too because // LLVM emits such code. // // lea 0(%ebx), %reg // R_386_TLS_GOTDESC foo // mov %reg, %eax // call *(%eax) // R_386_TLS_DESC_CALL foo // // Note that the compiler always uses the local-exec TLS model // for -fno-pic, so TLSDESC code is always PIC (i.e. uses %ebx to // store the address of GOT.) if (sym.has_tlsdesc(ctx)) { *(ul32 *)loc = sym.get_tlsdesc_addr(ctx) + A - GOT; } else if (sym.has_gottp(ctx)) { u32 insn = relax_tlsdesc_to_ie(loc - 2); if (!insn) Fatal(ctx) << *this << ": illegal instruction sequence for TLSDESC"; loc[-2] = insn >> 8; loc[-1] = insn; *(ul32 *)loc = sym.get_gottp_addr(ctx) + A - GOT; } else { u32 insn = relax_tlsdesc_to_le(loc - 2); if (!insn) Fatal(ctx) << *this << ": illegal instruction sequence for TLSDESC"; loc[-2] = insn >> 8; loc[-1] = insn; *(ul32 *)loc = S + A - ctx.tp_addr; } break; case R_386_TLS_DESC_CALL: if (!sym.has_tlsdesc(ctx)) { // call *(%eax) -> nop loc[0] = 0x66; loc[1] = 0x90; } break; default: unreachable(); } } } template <> void InputSection::apply_reloc_nonalloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE || record_undef_error(ctx, rel)) continue; Symbol &sym = *file.symbols[rel.r_sym]; u8 *loc = base + rel.r_offset; SectionFragment *frag; i64 frag_addend; std::tie(frag, frag_addend) = get_fragment(ctx, rel); u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx); u64 A = frag ? frag_addend : get_addend(*this, rel); u64 GOT = ctx.got->shdr.sh_addr; auto check = [&](i64 val, i64 lo, i64 hi) { check_range(ctx, val, i, lo, hi); }; switch (rel.r_type) { case R_386_8: check(S + A, 0, 1 << 8); *loc = S + A; break; case R_386_16: check(S + A, 0, 1 << 16); *(ul16 *)loc = S + A; break; case R_386_32: if (std::optional val = get_tombstone(sym, frag)) *(ul32 *)loc = *val; else *(ul32 *)loc = S + A; break; case R_386_PC8: check(S + A, -(1 << 7), 1 << 7); *loc = S + A; break; case R_386_PC16: check(S + A, -(1 << 15), 1 << 15); *(ul16 *)loc = S + A; break; case R_386_PC32: *(ul32 *)loc = S + A; break; case R_386_GOTPC: *(ul32 *)loc = GOT + A; break; case R_386_GOTOFF: *(ul32 *)loc = S + A - GOT; break; case R_386_TLS_LDO_32: if (std::optional val = get_tombstone(sym, frag)) *(ul32 *)loc = *val; else *(ul32 *)loc = S + A - ctx.dtp_addr; break; case R_386_SIZE32: *(ul32 *)loc = sym.esym().st_size + A; break; default: unreachable(); } } } template <> void InputSection::scan_relocations(Context &ctx) { assert(shdr().sh_flags & SHF_ALLOC); std::span> rels = get_rels(ctx); // Scan relocations for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE || record_undef_error(ctx, rel)) continue; Symbol &sym = *file.symbols[rel.r_sym]; u8 *loc = (u8 *)(contents.data() + rel.r_offset); if (sym.is_ifunc()) sym.flags |= NEEDS_GOT | NEEDS_PLT; if (rel.r_type == R_386_TLS_GD || rel.r_type == R_386_TLS_LDM) { if (i + 1 == rels.size()) Fatal(ctx) << *this << ": " << rel << " must be followed by PLT or GOT32"; if (u32 ty = rels[i + 1].r_type; ty != R_386_PLT32 && ty != R_386_PC32 && ty != R_386_GOT32 && ty != R_386_GOT32X) Fatal(ctx) << *this << ": " << rel << " must be followed by PLT or GOT32"; } switch (rel.r_type) { case R_386_8: case R_386_16: scan_absrel(ctx, sym, rel); break; case R_386_PC8: case R_386_PC16: case R_386_PC32: scan_pcrel(ctx, sym, rel); break; case R_386_GOT32: case R_386_GOTPC: sym.flags |= NEEDS_GOT; break; case R_386_GOT32X: // We always want to relax GOT32X even if --no-relax is given // because static PIE doesn't work without it. if (sym.is_pcrel_linktime_const(ctx) && relax_got32x(loc - 2)) { // Do nothing } else { sym.flags |= NEEDS_GOT; } break; case R_386_PLT32: if (sym.is_imported) sym.flags |= NEEDS_PLT; break; case R_386_TLS_GOTIE: case R_386_TLS_IE: sym.flags |= NEEDS_GOTTP; break; case R_386_TLS_GD: // We always relax if -static because libc.a doesn't contain // __tls_get_addr(). if (ctx.arg.static_ || (ctx.arg.relax && sym.is_tprel_linktime_const(ctx))) i++; else sym.flags |= NEEDS_TLSGD; break; case R_386_TLS_LDM: // We always relax if -static because libc.a doesn't contain // __tls_get_addr(). if (ctx.arg.static_ || (ctx.arg.relax && !ctx.arg.shared)) i++; else ctx.needs_tlsld = true; break; case R_386_TLS_GOTDESC: scan_tlsdesc(ctx, sym); break; case R_386_TLS_LE: check_tlsle(ctx, sym, rel); break; case R_386_32: case R_386_GOTOFF: case R_386_TLS_LDO_32: case R_386_SIZE32: case R_386_TLS_DESC_CALL: break; default: Error(ctx) << *this << ": unknown relocation: " << rel; } } } } // namespace mold #endif ================================================ FILE: src/arch-loongarch.cc ================================================ // LoongArch is a new RISC ISA announced in 2021 by Loongson. The ISA // feels like a modernized MIPS with a hint of RISC-V flavor, although // it's not compatible with either one. // // While LoongArch is a fresh and clean ISA, its technological advantage // over other modern RISC ISAs such as RISC-V doesn't seem to be very // significant. It appears that the real selling point of LoongArch is // that the ISA is developed and controlled by a Chinese company, // reflecting a desire for domestic CPUs. Loongson is actively working on // bootstrapping the entire ecosystem for LoongArch, sending patches to // Linux, GCC, LLVM, etc. // // Speaking of the ISA, all instructions are 4 byte long and aligned to 4 // byte boundaries in LoongArch. It has 32 general-purpose registers. // Among these, $t0 - $t8 (aliases for $r12 - $r20) are temporary // registers that we can use in our PLT. // // Just like RISC-V, LoongArch supports section-shrinking relaxations. // That is, it allows linkers to rewrite certain instruction sequences to // shorter ones. Sections are not an atomic unit of copying. // // https://github.com/loongson/la-abi-specs/blob/release/laelf.adoc #if MOLD_LOONGARCH64 || MOLD_LOONGARCH32 #include "mold.h" namespace mold { using E = MOLD_TARGET; static u64 page(u64 val) { return val & 0xffff'ffff'ffff'f000; } static u64 hi20(u64 val, u64 pc) { // A PC-relative address with a 32 bit offset is materialized in a // register with the following instructions: // // pcalau12i $rN, %pc_hi20(sym) // addi.d $rN, $rN, %lo12(sym) // // PCALAU12I materializes bits [63:12] by computing (pc + imm << 12) // and zero-clear [11:0]. ADDI.D sign-extends its 12 bit immediate and // add it to the register. To compensate the sign-extension, PCALAU12I // needs to materialize a 0x1000 larger value than the desired [63:12] // if [11:0] is sign-extended. // // This is similar but different from RISC-V because RISC-V's AUIPC // doesn't zero-clear [11:0]. return bits(page(val + 0x800) - page(pc), 31, 12); } static u64 higher(u64 val, u64 pc) { // A PC-relative 64-bit address is materialized with the following // instructions for the large code model: // // pcalau12i $rN, %pc_hi20(sym) // addi.d $rM, $zero, %lo12(sym) // lu32i.d $rM, %pc64_lo20(sym) // lu52i.d $rM, $r12, %pc64_hi12(sym) // add.d $rN, $rN, $rM // // PCALAU12I computes (pc + imm << 12) to materialize a 64-bit value. // ADDI.D adds a sign-extended 12 bit value to a register. LU32I.D and // LU52I.D simply set bits to [51:31] and to [63:53], respectively. // // Compensating all the sign-extensions is a bit complicated. The // psABI gives the following formula. val = val + 0x8000'0000 + ((val & 0x800) ? (0x1000 - 0x1'0000'0000) : 0); return page(val) - page(pc - 8); } static u64 higher20(u64 val, u64 pc) { return bits(higher(val, pc), 51, 32); } static u64 highest12(u64 val, u64 pc) { return bits(higher(val, pc), 63, 52); } static void write_k12(u8 *loc, u32 val) { // opcode, [11:0], rj, rd *(ul32 *)loc &= 0b1111111111'000000000000'11111'11111; *(ul32 *)loc |= bits(val, 11, 0) << 10; } static void write_k16(u8 *loc, u32 val) { // opcode, [15:0], rj, rd *(ul32 *)loc &= 0b111111'0000000000000000'11111'11111; *(ul32 *)loc |= bits(val, 15, 0) << 10; } static void write_j20(u8 *loc, u32 val) { // opcode, [19:0], rd *(ul32 *)loc &= 0b1111111'00000000000000000000'11111; *(ul32 *)loc |= bits(val, 19, 0) << 5; } static void write_d5k16(u8 *loc, u32 val) { // opcode, [15:0], rj, [20:16] *(ul32 *)loc &= 0b111111'0000000000000000'11111'00000; *(ul32 *)loc |= bits(val, 15, 0) << 10; *(ul32 *)loc |= bits(val, 20, 16); } static void write_d10k16(u8 *loc, u32 val) { // opcode, [15:0], [25:16] *(ul32 *)loc &= 0b111111'0000000000000000'0000000000; *(ul32 *)loc |= bits(val, 15, 0) << 10; *(ul32 *)loc |= bits(val, 25, 16); } static u32 get_rd(u32 insn) { return bits(insn, 4, 0); } static u32 get_rj(u32 insn) { return bits(insn, 9, 5); } static void set_rj(u8 *loc, u32 rj) { assert(rj < 32); *(ul32 *)loc &= 0b111111'1111111111111111'00000'11111; *(ul32 *)loc |= rj << 5; } // Returns true if isec's i'th relocation refers to the following // relaxable instructioon pair. // // pcalau12i $t0, 0 # R_LARCH_GOT_PC_HI20, R_LARCH_RELAX // ld.d $t0, $t0, 0 # R_LARCH_GOT_PC_LO12, R_LARCH_RELAX static bool is_relaxable_got_load(Context &ctx, InputSection &isec, i64 i) { std::span> rels = isec.get_rels(ctx); Symbol &sym = *isec.file.symbols[rels[i].r_sym]; u8 *buf = (u8 *)isec.contents.data(); if (ctx.arg.relax && sym.is_pcrel_linktime_const(ctx) && i + 3 < rels.size() && rels[i + 1].r_type == R_LARCH_RELAX && rels[i + 2].r_type == R_LARCH_GOT_PC_LO12 && rels[i + 2].r_offset == rels[i].r_offset + 4 && rels[i + 3].r_type == R_LARCH_RELAX) { u32 insn1 = *(ul32 *)(buf + rels[i].r_offset); u32 insn2 = *(ul32 *)(buf + rels[i].r_offset + 4); bool is_ld_d = (insn2 & 0xffc0'0000) == 0x28c0'0000; return get_rd(insn1) == get_rd(insn2) && get_rd(insn2) == get_rj(insn2) && is_ld_d; } return false; } template <> void write_plt_header(Context &ctx, u8 *buf) { constexpr ul32 insn_64[] = { 0x1a00'000e, // pcalau12i $t2, %pc_hi20(.got.plt) 0x0011'bdad, // sub.d $t1, $t1, $t3 0x28c0'01cf, // ld.d $t3, $t2, %lo12(.got.plt) # _dl_runtime_resolve 0x02ff'51ad, // addi.d $t1, $t1, -44 # .plt entry 0x02c0'01cc, // addi.d $t0, $t2, %lo12(.got.plt) # &.got.plt 0x0045'05ad, // srli.d $t1, $t1, 1 # .plt entry offset 0x28c0'218c, // ld.d $t0, $t0, 8 # link map 0x4c00'01e0, // jr $t3 }; constexpr ul32 insn_32[] = { 0x1a00'000e, // pcalau12i $t2, %pc_hi20(.got.plt) 0x0011'3dad, // sub.w $t1, $t1, $t3 0x2880'01cf, // ld.w $t3, $t2, %lo12(.got.plt) # _dl_runtime_resolve 0x02bf'51ad, // addi.w $t1, $t1, -44 # .plt entry 0x0280'01cc, // addi.w $t0, $t2, %lo12(.got.plt) # &.got.plt 0x0044'89ad, // srli.w $t1, $t1, 2 # .plt entry offset 0x2880'118c, // ld.w $t0, $t0, 4 # link map 0x4c00'01e0, // jr $t3 }; u64 gotplt = ctx.gotplt->shdr.sh_addr; u64 plt = ctx.plt->shdr.sh_addr; memcpy(buf, E::is_64 ? insn_64 : insn_32, E::plt_hdr_size); write_j20(buf, hi20(gotplt, plt)); write_k12(buf + 8, gotplt); write_k12(buf + 16, gotplt); } static constexpr ul32 plt_entry_64[] = { 0x1a00'000f, // pcalau12i $t3, %pc_hi20(func@.got.plt) 0x28c0'01ef, // ld.d $t3, $t3, %lo12(func@.got.plt) 0x4c00'01ed, // jirl $t1, $t3, 0 0x002a'0000, // break }; static constexpr ul32 plt_entry_32[] = { 0x1a00'000f, // pcalau12i $t3, %pc_hi20(func@.got.plt) 0x2880'01ef, // ld.w $t3, $t3, %lo12(func@.got.plt) 0x4c00'01ed, // jirl $t1, $t3, 0 0x002a'0000, // break }; template <> void write_plt_entry(Context &ctx, u8 *buf, Symbol &sym) { u64 gotplt = sym.get_gotplt_addr(ctx); u64 plt = sym.get_plt_addr(ctx); memcpy(buf, E::is_64 ? plt_entry_64 : plt_entry_32, E::plt_size); write_j20(buf, hi20(gotplt, plt)); write_k12(buf + 4, gotplt); } template <> void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym) { u64 got = sym.get_got_pltgot_addr(ctx); u64 plt = sym.get_plt_addr(ctx); memcpy(buf, E::is_64 ? plt_entry_64 : plt_entry_32, E::plt_size); write_j20(buf, hi20(got, plt)); write_k12(buf + 4, got); } template <> void EhFrameSection::apply_eh_reloc(Context &ctx, const ElfRel &rel, u64 offset, u64 val) { u8 *loc = ctx.buf + this->shdr.sh_offset + offset; switch (rel.r_type) { case R_NONE: break; case R_LARCH_ADD6: *loc = (*loc & 0b1100'0000) | ((*loc + val) & 0b0011'1111); break; case R_LARCH_ADD8: *loc += val; break; case R_LARCH_ADD16: *(ul16 *)loc += val; break; case R_LARCH_ADD32: *(ul32 *)loc += val; break; case R_LARCH_ADD64: *(ul64 *)loc += val; break; case R_LARCH_SUB6: *loc = (*loc & 0b1100'0000) | ((*loc - val) & 0b0011'1111); break; case R_LARCH_SUB8: *loc -= val; break; case R_LARCH_SUB16: *(ul16 *)loc -= val; break; case R_LARCH_SUB32: *(ul32 *)loc -= val; break; case R_LARCH_SUB64: *(ul64 *)loc -= val; break; case R_LARCH_32_PCREL: *(ul32 *)loc = val - this->shdr.sh_addr - offset; break; case R_LARCH_64_PCREL: *(ul64 *)loc = val - this->shdr.sh_addr - offset; break; default: Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel; } } template <> void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); std::span deltas = extra.r_deltas; i64 k = 0; u8 *buf = (u8 *)contents.data(); for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE || rel.r_type == R_LARCH_RELAX || rel.r_type == R_LARCH_MARK_LA || rel.r_type == R_LARCH_MARK_PCREL || rel.r_type == R_LARCH_ALIGN) continue; i64 removed_bytes = 0; i64 r_delta = 0; if (!deltas.empty()) { while (k < deltas.size() && deltas[k].offset < rel.r_offset) k++; if (k < deltas.size() && deltas[k].offset == rel.r_offset) removed_bytes = get_removed_bytes(deltas, k); if (k > 0) r_delta = deltas[k - 1].delta; } Symbol &sym = *file.symbols[rel.r_sym]; i64 r_offset = rel.r_offset - r_delta; u8 *loc = base + r_offset; // Unlike other psABIs, the LoongArch ABI uses the same relocation // types to refer to GOT entries for thread-local symbols and regular // ones. Therefore, G may refer to a TLSGD or a regular GOT slot // depending on the symbol type. // // Note that even though LoongArch defines relocations for TLSLD, TLSLD // is not actually supported on it. GCC and LLVM emit identical machine // code for -ftls-model=global-dynamic and -ftls-model=local-dynamic, // and we need to handle TLSLD relocations as equivalent to TLSGD // relocations. This is clearly a compiler bug, but it's too late to // fix. The only way to fix it would be to define a new set of // relocations for true TLSLD and deprecate the current ones. But it // appears that migrating to TLSDESC is a better choice, so it's // unlikely to happen. i64 got_idx = sym.has_tlsgd(ctx) ? sym.get_tlsgd_idx(ctx) : sym.get_got_idx(ctx); u64 S = sym.get_addr(ctx); u64 A = rel.r_addend; u64 P = get_addr() + r_offset; u64 G = got_idx * sizeof(Word); u64 GOT = ctx.got->shdr.sh_addr; auto check = [&](i64 val, i64 lo, i64 hi) { check_range(ctx, i, val, lo, hi); }; auto check_branch = [&](i64 val, i64 lo, i64 hi) { check(val, lo, hi); if (val & 0b11) Error(ctx) << *this << ": misaligned symbol " << sym << " for relocation " << rel; }; switch (rel.r_type) { case R_LARCH_32: assert(E::is_64); *(ul32 *)loc = S + A; break; case R_LARCH_B16: check_branch(S + A - P, -(1 << 17), 1 << 17); write_k16(loc, (S + A - P) >> 2); break; case R_LARCH_B21: check_branch(S + A - P, -(1 << 22), 1 << 22); write_d5k16(loc, (S + A - P) >> 2); break; case R_LARCH_B26: check_branch(S + A - P, -(1 << 27), 1 << 27); write_d10k16(loc, (S + A - P) >> 2); break; case R_LARCH_ABS_LO12: write_k12(loc, S + A); break; case R_LARCH_ABS_HI20: write_j20(loc, (S + A) >> 12); break; case R_LARCH_ABS64_LO20: write_j20(loc, (S + A) >> 32); break; case R_LARCH_ABS64_HI12: write_k12(loc, (S + A) >> 52); break; case R_LARCH_PCALA_LO12: // It looks like R_LARCH_PCALA_LO12 is sometimes used for JIRL even // though the instruction takes a 16 bit immediate rather than 12 bits. // It is contrary to the psABI document, but GNU ld has special // code to handle it, so we accept it too. if ((*(ul32 *)loc & 0xfc00'0000) == 0x4c00'0000) write_k16(loc, sign_extend(S + A, 12) >> 2); else write_k12(loc, S + A); break; case R_LARCH_PCALA_HI20: if (removed_bytes == 0) { write_j20(loc, hi20(S + A, P)); } else { // Rewrite pcalau12i + addi.d with pcaddi assert(removed_bytes == 4); *(ul32 *)loc = 0x1800'0000 | get_rd(*(ul32 *)loc); // pcaddi write_j20(loc, (S + A - P) >> 2); i += 3; } break; case R_LARCH_PCALA64_LO20: write_j20(loc, higher20(S + A, P)); break; case R_LARCH_PCALA64_HI12: write_k12(loc, highest12(S + A, P)); break; case R_LARCH_GOT_PC_LO12: write_k12(loc, GOT + G + A); break; case R_LARCH_GOT_PC_HI20: if (removed_bytes == 0) { // If the PC-relative symbol address is known at link-time, we can // rewrite the following GOT load // // pcalau12i $t0, 0 # R_LARCH_GOT_PC_HI20 // ld.d $t0, $t0, 0 # R_LARCH_GOT_PC_LO12 // // with the following address materialization // // pcalau12i $t0, 0 // addi.d $t0, $t0, 0 if (is_relaxable_got_load(ctx, *this, i)) { i64 dist = compute_distance(ctx, sym, *this, rel); if (is_int(dist, 32)) { u32 rd = get_rd(*(ul32 *)loc); *(ul32 *)(loc + 4) = 0x02c0'0000 | (rd << 5) | rd; // addi.d write_j20(loc, hi20(S + A, P)); write_k12(loc + 4, S + A); i += 3; break; } } write_j20(loc, hi20(GOT + G + A, P)); } else { // Rewrite pcalau12i + ld.d with pcaddi assert(removed_bytes == 4); *(ul32 *)loc = 0x1800'0000 | get_rd(*(ul32 *)loc); // pcaddi write_j20(loc, (S + A - P) >> 2); i += 3; } break; case R_LARCH_GOT64_PC_LO20: write_j20(loc, higher20(GOT + G + A, P)); break; case R_LARCH_GOT64_PC_HI12: write_k12(loc, highest12(GOT + G + A, P)); break; case R_LARCH_GOT_LO12: write_k12(loc, GOT + G + A); break; case R_LARCH_GOT_HI20: write_j20(loc, (GOT + G + A) >> 12); break; case R_LARCH_GOT64_LO20: write_j20(loc, (GOT + G + A) >> 32); break; case R_LARCH_GOT64_HI12: write_k12(loc, (GOT + G + A) >> 52); break; case R_LARCH_TLS_LE_LO12: write_k12(loc, S + A - ctx.tp_addr); break; case R_LARCH_TLS_LE_HI20: write_j20(loc, (S + A - ctx.tp_addr) >> 12); break; case R_LARCH_TLS_LE64_LO20: write_j20(loc, (S + A - ctx.tp_addr) >> 32); break; case R_LARCH_TLS_LE64_HI12: write_k12(loc, (S + A - ctx.tp_addr) >> 52); break; case R_LARCH_TLS_IE_PC_LO12: write_k12(loc, sym.get_gottp_addr(ctx) + A); break; case R_LARCH_TLS_IE_PC_HI20: write_j20(loc, hi20(sym.get_gottp_addr(ctx) + A, P)); break; case R_LARCH_TLS_IE64_PC_LO20: write_j20(loc, higher20(sym.get_gottp_addr(ctx) + A, P)); break; case R_LARCH_TLS_IE64_PC_HI12: write_k12(loc, highest12(sym.get_gottp_addr(ctx) + A, P)); break; case R_LARCH_TLS_IE_LO12: write_k12(loc, sym.get_gottp_addr(ctx) + A); break; case R_LARCH_TLS_IE_HI20: write_j20(loc, (sym.get_gottp_addr(ctx) + A) >> 12); break; case R_LARCH_TLS_IE64_LO20: write_j20(loc, (sym.get_gottp_addr(ctx) + A) >> 32); break; case R_LARCH_TLS_IE64_HI12: write_k12(loc, (sym.get_gottp_addr(ctx) + A) >> 52); break; case R_LARCH_TLS_GD_PC_HI20: case R_LARCH_TLS_LD_PC_HI20: check(sym.get_tlsgd_addr(ctx) + A - P, -(1LL << 31), 1LL << 31); write_j20(loc, hi20(sym.get_tlsgd_addr(ctx) + A, P)); break; case R_LARCH_TLS_GD_HI20: case R_LARCH_TLS_LD_HI20: write_j20(loc, (sym.get_tlsgd_addr(ctx) + A) >> 12); break; case R_LARCH_ADD6: *loc = (*loc & 0b1100'0000) | ((*loc + S + A) & 0b0011'1111); break; case R_LARCH_ADD8: *loc += S + A; break; case R_LARCH_ADD16: *(ul16 *)loc += S + A; break; case R_LARCH_ADD32: *(ul32 *)loc += S + A; break; case R_LARCH_ADD64: *(ul64 *)loc += S + A; break; case R_LARCH_SUB6: *loc = (*loc & 0b1100'0000) | ((*loc - S - A) & 0b0011'1111); break; case R_LARCH_SUB8: *loc -= S + A; break; case R_LARCH_SUB16: *(ul16 *)loc -= S + A; break; case R_LARCH_SUB32: *(ul32 *)loc -= S + A; break; case R_LARCH_SUB64: *(ul64 *)loc -= S + A; break; case R_LARCH_32_PCREL: check(S + A - P, -(1LL << 31), 1LL << 31); *(ul32 *)loc = S + A - P; break; case R_LARCH_64_PCREL: *(ul64 *)loc = S + A - P; break; case R_LARCH_CALL36: if (removed_bytes == 0) { i64 val = S + A - P; check_branch(val, -(1LL << 37) - 0x20000, (1LL << 37) - 0x20000); write_j20(loc, (val + 0x20000) >> 18); write_k16(loc + 4, val >> 2); } else { // Rewrite PCADDU18I + JIRL to B or BL assert(removed_bytes == 4); if (get_rd(*(ul32 *)(buf + rel.r_offset + 4)) == 0) *(ul32 *)loc = 0x5000'0000; // B else *(ul32 *)loc = 0x5400'0000; // BL write_d10k16(loc, (S + A - P) >> 2); } break; case R_LARCH_ADD_ULEB128: overwrite_uleb(loc, read_uleb(loc) + S + A); break; case R_LARCH_SUB_ULEB128: overwrite_uleb(loc, read_uleb(loc) - S - A); break; case R_LARCH_TLS_DESC_PC_HI20: // LoongArch TLSDESC uses the following code sequence to materialize // a TP-relative address in a0. // // pcalau12i $a0, 0 // R_LARCH_TLS_DESC_PC_HI20 foo // addi.[dw] $a0, $a0, 0 // R_LARCH_TLS_DESC_PC_LO12 foo // ld.d $ra, $a0, 0 // R_LARCH_TLS_DESC_LD foo // jirl $ra, $ra, 0 // R_LARCH_TLS_DESC_CALL foo // // We may relax the instructions to the following if its TP-relative // address is known at link-time // // // // lu12i.w $a0, foo@TPOFF // addi.w $a0, $a0, foo@TPOFF // // or to the following if the TP offset is small enough. // // // // // ori $a0, $zero, foo@TPOFF // // If the TP-relative address is known at process startup time, we // may relax the instructions to the following. // // // // pcalau12i $a0, foo@GOTTP // ld.[dw] $a0, $a0, foo@GOTTP // // If we don't know anything about the symbol, we can still relax // the first two instructions to a single pcaddi as shown below. // // // pcaddi $a0, foo@GOTDESC // ld.d $ra, $a0, 0 // jirl $ra, $ra, 0 if (sym.has_tlsdesc(ctx) && removed_bytes == 0) write_j20(loc, hi20(sym.get_tlsdesc_addr(ctx) + A, P)); break; case R_LARCH_TLS_DESC_PC_LO12: if (sym.has_tlsdesc(ctx) && removed_bytes == 0) { i64 dist = sym.get_tlsdesc_addr(ctx) + A - P; if (is_int(dist, 22)) { *(ul32 *)loc = 0x1800'0000 | get_rd(*(ul32 *)loc); // pcaddi write_j20(loc, dist >> 2); } else { write_k12(loc, sym.get_tlsdesc_addr(ctx) + A); } } break; case R_LARCH_TLS_DESC_LD: if (sym.has_tlsdesc(ctx) || removed_bytes == 4) { // Do nothing } else if (sym.has_gottp(ctx)) { *(ul32 *)loc = 0x1a00'0004; // pcalau12i $a0, 0 write_j20(loc, hi20(sym.get_gottp_addr(ctx) + A, P)); } else { *(ul32 *)loc = 0x1400'0004; // lu12i.w $a0, 0 write_j20(loc, (S + A + 0x800 - ctx.tp_addr) >> 12); } break; case R_LARCH_TLS_DESC_CALL: if (sym.has_tlsdesc(ctx)) { // Do nothing } else if (sym.has_gottp(ctx)) { if (E::is_64) *(ul32 *)loc = 0x28c0'0084; // ld.d $a0, $a0, 0 else *(ul32 *)loc = 0x2880'0084; // ld.w $a0, $a0, 0 write_k12(loc, sym.get_gottp_addr(ctx) + A); } else { i64 val = S + A - ctx.tp_addr; if (0 <= val && val < 0x1000) *(ul32 *)loc = 0x0380'0004; // ori $a0, $zero, 0 else *(ul32 *)loc = 0x0280'0084; // addi.w $a0, $a0, 0 write_k12(loc, val); } break; case R_LARCH_TLS_LE_HI20_R: if (removed_bytes == 0) write_j20(loc, (S + A + 0x800 - ctx.tp_addr) >> 12); break; case R_LARCH_TLS_LE_LO12_R: { i64 val = S + A - ctx.tp_addr; write_k12(loc, val); // Rewrite `addi.d $t0, $t0, ` with `addi.d $t0, $tp, ` // if the offset is directly accessible using tp. tp is r2. if (is_int(val, 12)) set_rj(loc, 2); break; } case R_LARCH_64: case R_LARCH_TLS_LE_ADD_R: break; default: unreachable(); } } } template <> void InputSection::apply_reloc_nonalloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE) continue; Symbol &sym = *file.symbols[rel.r_sym]; u8 *loc = base + rel.r_offset; if (!sym.file) { record_undef_error(ctx, rel); continue; } SectionFragment *frag; i64 frag_addend; std::tie(frag, frag_addend) = get_fragment(ctx, rel); u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx); u64 A = frag ? frag_addend : (i64)rel.r_addend; switch (rel.r_type) { case R_LARCH_32: *(ul32 *)loc = S + A; break; case R_LARCH_64: if (std::optional val = get_tombstone(sym, frag)) *(ul64 *)loc = *val; else *(ul64 *)loc = S + A; break; case R_LARCH_ADD6: *loc = (*loc & 0b1100'0000) | ((*loc + S + A) & 0b0011'1111); break; case R_LARCH_ADD8: *loc += S + A; break; case R_LARCH_ADD16: *(ul16 *)loc += S + A; break; case R_LARCH_ADD32: *(ul32 *)loc += S + A; break; case R_LARCH_ADD64: *(ul64 *)loc += S + A; break; case R_LARCH_SUB6: *loc = (*loc & 0b1100'0000) | ((*loc - S - A) & 0b0011'1111); break; case R_LARCH_SUB8: *loc -= S + A; break; case R_LARCH_SUB16: *(ul16 *)loc -= S + A; break; case R_LARCH_SUB32: *(ul32 *)loc -= S + A; break; case R_LARCH_SUB64: *(ul64 *)loc -= S + A; break; case R_LARCH_TLS_DTPREL32: if (std::optional val = get_tombstone(sym, frag)) *(ul32 *)loc = *val; else *(ul32 *)loc = S + A - ctx.dtp_addr; break; case R_LARCH_TLS_DTPREL64: if (std::optional val = get_tombstone(sym, frag)) *(ul64 *)loc = *val; else *(ul64 *)loc = S + A - ctx.dtp_addr; break; case R_LARCH_ADD_ULEB128: overwrite_uleb(loc, read_uleb(loc) + S + A); break; case R_LARCH_SUB_ULEB128: overwrite_uleb(loc, read_uleb(loc) - S - A); break; default: Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: " << rel; break; } } } template <> void InputSection::scan_relocations(Context &ctx) { assert(shdr().sh_flags & SHF_ALLOC); std::span> rels = get_rels(ctx); // Scan relocations for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE || rel.r_type == R_LARCH_RELAX || rel.r_type == R_LARCH_MARK_LA || rel.r_type == R_LARCH_MARK_PCREL || rel.r_type == R_LARCH_ALIGN) continue; if (record_undef_error(ctx, rel)) continue; Symbol &sym = *file.symbols[rel.r_sym]; if (sym.is_ifunc()) sym.flags |= NEEDS_GOT | NEEDS_PLT; switch (rel.r_type) { case R_LARCH_32: if constexpr (E::is_64) scan_absrel(ctx, sym, rel); break; case R_LARCH_B26: case R_LARCH_PCALA_HI20: case R_LARCH_CALL36: if (sym.is_imported) sym.flags |= NEEDS_PLT; break; case R_LARCH_GOT_HI20: case R_LARCH_GOT_PC_HI20: sym.flags |= NEEDS_GOT; break; case R_LARCH_TLS_IE_HI20: case R_LARCH_TLS_IE_PC_HI20: sym.flags |= NEEDS_GOTTP; break; case R_LARCH_TLS_GD_PC_HI20: case R_LARCH_TLS_LD_PC_HI20: case R_LARCH_TLS_GD_HI20: case R_LARCH_TLS_LD_HI20: sym.flags |= NEEDS_TLSGD; break; case R_LARCH_32_PCREL: case R_LARCH_64_PCREL: scan_pcrel(ctx, sym, rel); break; case R_LARCH_TLS_LE_HI20: case R_LARCH_TLS_LE_LO12: case R_LARCH_TLS_LE64_LO20: case R_LARCH_TLS_LE64_HI12: case R_LARCH_TLS_LE_HI20_R: case R_LARCH_TLS_LE_LO12_R: check_tlsle(ctx, sym, rel); break; case R_LARCH_TLS_DESC_CALL: scan_tlsdesc(ctx, sym); break; case R_LARCH_64: case R_LARCH_B16: case R_LARCH_B21: case R_LARCH_ABS_HI20: case R_LARCH_ABS_LO12: case R_LARCH_ABS64_LO20: case R_LARCH_ABS64_HI12: case R_LARCH_PCALA_LO12: case R_LARCH_PCALA64_LO20: case R_LARCH_PCALA64_HI12: case R_LARCH_GOT_PC_LO12: case R_LARCH_GOT64_PC_LO20: case R_LARCH_GOT64_PC_HI12: case R_LARCH_GOT_LO12: case R_LARCH_GOT64_LO20: case R_LARCH_GOT64_HI12: case R_LARCH_TLS_IE_PC_LO12: case R_LARCH_TLS_IE64_PC_LO20: case R_LARCH_TLS_IE64_PC_HI12: case R_LARCH_TLS_IE_LO12: case R_LARCH_TLS_IE64_LO20: case R_LARCH_TLS_IE64_HI12: case R_LARCH_ADD6: case R_LARCH_SUB6: case R_LARCH_ADD8: case R_LARCH_SUB8: case R_LARCH_ADD16: case R_LARCH_SUB16: case R_LARCH_ADD32: case R_LARCH_SUB32: case R_LARCH_ADD64: case R_LARCH_SUB64: case R_LARCH_ADD_ULEB128: case R_LARCH_SUB_ULEB128: case R_LARCH_TLS_DESC_PC_HI20: case R_LARCH_TLS_DESC_PC_LO12: case R_LARCH_TLS_DESC_LD: case R_LARCH_TLS_LE_ADD_R: break; default: Error(ctx) << *this << ": unknown relocation: " << rel; } } } template <> void shrink_section(Context &ctx, InputSection &isec) { std::span> rels = isec.get_rels(ctx); std::vector &deltas = isec.extra.r_deltas; i64 r_delta = 0; u8 *buf = (u8 *)isec.contents.data(); for (i64 i = 0; i < rels.size(); i++) { const ElfRel &r = rels[i]; Symbol &sym = *isec.file.symbols[r.r_sym]; auto remove = [&](i64 d) { r_delta += d; deltas.push_back(RelocDelta{r.r_offset, r_delta}); }; // A R_LARCH_ALIGN relocation refers to the beginning of a nop // sequence. We need to remove some or all of them so that the // instruction that immediately follows that is aligned to a specified // boundary. To allow that, a R_LARCH_ALIGN relocation that requests // 2^n alignment refers to 2^n - 4 bytes of nop instructions. if (r.r_type == R_LARCH_ALIGN) { // The actual rule for storing the alignment size is a bit weird. // In particular, the most significant 56 bits of r_addend is // sometimes used to store the upper limit of the alignment, // allowing the instruction that follows nops _not_ to be aligned at // all. I think that's a spec bug, so we don't want to support that. i64 alignment; if (r.r_sym) { if (r.r_addend >> 8) Fatal(ctx) << isec << ": ternary R_LARCH_ALIGN is not supported: " << i; alignment = 1 << r.r_addend; } else { if (!has_single_bit(r.r_addend + 4)) Fatal(ctx) << isec << ": R_LARCH_ALIGN: invalid alignment requirement: " << i; alignment = r.r_addend + 4; } u64 P = isec.get_addr() + r.r_offset - r_delta; u64 desired = align_to(P, alignment); u64 actual = P + alignment - 4; if (desired != actual) remove(actual - desired); continue; } // Handling other relocations is optional. if (!ctx.arg.relax || i == rels.size() - 1 || rels[i + 1].r_type != R_LARCH_RELAX) continue; // Skip linker-synthesized symbols because their final addresses // are not fixed yet. if (sym.file == ctx.internal_obj) continue; switch (r.r_type) { case R_LARCH_TLS_LE_HI20_R: case R_LARCH_TLS_LE_ADD_R: // LoongArch uses the following three instructions to access // TP ± 2 GiB. // // lu12i.w $t0, 0 # R_LARCH_TLS_LE_HI20_R // add.d $t0, $t0, $tp # R_LARCH_TLS_LE_ADD_R // addi.d $t0, $t0, 0 # R_LARCH_TLS_LE_LO12_R // // If the thread-local variable is within TP ± 2 KiB, we can // relax them into the following single instruction. // // addi.d $t0, $tp, if (i64 val = sym.get_addr(ctx) + r.r_addend - ctx.tp_addr; is_int(val, 12)) remove(4); break; case R_LARCH_PCALA_HI20: // The following two instructions are used to materialize a // PC-relative address with a 32 bit displacement. // // pcalau12i $t0, 0 # R_LARCH_PCALA_HI20 // addi.d $t0, $t0, 0 # R_LARCH_PCALA_LO12 // // If the displacement is within ±2 MiB, we can relax them to // the following instruction. // // pcaddi $t0, if (i + 3 < rels.size() && rels[i + 2].r_type == R_LARCH_PCALA_LO12 && rels[i + 2].r_offset == rels[i].r_offset + 4 && rels[i + 3].r_type == R_LARCH_RELAX) { i64 dist = compute_distance(ctx, sym, isec, r); u32 insn1 = *(ul32 *)(buf + rels[i].r_offset); u32 insn2 = *(ul32 *)(buf + rels[i].r_offset + 4); bool is_addi_d = (insn2 & 0xffc0'0000) == 0x02c0'0000; if ((dist & 0b11) == 0 && is_int(dist, 22) && is_addi_d && get_rd(insn1) == get_rd(insn2) && get_rd(insn2) == get_rj(insn2)) remove(4); } break; case R_LARCH_CALL36: // A CALL36 relocation referes to the following instruction pair // to jump to PC ± 128 GiB. // // pcaddu18i $t0, 0 # R_LARCH_CALL36 // jirl $zero/$ra, $t0, 0 // // If the displacement is PC ± 128 MiB, we can use B or BL instead. // Note that $zero is $r0 and $ra is $r1. if (i64 dist = compute_distance(ctx, sym, isec, r); is_int(dist, 28)) if (u32 jirl = *(ul32 *)(buf + rels[i].r_offset + 4); get_rd(jirl) == 0 || get_rd(jirl) == 1) remove(4); break; case R_LARCH_GOT_PC_HI20: // The following two instructions are used to load a symbol address // from the GOT. // // pcalau12i $t0, 0 # R_LARCH_GOT_PC_HI20 // ld.d $t0, $t0, 0 # R_LARCH_GOT_PC_LO12 // // If the PC-relative symbol address is known at link-time, we can // relax them to the following instruction. // // pcaddi $t0, if (is_relaxable_got_load(ctx, isec, i)) if (i64 dist = compute_distance(ctx, sym, isec, r); is_int(dist, 22)) remove(4); break; case R_LARCH_TLS_DESC_PC_HI20: if (sym.has_tlsdesc(ctx)) { u64 P = isec.get_addr() + r.r_offset; i64 dist = sym.get_tlsdesc_addr(ctx) + r.r_addend - P; if (is_int(dist, 22)) remove(4); } else { remove(4); } break; case R_LARCH_TLS_DESC_PC_LO12: if (!sym.has_tlsdesc(ctx)) remove(4); break; case R_LARCH_TLS_DESC_LD: if (!sym.has_tlsdesc(ctx) && !sym.has_gottp(ctx)) if (i64 val = sym.get_addr(ctx) + r.r_addend - ctx.tp_addr; 0 <= val && val < 0x1000) remove(4); break; } } isec.sh_size -= r_delta; } } // namespace mold #endif ================================================ FILE: src/arch-m68k.cc ================================================ // This file contains code for the Motorola 68000 series microprocessors, // which is often abbreviated as m68k. Running a Unix-like system on a // m68k-based machine today is a retro-computing hobby activity, but the // processor was a popular choice to build Unix computers during '80s. // Early Sun workstations for example used m68k. Macintosh until 1994 were // based on m68k as well until they switched to PowerPC (and then to x86 // and to ARM.) // // From the linker's point of view, it is not hard to support m68k. It's // just a 32-bit big-endian CISC ISA. Compared to comtemporary i386, // m68k's psABI is actually simpler because m68k has PC-relative memory // access instructions and therefore can support position-independent // code without too much hassle. // // https://github.com/rui314/psabi/blob/main/m68k.pdf #if MOLD_M68K #include "mold.h" namespace mold { using E = M68K; template <> void write_plt_header(Context &ctx, u8 *buf) { static const u8 insn[] = { 0x2f, 0x00, // move.l %d0, -(%sp) 0x2f, 0x3b, 0x01, 0x70, 0, 0, 0, 0, // move.l (GOTPLT+4, %pc), -(%sp) 0x4e, 0xfb, 0x01, 0x71, 0, 0, 0, 0, // jmp ([GOTPLT+8, %pc]) }; memcpy(buf, insn, sizeof(insn)); *(ub32 *)(buf + 6) = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr; *(ub32 *)(buf + 14) = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 4; } template <> void write_plt_entry(Context &ctx, u8 *buf, Symbol &sym) { static const u8 insn[] = { 0x20, 0x3c, 0, 0, 0, 0, // move.l PLT_OFFSET, %d0 0x4e, 0xfb, 0x01, 0x71, 0, 0, 0, 0, // jmp ([GOTPLT_ENTRY, %pc]) }; memcpy(buf, insn, sizeof(insn)); *(ub32 *)(buf + 2) = sym.get_plt_idx(ctx) * sizeof(ElfRel); *(ub32 *)(buf + 10) = sym.get_gotplt_addr(ctx) - sym.get_plt_addr(ctx) - 8; } template <> void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym) { static const u8 insn[] = { 0x4e, 0xfb, 0x01, 0x71, 0, 0, 0, 0, // jmp ([GOT_ENTRY, %pc]) }; memcpy(buf, insn, sizeof(insn)); *(ub32 *)(buf + 4) = sym.get_got_pltgot_addr(ctx) - sym.get_plt_addr(ctx) - 2; } template <> void EhFrameSection::apply_eh_reloc(Context &ctx, const ElfRel &rel, u64 offset, u64 val) { u8 *loc = ctx.buf + this->shdr.sh_offset + offset; switch (rel.r_type) { case R_NONE: break; case R_68K_32: *(ub32 *)loc = val; break; case R_68K_PC32: *(ub32 *)loc = val - this->shdr.sh_addr - offset; break; default: Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel; } } template <> void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE) continue; Symbol &sym = *file.symbols[rel.r_sym]; u8 *loc = base + rel.r_offset; u64 S = sym.get_addr(ctx); u64 A = rel.r_addend; u64 P = get_addr() + rel.r_offset; u64 G = sym.get_got_idx(ctx) * sizeof(Word); u64 GOT = ctx.got->shdr.sh_addr; auto check = [&](i64 val, i64 lo, i64 hi) { check_range(ctx, i, val, lo, hi); }; auto write16 = [&](u64 val) { check(val, 0, 1 << 16); *(ub16 *)loc = val; }; auto write16s = [&](u64 val) { check(val, -(1 << 15), 1 << 15); *(ub16 *)loc = val; }; auto write8 = [&](u64 val) { check(val, 0, 1 << 8); *loc = val; }; auto write8s = [&](u64 val) { check(val, -(1 << 7), 1 << 7); *loc = val; }; switch (rel.r_type) { case R_68K_32: break; case R_68K_16: write16(S + A); break; case R_68K_8: write8(S + A); break; case R_68K_PC32: case R_68K_PLT32: *(ub32 *)loc = S + A - P; break; case R_68K_PC16: case R_68K_PLT16: write16s(S + A - P); break; case R_68K_PC8: case R_68K_PLT8: write8s(S + A - P); break; case R_68K_GOTPCREL32: *(ub32 *)loc = GOT + A - P; break; case R_68K_GOTPCREL16: write16s(GOT + A - P); break; case R_68K_GOTPCREL8: write8s(GOT + A - P); break; case R_68K_GOTOFF32: *(ub32 *)loc = G + A; break; case R_68K_GOTOFF16: write16(G + A); break; case R_68K_GOTOFF8: write8(G + A); break; case R_68K_TLS_GD32: *(ub32 *)loc = sym.get_tlsgd_addr(ctx) + A - GOT; break; case R_68K_TLS_GD16: write16(sym.get_tlsgd_addr(ctx) + A - GOT); break; case R_68K_TLS_GD8: write8(sym.get_tlsgd_addr(ctx) + A - GOT); break; case R_68K_TLS_LDM32: *(ub32 *)loc = ctx.got->get_tlsld_addr(ctx) + A - GOT; break; case R_68K_TLS_LDM16: write16(ctx.got->get_tlsld_addr(ctx) + A - GOT); break; case R_68K_TLS_LDM8: write8(ctx.got->get_tlsld_addr(ctx) + A - GOT); break; case R_68K_TLS_LDO32: *(ub32 *)loc = S + A - ctx.dtp_addr; break; case R_68K_TLS_LDO16: write16s(S + A - ctx.dtp_addr); break; case R_68K_TLS_LDO8: write8s(S + A - ctx.dtp_addr); break; case R_68K_TLS_IE32: *(ub32 *)loc = sym.get_gottp_addr(ctx) + A - GOT; break; case R_68K_TLS_IE16: write16(sym.get_gottp_addr(ctx) + A - GOT); break; case R_68K_TLS_IE8: write8(sym.get_gottp_addr(ctx) + A - GOT); break; case R_68K_TLS_LE32: *(ub32 *)loc = S + A - ctx.tp_addr; break; case R_68K_TLS_LE16: write16(S + A - ctx.tp_addr); break; case R_68K_TLS_LE8: write8(S + A - ctx.tp_addr); break; default: unreachable(); } } } template <> void InputSection::apply_reloc_nonalloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE || record_undef_error(ctx, rel)) continue; Symbol &sym = *file.symbols[rel.r_sym]; u8 *loc = base + rel.r_offset; SectionFragment *frag; i64 frag_addend; std::tie(frag, frag_addend) = get_fragment(ctx, rel); u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx); u64 A = frag ? frag_addend : (i64)rel.r_addend; switch (rel.r_type) { case R_68K_32: if (std::optional val = get_tombstone(sym, frag)) *(ub32 *)loc = *val; else *(ub32 *)loc = S + A; break; default: Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: " << rel; } } } template <> void InputSection::scan_relocations(Context &ctx) { assert(shdr().sh_flags & SHF_ALLOC); std::span> rels = get_rels(ctx); for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE || record_undef_error(ctx, rel)) continue; Symbol &sym = *file.symbols[rel.r_sym]; if (sym.is_ifunc()) Error(ctx) << sym << ": GNU ifunc symbol is not supported on m68k"; switch (rel.r_type) { case R_68K_16: case R_68K_8: scan_absrel(ctx, sym, rel); break; case R_68K_PC32: case R_68K_PC16: case R_68K_PC8: scan_pcrel(ctx, sym, rel); break; case R_68K_GOTPCREL32: case R_68K_GOTPCREL16: case R_68K_GOTPCREL8: case R_68K_GOTOFF32: case R_68K_GOTOFF16: case R_68K_GOTOFF8: sym.flags |= NEEDS_GOT; break; case R_68K_PLT32: case R_68K_PLT16: case R_68K_PLT8: if (sym.is_imported) sym.flags |= NEEDS_PLT; break; case R_68K_TLS_GD32: case R_68K_TLS_GD16: case R_68K_TLS_GD8: sym.flags |= NEEDS_TLSGD; break; case R_68K_TLS_LDM32: case R_68K_TLS_LDM16: case R_68K_TLS_LDM8: ctx.needs_tlsld = true; break; case R_68K_TLS_IE32: case R_68K_TLS_IE16: case R_68K_TLS_IE8: sym.flags |= NEEDS_GOTTP; break; case R_68K_TLS_LE32: case R_68K_TLS_LE16: case R_68K_TLS_LE8: check_tlsle(ctx, sym, rel); break; case R_68K_32: case R_68K_TLS_LDO32: case R_68K_TLS_LDO16: case R_68K_TLS_LDO8: break; default: Error(ctx) << *this << ": unknown relocation: " << rel; } } } } // namespace mold #endif ================================================ FILE: src/arch-ppc32.cc ================================================ // This file implements the PowerPC 32-bit ISA. For 64-bit PowerPC, see // arch-ppc64v1.cpp and arch-ppc64v2.cpp. // // PPC32 is a RISC ISA. It has 32 general-purpose registers (GPRs). // r0, r11 and r12 are reserved for static linkers, so we can use these // registers in PLTs and range extension thunks. In addition to that, it // has a few special registers. Notable ones are LR which holds a return // address and CTR which we can use to store a branch target address. // // It feels that the PPC32 psABI is unnecessarily complicated at first // glance, but that is mainly stemmed from the fact that the ISA lacks // PC-relative load/store instructions. Since machine instructions cannot // load data relative to its own address, it is not straightforward to // support position-independent code (PIC) on PPC32. // // A position-independent function typically contains the following code // in the prologue to obtain its own address: // // mflr r0 // save the current return address to %r0 // bcl 20, 31, 4 // call the next instruction as if it were a function // mtlr r12 // save the return address to %r12 // mtlr r0 // restore the original return address // // An object file compiled with -fPIC contains a data section named // `.got2` to store addresses of locally-defined global variables and // constants. A PIC function usually computes its .got2+0x8000 and set it // to %r30. This scheme allows the function to access global objects // defined in the same input file with a single %r30-relative load/store // instruction with a 16-bit offset, given that .got2 is smaller than // 0x10000 (or 65536) bytes. // // Since each object file has its own .got2, %r30 refers to different // places in a merged .got2 for two functions that came from different // input files. Therefore, %r30 makes sense only within a single function. // // Technically, we can reuse a %r30 value in our PLT if we create a PLT // _for each input file_ (that's what GNU ld seems to be doing), but that // doesn't seems to be worth its complexity. Our PLT simply doesn't rely // on a %r30 value. // // https://github.com/rui314/psabi/blob/main/ppc32.pdf #if MOLD_PPC32 #include "mold.h" namespace mold { using E = PPC32; static u64 lo(u64 x) { return x & 0xffff; } static u64 hi(u64 x) { return x >> 16; } static u64 ha(u64 x) { return (x + 0x8000) >> 16; } static u64 high(u64 x) { return (x >> 16) & 0xffff; } static u64 higha(u64 x) { return ((x + 0x8000) >> 16) & 0xffff; } template <> void write_plt_header(Context &ctx, u8 *buf) { constexpr ub32 insn[] = { // Get the address of this PLT section 0x7c08'02a6, // mflr r0 0x429f'0005, // bcl 20, 31, 4 0x7d88'02a6, // 1: mflr r12 0x7c08'03a6, // mtlr r0 // Compute the runtime address of GOTPLT+12 0x3d8c'0000, // addis r12, r12, (GOTPLT - 1b)@higha 0x398c'0000, // addi r12, r12, (GOTPLT - 1b)@lo // Compute the PLT entry offset 0x7d6c'5850, // sub r11, r11, r12 0x1d6b'0003, // mulli r11, r11, 3 // Load GOTPLT[2] and branch to GOTPLT[1] 0x800c'fff8, // lwz r0, -8(r12) 0x7c09'03a6, // mtctr r0 0x818c'fffc, // lwz r12, -4(r12) 0x4e80'0420, // bctr 0x6000'0000, // nop 0x6000'0000, // nop 0x6000'0000, // nop 0x6000'0000, // nop }; static_assert(sizeof(insn) == E::plt_hdr_size); memcpy(buf, insn, sizeof(insn)); ub32 *loc = (ub32 *)buf; loc[4] |= higha(ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr + 4); loc[5] |= lo(ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr + 4); } static constexpr ub32 plt_entry[] = { // Get the address of this PLT entry 0x7c08'02a6, // mflr r0 0x429f'0005, // bcl 20, 31, 4 0x7d88'02a6, // mflr r12 0x7c08'03a6, // mtlr r0 // Load an address from the GOT/GOTPLT entry and jump to that address 0x3d6c'0000, // addis r11, r12, OFFSET@higha 0x396b'0000, // addi r11, r11, OFFSET@lo 0x818b'0000, // lwz r12, 0(r11) 0x7d89'03a6, // mtctr r12 0x4e80'0420, // bctr }; template <> void write_plt_entry(Context &ctx, u8 *buf, Symbol &sym) { static_assert(E::plt_size == sizeof(plt_entry)); memcpy(buf, plt_entry, sizeof(plt_entry)); ub32 *loc = (ub32 *)buf; i64 offset = sym.get_gotplt_addr(ctx) - sym.get_plt_addr(ctx) - 8; loc[4] |= higha(offset); loc[5] |= lo(offset); } template <> void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym) { static_assert(E::pltgot_size == sizeof(plt_entry)); memcpy(buf, plt_entry, sizeof(plt_entry)); ub32 *loc = (ub32 *)buf; i64 offset = sym.get_got_pltgot_addr(ctx) - sym.get_plt_addr(ctx) - 8; loc[4] |= higha(offset); loc[5] |= lo(offset); } template <> void EhFrameSection::apply_eh_reloc(Context &ctx, const ElfRel &rel, u64 offset, u64 val) { u8 *loc = ctx.buf + this->shdr.sh_offset + offset; switch (rel.r_type) { case R_NONE: break; case R_PPC_ADDR32: *(ub32 *)loc = val; break; case R_PPC_REL32: *(ub32 *)loc = val - this->shdr.sh_addr - offset; break; default: Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel; } } template <> void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); u64 GOT2 = file.extra.got2 ? file.extra.got2->get_addr() : 0; for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE) continue; Symbol &sym = *file.symbols[rel.r_sym]; u8 *loc = base + rel.r_offset; u64 S = sym.get_addr(ctx); u64 A = rel.r_addend; u64 P = get_addr() + rel.r_offset; u64 G = sym.get_got_idx(ctx) * sizeof(Word); u64 GOT = ctx.got->shdr.sh_addr; switch (rel.r_type) { case R_PPC_ADDR14: *(ub32 *)loc |= bits(S + A, 15, 2) << 2; break; case R_PPC_ADDR16: case R_PPC_UADDR16: case R_PPC_ADDR16_LO: *(ub16 *)loc = lo(S + A); break; case R_PPC_ADDR16_HI: *(ub16 *)loc = hi(S + A); break; case R_PPC_ADDR16_HA: *(ub16 *)loc = ha(S + A); break; case R_PPC_ADDR24: *(ub32 *)loc |= bits(S + A, 25, 2) << 2; break; case R_PPC_ADDR30: *(ub32 *)loc |= bits(S + A, 31, 2) << 2; break; case R_PPC_PLT16_LO: *(ub16 *)loc = lo(G + GOT - A - GOT2); break; case R_PPC_PLT16_HI: *(ub16 *)loc = hi(G + GOT - A - GOT2); break; case R_PPC_PLT16_HA: *(ub16 *)loc = ha(G + GOT - A - GOT2); break; case R_PPC_PLT32: *(ub32 *)loc = G + GOT - A - GOT2; break; case R_PPC_REL14: *(ub32 *)loc |= bits(S + A - P, 15, 2) << 2; break; case R_PPC_REL16: case R_PPC_REL16_LO: *(ub16 *)loc = lo(S + A - P); break; case R_PPC_REL16_HI: *(ub16 *)loc = hi(S + A - P); break; case R_PPC_REL16_HA: *(ub16 *)loc = ha(S + A - P); break; case R_PPC_REL24: case R_PPC_LOCAL24PC: { i64 val = S + A - P; if (!is_int(val, 26)) val = sym.get_thunk_addr(ctx, P) - P; *(ub32 *)loc |= bits(val, 25, 2) << 2; break; } case R_PPC_PLTREL24: { i64 val = S - P; if (sym.has_plt(ctx) || !is_int(val, 26)) val = sym.get_thunk_addr(ctx, P) - P; *(ub32 *)loc |= bits(val, 25, 2) << 2; break; } case R_PPC_REL32: case R_PPC_PLTREL32: *(ub32 *)loc = S + A - P; break; case R_PPC_GOT16: case R_PPC_GOT16_LO: *(ub16 *)loc = lo(G + A); break; case R_PPC_GOT16_HI: *(ub16 *)loc = hi(G + A); break; case R_PPC_GOT16_HA: *(ub16 *)loc = ha(G + A); break; case R_PPC_TPREL16_LO: *(ub16 *)loc = lo(S + A - ctx.tp_addr); break; case R_PPC_TPREL16_HI: *(ub16 *)loc = hi(S + A - ctx.tp_addr); break; case R_PPC_TPREL16_HA: *(ub16 *)loc = ha(S + A - ctx.tp_addr); break; case R_PPC_DTPREL16_LO: *(ub16 *)loc = lo(S + A - ctx.dtp_addr); break; case R_PPC_DTPREL16_HI: *(ub16 *)loc = hi(S + A - ctx.dtp_addr); break; case R_PPC_DTPREL16_HA: *(ub16 *)loc = ha(S + A - ctx.dtp_addr); break; case R_PPC_GOT_TLSGD16: *(ub16 *)loc = sym.get_tlsgd_addr(ctx) - GOT; break; case R_PPC_GOT_TLSLD16: *(ub16 *)loc = ctx.got->get_tlsld_addr(ctx) - GOT; break; case R_PPC_GOT_TPREL16: *(ub16 *)loc = sym.get_gottp_addr(ctx) - GOT; break; case R_PPC_ADDR32: case R_PPC_UADDR32: case R_PPC_TLS: case R_PPC_TLSGD: case R_PPC_TLSLD: case R_PPC_PLTSEQ: case R_PPC_PLTCALL: break; default: unreachable(); } } } template <> void InputSection::apply_reloc_nonalloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE || record_undef_error(ctx, rel)) continue; Symbol &sym = *file.symbols[rel.r_sym]; u8 *loc = base + rel.r_offset; SectionFragment *frag; i64 frag_addend; std::tie(frag, frag_addend) = get_fragment(ctx, rel); u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx); u64 A = frag ? frag_addend : (i64)rel.r_addend; switch (rel.r_type) { case R_PPC_ADDR32: if (std::optional val = get_tombstone(sym, frag)) *(ub32 *)loc = *val; else *(ub32 *)loc = S + A; break; case R_PPC_DTPREL32: if (std::optional val = get_tombstone(sym, frag)) *(ub32 *)loc = *val; else *(ub32 *)loc = S + A - ctx.dtp_addr; break; default: Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: " << rel; } } } template <> void InputSection::scan_relocations(Context &ctx) { assert(shdr().sh_flags & SHF_ALLOC); std::span> rels = get_rels(ctx); // Scan relocations for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE || record_undef_error(ctx, rel)) continue; Symbol &sym = *file.symbols[rel.r_sym]; if (sym.is_ifunc()) sym.flags |= NEEDS_GOT | NEEDS_PLT; switch (rel.r_type) { case R_PPC_ADDR14: case R_PPC_ADDR16: case R_PPC_UADDR16: case R_PPC_ADDR16_LO: case R_PPC_ADDR16_HI: case R_PPC_ADDR16_HA: case R_PPC_ADDR24: case R_PPC_ADDR30: scan_absrel(ctx, sym, rel); break; case R_PPC_REL14: case R_PPC_REL16: case R_PPC_REL16_LO: case R_PPC_REL16_HI: case R_PPC_REL16_HA: case R_PPC_REL32: scan_pcrel(ctx, sym, rel); break; case R_PPC_GOT16: case R_PPC_GOT16_LO: case R_PPC_GOT16_HI: case R_PPC_GOT16_HA: case R_PPC_PLT16_LO: case R_PPC_PLT16_HI: case R_PPC_PLT16_HA: case R_PPC_PLT32: sym.flags |= NEEDS_GOT; break; case R_PPC_REL24: case R_PPC_PLTREL24: case R_PPC_PLTREL32: if (sym.is_imported) sym.flags |= NEEDS_PLT; break; case R_PPC_GOT_TLSGD16: sym.flags |= NEEDS_TLSGD; break; case R_PPC_GOT_TLSLD16: ctx.needs_tlsld = true; break; case R_PPC_GOT_TPREL16: sym.flags |= NEEDS_GOTTP; break; case R_PPC_TPREL16_LO: case R_PPC_TPREL16_HI: case R_PPC_TPREL16_HA: check_tlsle(ctx, sym, rel); break; case R_PPC_ADDR32: case R_PPC_UADDR32: case R_PPC_LOCAL24PC: case R_PPC_TLS: case R_PPC_TLSGD: case R_PPC_TLSLD: case R_PPC_DTPREL16_LO: case R_PPC_DTPREL16_HI: case R_PPC_DTPREL16_HA: case R_PPC_PLTSEQ: case R_PPC_PLTCALL: break; default: Error(ctx) << *this << ": unknown relocation: " << rel; } } } template <> void Thunk::copy_buf(Context &ctx) { constexpr ub32 local_thunk[] = { // Get this thunk's address 0x7c08'02a6, // mflr r0 0x429f'0005, // bcl 20, 31, 4 0x7d88'02a6, // mflr r12 0x7c08'03a6, // mtlr r0 // Materialize the destination's address in %r11 and jump to that address 0x3d6c'0000, // addis r11, r12, OFFSET@higha 0x396b'0000, // addi r11, r11, OFFSET@lo 0x7d69'03a6, // mtctr r11 0x4e80'0420, // bctr 0x6000'0000, // nop }; static_assert(E::thunk_size == sizeof(plt_entry)); static_assert(E::thunk_size == sizeof(local_thunk)); u8 *base = ctx.buf + output_section.shdr.sh_offset + offset; for (i64 i = 0; i < symbols.size(); i++) { Symbol &sym = *symbols[i]; u64 P = get_addr() + offsets[i]; u8 *buf = base + offsets[i]; if (sym.has_plt(ctx)) { u64 got = sym.has_got(ctx) ? sym.get_got_addr(ctx) : sym.get_gotplt_addr(ctx); i64 val = got - P - 8; memcpy(buf, plt_entry, sizeof(plt_entry)); *(ub32 *)(buf + 16) |= higha(val); *(ub32 *)(buf + 20) |= lo(val); } else { i64 val = sym.get_addr(ctx) - P - 8; memcpy(buf, local_thunk, sizeof(local_thunk)); *(ub32 *)(buf + 16) |= higha(val); *(ub32 *)(buf + 20) |= lo(val); } } } } // namespace mold #endif ================================================ FILE: src/arch-ppc64v1.cc ================================================ // This file contains code for the 64-bit PowerPC ELFv1 ABI that is // commonly used for big-endian PPC systems. Modern PPC systems that use // the processor in the little-endian mode use the ELFv2 ABI instead. For // ELFv2, see arch-ppc64v2.cc. // // Even though they are similiar, ELFv1 isn't only different from ELFv2 in // endianness. The most notable difference is, in ELFv1, a function // pointer doesn't directly refer to the entry point of a function but // instead refers to a data structure so-called "function descriptor". // // The function descriptor is essentially a pair of a function entry point // address and a value that should be set to %r2 before calling that // function. There is also a third member for "the environment pointer for // languages such as Pascal and PL/1" according to the psABI, but it looks // like no one acutally uses it. In total, the function descriptor is 24 // bytes long. Here is why we need it. // // PPC generally lacks PC-relative data access instructions. Position- // independent code sets GOT + 0x8000 to %r2 and access global variables // relative to %r2. // // Each ELF file has its own GOT. If a function calls another function in // the same ELF file, it doesn't have to reset %r2. However, if it is in // other file (e.g. other .so), it has to set a new value to %r2 so that // the register contains the callee's GOT + 0x8000. // // In this way, you can't call a function just by knowing the function's // entry point address. You also need to know a proper %r2 value for the // function. This is why a function pointer refers to a tuple of an // address and a %r2 value. // // If a function call is made through PLT, PLT takes care of restoring %r2. // Therefore, the caller has to restore %r2 only for function calls // through function pointers. // // .opd (short for "official procedure descriptors") contains function // descriptors. // // You can think OPD as this: even in other targets, a function can have a // few different addresses for different purposes. It may not only have an // entry point address but may also have PLT and/or GOT addresses. // In PPCV1, it may have an OPD address in addition to these. OPD address // is used for relocations that refers to the address of a function as a // function pointer. // // https://github.com/rui314/psabi/blob/main/ppc64v1.pdf #if MOLD_PPC64V1 #include "mold.h" #include #include namespace mold { using E = PPC64V1; static u64 lo(u64 x) { return x & 0xffff; } static u64 hi(u64 x) { return x >> 16; } static u64 ha(u64 x) { return (x + 0x8000) >> 16; } static u64 high(u64 x) { return (x >> 16) & 0xffff; } static u64 higha(u64 x) { return ((x + 0x8000) >> 16) & 0xffff; } // .plt is used only for lazy symbol resolution on PPC64. All PLT // calls are made via range extension thunks even if they are within // reach. Thunks read addresses from .got.plt and jump there. // Therefore, once PLT symbols are resolved and final addresses are // written to .got.plt, thunks just skip .plt and directly jump to the // resolved addresses. template <> void write_plt_header(Context &ctx, u8 *buf) { constexpr ub32 insn[] = { 0x7d88'02a6, // mflr r12 0x429f'0005, // bcl 20, 31, 4 // obtain PC 0x7d68'02a6, // mflr r11 0x7d88'03a6, // mtlr r12 0x3d6b'0000, // addis r11, r11, GOTPLT_OFFSET@ha 0x396b'0000, // addi r11, r11, GOTPLT_OFFSET@lo 0xe98b'0000, // ld r12,0(r11) 0xe84b'0008, // ld r2,8(r11) 0x7d89'03a6, // mtctr r12 0xe96b'0010, // ld r11,16(r11) 0x4e80'0420, // bctr }; static_assert(sizeof(insn) == E::plt_hdr_size); memcpy(buf, insn, sizeof(insn)); i64 val = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 8; *(ub32 *)(buf + 16) |= higha(val); *(ub32 *)(buf + 20) |= lo(val); } template <> void write_plt_entry(Context &ctx, u8 *buf, Symbol &sym) { ub32 *loc = (ub32 *)buf; i64 idx = sym.get_plt_idx(ctx); // The PPC64 ELFv1 ABI requires PLT entries to be vary in size depending // on their indices. Unlike other targets, .got.plt is filled not by us // but by the loader, so we don't have a control over where the initial // call to the PLT entry jumps to. So we need to strictly follow the PLT // section layout as the loader expect it to be. if (idx < 0x8000) { constexpr ub32 insn[] = { 0x3800'0000, // li r0, PLT_INDEX 0x4b00'0000, // b plt0 }; memcpy(loc, insn, sizeof(insn)); loc[0] |= idx; loc[1] |= (ctx.plt->shdr.sh_addr - sym.get_plt_addr(ctx) - 4) & 0x00ff'ffff; } else { constexpr ub32 insn[] = { 0x3c00'0000, // lis r0, PLT_INDEX@high 0x6000'0000, // ori r0, r0, PLT_INDEX@lo 0x4b00'0000, // b plt0 }; memcpy(loc, insn, sizeof(insn)); loc[0] |= high(idx); loc[1] |= lo(idx); loc[2] |= (ctx.plt->shdr.sh_addr - sym.get_plt_addr(ctx) - 8) & 0x00ff'ffff; } } // .plt.got is not necessary on PPC64 because range extension thunks // directly read GOT entries and jump there. template <> void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym) {} template <> void EhFrameSection::apply_eh_reloc(Context &ctx, const ElfRel &rel, u64 offset, u64 val) { u8 *loc = ctx.buf + this->shdr.sh_offset + offset; switch (rel.r_type) { case R_NONE: break; case R_PPC64_ADDR64: *(ub64 *)loc = val; break; case R_PPC64_REL32: *(ub32 *)loc = val - this->shdr.sh_addr - offset; break; case R_PPC64_REL64: *(ub64 *)loc = val - this->shdr.sh_addr - offset; break; default: Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel; } } template <> void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE) continue; Symbol &sym = *file.symbols[rel.r_sym]; u8 *loc = base + rel.r_offset; u64 S = sym.get_addr(ctx); u64 A = rel.r_addend; u64 P = get_addr() + rel.r_offset; u64 G = sym.get_got_idx(ctx) * sizeof(Word); u64 GOT = ctx.got->shdr.sh_addr; u64 TOC = ctx.extra.TOC->value; auto check = [&](i64 val, i64 lo, i64 hi) { check_range(ctx, i, val, lo, hi); }; switch (rel.r_type) { case R_PPC64_TOC: break; case R_PPC64_TOC16_HA: *(ub16 *)loc = ha(S + A - TOC); break; case R_PPC64_TOC16_LO: *(ub16 *)loc = lo(S + A - TOC); break; case R_PPC64_TOC16_DS: check(S + A - TOC, -(1 << 15), 1 << 15); *(ub16 *)loc |= (S + A - TOC) & 0xfffc; break; case R_PPC64_TOC16_LO_DS: *(ub16 *)loc |= (S + A - TOC) & 0xfffc; break; case R_PPC64_REL24: { i64 val = sym.get_addr(ctx, NO_OPD) + A - P; if (sym.has_plt(ctx) || !is_int(val, 26)) val = sym.get_thunk_addr(ctx, P) + A - P; check(val, -(1 << 25), 1 << 25); *(ub32 *)loc |= bits(val, 25, 2) << 2; // If a callee is an external function, PLT saves %r2 to the // caller's r2 save slot. We need to restore it after function // return. To do so, there's usually a NOP as a placeholder // after a BL. 0x6000'0000 is a NOP. if (sym.has_plt(ctx) && *(ub32 *)(loc + 4) == 0x6000'0000) *(ub32 *)(loc + 4) = 0xe841'0028; // ld r2, 40(r1) break; } case R_PPC64_REL32: *(ub32 *)loc = S + A - P; break; case R_PPC64_REL64: *(ub64 *)loc = S + A - P; break; case R_PPC64_REL16_HA: *(ub16 *)loc = ha(S + A - P); break; case R_PPC64_REL16_LO: *(ub16 *)loc = lo(S + A - P); break; case R_PPC64_GOT16: *(ub16 *)loc = G - TOC; break; case R_PPC64_PLT16_HA: *(ub16 *)loc = ha(G + GOT - TOC); break; case R_PPC64_PLT16_HI: *(ub16 *)loc = hi(G + GOT - TOC); break; case R_PPC64_PLT16_LO: *(ub16 *)loc = lo(G + GOT - TOC); break; case R_PPC64_PLT16_LO_DS: *(ub16 *)loc |= (G + GOT - TOC) & 0xfffc; break; case R_PPC64_GOT_TPREL16_HA: *(ub16 *)loc = ha(sym.get_gottp_addr(ctx) - TOC); break; case R_PPC64_GOT_TLSGD16_HA: *(ub16 *)loc = ha(sym.get_tlsgd_addr(ctx) - TOC); break; case R_PPC64_GOT_TLSGD16_LO: *(ub16 *)loc = lo(sym.get_tlsgd_addr(ctx) - TOC); break; case R_PPC64_GOT_TLSLD16_HA: *(ub16 *)loc = ha(ctx.got->get_tlsld_addr(ctx) - TOC); break; case R_PPC64_GOT_TLSLD16_LO: *(ub16 *)loc = lo(ctx.got->get_tlsld_addr(ctx) - TOC); break; case R_PPC64_DTPREL16_HA: *(ub16 *)loc = ha(S + A - ctx.dtp_addr); break; case R_PPC64_DTPREL16_LO: *(ub16 *)loc = lo(S + A - ctx.dtp_addr); break; case R_PPC64_DTPREL16_LO_DS: *(ub16 *)loc |= (S + A - ctx.dtp_addr) & 0xfffc; break; case R_PPC64_TPREL16_HA: *(ub16 *)loc = ha(S + A - ctx.tp_addr); break; case R_PPC64_TPREL16_LO: *(ub16 *)loc = lo(S + A - ctx.tp_addr); break; case R_PPC64_TPREL16_LO_DS: *(ub16 *)loc |= (S + A - ctx.tp_addr) & 0xfffc; break; case R_PPC64_GOT_TPREL16_LO_DS: *(ub16 *)loc |= (sym.get_gottp_addr(ctx) - TOC) & 0xfffc; break; case R_PPC64_ADDR64: case R_PPC64_PLTSEQ: case R_PPC64_PLTCALL: case R_PPC64_TLS: case R_PPC64_TLSGD: case R_PPC64_TLSLD: break; default: unreachable(); } } } template <> void InputSection::apply_reloc_nonalloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE || record_undef_error(ctx, rel)) continue; Symbol &sym = *file.symbols[rel.r_sym]; u8 *loc = base + rel.r_offset; SectionFragment *frag; i64 frag_addend; std::tie(frag, frag_addend) = get_fragment(ctx, rel); u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx); u64 A = frag ? frag_addend : (i64)rel.r_addend; auto check = [&](i64 val, i64 lo, i64 hi) { check_range(ctx, val, i, lo, hi); }; switch (rel.r_type) { case R_PPC64_ADDR64: if (std::optional val = get_tombstone(sym, frag)) *(ub64 *)loc = *val; else *(ub64 *)loc = S + A; break; case R_PPC64_ADDR32: check(S + A, 0, 1LL << 32); *(ub32 *)loc = S + A; break; case R_PPC64_DTPREL64: *(ub64 *)loc = S + A - ctx.dtp_addr; break; default: Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: " << rel; } } } template <> void InputSection::scan_relocations(Context &ctx) { assert(shdr().sh_flags & SHF_ALLOC); std::span> rels = get_rels(ctx); // Scan relocations for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE || record_undef_error(ctx, rel)) continue; Symbol &sym = *file.symbols[rel.r_sym]; if (sym.is_ifunc()) sym.flags |= NEEDS_GOT | NEEDS_PLT | NEEDS_PPC_OPD; // Any relocation except R_PPC64_REL24 is considered as an // address-taking relocation. if (rel.r_type != R_PPC64_REL24 && sym.get_type() == STT_FUNC) sym.flags |= NEEDS_PPC_OPD; switch (rel.r_type) { case R_PPC64_GOT_TPREL16_HA: sym.flags |= NEEDS_GOTTP; break; case R_PPC64_REL24: if (sym.is_imported) sym.flags |= NEEDS_PLT; break; case R_PPC64_GOT16: case R_PPC64_PLT16_HA: sym.flags |= NEEDS_GOT; break; case R_PPC64_GOT_TLSGD16_HA: sym.flags |= NEEDS_TLSGD; break; case R_PPC64_GOT_TLSLD16_HA: ctx.needs_tlsld = true; break; case R_PPC64_TPREL16_HA: case R_PPC64_TPREL16_LO: case R_PPC64_TPREL16_LO_DS: check_tlsle(ctx, sym, rel); break; case R_PPC64_ADDR64: case R_PPC64_TOC: case R_PPC64_REL32: case R_PPC64_REL64: case R_PPC64_TOC16_HA: case R_PPC64_TOC16_LO: case R_PPC64_TOC16_LO_DS: case R_PPC64_TOC16_DS: case R_PPC64_REL16_HA: case R_PPC64_REL16_LO: case R_PPC64_PLT16_HI: case R_PPC64_PLT16_LO: case R_PPC64_PLT16_LO_DS: case R_PPC64_PLTSEQ: case R_PPC64_PLTCALL: case R_PPC64_GOT_TPREL16_LO_DS: case R_PPC64_GOT_TLSGD16_LO: case R_PPC64_GOT_TLSLD16_LO: case R_PPC64_TLS: case R_PPC64_TLSGD: case R_PPC64_TLSLD: case R_PPC64_DTPREL16_HA: case R_PPC64_DTPREL16_LO: case R_PPC64_DTPREL16_LO_DS: break; default: Error(ctx) << *this << ": unknown relocation: " << rel; } } } template <> void Thunk::copy_buf(Context &ctx) { // If the destination is .plt.got, we save the current r2, read an // address of a function descriptor from .got, restore %r2 and jump // to the function. constexpr ub32 pltgot_thunk[] = { // Store the caller's %r2 0xf841'0028, // std %r2, 40(%r1) // Load an address of a function descriptor 0x3d82'0000, // addis %r12, %r2, foo@got@toc@ha 0xe98c'0000, // ld %r12, foo@got@toc@lo(%r12) // Restore the callee's %r2 0xe84c'0008, // ld %r2, 8(%r12) // Jump to the function 0xe98c'0000, // ld %r12, 0(%r12) 0x7d89'03a6, // mtctr %r12 0x4e80'0420, // bctr }; // If the destination is .plt, read a function descriptor from .got.plt. constexpr ub32 plt_thunk[] = { // Store the caller's %r2 0xf841'0028, // std %r2, 40(%r1) // Materialize an address of a function descriptor 0x3d82'0000, // addis %r12, %r2, foo@gotplt@toc@ha 0x398c'0000, // addi %r12, %r12, foo@gotplt@toc@lo // Restore the callee's %r2 0xe84c'0008, // ld %r2, 8(%r12) // Jump to the function 0xe98c'0000, // ld %r12, 0(%r12) 0x7d89'03a6, // mtctr %r12 0x4e80'0420, // bctr }; // If the destination is a non-imported function, we directly jump // to the function entry address. constexpr ub32 local_thunk[] = { 0x3d82'0000, // addis r12, r2, foo@toc@ha 0x398c'0000, // addi r12, r12, foo@toc@lo 0x7d89'03a6, // mtctr r12 0x4e80'0420, // bctr 0x6000'0000, // nop 0x6000'0000, // nop 0x6000'0000, // nop }; static_assert(E::thunk_size == sizeof(pltgot_thunk)); static_assert(E::thunk_size == sizeof(plt_thunk)); static_assert(E::thunk_size == sizeof(local_thunk)); u8 *base = ctx.buf + output_section.shdr.sh_offset + offset; u64 TOC = ctx.extra.TOC->value; for (i64 i = 0; i < symbols.size(); i++) { Symbol &sym = *symbols[i]; u8 *buf = base + offsets[i]; if (sym.has_got(ctx)) { memcpy(buf, pltgot_thunk, sizeof(pltgot_thunk)); i64 val = sym.get_got_addr(ctx) - TOC; *(ub32 *)(buf + 4) |= higha(val); *(ub32 *)(buf + 8) |= lo(val); } else if(sym.has_plt(ctx)) { memcpy(buf, plt_thunk, sizeof(plt_thunk)); i64 val = sym.get_gotplt_addr(ctx) - TOC; *(ub32 *)(buf + 4) |= higha(val); *(ub32 *)(buf + 8) |= lo(val); } else { memcpy(buf, local_thunk, sizeof(local_thunk)); i64 val = sym.get_addr(ctx, NO_OPD) - TOC; *(ub32 *)buf |= higha(val); *(ub32 *)(buf + 4) |= lo(val); } } } static InputSection *get_opd_section(ObjectFile &file) { for (std::unique_ptr> &isec : file.sections) if (isec && isec->name() == ".opd") return isec.get(); return nullptr; } static ElfRel * get_relocation_at(Context &ctx, InputSection &isec, u64 offset) { std::span> rels = isec.get_rels(ctx); auto it = ranges::lower_bound(rels, offset, {}, [](const ElfRel &r) { return (u64)r.r_offset; }); if (it == rels.end() || it->r_offset != offset) return nullptr; return &*it; } namespace { struct OpdSymbol { u64 r_offset = 0; Symbol *sym = nullptr; }; } static Symbol * get_opd_sym_at(std::span syms, u64 offset) { auto it = ranges::lower_bound(syms, offset, {}, &OpdSymbol::r_offset); if (it == syms.end() || it->r_offset != offset) return nullptr; return it->sym; } // Compiler creates an .opd entry for each function symbol. The intention // is to make it possible to create an output .opd section just by linking // input .opd sections in the same manner as we do to other normal input // sections. // // However, in reality, .opd isn't a normal input section. It needs many // special treatments as follows: // // 1. A function symbol refers to not a .text but an .opd. Its address // works fine for address-taking relocations such as R_PPC64_ADDR64. // However, R_PPC64_REL24 (which is used for branch instruction) needs // a function's real address instead of the function's .opd address. // We need to read .opd contents to find out a function entry point // address to apply R_PPC64_REL24. // // 2. Output .opd entries are needed only for functions whose addresses // are taken. Just copying input .opd sections to an output would // produces lots of dead .opd entries. // // 3. In this design, all function symbols refer to an .opd section, and // that doesn't work well with graph traversal optimizations such as // garbage collection or identical comdat folding. For example, garbage // collector would mark an .opd alive which in turn mark all functions // thatare referenced by .opd as alive, effectively keeping all // functions as alive. // // The problem is that the compiler creates a half-baked .opd section, and // the linker has to figure out what all these .opd entries and // relocations are trying to achieve. It's like the compiler would emit a // half-baked .plt section in an object file and the linker has to deal // with that. That's not a good design. // // So, in this function, we undo what the compiler did to .opd. We remove // function symbols from .opd and reattach them to their function entry // points. We also rewrite relocations that directly refer to an input // .opd section so that they refer to function symbols instead. We then // mark input .opd sections as dead. // // After this function, we mark symbols with the NEEDS_PPC_OPD flag if the // symbol needs an .opd entry. We then create an output .opd just like we // do for .plt or .got. void ppc64v1_rewrite_opd(Context &ctx) { tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { InputSection *opd = get_opd_section(*file); if (!opd) return; opd->is_alive = false; // Move symbols from .opd to .text. std::vector opd_syms; for (Symbol *sym : file->symbols) { if (sym->file != file || sym->get_input_section() != opd) continue; if (u32 ty = sym->get_type(); ty != STT_FUNC && ty != STT_GNU_IFUNC) continue; ElfRel *rel = get_relocation_at(ctx, *opd, sym->value); if (!rel) Fatal(ctx) << *file << ": cannot find a relocation in .opd for " << *sym << " at offset 0x" << std::hex << (u64)sym->value; Symbol *sym2 = file->symbols[rel->r_sym]; if (sym2->get_type() != STT_SECTION) Fatal(ctx) << *file << ": bad relocation in .opd referring to " << *sym2; opd_syms.push_back({sym->value, sym}); sym->set_input_section(sym2->get_input_section()); sym->value = rel->r_addend; } // Sort symbols so that get_opd_sym_at() can do binary search. ranges::stable_sort(opd_syms, {}, &OpdSymbol::r_offset); // Rewrite relocations so that they directly refer to .opd. for (std::unique_ptr> &isec : file->sections) { if (!isec || !isec->is_alive || isec.get() == opd) continue; for (ElfRel &r : isec->get_rels(ctx)) { Symbol &sym = *file->symbols[r.r_sym]; if (sym.get_input_section() != opd) continue; Symbol *real_sym = get_opd_sym_at(opd_syms, r.r_addend); if (!real_sym) Fatal(ctx) << *isec << ": cannot find a symbol in .opd for " << r << " at offset 0x" << std::hex << (u64)r.r_addend; r.r_sym = real_sym->sym_idx; r.r_addend = 0; } } }); } // When a function is exported, the dynamic symbol for the function should // refers to the function's .opd entry. This function marks such symbols // with NEEDS_PPC_OPD. void ppc64v1_scan_symbols(Context &ctx) { tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { for (Symbol *sym : file->symbols) if (sym->file == file && sym->is_exported) if (u32 ty = sym->get_type(); ty == STT_FUNC || ty == STT_GNU_IFUNC) sym->flags |= NEEDS_PPC_OPD; }); // Functions referenced by the ELF header also have to have .opd entries. if (!ctx.arg.entry->is_imported) ctx.arg.entry->flags |= NEEDS_PPC_OPD; if (!ctx.arg.init->is_imported) ctx.arg.init->flags |= NEEDS_PPC_OPD; if (!ctx.arg.fini->is_imported) ctx.arg.fini->flags |= NEEDS_PPC_OPD; } void PPC64OpdSection::add_symbol(Context &ctx, Symbol *sym) { sym->set_opd_idx(ctx, symbols.size()); symbols.push_back(sym); this->shdr.sh_size += ENTRY_SIZE; } i64 PPC64OpdSection::get_reldyn_size(Context &ctx) const { if (ctx.arg.pic) return symbols.size() * 2; return 0; } void PPC64OpdSection::copy_buf(Context &ctx) { ub64 *buf = (ub64 *)(ctx.buf + this->shdr.sh_offset); ElfRel *rel = nullptr; if (ctx.arg.pic) rel = (ElfRel *)(ctx.buf + ctx.reldyn->shdr.sh_offset + reldyn_offset); for (Symbol *sym : symbols) { u64 addr = sym->get_addr(ctx, NO_PLT | NO_OPD); *buf++ = addr; *buf++ = ctx.extra.TOC->value; *buf++ = 0; if (ctx.arg.pic) { u64 loc = sym->get_opd_addr(ctx); *rel++ = ElfRel(loc, E::R_RELATIVE, 0, addr); *rel++ = ElfRel(loc + 8, E::R_RELATIVE, 0, ctx.extra.TOC->value); } } } } // namespace mold #endif ================================================ FILE: src/arch-ppc64v2.cc ================================================ // This file implements the PowerPC ELFv2 ABI which was standardized in // 2014. Modern little-endian PowerPC systems are based on this ABI. // The ABI is often referred to as "ppc64le". This shouldn't be confused // with "ppc64" which refers to the original, big-endian PowerPC systems. // // PPC64 is a bit tricky to support because PC-relative load/store // instructions hadn't been available until Power10 which debuted in 2021. // Prior to Power10, it wasn't trivial for position-independent code (PIC) // to load a value from, for example, .got, as we can't do that with [PC + // the offset to the .got entry]. // // In the following, I'll explain how PIC is supported on pre-Power10 // systems first and then explain what has changed with Power10. // // // Position-independent code on Power9 or earlier: // // We can get the program counter on older PPC64 systems with the // following four instructions // // mflr r1 // save the current link register to r1 // bl .+4 // branch to the next instruction as if it were a function // mflr r0 // copy the return address to r0 // mtlr r1 // restore the original link register value // // , but it's too expensive to do if we do this for each load/store. // // As a workaround, most functions are compiled in such a way that r2 is // assumed to always contain the address of .got + 0x8000. With this, we // can for example load the first entry of .got with a single instruction // `lw r0, -0x8000(r2)`. r2 is called the TOC pointer. // // There's only one .got for each ELF module. Therefore, if a callee is in // the same ELF module, r2 doesn't have to be recomputed. Most function // calls are usually within the same ELF module, so this mechanism is // efficient. // // A function compiled for pre-Power10 usually has two entry points, // global and local. The global entry point usually 8 bytes precedes // the local entry point. In between is the following instructions: // // addis r2, r12, .TOC.@ha // addi r2, r2, .TOC.@lo + 4; // // The global entry point assumes that the address of itself is in r12, // and it computes its own TOC pointer from r12. It's easy to do so for // the callee because the offset between its .got + 0x8000 and the // function is known at link-time. The above code sequence then falls // through to the local entry point that assumes r2 is .got + 0x8000. // // So, if a callee's TOC pointer is different from the current one // (e.g. calling a function in another .so), we first load the callee's // address to r12 (e.g. from .got.plt with a r2-relative load) and branch // to that address. Then the callee computes its own TOC pointer using // r12. // // // Position-independent code on Power10: // // Power10 added 8-bytes-long instructions to the ISA. Some of them are // PC-relative load/store instructions that take 34 bits offsets. // Functions compiled with `-mcpu=power10` use these instructions for PIC. // r2 does not have a special meaning in such fucntions. // // When a fucntion compiled for Power10 calls a function that uses the TOC // pointer, we need to compute a correct value for TOC and set it to r2 // before transferring the control to the callee. Thunks are responsible // for doing it. // // `_NOTOC` relocations such as `R_PPC64_REL24_NOTOC` indicate that the // callee does not use TOC (i.e. compiled with `-mcpu=power10`). If a // function using TOC is referenced via a `_NOTOC` relocation, that call // is made through a range extension thunk. // // // Note on section names: the PPC64 psABI uses a weird naming convention // which calls .got.plt .plt. We ignored that part because it's just // confusing. Since the runtime only cares about segments, we should be // able to name sections whatever we want. // // https://github.com/rui314/psabi/blob/main/ppc64v2.pdf #if MOLD_PPC64V2 #include "mold.h" namespace mold { using E = PPC64V2; static u64 lo(u64 x) { return x & 0xffff; } static u64 hi(u64 x) { return x >> 16; } static u64 ha(u64 x) { return (x + 0x8000) >> 16; } static u64 high(u64 x) { return (x >> 16) & 0xffff; } static u64 higha(u64 x) { return ((x + 0x8000) >> 16) & 0xffff; } static void write34(u8 *loc, u64 x) { ul32 *buf = (ul32 *)loc; buf[0] = (buf[0] & 0xfffc'0000) | bits(x, 33, 16); buf[1] = (buf[1] & 0xffff'0000) | bits(x, 15, 0); } // .plt is used only for lazy symbol resolution on PPC64. All PLT // calls are made via range extension thunks even if they are within // reach. Thunks read addresses from .got.plt and jump there. // Therefore, once PLT symbols are resolved and final addresses are // written to .got.plt, thunks just skip .plt and directly jump to the // resolved addresses. template <> void write_plt_header(Context &ctx, u8 *buf) { constexpr ul32 insn[] = { // Get PC 0x7c08'02a6, // mflr r0 0x429f'0005, // bcl 20, 31, 4 // obtain PC 0x7d68'02a6, // mflr r11 0x7c08'03a6, // mtlr r0 // Compute the PLT entry index 0x398c'ffd4, // addi r12, r12, -44 0x7c0b'6050, // subf r0, r11, r12 0x7800'f082, // rldicl r0, r0, 62, 2 // Compute the address of .got.plt 0x3d6b'0000, // addis r11, r11, GOTPLT_OFFSET@ha 0x396b'0000, // addi r11, r11, GOTPLT_OFFSET@lo // Load .got.plt[0] and .got.plt[1] and branch to .got.plt[0] 0xe98b'0000, // ld r12, 0(r11) 0x7d89'03a6, // mtctr r12 0xe96b'0008, // ld r11, 8(r11) 0x4e80'0420, // bctr }; memcpy(buf, insn, sizeof(insn)); i64 val = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 8; *(ul32 *)(buf + 28) |= higha(val); *(ul32 *)(buf + 32) |= lo(val); } template <> void write_plt_entry(Context &ctx, u8 *buf, Symbol &sym) { // When the control is transferred to a PLT entry, the PLT entry's // address is already set to %r12 by the caller. i64 offset = ctx.plt->shdr.sh_addr - sym.get_plt_addr(ctx); *(ul32 *)buf = 0x4b00'0000 | (offset & 0x00ff'ffff); // b plt0 } // .plt.got is not necessary on PPC64 because range extension thunks // directly read GOT entries and jump there. template <> void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym) {} template <> void EhFrameSection::apply_eh_reloc(Context &ctx, const ElfRel &rel, u64 offset, u64 val) { u8 *loc = ctx.buf + this->shdr.sh_offset + offset; switch (rel.r_type) { case R_NONE: break; case R_PPC64_ADDR64: *(ul64 *)loc = val; break; case R_PPC64_REL32: *(ul32 *)loc = val - this->shdr.sh_addr - offset; break; case R_PPC64_REL64: *(ul64 *)loc = val - this->shdr.sh_addr - offset; break; default: Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel; } } static u64 get_local_entry_offset(Context &ctx, Symbol &sym) { i64 val = sym.esym().ppc64_local_entry; assert(val <= 7); if (val == 7) Fatal(ctx) << sym << ": local entry offset 7 is reserved"; if (val == 0 || val == 1) return 0; return 1 << val; } template <> void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE) continue; Symbol &sym = *file.symbols[rel.r_sym]; u8 *loc = base + rel.r_offset; u64 S = sym.get_addr(ctx); u64 A = rel.r_addend; u64 P = get_addr() + rel.r_offset; u64 G = sym.get_got_idx(ctx) * sizeof(Word); u64 GOT = ctx.got->shdr.sh_addr; u64 TOC = ctx.extra.TOC->value; auto r2save_thunk_addr = [&] { return sym.get_thunk_addr(ctx, P); }; auto no_r2save_thunk_addr = [&] { return sym.get_thunk_addr(ctx, P) + 8; }; switch (rel.r_type) { case R_PPC64_TOC16_HA: *(ul16 *)loc = ha(S + A - TOC); break; case R_PPC64_TOC16_LO: *(ul16 *)loc = lo(S + A - TOC); break; case R_PPC64_TOC16_DS: case R_PPC64_TOC16_LO_DS: *(ul16 *)loc |= (S + A - TOC) & 0xfffc; break; case R_PPC64_REL24: if (sym.has_plt(ctx) || !sym.esym().ppc64_preserves_r2()) { i64 val = r2save_thunk_addr() + A - P; *(ul32 *)loc |= bits(val, 25, 2) << 2; // The thunk saves %r2 to the caller's r2 save slot. We need to // restore it after function return. To do so, there's usually a // NOP as a placeholder after a BL. 0x6000'0000 is a NOP. if (*(ul32 *)(loc + 4) == 0x6000'0000) *(ul32 *)(loc + 4) = 0xe841'0018; // ld r2, 24(r1) } else { i64 val = S + get_local_entry_offset(ctx, sym) + A - P; if (!is_int(val, 26)) val = no_r2save_thunk_addr() + A - P; *(ul32 *)loc |= bits(val, 25, 2) << 2; } break; case R_PPC64_REL24_NOTOC: if (sym.has_plt(ctx) || sym.esym().ppc64_uses_toc()) { i64 val = no_r2save_thunk_addr() + A - P; *(ul32 *)loc |= bits(val, 25, 2) << 2; } else { i64 val = S + A - P; if (!is_int(val, 26)) val = no_r2save_thunk_addr() + A - P; *(ul32 *)loc |= bits(val, 25, 2) << 2; } break; case R_PPC64_REL14: *(ul32 *)loc |= bits(S + A - P, 15, 2) << 2; break; case R_PPC64_REL32: *(ul32 *)loc = S + A - P; break; case R_PPC64_REL64: *(ul64 *)loc = S + A - P; break; case R_PPC64_REL16_HA: *(ul16 *)loc = ha(S + A - P); break; case R_PPC64_REL16_LO: *(ul16 *)loc = lo(S + A - P); break; case R_PPC64_GOT16: *(ul16 *)loc = G - TOC; break; case R_PPC64_GOT16_LO: *(ul16 *)loc = lo(G - TOC); break; case R_PPC64_GOT16_HI: *(ul16 *)loc = hi(G - TOC); break; case R_PPC64_GOT16_HA: *(ul16 *)loc = ha(G - TOC); break; case R_PPC64_PLT16_HA: *(ul16 *)loc = ha(G + GOT - TOC); break; case R_PPC64_PLT16_HI: *(ul16 *)loc = hi(G + GOT - TOC); break; case R_PPC64_PLT16_LO: *(ul16 *)loc = lo(G + GOT - TOC); break; case R_PPC64_PLT16_LO_DS: *(ul16 *)loc |= (G + GOT - TOC) & 0xfffc; break; case R_PPC64_PLT_PCREL34: case R_PPC64_PLT_PCREL34_NOTOC: case R_PPC64_GOT_PCREL34: write34(loc, G + GOT - P); break; case R_PPC64_PCREL34: write34(loc, S + A - P); break; case R_PPC64_GOT_TPREL16_HA: *(ul16 *)loc = ha(sym.get_gottp_addr(ctx) - TOC); break; case R_PPC64_GOT_TPREL16_LO_DS: *(ul16 *)loc |= (sym.get_gottp_addr(ctx) - TOC) & 0xfffc; break; case R_PPC64_GOT_TPREL_PCREL34: write34(loc, sym.get_gottp_addr(ctx) - P); break; case R_PPC64_GOT_TLSGD16_HA: *(ul16 *)loc = ha(sym.get_tlsgd_addr(ctx) - TOC); break; case R_PPC64_GOT_TLSGD16_LO: *(ul16 *)loc = lo(sym.get_tlsgd_addr(ctx) - TOC); break; case R_PPC64_GOT_TLSGD_PCREL34: write34(loc, sym.get_tlsgd_addr(ctx) - P); break; case R_PPC64_GOT_TLSLD16_HA: *(ul16 *)loc = ha(ctx.got->get_tlsld_addr(ctx) - TOC); break; case R_PPC64_GOT_TLSLD16_LO: *(ul16 *)loc = lo(ctx.got->get_tlsld_addr(ctx) - TOC); break; case R_PPC64_GOT_TLSLD_PCREL34: write34(loc, ctx.got->get_tlsld_addr(ctx) - P); break; case R_PPC64_DTPREL16_HA: *(ul16 *)loc = ha(S + A - ctx.dtp_addr); break; case R_PPC64_DTPREL16_LO: *(ul16 *)loc = lo(S + A - ctx.dtp_addr); break; case R_PPC64_DTPREL16_LO_DS: *(ul16 *)loc |= (S + A - ctx.dtp_addr) & 0xfffc; break; case R_PPC64_DTPREL34: write34(loc, S + A - ctx.dtp_addr); break; case R_PPC64_TPREL16_HA: *(ul16 *)loc = ha(S + A - ctx.tp_addr); break; case R_PPC64_TPREL16_LO: *(ul16 *)loc = lo(S + A - ctx.tp_addr); break; case R_PPC64_TPREL16_LO_DS: *(ul16 *)loc |= (S + A - ctx.tp_addr) & 0xfffc; break; case R_PPC64_TPREL34: write34(loc, S + A - ctx.tp_addr); break; case R_PPC64_ADDR64: case R_PPC64_PLTSEQ: case R_PPC64_PLTSEQ_NOTOC: case R_PPC64_PLTCALL: case R_PPC64_PLTCALL_NOTOC: case R_PPC64_TLS: case R_PPC64_TLSGD: case R_PPC64_TLSLD: case R_PPC64_ENTRY: break; default: unreachable(); } } } template <> void InputSection::apply_reloc_nonalloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE || record_undef_error(ctx, rel)) continue; Symbol &sym = *file.symbols[rel.r_sym]; u8 *loc = base + rel.r_offset; SectionFragment *frag; i64 frag_addend; std::tie(frag, frag_addend) = get_fragment(ctx, rel); u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx); u64 A = frag ? frag_addend : (i64)rel.r_addend; auto check = [&](i64 val, i64 lo, i64 hi) { check_range(ctx, i, val, lo, hi); }; switch (rel.r_type) { case R_PPC64_ADDR64: if (std::optional val = get_tombstone(sym, frag)) *(ul64 *)loc = *val; else *(ul64 *)loc = S + A; break; case R_PPC64_ADDR32: check(S + A, 0, 1LL << 32); *(ul32 *)loc = S + A; break; case R_PPC64_DTPREL64: *(ul64 *)loc = S + A - ctx.dtp_addr; break; default: Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: " << rel; } } } template <> void InputSection::scan_relocations(Context &ctx) { assert(shdr().sh_flags & SHF_ALLOC); std::span> rels = get_rels(ctx); // Scan relocations for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE || record_undef_error(ctx, rel)) continue; Symbol &sym = *file.symbols[rel.r_sym]; if (sym.is_ifunc()) sym.flags |= NEEDS_GOT | NEEDS_PLT; switch (rel.r_type) { case R_PPC64_GOT_TPREL16_HA: case R_PPC64_GOT_TPREL_PCREL34: sym.flags |= NEEDS_GOTTP; break; case R_PPC64_REL24: if (sym.is_imported) sym.flags |= NEEDS_PLT; break; case R_PPC64_REL24_NOTOC: if (sym.is_imported) sym.flags |= NEEDS_PLT; ctx.extra.is_power10 = true; break; case R_PPC64_GOT16: case R_PPC64_GOT16_LO: case R_PPC64_GOT16_HI: case R_PPC64_GOT16_HA: case R_PPC64_PLT16_HA: case R_PPC64_PLT_PCREL34: case R_PPC64_PLT_PCREL34_NOTOC: case R_PPC64_GOT_PCREL34: sym.flags |= NEEDS_GOT; break; case R_PPC64_GOT_TLSGD16_HA: case R_PPC64_GOT_TLSGD_PCREL34: sym.flags |= NEEDS_TLSGD; break; case R_PPC64_GOT_TLSLD16_HA: case R_PPC64_GOT_TLSLD_PCREL34: ctx.needs_tlsld = true; break; case R_PPC64_TPREL16_HA: case R_PPC64_TPREL34: check_tlsle(ctx, sym, rel); break; case R_PPC64_ADDR64: case R_PPC64_REL14: case R_PPC64_REL32: case R_PPC64_REL64: case R_PPC64_TOC16_HA: case R_PPC64_TOC16_LO: case R_PPC64_TOC16_LO_DS: case R_PPC64_TOC16_DS: case R_PPC64_REL16_HA: case R_PPC64_REL16_LO: case R_PPC64_PLT16_HI: case R_PPC64_PLT16_LO: case R_PPC64_PLT16_LO_DS: case R_PPC64_PCREL34: case R_PPC64_PLTSEQ: case R_PPC64_PLTSEQ_NOTOC: case R_PPC64_PLTCALL: case R_PPC64_PLTCALL_NOTOC: case R_PPC64_GOT_TPREL16_LO_DS: case R_PPC64_GOT_TLSGD16_LO: case R_PPC64_GOT_TLSLD16_LO: case R_PPC64_TPREL16_LO: case R_PPC64_TPREL16_LO_DS: case R_PPC64_TLS: case R_PPC64_TLSGD: case R_PPC64_TLSLD: case R_PPC64_DTPREL16_HA: case R_PPC64_DTPREL16_LO: case R_PPC64_DTPREL16_LO_DS: case R_PPC64_DTPREL34: case R_PPC64_ENTRY: break; default: Error(ctx) << *this << ": unknown relocation: " << rel; } } } template <> void Thunk::copy_buf(Context &ctx) { // If the destination is PLT, we read an address from .got.plt or .got // and jump there. constexpr ul32 plt_thunk[] = { 0xf841'0018, // std r2, 24(r1) 0x6000'0000, // nop 0x3d82'0000, // addis r12, r2, foo@gotplt@toc@ha 0xe98c'0000, // ld r12, foo@gotplt@toc@lo(r12) 0x7d89'03a6, // mtctr r12 0x4e80'0420, // bctr }; constexpr ul32 plt_thunk_power10[] = { 0xf841'0018, // std r2, 24(r1) 0x6000'0000, // nop 0x0410'0000, // pld r12, foo@gotplt@pcrel 0xe580'0000, 0x7d89'03a6, // mtctr r12 0x4e80'0420, // bctr }; // If the destination is a non-imported function, we directly jump // to its local entry point. constexpr ul32 local_thunk[] = { 0xf841'0018, // std r2, 24(r1) 0x6000'0000, // nop 0x3d82'0000, // addis r12, r2, foo@toc@ha 0x398c'0000, // addi r12, r12, foo@toc@lo 0x7d89'03a6, // mtctr r12 0x4e80'0420, // bctr }; constexpr ul32 local_thunk_power10[] = { 0xf841'0018, // std r2, 24(r1) 0x6000'0000, // nop 0x0610'0000, // pla r12, foo@pcrel 0x3980'0000, 0x7d89'03a6, // mtctr r12 0x4e80'0420, // bctr }; static_assert(E::thunk_size == sizeof(plt_thunk)); static_assert(E::thunk_size == sizeof(plt_thunk_power10)); static_assert(E::thunk_size == sizeof(local_thunk)); static_assert(E::thunk_size == sizeof(local_thunk_power10)); u8 *base = ctx.buf + output_section.shdr.sh_offset + offset; u64 TOC = ctx.extra.TOC->value; for (i64 i = 0; i < symbols.size(); i++) { Symbol &sym = *symbols[i]; u64 P = get_addr() + offsets[i]; u8 *buf = base + offsets[i]; if (sym.has_plt(ctx)) { u64 got = sym.has_got(ctx) ? sym.get_got_addr(ctx) : sym.get_gotplt_addr(ctx); if (ctx.extra.is_power10) { memcpy(buf, plt_thunk_power10, E::thunk_size); write34(buf + 8, got - P - 8); } else { memcpy(buf, plt_thunk, E::thunk_size); *(ul32 *)(buf + 8) |= higha(got - TOC); *(ul32 *)(buf + 12) |= lo(got - TOC); } } else { u64 S = sym.get_addr(ctx); if (ctx.extra.is_power10) { memcpy(buf, local_thunk_power10, E::thunk_size); write34(buf + 8, S - P - 8); } else { memcpy(buf, local_thunk, E::thunk_size); *(ul32 *)(buf + 8) |= higha(S - TOC); *(ul32 *)(buf + 12) |= lo(S - TOC); } } } } // GCC may emit references to the following functions in function prologue // and epiilogue if -Os is specified. For some reason, these functions are // not in libgcc.a and expected to be synthesized by the linker. const std::vector> ppc64_save_restore_insns = { { "_savegpr0_14", 0xf9c1ff70 }, // std r14,-144(r1) { "_savegpr0_15", 0xf9e1ff78 }, // std r15,-136(r1) { "_savegpr0_16", 0xfa01ff80 }, // std r16,-128(r1) { "_savegpr0_17", 0xfa21ff88 }, // std r17,-120(r1) { "_savegpr0_18", 0xfa41ff90 }, // std r18,-112(r1) { "_savegpr0_19", 0xfa61ff98 }, // std r19,-104(r1) { "_savegpr0_20", 0xfa81ffa0 }, // std r20,-96(r1) { "_savegpr0_21", 0xfaa1ffa8 }, // std r21,-88(r1) { "_savegpr0_22", 0xfac1ffb0 }, // std r22,-80(r1) { "_savegpr0_23", 0xfae1ffb8 }, // std r23,-72(r1) { "_savegpr0_24", 0xfb01ffc0 }, // std r24,-64(r1) { "_savegpr0_25", 0xfb21ffc8 }, // std r25,-56(r1) { "_savegpr0_26", 0xfb41ffd0 }, // std r26,-48(r1) { "_savegpr0_27", 0xfb61ffd8 }, // std r27,-40(r1) { "_savegpr0_28", 0xfb81ffe0 }, // std r28,-32(r1) { "_savegpr0_29", 0xfba1ffe8 }, // std r29,-24(r1) { "_savegpr0_30", 0xfbc1fff0 }, // std r30,-16(r1) { "_savegpr0_31", 0xfbe1fff8 }, // std r31,-8(r1) { "", 0xf8010010 }, // std r0,16(r1) { "", 0x4e800020 }, // blr { "_restgpr0_14", 0xe9c1ff70 }, // ld r14,-144(r1) { "_restgpr0_15", 0xe9e1ff78 }, // ld r15,-136(r1) { "_restgpr0_16", 0xea01ff80 }, // ld r16,-128(r1) { "_restgpr0_17", 0xea21ff88 }, // ld r17,-120(r1) { "_restgpr0_18", 0xea41ff90 }, // ld r18,-112(r1) { "_restgpr0_19", 0xea61ff98 }, // ld r19,-104(r1) { "_restgpr0_20", 0xea81ffa0 }, // ld r20,-96(r1) { "_restgpr0_21", 0xeaa1ffa8 }, // ld r21,-88(r1) { "_restgpr0_22", 0xeac1ffb0 }, // ld r22,-80(r1) { "_restgpr0_23", 0xeae1ffb8 }, // ld r23,-72(r1) { "_restgpr0_24", 0xeb01ffc0 }, // ld r24,-64(r1) { "_restgpr0_25", 0xeb21ffc8 }, // ld r25,-56(r1) { "_restgpr0_26", 0xeb41ffd0 }, // ld r26,-48(r1) { "_restgpr0_27", 0xeb61ffd8 }, // ld r27,-40(r1) { "_restgpr0_28", 0xeb81ffe0 }, // ld r28,-32(r1) { "_restgpr0_29", 0xe8010010 }, // ld r0,16(r1) { "", 0xeba1ffe8 }, // ld r29,-24(r1) { "", 0x7c0803a6 }, // mtlr r0 { "", 0xebc1fff0 }, // ld r30,-16(r1) { "", 0xebe1fff8 }, // ld r31,-8(r1) { "", 0x4e800020 }, // blr { "_restgpr0_30", 0xebc1fff0 }, // ld r30,-16(r1) { "_restgpr0_31", 0xe8010010 }, // ld r0,16(r1) { "", 0xebe1fff8 }, // ld r31,-8(r1) { "", 0x7c0803a6 }, // mtlr r0 { "", 0x4e800020 }, // blr { "_savegpr1_14", 0xf9ccff70 }, // std r14,-144(r12) { "_savegpr1_15", 0xf9ecff78 }, // std r15,-136(r12) { "_savegpr1_16", 0xfa0cff80 }, // std r16,-128(r12) { "_savegpr1_17", 0xfa2cff88 }, // std r17,-120(r12) { "_savegpr1_18", 0xfa4cff90 }, // std r18,-112(r12) { "_savegpr1_19", 0xfa6cff98 }, // std r19,-104(r12) { "_savegpr1_20", 0xfa8cffa0 }, // std r20,-96(r12) { "_savegpr1_21", 0xfaacffa8 }, // std r21,-88(r12) { "_savegpr1_22", 0xfaccffb0 }, // std r22,-80(r12) { "_savegpr1_23", 0xfaecffb8 }, // std r23,-72(r12) { "_savegpr1_24", 0xfb0cffc0 }, // std r24,-64(r12) { "_savegpr1_25", 0xfb2cffc8 }, // std r25,-56(r12) { "_savegpr1_26", 0xfb4cffd0 }, // std r26,-48(r12) { "_savegpr1_27", 0xfb6cffd8 }, // std r27,-40(r12) { "_savegpr1_28", 0xfb8cffe0 }, // std r28,-32(r12) { "_savegpr1_29", 0xfbacffe8 }, // std r29,-24(r12) { "_savegpr1_30", 0xfbccfff0 }, // std r30,-16(r12) { "_savegpr1_31", 0xfbecfff8 }, // std r31,-8(r12) { "", 0x4e800020 }, // blr { "_restgpr1_14", 0xe9ccff70 }, // ld r14,-144(r12) { "_restgpr1_15", 0xe9ecff78 }, // ld r15,-136(r12) { "_restgpr1_16", 0xea0cff80 }, // ld r16,-128(r12) { "_restgpr1_17", 0xea2cff88 }, // ld r17,-120(r12) { "_restgpr1_18", 0xea4cff90 }, // ld r18,-112(r12) { "_restgpr1_19", 0xea6cff98 }, // ld r19,-104(r12) { "_restgpr1_20", 0xea8cffa0 }, // ld r20,-96(r12) { "_restgpr1_21", 0xeaacffa8 }, // ld r21,-88(r12) { "_restgpr1_22", 0xeaccffb0 }, // ld r22,-80(r12) { "_restgpr1_23", 0xeaecffb8 }, // ld r23,-72(r12) { "_restgpr1_24", 0xeb0cffc0 }, // ld r24,-64(r12) { "_restgpr1_25", 0xeb2cffc8 }, // ld r25,-56(r12) { "_restgpr1_26", 0xeb4cffd0 }, // ld r26,-48(r12) { "_restgpr1_27", 0xeb6cffd8 }, // ld r27,-40(r12) { "_restgpr1_28", 0xeb8cffe0 }, // ld r28,-32(r12) { "_restgpr1_29", 0xebacffe8 }, // ld r29,-24(r12) { "_restgpr1_30", 0xebccfff0 }, // ld r30,-16(r12) { "_restgpr1_31", 0xebecfff8 }, // ld r31,-8(r12) { "", 0x4e800020 }, // blr }; void PPC64SaveRestoreSection::copy_buf(Context &ctx) { ul32 *buf = (ul32 *)(ctx.buf + this->shdr.sh_offset); for (auto [label, insn] : ppc64_save_restore_insns) *buf++ = insn; } template <> u64 get_eflags(Context &ctx) { return 2; } } // namespace mold #endif ================================================ FILE: src/arch-riscv.cc ================================================ // RISC-V is a clean RISC ISA. It supports PC-relative load/store for // position-independent code. Its 32-bit and 64-bit ISAs are almost // identical. That is, you can think RV32 as a RV64 without 64-bit // operations. In this file, we support both RV64 and RV32. // // RISC-V is essentially little-endian, but the big-endian version is // available as an extension. GCC supports `-mbig-endian` to generate // big-endian code. Even in big-endian mode, machine instructions are // defined to be encoded in little-endian, though. Only the behavior of // load/store instructions are different between LE RISC-V and BE RISC-V. // // From the linker's point of view, the RISC-V's psABI is unique because // sections in input object files can be shrunk while being copied to the // output file. That is contrary to other psABIs in which sections are an // atomic unit of copying. See file comments in shrink-sections.cc for // details. // // https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-elf.adoc #if MOLD_RV64LE || MOLD_RV64BE || MOLD_RV32LE || MOLD_RV32BE #include "mold.h" #include namespace mold { using E = MOLD_TARGET; static void write_itype(u8 *loc, u32 val) { *(ul32 *)loc &= 0b000000'00000'11111'111'11111'1111111; *(ul32 *)loc |= bits(val, 11, 0) << 20; } static void write_stype(u8 *loc, u32 val) { *(ul32 *)loc &= 0b000000'11111'11111'111'00000'1111111; *(ul32 *)loc |= bits(val, 11, 5) << 25 | bits(val, 4, 0) << 7; } static void write_btype(u8 *loc, u32 val) { *(ul32 *)loc &= 0b000000'11111'11111'111'00000'1111111; *(ul32 *)loc |= bit(val, 12) << 31 | bits(val, 10, 5) << 25 | bits(val, 4, 1) << 8 | bit(val, 11) << 7; } static void write_utype(u8 *loc, u32 val) { *(ul32 *)loc &= 0b000000'00000'00000'000'11111'1111111; // U-type instructions are used in combination with I-type // instructions. U-type insn sets an immediate to the upper 20-bits // of a register. I-type insn sign-extends a 12-bits immediate and // adds it to a register value to construct a complete value. 0x800 // is added here to compensate for the sign-extension. *(ul32 *)loc |= (val + 0x800) & 0xffff'f000; } static void write_jtype(u8 *loc, u32 val) { *(ul32 *)loc &= 0b000000'00000'00000'000'11111'1111111; *(ul32 *)loc |= bit(val, 20) << 31 | bits(val, 10, 1) << 21 | bit(val, 11) << 20 | bits(val, 19, 12) << 12; } static void write_citype(u8 *loc, u32 val) { *(ul16 *)loc &= 0b111'0'11111'00000'11; *(ul16 *)loc |= bit(val, 5) << 12 | bits(val, 4, 0) << 2; } static void write_cbtype(u8 *loc, u32 val) { *(ul16 *)loc &= 0b111'000'111'00000'11; *(ul16 *)loc |= bit(val, 8) << 12 | bit(val, 4) << 11 | bit(val, 3) << 10 | bit(val, 7) << 6 | bit(val, 6) << 5 | bit(val, 2) << 4 | bit(val, 1) << 3 | bit(val, 5) << 2; } static void write_cjtype(u8 *loc, u32 val) { *(ul16 *)loc &= 0b111'00000000000'11; *(ul16 *)loc |= bit(val, 11) << 12 | bit(val, 4) << 11 | bit(val, 9) << 10 | bit(val, 8) << 9 | bit(val, 10) << 8 | bit(val, 6) << 7 | bit(val, 7) << 6 | bit(val, 3) << 5 | bit(val, 2) << 4 | bit(val, 1) << 3 | bit(val, 5) << 2; } static void set_rs1(u8 *loc, u32 rs1) { assert(rs1 < 32); *(ul32 *)loc &= 0b111111'11111'00000'111'11111'1111111; *(ul32 *)loc |= rs1 << 15; } static u32 get_rd(u8 *loc) { return bits(*(u32 *)loc, 11, 7); }; template <> void write_plt_header(Context &ctx, u8 *buf) { constexpr ul32 insn_64[] = { 0x0000'0397, // auipc t2, %pcrel_hi(.got.plt) 0x41c3'0333, // sub t1, t1, t3 # .plt entry + hdr + 12 0x0003'be03, // ld t3, %pcrel_lo(1b)(t2) # _dl_runtime_resolve 0xfd43'0313, // addi t1, t1, -44 # .plt entry 0x0003'8293, // addi t0, t2, %pcrel_lo(1b) # &.got.plt 0x0013'5313, // srli t1, t1, 1 # .plt entry offset 0x0082'b283, // ld t0, 8(t0) # link map 0x000e'0067, // jr t3 }; constexpr ul32 insn_32[] = { 0x0000'0397, // auipc t2, %pcrel_hi(.got.plt) 0x41c3'0333, // sub t1, t1, t3 # .plt entry + hdr + 12 0x0003'ae03, // lw t3, %pcrel_lo(1b)(t2) # _dl_runtime_resolve 0xfd43'0313, // addi t1, t1, -44 # .plt entry 0x0003'8293, // addi t0, t2, %pcrel_lo(1b) # &.got.plt 0x0023'5313, // srli t1, t1, 2 # .plt entry offset 0x0042'a283, // lw t0, 4(t0) # link map 0x000e'0067, // jr t3 }; u64 gotplt = ctx.gotplt->shdr.sh_addr; u64 plt = ctx.plt->shdr.sh_addr; memcpy(buf, E::is_64 ? insn_64 : insn_32, E::plt_hdr_size); write_utype(buf, gotplt - plt); write_itype(buf + 8, gotplt - plt); write_itype(buf + 16, gotplt - plt); } static constexpr ul32 plt_entry_64[] = { 0x0000'0e17, // auipc t3, %pcrel_hi(function@.got.plt) 0x000e'3e03, // ld t3, %pcrel_lo(1b)(t3) 0x000e'0367, // jalr t1, t3 0x0010'0073, // ebreak }; static constexpr ul32 plt_entry_32[] = { 0x0000'0e17, // auipc t3, %pcrel_hi(function@.got.plt) 0x000e'2e03, // lw t3, %pcrel_lo(1b)(t3) 0x000e'0367, // jalr t1, t3 0x0010'0073, // ebreak }; template <> void write_plt_entry(Context &ctx, u8 *buf, Symbol &sym) { u64 gotplt = sym.get_gotplt_addr(ctx); u64 plt = sym.get_plt_addr(ctx); memcpy(buf, E::is_64 ? plt_entry_64 : plt_entry_32, E::plt_size); write_utype(buf, gotplt - plt); write_itype(buf + 4, gotplt - plt); } template <> void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym) { u64 got = sym.get_got_pltgot_addr(ctx); u64 plt = sym.get_plt_addr(ctx); memcpy(buf, E::is_64 ? plt_entry_64 : plt_entry_32, E::plt_size); write_utype(buf, got - plt); write_itype(buf + 4, got - plt); } template <> void EhFrameSection::apply_eh_reloc(Context &ctx, const ElfRel &rel, u64 offset, u64 val) { u8 *loc = ctx.buf + this->shdr.sh_offset + offset; switch (rel.r_type) { case R_NONE: break; case R_RISCV_ADD32: *(U32 *)loc += val; break; case R_RISCV_SUB8: *loc -= val; break; case R_RISCV_SUB16: *(U16 *)loc -= val; break; case R_RISCV_SUB32: *(U32 *)loc -= val; break; case R_RISCV_SUB6: *loc = (*loc & 0b1100'0000) | ((*loc - val) & 0b0011'1111); break; case R_RISCV_SET6: *loc = (*loc & 0b1100'0000) | (val & 0b0011'1111); break; case R_RISCV_SET8: *loc = val; break; case R_RISCV_SET16: *(U16 *)loc = val; break; case R_RISCV_SET32: *(U32 *)loc = val; break; case R_RISCV_32_PCREL: *(U32 *)loc = val - this->shdr.sh_addr - offset; break; default: Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel; } } // RISC-V generally uses the AUIPC + ADDI/LW/SW/etc instruction pair // to access the AUIPC's address ± 2 GiB. AUIPC materializes the most // significant 52 bits in a PC-relative manner, and the following // instruction specifies the remaining least significant 12 bits. // There are several HI20 and LO12 relocation types for them. // // LO12 relocations need to materialize an address relative to AUIPC's // address, not relative to the instruction that the relocation // directly refers to. // // The problem here is that the instruction pair may not always be // adjacent. We need a mechanism to find a paired AUIPC for a given // LO12 relocation. For this purpose, the compiler creates a local // symbol for each location to which HI20 refers, and the LO12 // relocation refers to that symbol. // // This function returns a paired HI20 relocation for a given LO12. // Since the instructions are typically adjacent, we do a linear // search. static const ElfRel & find_paired_reloc(Context &ctx, InputSection &isec, std::span> rels, Symbol &sym, i64 i) { auto is_hi20 = [](u32 ty) { return ty == R_RISCV_GOT_HI20 || ty == R_RISCV_TLS_GOT_HI20 || ty == R_RISCV_TLS_GD_HI20 || ty == R_RISCV_PCREL_HI20 || ty == R_RISCV_TLSDESC_HI20; }; u64 value = sym.esym().st_value; if (value <= rels[i].r_offset) { for (i64 j = i - 1; j >= 0; j--) if (is_hi20(rels[j].r_type) && value == rels[j].r_offset) return rels[j]; } else { for (i64 j = i + 1; j < rels.size(); j++) if (is_hi20(rels[j].r_type) && value == rels[j].r_offset) return rels[j]; } Fatal(ctx) << isec << ": paired relocation is missing: " << i; } // Returns true if isec's i'th relocation refers to the following // GOT-load instructioon pair, which is an expeanded form of // `la t0, foo` pseudo assembly instruction. // // .L0 // auipc t0, 0 # R_RISCV_GOT_HI20(foo), R_RISCV_RELAX // ld t0, 0(t0) # R_RISCV_PCREL_LO12_I(.L0), R_RISCV_RELAX static bool is_got_load_pair(Context &ctx, InputSection &isec, std::span> rels, i64 i) { u8 *buf = (u8 *)isec.contents.data(); return i + 3 < rels.size() && rels[i].r_type == R_RISCV_GOT_HI20 && rels[i + 1].r_type == R_RISCV_RELAX && rels[i + 2].r_type == R_RISCV_PCREL_LO12_I && rels[i + 3].r_type == R_RISCV_RELAX && rels[i].r_offset == rels[i + 2].r_offset - 4 && rels[i].r_offset == isec.file.symbols[rels[i + 2].r_sym]->value && get_rd(buf + rels[i].r_offset) == get_rd(buf + rels[i + 2].r_offset); } template <> void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); std::span deltas = extra.r_deltas; i64 k = 0; u8 *buf = (u8 *)contents.data(); for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE || rel.r_type == R_RISCV_RELAX) continue; i64 removed_bytes = 0; i64 r_delta = 0; if (!deltas.empty()) { while (k < deltas.size() && deltas[k].offset < rel.r_offset) k++; if (k < deltas.size() && deltas[k].offset == rel.r_offset) removed_bytes = get_removed_bytes(deltas, k); if (k > 0) r_delta = deltas[k - 1].delta; } Symbol &sym = *file.symbols[rel.r_sym]; i64 r_offset = rel.r_offset - r_delta; u8 *loc = base + r_offset; u64 S = sym.get_addr(ctx); u64 A = rel.r_addend; u64 P = get_addr() + r_offset; u64 G = sym.get_got_idx(ctx) * sizeof(Word); u64 GOT = ctx.got->shdr.sh_addr; auto check = [&](i64 val, i64 lo, i64 hi) { check_range(ctx, i, val, lo, hi); }; auto utype = [&](i64 val) { check(val, -(1LL << 31) - 0x800, (1LL << 31) - 0x800); write_utype(loc, val); }; switch (rel.r_type) { case R_RISCV_32: if (E::is_64) *(U32 *)loc = S + A; break; case R_RISCV_64: break; case R_RISCV_BRANCH: check(S + A - P, -(1 << 12), 1 << 12); write_btype(loc, S + A - P); break; case R_RISCV_JAL: check(S + A - P, -(1 << 20), 1 << 20); write_jtype(loc, S + A - P); break; case R_RISCV_CALL: case R_RISCV_CALL_PLT: { i64 val = S + A - P; i64 rd = get_rd(buf + rel.r_offset + 4); if (removed_bytes == 4) { // auipc + jalr -> jal *(ul32 *)loc = (rd << 7) | 0b1101111; write_jtype(loc, val); } else if (removed_bytes == 6 && rd == 0) { // auipc + jalr -> c.j *(ul16 *)loc = 0b101'00000000000'01; write_cjtype(loc, val); } else if (removed_bytes == 6 && rd == 1) { // auipc + jalr -> c.jal assert(!E::is_64); *(ul16 *)loc = 0b001'00000000000'01; write_cjtype(loc, val); } else { assert(removed_bytes == 0); utype(val); write_itype(loc + 4, val); } break; } case R_RISCV_GOT_HI20: { // This relocation usually refers to an AUIPC + LD instruction // pair to load a symbol value from the GOT. If the symbol value // is actually a link-time constant, we can materialize the value // directly into a register to eliminate a memory load. i64 rd = get_rd(buf + rel.r_offset); if (removed_bytes == 6) { // c.li , val *(ul16 *)loc = 0b010'0'00000'00000'01 | (rd << 7); write_citype(loc, sym.get_addr(ctx)); i += 3; } else if (removed_bytes == 4) { // addi , zero, val *(ul32 *)loc = 0b0010011 | (rd << 7); write_itype(loc, sym.get_addr(ctx)); i += 3; } else { assert(removed_bytes == 0); i64 val = S + A - P; if (ctx.arg.relax && sym.is_pcrel_linktime_const(ctx) && is_got_load_pair(ctx, *this, rels, i) && is_int(val, 32)) { // auipc , %hi20(val) utype(val); // addi , , %lo12(val) *(ul32 *)(loc + 4) = 0b0010011 | (rd << 15) | (rd << 7); write_itype(loc + 4, val); i += 3; } else { utype(G + GOT + A - P); } } break; } case R_RISCV_TLS_GOT_HI20: utype(sym.get_gottp_addr(ctx) + A - P); break; case R_RISCV_TLS_GD_HI20: utype(sym.get_tlsgd_addr(ctx) + A - P); break; case R_RISCV_PCREL_HI20: utype(S + A - P); break; case R_RISCV_PCREL_LO12_I: case R_RISCV_PCREL_LO12_S: { const ElfRel &rel2 = find_paired_reloc(ctx, *this, rels, sym, i); Symbol &sym2 = *file.symbols[rel2.r_sym]; auto write = (rel.r_type == R_RISCV_PCREL_LO12_I) ? write_itype : write_stype; u64 S = sym2.get_addr(ctx); u64 A = rel2.r_addend; u64 P = get_addr() + rel2.r_offset - get_r_delta(*this, rel2.r_offset); u64 G = sym2.get_got_idx(ctx) * sizeof(Word); switch (rel2.r_type) { case R_RISCV_GOT_HI20: write(loc, G + GOT + A - P); break; case R_RISCV_TLS_GOT_HI20: write(loc, sym2.get_gottp_addr(ctx) + A - P); break; case R_RISCV_TLS_GD_HI20: write(loc, sym2.get_tlsgd_addr(ctx) + A - P); break; case R_RISCV_PCREL_HI20: write(loc, S + A - P); break; } break; } case R_RISCV_HI20: if (removed_bytes == 2) { // Rewrite LUI with C.LUI i64 rd = get_rd(buf + rel.r_offset); *(ul16 *)loc = 0b011'0'00000'00000'01 | (rd << 7); write_citype(loc, (S + A + 0x800) >> 12); } else if (removed_bytes == 0) { utype(S + A); } break; case R_RISCV_LO12_I: case R_RISCV_LO12_S: if (rel.r_type == R_RISCV_LO12_I) write_itype(loc, S + A); else write_stype(loc, S + A); // Rewrite `lw t1, 0(t0)` with `lw t1, 0(x0)` if the address is // accessible relative to the zero register because if that's the // case, corresponding LUI might have been removed by relaxation. if (is_int(S + A, 12)) set_rs1(loc, 0); break; case R_RISCV_TPREL_HI20: assert(removed_bytes == 0 || removed_bytes == 4); if (removed_bytes == 0) utype(S + A - ctx.tp_addr); break; case R_RISCV_TPREL_ADD: // This relocation just annotates an ADD instruction that can be // removed when a TPREL is relaxed. No value is needed to be // written. assert(removed_bytes == 0 || removed_bytes == 4); break; case R_RISCV_TPREL_LO12_I: case R_RISCV_TPREL_LO12_S: { i64 val = S + A - ctx.tp_addr; if (rel.r_type == R_RISCV_TPREL_LO12_I) write_itype(loc, val); else write_stype(loc, val); // Rewrite `lw t1, 0(t0)` with `lw t1, 0(tp)` if the address is // directly accessible using tp. tp is x4. if (is_int(val, 12)) set_rs1(loc, 4); break; } case R_RISCV_TLSDESC_HI20: // RISC-V TLSDESC uses the following code sequence to materialize // a TP-relative address in a0. // // .L0: // auipc tX, 0 // R_RISCV_TLSDESC_HI20 foo // l[d|w] tY, tX, 0 // R_RISCV_TLSDESC_LOAD_LO12_I .L0 // addi a0, tX, 0 // R_RISCV_TLSDESC_ADD_LO12_I .L0 // jalr t0, tY // R_RISCV_TLSDESC_CALL .L0 // // For non-dlopen'd DSO, we may relax the instructions to the following: // // // // auipc a0, %gottp_hi(a0) // l[d|w] a0, %gottp_lo(a0) // // For executable, if the TP offset is small enough, we'll relax // it to the following: // // // // // addi a0, zero, %tpoff_lo(a0) // // Otherwise, the following sequence is used: // // // // lui a0, %tpoff_hi(a0) // addi a0, a0, %tpoff_lo(a0) // // If the code-shrinking relaxation is disabled, we may leave // original useless instructions instead of deleting them, but we // accept that because relaxations are enabled by default. if (sym.has_tlsdesc(ctx) && removed_bytes == 0) utype(sym.get_tlsdesc_addr(ctx) + A - P); break; case R_RISCV_TLSDESC_LOAD_LO12: case R_RISCV_TLSDESC_ADD_LO12: case R_RISCV_TLSDESC_CALL: { if (removed_bytes == 4) break; const ElfRel &rel2 = find_paired_reloc(ctx, *this, rels, sym, i); Symbol &sym2 = *file.symbols[rel2.r_sym]; u64 S = sym2.get_addr(ctx); u64 A = rel2.r_addend; u64 P = get_addr() + rel2.r_offset - get_r_delta(*this, rel2.r_offset); switch (rel.r_type) { case R_RISCV_TLSDESC_LOAD_LO12: if (sym2.has_tlsdesc(ctx)) write_itype(loc, sym2.get_tlsdesc_addr(ctx) + A - P); else *(ul32 *)loc = 0x13; // nop break; case R_RISCV_TLSDESC_ADD_LO12: if (sym2.has_tlsdesc(ctx)) { write_itype(loc, sym2.get_tlsdesc_addr(ctx) + A - P); } else if (sym2.has_gottp(ctx)) { *(ul32 *)loc = 0x517; // auipc a0, utype(sym2.get_gottp_addr(ctx) + A - P); } else { *(ul32 *)loc = 0x537; // lui a0, utype(S + A - ctx.tp_addr); } break; case R_RISCV_TLSDESC_CALL: if (sym2.has_tlsdesc(ctx)) { // Do nothing } else if (sym2.has_gottp(ctx)) { // l[d|w] a0, *(ul32 *)loc = E::is_64 ? 0x53503 : 0x52503; write_itype(loc, sym2.get_gottp_addr(ctx) + A - P); } else { i64 val = S + A - ctx.tp_addr; if (is_int(val, 12)) *(ul32 *)loc = 0x513; // addi a0,zero, else *(ul32 *)loc = 0x50513; // addi a0,a0, write_itype(loc, val); } break; } break; } case R_RISCV_ADD8: loc += S + A; break; case R_RISCV_ADD16: *(U16 *)loc += S + A; break; case R_RISCV_ADD32: *(U32 *)loc += S + A; break; case R_RISCV_ADD64: *(U64 *)loc += S + A; break; case R_RISCV_SUB8: loc -= S + A; break; case R_RISCV_SUB16: *(U16 *)loc -= S + A; break; case R_RISCV_SUB32: *(U32 *)loc -= S + A; break; case R_RISCV_SUB64: *(U64 *)loc -= S + A; break; case R_RISCV_ALIGN: { // A R_RISCV_ALIGN is followed by a NOP sequence. We need to remove // zero or more bytes so that the instruction after R_RISCV_ALIGN is // aligned to a given alignment boundary. // // We need to guarantee that the NOP sequence is valid after byte // removal (e.g. we can't remove the first 2 bytes of a 4-byte NOP). // For the sake of simplicity, we always rewrite the entire NOP sequence. i64 padding_bytes = rel.r_addend - removed_bytes; assert((padding_bytes & 1) == 0); i64 i = 0; for (; i <= padding_bytes - 4; i += 4) *(ul32 *)(loc + i) = 0x0000'0013; // nop if (i < padding_bytes) *(ul16 *)(loc + i) = 0x0001; // c.nop break; } case R_RISCV_RVC_BRANCH: check(S + A - P, -(1 << 8), 1 << 8); write_cbtype(loc, S + A - P); break; case R_RISCV_RVC_JUMP: check(S + A - P, -(1 << 11), 1 << 11); write_cjtype(loc, S + A - P); break; case R_RISCV_SUB6: *loc = (*loc & 0b1100'0000) | ((*loc - S - A) & 0b0011'1111); break; case R_RISCV_SET6: *loc = (*loc & 0b1100'0000) | ((S + A) & 0b0011'1111); break; case R_RISCV_SET8: *loc = S + A; break; case R_RISCV_SET16: *(U16 *)loc = S + A; break; case R_RISCV_SET32: *(U32 *)loc = S + A; break; case R_RISCV_PLT32: case R_RISCV_32_PCREL: *(U32 *)loc = S + A - P; break; case R_RISCV_SET_ULEB128: overwrite_uleb(loc, S + A); break; case R_RISCV_SUB_ULEB128: overwrite_uleb(loc, read_uleb(loc) - S - A); break; default: unreachable(); } } } template <> void InputSection::apply_reloc_nonalloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE || record_undef_error(ctx, rel)) continue; Symbol &sym = *file.symbols[rel.r_sym]; u8 *loc = base + rel.r_offset; SectionFragment *frag; i64 frag_addend; std::tie(frag, frag_addend) = get_fragment(ctx, rel); u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx); u64 A = frag ? frag_addend : (i64)rel.r_addend; switch (rel.r_type) { case R_RISCV_32: *(U32 *)loc = S + A; break; case R_RISCV_64: if (std::optional val = get_tombstone(sym, frag)) *(U64 *)loc = *val; else *(U64 *)loc = S + A; break; case R_RISCV_ADD8: *loc += S + A; break; case R_RISCV_ADD16: *(U16 *)loc += S + A; break; case R_RISCV_ADD32: *(U32 *)loc += S + A; break; case R_RISCV_ADD64: *(U64 *)loc += S + A; break; case R_RISCV_SUB8: *loc -= S + A; break; case R_RISCV_SUB16: *(U16 *)loc -= S + A; break; case R_RISCV_SUB32: *(U32 *)loc -= S + A; break; case R_RISCV_SUB64: *(U64 *)loc -= S + A; break; case R_RISCV_SUB6: *loc = (*loc & 0b1100'0000) | ((*loc - S - A) & 0b0011'1111); break; case R_RISCV_SET6: *loc = (*loc & 0b1100'0000) | ((S + A) & 0b0011'1111); break; case R_RISCV_SET8: *loc = S + A; break; case R_RISCV_SET16: *(U16 *)loc = S + A; break; case R_RISCV_SET32: *(U32 *)loc = S + A; break; case R_RISCV_SET_ULEB128: overwrite_uleb(loc, S + A); break; case R_RISCV_SUB_ULEB128: overwrite_uleb(loc, read_uleb(loc) - S - A); break; default: Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: " << rel; break; } } } template <> void InputSection::scan_relocations(Context &ctx) { assert(shdr().sh_flags & SHF_ALLOC); std::span> rels = get_rels(ctx); // Scan relocations for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE || record_undef_error(ctx, rel)) continue; Symbol &sym = *file.symbols[rel.r_sym]; if (sym.is_ifunc()) sym.flags |= NEEDS_GOT | NEEDS_PLT; switch (rel.r_type) { case R_RISCV_32: if constexpr (E::is_64) scan_absrel(ctx, sym, rel); break; case R_RISCV_HI20: scan_absrel(ctx, sym, rel); break; case R_RISCV_CALL: case R_RISCV_CALL_PLT: case R_RISCV_PLT32: if (sym.is_imported) sym.flags |= NEEDS_PLT; break; case R_RISCV_GOT_HI20: sym.flags |= NEEDS_GOT; break; case R_RISCV_TLS_GOT_HI20: sym.flags |= NEEDS_GOTTP; break; case R_RISCV_TLS_GD_HI20: sym.flags |= NEEDS_TLSGD; break; case R_RISCV_TLSDESC_HI20: scan_tlsdesc(ctx, sym); break; case R_RISCV_32_PCREL: case R_RISCV_PCREL_HI20: scan_pcrel(ctx, sym, rel); break; case R_RISCV_TPREL_HI20: check_tlsle(ctx, sym, rel); break; case R_RISCV_64: case R_RISCV_BRANCH: case R_RISCV_JAL: case R_RISCV_PCREL_LO12_I: case R_RISCV_PCREL_LO12_S: case R_RISCV_LO12_I: case R_RISCV_LO12_S: case R_RISCV_TPREL_LO12_I: case R_RISCV_TPREL_LO12_S: case R_RISCV_TPREL_ADD: case R_RISCV_TLSDESC_LOAD_LO12: case R_RISCV_TLSDESC_ADD_LO12: case R_RISCV_TLSDESC_CALL: case R_RISCV_ADD8: case R_RISCV_ADD16: case R_RISCV_ADD32: case R_RISCV_ADD64: case R_RISCV_SUB8: case R_RISCV_SUB16: case R_RISCV_SUB32: case R_RISCV_SUB64: case R_RISCV_ALIGN: case R_RISCV_RVC_BRANCH: case R_RISCV_RVC_JUMP: case R_RISCV_RELAX: case R_RISCV_SUB6: case R_RISCV_SET6: case R_RISCV_SET8: case R_RISCV_SET16: case R_RISCV_SET32: case R_RISCV_SET_ULEB128: case R_RISCV_SUB_ULEB128: break; default: Error(ctx) << *this << ": unknown relocation: " << rel; } } } template <> u64 get_eflags(Context &ctx) { std::vector *> objs = ctx.objs; std::erase(objs, ctx.internal_obj); if (objs.empty()) return 0; u32 ret = objs[0]->get_eflags(); for (i64 i = 1; i < objs.size(); i++) { u32 flags = objs[i]->get_eflags(); if (flags & EF_RISCV_RVC) ret |= EF_RISCV_RVC; if ((flags & EF_RISCV_FLOAT_ABI) != (ret & EF_RISCV_FLOAT_ABI)) Error(ctx) << *objs[i] << ": cannot link object files with different" << " floating-point ABI from " << *objs[0]; if ((flags & EF_RISCV_RVE) != (ret & EF_RISCV_RVE)) Error(ctx) << *objs[i] << ": cannot link object files with different" << " EF_RISCV_RVE from " << *objs[0]; } return ret; } // Scan relocations to shrink a given section. template <> void shrink_section(Context &ctx, InputSection &isec) { std::span> rels = isec.get_rels(ctx); std::vector &deltas = isec.extra.r_deltas; i64 r_delta = 0; u8 *buf = (u8 *)isec.contents.data(); // True if we can use 2-byte instructions. This is usually true on // Unix because RV64GC is generally considered the baseline hardware. bool use_rvc = isec.file.get_eflags() & EF_RISCV_RVC; for (i64 i = 0; i < rels.size(); i++) { const ElfRel &r = rels[i]; Symbol &sym = *isec.file.symbols[r.r_sym]; auto remove = [&](i64 d) { r_delta += d; deltas.push_back(RelocDelta{r.r_offset, r_delta}); }; // Handling R_RISCV_ALIGN is mandatory. // // R_RISCV_ALIGN refers to NOP instructions. We need to eliminate some // or all of the instructions so that the instruction that immediately // follows the NOPs is aligned to a specified alignment boundary. if (r.r_type == R_RISCV_ALIGN) { // The total bytes of NOPs is stored to r_addend, so the next // instruction is r_addend away. u64 P = isec.get_addr() + r.r_offset - r_delta; u64 desired = align_to(P, bit_ceil(r.r_addend)); u64 actual = P + r.r_addend; if (desired != actual) remove(actual - desired); continue; } // Handling other relocations is optional. if (!ctx.arg.relax || i == rels.size() - 1 || rels[i + 1].r_type != R_RISCV_RELAX) continue; // Linker-synthesized symbols haven't been assigned their final // values when we are shrinking sections because actual values can // be computed only after we fix the file layout. Therefore, we // assume that relocations against such symbols are always // non-relaxable. if (sym.file == ctx.internal_obj) continue; switch (r.r_type) { case R_RISCV_CALL: case R_RISCV_CALL_PLT: { // These relocations refer to an AUIPC + JALR instruction pair to // allow to jump to anywhere in PC ± 2 GiB. If the jump target is // close enough to PC, we can use C.J, C.JAL or JAL instead. i64 dist = compute_distance(ctx, sym, isec, r); if (dist & 1) break; i64 rd = get_rd(buf + r.r_offset + 4); if (use_rvc && rd == 0 && is_int(dist, 12)) { // If rd is x0 and the jump target is within ±2 KiB, we can use // C.J, saving 6 bytes. remove(6); } else if (use_rvc && !E::is_64 && rd == 1 && is_int(dist, 12)) { // If rd is x1 and the jump target is within ±2 KiB, we can use // C.JAL. This is RV32 only because C.JAL is RV32-only instruction. remove(6); } else if (is_int(dist, 21)) { // If the jump target is within ±1 MiB, we can use JAL. remove(4); } break; } case R_RISCV_GOT_HI20: // A GOT_HI20 followed by a PCREL_LO12_I is used to load a value from // GOT. If the loaded value is a link-time constant, we can rewrite // the instructions to directly materialize the value, eliminating a // memory load. if (sym.is_absolute() && is_got_load_pair(ctx, isec, rels, i)) { u64 val = sym.get_addr(ctx) + r.r_addend; if (use_rvc && is_int(val, 6) && get_rd(buf + r.r_offset) != 0) { // Replace AUIPC + LD with C.LI. remove(6); } else if (is_int(val, 12)) { // Replace AUIPC + LD with ADDI. remove(4); } } break; case R_RISCV_HI20: { u64 val = sym.get_addr(ctx) + r.r_addend; i64 rd = get_rd(buf + r.r_offset); if (is_int(val, 12)) { // We can replace `lui t0, %hi(foo)` and `add t0, t0, %lo(foo)` // instruction pair with `add t0, x0, %lo(foo)` if foo's bits // [32:11] are all one or all zero. remove(4); } else if (use_rvc && rd != 0 && rd != 2 && is_int(val + 0x800, 18)) { // If the upper 20 bits can actually be represented in 6 bits, // we can use C.LUI instead of LUI. remove(2); } break; } case R_RISCV_TPREL_HI20: case R_RISCV_TPREL_ADD: // These relocations are used to add a high 20-bit value to the // thread pointer. The following two instructions materializes // TP + %tprel_hi20(foo) in %t0, for example. // // lui t0, %tprel_hi(foo) # R_RISCV_TPREL_HI20 // add t0, t0, tp # R_RISCV_TPREL_ADD // // Then thread-local variable `foo` is accessed with the low // 12-bit offset like this: // // sw t0, %tprel_lo(foo)(t0) # R_RISCV_TPREL_LO12_S // // However, if the variable is at TP ± 2 KiB, TP + %tprel_hi20(foo) // is the same as TP, so we can instead access the thread-local // variable directly using TP like this: // // sw t0, %tprel_lo(foo)(tp) // // Here, we remove `lui` and `add` if the offset is within ±2 KiB. if (i64 val = sym.get_addr(ctx) + r.r_addend - ctx.tp_addr; is_int(val, 12)) remove(4); break; case R_RISCV_TLSDESC_HI20: if (!sym.has_tlsdesc(ctx)) remove(4); break; case R_RISCV_TLSDESC_LOAD_LO12: case R_RISCV_TLSDESC_ADD_LO12: { const ElfRel &rel2 = find_paired_reloc(ctx, isec, rels, sym, i); Symbol &sym2 = *isec.file.symbols[rel2.r_sym]; if (r.r_type == R_RISCV_TLSDESC_LOAD_LO12) { if (!sym2.has_tlsdesc(ctx)) remove(4); } else { assert(r.r_type == R_RISCV_TLSDESC_ADD_LO12); if (!sym2.has_tlsdesc(ctx) && !sym2.has_gottp(ctx)) if (i64 val = sym2.get_addr(ctx) + rel2.r_addend - ctx.tp_addr; is_int(val, 12)) remove(4); } break; } } } isec.sh_size -= r_delta; } // ISA name handlers // // An example of ISA name is "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zicsr2p0". // An ISA name starts with the base name (e.g. "rv64i2p1") followed by // ISA extensions separated by underscores. // // There are lots of ISA extensions defined for RISC-V, and they are // identified by name. Some extensions are of single-letter alphabet such // as "m" or "q". Newer extension names start with "z" followed by one or // more alphabets (i.e. "zicsr"). "s" and "x" prefixes are reserved // for supervisor-level extensions and private extensions, respectively. // // Each extension consists of a name, a major version and a minor version. // For example, "m2p0" indicates the "m" extension of version 2.0. "p" is // just a separator. Versions are often omitted in documents, but they are // mandatory in .riscv.attributes. Likewise, abbreviations such as "G" // (which is short for "IMAFD") are not allowed in .riscv.attributes. // // Each RISC-V object file contains an ISA string enumerating extensions // used by the object file. We need to merge input objects' ISA strings // into a single ISA string. // // In order to guarantee string uniqueness, extensions have to be ordered // in a specific manner. The exact rule is unfortunately a bit complicated. // // The following functions takes care of ISA strings. namespace { struct Extn { std::string name; i64 major; i64 minor; }; } // As per the RISC-V spec, the extension names must be sorted in a very // specific way, and unfortunately that's not just an alphabetical order. // For example, rv64imafd is a legal ISA string, whereas rv64iafdm is not. // The exact rule is somewhat arbitrary. // // This function returns true if the first extension name should precede // the second one as per the rule. static bool extn_name_less(std::string_view x, std::string_view y) { auto get_single_letter_rank = [](char c) -> i64 { std::string_view exts = "iemafdqlcbkjtpvnh"; size_t pos = exts.find_first_of(c); if (pos != exts.npos) return pos; return c - 'a' + exts.size(); }; auto get_rank = [&](std::string_view str) -> i64 { switch (str[0]) { case 'x': return 1 << 20; case 's': return 1 << 19; case 'z': return (1 << 18) + get_single_letter_rank(str[1]); default: return get_single_letter_rank(str[0]); } }; return std::tuple{get_rank(x), x} < std::tuple{get_rank(y), y}; } static std::vector parse_arch_string(std::string_view str) { auto flags = std::regex_constants::optimize | std::regex_constants::ECMAScript; static std::regex re(R"(^([a-z]|[a-z][a-z0-9]*[a-z])(\d+)p(\d+)(_|$))", flags); std::vector vec; for (;;) { std::cmatch m; if (!std::regex_search(str.data(), str.data() + str.size(), m, re)) return {}; vec.push_back(Extn{m[1], (i64)std::stoul(m[2]), (i64)std::stoul(m[3])}); if (m[4].length() == 0) return vec; str = str.substr(m.length()); } } static std::vector merge_extensions(std::span x, std::span y) { std::vector vec; // The base part (i.e. "rv64i" or "rv32i") must match. if (x[0].name != y[0].name) return {}; // Merge ISA extension strings while (!x.empty() && !y.empty()) { if (x[0].name == y[0].name) { if (std::tuple{x[0].major, x[0].minor} < std::tuple{y[0].major, y[0].minor}) vec.push_back(y[0]); else vec.push_back(x[0]); x = x.subspan(1); y = y.subspan(1); } else if (extn_name_less(x[0].name, y[0].name)) { vec.push_back(x[0]); x = x.subspan(1); } else { vec.push_back(y[0]); y = y.subspan(1); } } append(vec, x); append(vec, y); return vec; } static std::string to_string(std::span v) { std::ostringstream os; os << v[0].name << v[0].major << 'p' << v[0].minor; for (Extn &e : v.subspan(1)) os << '_' << e.name << e.major << 'p' << e.minor; return os.str(); } // // Output .riscv.attributes class // template <> void RiscvAttributesSection::update_shdr(Context &ctx) { if (!contents.empty()) return; i64 stack = -1; std::vector arch; bool unaligned = false; for (ObjectFile *file : ctx.objs) { if (file->extra.stack_align) { i64 val = *file->extra.stack_align; if (stack != -1 && stack != val) Error(ctx) << *file << ": stack alignment requirement mistmatch"; stack = val; } if (file->extra.arch) { std::vector arch2 = parse_arch_string(*file->extra.arch); if (arch2.empty()) Error(ctx) << *file << ": corrupted .riscv.attributes ISA string: " << *file->extra.arch; if (arch.empty()) { arch = arch2; } else { arch = merge_extensions(arch, arch2); if (arch.empty()) Error(ctx) << *file << ": incompatible .riscv.attributes ISA string: " << *file->extra.arch; } } if (file->extra.unaligned_access) unaligned = true; } if (arch.empty()) return; std::string arch_str = to_string(arch); contents.resize(arch_str.size() + 100); u8 *p = (u8 *)contents.data(); *p++ = 'A'; // Format version U32 *sub_sz = (U32 *)p; // Sub-section length p += 4; p += write_string(p, "riscv"); // Vendor name u8 *sub_sub_start = p; *p++ = ELF_TAG_FILE; // Sub-section tag U32 *sub_sub_sz = (U32 *)p; // Sub-sub-section length p += 4; if (stack != -1) { p += write_uleb(p, ELF_TAG_RISCV_STACK_ALIGN); p += write_uleb(p, stack); } p += write_uleb(p, ELF_TAG_RISCV_ARCH); p += write_string(p, arch_str); if (unaligned) { p += write_uleb(p, ELF_TAG_RISCV_UNALIGNED_ACCESS); p += write_uleb(p, 1); } i64 sz = p - (u8 *)contents.data(); *sub_sz = sz - 1; *sub_sub_sz = p - sub_sub_start; contents.resize(sz); this->shdr.sh_size = sz; } template <> void RiscvAttributesSection::copy_buf(Context &ctx) { write_vector(ctx.buf + this->shdr.sh_offset, contents); } } // namespace mold #endif ================================================ FILE: src/arch-s390x.cc ================================================ // This file contains code for the IBM z/Architecture 64-bit ISA, which is // commonly referred to as "s390x" on Linux. // // z/Architecture is a 64-bit CISC ISA developed by IBM around 2000 for // IBM's "big iron" mainframe computers. The computers are direct // descendents of IBM System/360 all the way back in 1966. I've never // actually seen a mainframe, and you probaly haven't either, but it looks // like the mainframe market is still large enough to sustain its ecosystem. // Ubuntu for example provides the official support for s390x as of 2022. // Since they are being actively maintained, we need to support them. // // As an instruction set, s390x isn't particularly odd. It has 16 general- // purpose registers. Instructions are 2, 4 or 6 bytes long and always // aligned to 2 bytes boundaries. Despite unfamiliarty, I found that it // just feels like an x86-64 in a parallel universe. // // Here is the register usage in this ABI: // // r0-r1: reserved as scratch registers so we can use them in our PLT // r2: parameter passing and return values // r3-r6: parameter passing // r12: address of GOT if position-independent code // r14: return address // r15: stack pointer // a1: upper 32 bits of TP (thread pointer) // a2: lower 32 bits of TP (thread pointer) // // Thread-local storage (TLS) is supported on s390x in the same way as it // is on other targets with one exeption. On other targets, __tls_get_addr // is used to get an address of a thread-local variable. On s390x, // __tls_get_offset is used instead. The difference is __tls_get_offset // returns an address of a thread-local variable as an offset from TP. So // we need to add TP to a return value before use. I don't know why it is // different, but that is the way it is. // // https://github.com/IBM/s390x-abi/releases/download/v1.6.1/lzsabi_s390x.pdf #if MOLD_S390X #include "mold.h" namespace mold { using E = S390X; static void write_mid20(u8 *loc, u64 val) { *(ub32 *)loc |= (bits(val, 11, 0) << 16) | (bits(val, 19, 12) << 8); } template <> void write_plt_header(Context &ctx, u8 *buf) { static u8 insn[] = { // Compute the offset into .rela.plt. This is equivalent to // (%r0 - %r1 - 48 - 14) * 3/2 where %r0 is the PLT entry address // plus 14, %r1 is the start address of .plt, and 48 is the size // of this PLT header. We multiply by 3/2 because each PLT entry // is 16 bytes, whereas each .rela.plt entry is 24 bytes. 0xb9, 0x09, 0x00, 0x01, // sgr %r0, %r1 0xa7, 0x0b, 0xff, 0xc2, // aghi %r0, -62 0xeb, 0x10, 0x00, 0x01, 0x00, 0x0c, // srlg %r1, %r0, 1 0xb9, 0x08, 0x00, 0x01, // agr %r0, %r1 // Store the computed value to 56(%r15) and .got.plt[1] to 48(%15) // where %r15 is the stack pointer. 0xe3, 0x00, 0xf0, 0x38, 0x00, 0x24, // stg %r0, 56(%r15) 0xc0, 0x10, 0, 0, 0, 0, // larl %r1, GOTPLT_OFFSET 0xd2, 0x07, 0xf0, 0x30, 0x10, 0x08, // mvc 48(8, %r15), 8(%r1) // Branch to _dl_runtime_resolve 0xe3, 0x10, 0x10, 0x10, 0x00, 0x04, // lg %r1, 16(%r1) 0x07, 0xf1, // br %r1 0x00, 0x00, 0x00, 0x00, // (filler) }; memcpy(buf, insn, sizeof(insn)); *(ub32 *)(buf + 26) = (ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 24) >> 1; } template <> void write_plt_entry(Context &ctx, u8 *buf, Symbol &sym) { static u8 insn[] = { 0xc0, 0x10, 0, 0, 0, 0, // larl %r1, GOTPLT_ENTRY_OFFSET 0xe3, 0x10, 0x10, 0x00, 0x00, 0x04, // lg %r1, (%r1) 0x0d, 0x01, // basr %r0, %r1 0x00, 0x00, // (filler) }; memcpy(buf, insn, sizeof(insn)); *(ub32 *)(buf + 2) = (sym.get_gotplt_addr(ctx) - sym.get_plt_addr(ctx)) >> 1; } template <> void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym) { static u8 insn[] = { 0xc0, 0x10, 0, 0, 0, 0, // larl %r1, GOT_ENTRY_OFFSET 0xe3, 0x10, 0x10, 0x00, 0x00, 0x04, // lg %r1, (%r1) 0x07, 0xf1, // br %r1 0x00, 0x00, // (filler) }; memcpy(buf, insn, sizeof(insn)); *(ub32 *)(buf + 2) = (sym.get_got_pltgot_addr(ctx) - sym.get_plt_addr(ctx)) >> 1; } template <> void EhFrameSection::apply_eh_reloc(Context &ctx, const ElfRel &rel, u64 offset, u64 val) { u8 *loc = ctx.buf + this->shdr.sh_offset + offset; switch (rel.r_type) { case R_NONE: break; case R_390_PC32: *(ub32 *)loc = val - this->shdr.sh_addr - offset; break; case R_390_64: *(ub64 *)loc = val; break; default: Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel; } } template <> void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE) continue; Symbol &sym = *file.symbols[rel.r_sym]; u8 *loc = base + rel.r_offset; u64 S = sym.get_addr(ctx); u64 A = rel.r_addend; u64 P = get_addr() + rel.r_offset; u64 G = sym.get_got_idx(ctx) * sizeof(Word); u64 GOT = ctx.got->shdr.sh_addr; auto check = [&](i64 val, i64 lo, i64 hi) { check_range(ctx, i, val, lo, hi); }; auto check_dbl = [&](i64 val, i64 lo, i64 hi) { // R_390_*DBL relocs should never refer to a symbol at an odd address check(val, lo, hi); if (val & 1) Error(ctx) << *this << ": misaligned symbol " << sym << " for relocation " << rel; }; switch (rel.r_type) { case R_390_64: break; case R_390_8: check(S + A, 0, 1 << 8); *loc = S + A; break; case R_390_12: check(S + A, 0, 1 << 12); *(ul16 *)loc |= bits(S + A, 11, 0); break; case R_390_16: check(S + A, 0, 1 << 16); *(ub16 *)loc = S + A; break; case R_390_20: check(S + A, 0, 1 << 20); write_mid20(loc, S + A); break; case R_390_32: case R_390_PLT32: check(S + A, 0, 1LL << 32); *(ub32 *)loc = S + A; break; case R_390_PC12DBL: case R_390_PLT12DBL: check_dbl(S + A - P, -(1 << 12), 1 << 12); *(ul16 *)loc |= bits(S + A - P, 12, 1); break; case R_390_PC16: check(S + A - P, -(1 << 15), 1 << 15); *(ub16 *)loc = S + A - P; break; case R_390_PC32: check(S + A - P, -(1LL << 31), 1LL << 31); *(ub32 *)loc = S + A - P; break; case R_390_PC64: case R_390_PLT64: *(ub64 *)loc = S + A - P; break; case R_390_PC16DBL: case R_390_PLT16DBL: check_dbl(S + A - P, -(1 << 16), 1 << 16); *(ub16 *)loc = (S + A - P) >> 1; break; case R_390_PC24DBL: case R_390_PLT24DBL: check_dbl(S + A - P, -(1 << 24), 1 << 24); *(ub32 *)loc |= bits(S + A - P, 24, 1); break; case R_390_PC32DBL: case R_390_PLT32DBL: check_dbl(S + A - P, -(1LL << 32), 1LL << 32); *(ub32 *)loc = (S + A - P) >> 1; break; case R_390_GOT12: case R_390_GOTPLT12: check(G + A, 0, 1 << 12); *(ul16 *)loc |= bits(G + A, 11, 0); break; case R_390_GOT16: case R_390_GOTPLT16: check(G + A, 0, 1 << 16); *(ub16 *)loc = G + A; break; case R_390_GOT20: case R_390_GOTPLT20: check(G + A, 0, 1 << 20); write_mid20(loc, G + A); break; case R_390_GOT32: case R_390_GOTPLT32: check(G + A, 0, 1LL << 32); *(ub32 *)loc = G + A; break; case R_390_GOT64: case R_390_GOTPLT64: *(ub64 *)loc = G + A; break; case R_390_GOTOFF16: case R_390_PLTOFF16: check(S + A - GOT, -(1 << 15), 1 << 15); *(ub16 *)loc = S + A - GOT; break; case R_390_GOTOFF32: case R_390_PLTOFF32: check(S + A - GOT, -(1LL << 31), 1LL << 31); *(ub32 *)loc = S + A - GOT; break; case R_390_GOTOFF64: case R_390_PLTOFF64: *(ub64 *)loc = S + A - GOT; break; case R_390_GOTPC: *(ub64 *)loc = GOT + A - P; break; case R_390_GOTPCDBL: check_dbl(GOT + A - P, -(1LL << 32), 1LL << 32); *(ub32 *)loc = (GOT + A - P) >> 1; break; case R_390_GOTENT: // If we can relax a GOT-loading LGRL to an address-materializing // LARL, do that. The format of LGRL is 0xc 0x4 0x8 followed // by a 32-bit offset. LARL is 0xc 0x0 0x0. if (ctx.arg.relax && sym.is_pcrel_linktime_const(ctx)) { u64 op = *(ub16 *)(loc - 2); u64 val = S + A - P; if ((op & 0xff0f) == 0xc408 && A == 2 && (val & 1) == 0 && is_int(val, 33)) { *(ub16 *)(loc - 2) = 0xc000 | (op & 0x00f0); *(ub32 *)loc = val >> 1; break; } } check_dbl(GOT + G + A - P, -(1LL << 32), 1LL << 32); *(ub32 *)loc = (GOT + G + A - P) >> 1; break; case R_390_TLS_LE32: *(ub32 *)loc = S + A - ctx.tp_addr; break; case R_390_TLS_LE64: *(ub64 *)loc = S + A - ctx.tp_addr; break; case R_390_TLS_GOTIE20: write_mid20(loc, sym.get_gottp_addr(ctx) + A - GOT); break; case R_390_TLS_IEENT: *(ub32 *)loc = (sym.get_gottp_addr(ctx) + A - P) >> 1; break; case R_390_TLS_GD32: if (sym.has_tlsgd(ctx)) *(ub32 *)loc = sym.get_tlsgd_addr(ctx) + A - GOT; else if (sym.has_gottp(ctx)) *(ub32 *)loc = sym.get_gottp_addr(ctx) + A - GOT; else *(ub32 *)loc = S + A - ctx.tp_addr; break; case R_390_TLS_GD64: if (sym.has_tlsgd(ctx)) *(ub64 *)loc = sym.get_tlsgd_addr(ctx) + A - GOT; else if (sym.has_gottp(ctx)) *(ub64 *)loc = sym.get_gottp_addr(ctx) + A - GOT; else *(ub64 *)loc = S + A - ctx.tp_addr; break; case R_390_TLS_GDCALL: if (sym.has_tlsgd(ctx)) { // do nothing } else if (sym.has_gottp(ctx)) { // lg %r2, 0(%r2, %r12) static u8 insn[] = { 0xe3, 0x22, 0xc0, 0x00, 0x00, 0x04 }; memcpy(loc, insn, sizeof(insn)); } else { // nop static u8 insn[] = { 0xc0, 0x04, 0x00, 0x00, 0x00, 0x00 }; memcpy(loc, insn, sizeof(insn)); } break; case R_390_TLS_LDM32: if (ctx.got->has_tlsld(ctx)) *(ub32 *)loc = ctx.got->get_tlsld_addr(ctx) + A - GOT; else *(ub32 *)loc = ctx.dtp_addr - ctx.tp_addr; break; case R_390_TLS_LDM64: if (ctx.got->has_tlsld(ctx)) *(ub64 *)loc = ctx.got->get_tlsld_addr(ctx) + A - GOT; else *(ub64 *)loc = ctx.dtp_addr - ctx.tp_addr; break; case R_390_TLS_LDCALL: if (!ctx.got->has_tlsld(ctx)) { // nop static u8 insn[] = { 0xc0, 0x04, 0x00, 0x00, 0x00, 0x00 }; memcpy(loc, insn, sizeof(insn)); } break; case R_390_TLS_LDO32: *(ub32 *)loc = S + A - ctx.dtp_addr; break; case R_390_TLS_LDO64: *(ub64 *)loc = S + A - ctx.dtp_addr; break; default: unreachable(); } } } template <> void InputSection::apply_reloc_nonalloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE || record_undef_error(ctx, rel)) continue; Symbol &sym = *file.symbols[rel.r_sym]; u8 *loc = base + rel.r_offset; SectionFragment *frag; i64 frag_addend; std::tie(frag, frag_addend) = get_fragment(ctx, rel); u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx); u64 A = frag ? frag_addend : (i64)rel.r_addend; auto check = [&](i64 val, i64 lo, i64 hi) { check_range(ctx, val, i, lo, hi); }; switch (rel.r_type) { case R_390_32: check(S + A, 0, 1LL << 32); *(ub32 *)loc = S + A; break; case R_390_64: if (std::optional val = get_tombstone(sym, frag)) *(ub64 *)loc = *val; else *(ub64 *)loc = S + A; break; case R_390_TLS_LDO64: if (std::optional val = get_tombstone(sym, frag)) *(ub64 *)loc = *val; else *(ub64 *)loc = S + A - ctx.dtp_addr; break; default: Fatal(ctx) << *this << ": apply_reloc_nonalloc: " << rel; } } } template <> void InputSection::scan_relocations(Context &ctx) { assert(shdr().sh_flags & SHF_ALLOC); std::span> rels = get_rels(ctx); // Scan relocations for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE || record_undef_error(ctx, rel)) continue; Symbol &sym = *file.symbols[rel.r_sym]; if (sym.is_ifunc()) sym.flags |= NEEDS_GOT | NEEDS_PLT; switch (rel.r_type) { case R_390_8: case R_390_12: case R_390_16: case R_390_20: case R_390_32: scan_absrel(ctx, sym, rel); break; case R_390_PC12DBL: case R_390_PC16: case R_390_PC16DBL: case R_390_PC24DBL: case R_390_PC32: case R_390_PC32DBL: case R_390_PC64: scan_pcrel(ctx, sym, rel); break; case R_390_GOT12: case R_390_GOT16: case R_390_GOT20: case R_390_GOT32: case R_390_GOT64: case R_390_GOTOFF16: case R_390_GOTOFF32: case R_390_GOTOFF64: case R_390_GOTPLT12: case R_390_GOTPLT16: case R_390_GOTPLT20: case R_390_GOTPLT32: case R_390_GOTPLT64: case R_390_GOTPC: case R_390_GOTPCDBL: case R_390_GOTENT: sym.flags |= NEEDS_GOT; break; case R_390_PLT12DBL: case R_390_PLT16DBL: case R_390_PLT24DBL: case R_390_PLT32: case R_390_PLT32DBL: case R_390_PLT64: case R_390_PLTOFF16: case R_390_PLTOFF32: case R_390_PLTOFF64: if (sym.is_imported) sym.flags |= NEEDS_PLT; break; case R_390_TLS_GOTIE20: case R_390_TLS_IEENT: sym.flags |= NEEDS_GOTTP; break; case R_390_TLS_GD32: case R_390_TLS_GD64: // We always want to relax calls to __tls_get_offset() in statically- // linked executables because __tls_get_offset() in libc.a just calls // abort(). if (ctx.arg.static_ || (ctx.arg.relax && sym.is_tprel_linktime_const(ctx))) { // Do nothing } else if (ctx.arg.relax && sym.is_tprel_runtime_const(ctx)) { sym.flags |= NEEDS_GOTTP; } else { sym.flags |= NEEDS_TLSGD; } break; case R_390_TLS_LDM32: case R_390_TLS_LDM64: if (ctx.arg.static_ || (ctx.arg.relax && !ctx.arg.shared)) { // Do nothing } else { ctx.needs_tlsld = true; } break; case R_390_TLS_LE32: case R_390_TLS_LE64: check_tlsle(ctx, sym, rel); break; case R_390_64: case R_390_TLS_LDO32: case R_390_TLS_LDO64: case R_390_TLS_GDCALL: case R_390_TLS_LDCALL: break; default: Error(ctx) << *this << ": unknown relocation: " << rel; } } } } // namespace mold #endif ================================================ FILE: src/arch-sh4.cc ================================================ // SH-4 (SuperH 4) is a 32-bit RISC ISA developed by Hitachi in the early // '90s. Some relatively powerful systems were developed with SH-4. // A notable example is Sega's Dreamcast game console which debuted in 1998. // Hitachi later spun off its semiconductor division as an independent // company, Renesas, and Renesas is still selling SH-4 processors for the // embedded market. It has never been as popular as ARM is, and its // popularity continues to decline though. // // SH-4's most distinctive feature compared to other RISC ISAs is that its // instructions are 16 bits in length instead of more common 32 bits for // better code density. This difference affects various aspects of its // instruction set as shown below: // // - SH-4 has 16 general-purpose registers (GPRs) instead of the most // commmon 32 GPR configuration to save one bit to specify a register. // // - Binary instructions such as ADD normally take three register in // RISC ISAs (e.g. x ← y ⊕ z where x, y and z are registers), but // SH-4's instructions take only two registers. The result of an // operation is written to one of the source registers (e.g. x ← x ⊕ y). // // - Usual RISC ISAs have "load high" and "load low" instructions to set // an immediate to most significant and least significant bits in a // register to construct a full 32-bit value in a register. This // technique is hard to use in SH-4, as 16 bit instructions are too // small to contain large immediates. On SH-4, large immediates are // loaded from memory using `mov.l` PC-relative load instruction. // // - Many RISC ISAs are, despite their name, actually fairly complex. // They tend to have hundreds if not thousands of different instructions. // SH-4 doesn't really have that many instructions because its 16-bit // machine code simply can't encode many different opcodes. As a // result, the number of relocations the linker has to support is also // small. // // Beside these, SH-4 has a delay branch slot just like contemporary MIPS // and SPARC. That is, one instruction after a branch instruction will // always be executed even if the branch is taken. Delay branch slot allows // a pipelined CPU to start and finish executing an instruction after a // branch regardless of the branch's condition, simplifying the processor's // implementation. It's considered a bad premature optimization nowadays, // though. Modern RISC processors don't have it. // // Here are notes about the SH-4 psABI: // // - If a source file is compiled with -fPIC, each function starts // with a piece of code to store the address of .got to %r12. // We can use the register in our PLT for position-independent output. // // - Even though it uses the RELA-type relocations, relocation addends // are stored not to the r_addend field but to the relocated section // contents for some reason. Therefore, it's effectively REL. // // - It looks like the ecosystem has bit-rotted. Some tests, especially // one using C++ exceptions, don't pass even with GNU ld. // // - GCC/SH4 tends to write dynamically-relocated data into .text, so the // output from the linker contains lots of text relocations. That's not // a problem with embedded programming, I guess. #if MOLD_SH4LE || MOLD_SH4BE #include "mold.h" namespace mold { using E = MOLD_TARGET; // Even though SH-4 uses RELA-type relocations, addends are stored to // relocated places for some reason. template <> i64 get_addend(u8 *loc, const ElfRel &rel) { switch (rel.r_type) { case R_SH_DIR32: case R_SH_REL32: case R_SH_TLS_GD_32: case R_SH_TLS_LD_32: case R_SH_TLS_LDO_32: case R_SH_TLS_IE_32: case R_SH_TLS_LE_32: case R_SH_TLS_DTPMOD32: case R_SH_TLS_DTPOFF32: case R_SH_TLS_TPOFF32: case R_SH_GOT32: case R_SH_PLT32: case R_SH_GOTOFF: case R_SH_GOTPC: case R_SH_GOTPLT32: return *(U32 *)loc; default: return 0; } } template <> void write_addend(u8 *loc, i64 val, const ElfRel &rel) { switch (rel.r_type) { case R_SH_DIR32: case R_SH_REL32: case R_SH_TLS_GD_32: case R_SH_TLS_LD_32: case R_SH_TLS_LDO_32: case R_SH_TLS_IE_32: case R_SH_TLS_LE_32: case R_SH_TLS_DTPMOD32: case R_SH_TLS_DTPOFF32: case R_SH_TLS_TPOFF32: case R_SH_GOT32: case R_SH_PLT32: case R_SH_GOTOFF: case R_SH_GOTPC: case R_SH_GOTPLT32: *(U32 *)loc = val; } } template <> void write_plt_header(Context &ctx, u8 *buf) { if (ctx.arg.pic) { constexpr U16 insn[] = { 0xd202, // mov.l 1f, r2 0x32cc, // add r12, r2 0x5022, // mov.l @(8, r2), r0 0x5221, // mov.l @(4, r2), r2 0x402b, // jmp @r0 0xe000, // mov #0, r0 0, 0, // 1: .long GOTPLT }; static_assert(sizeof(insn) == E::plt_hdr_size); memcpy(buf, insn, sizeof(insn)); *(U32 *)(buf + 12) = ctx.gotplt->shdr.sh_addr - ctx.got->shdr.sh_addr; } else { constexpr U16 insn[] = { 0xd202, // mov.l 1f, r2 0x5022, // mov.l @(8, r2), r0 0x5221, // mov.l @(4, r2), r2 0x402b, // jmp @r0 0xe000, // mov #0, r0 0xfffd, // (illegal) 0, 0, // 1: .long GOTPLT }; static_assert(sizeof(insn) == E::plt_hdr_size); memcpy(buf, insn, sizeof(insn)); *(U32 *)(buf + 12) = ctx.gotplt->shdr.sh_addr; } } template <> void write_plt_entry(Context &ctx, u8 *buf, Symbol &sym) { if (ctx.arg.pic) { constexpr U16 insn[] = { 0xd001, // mov.l 1f, r0 0x00ce, // mov.l @(r0, r12), r0 0x402b, // jmp @r0 0xd101, // mov.l 2f, r1 0, 0, // 1: .long GOTPLT_ENTRY 0, 0, // 2: .long INDEX_IN_RELPLT }; static_assert(sizeof(insn) == E::plt_size); memcpy(buf, insn, sizeof(insn)); *(U32 *)(buf + 8) = sym.get_gotplt_addr(ctx) - ctx.got->shdr.sh_addr; *(U32 *)(buf + 12) = sym.get_plt_idx(ctx) * sizeof(ElfRel); } else { constexpr U16 insn[] = { 0xd001, // mov.l 1f, r0 0x6002, // mov.l @r0, r0 0x402b, // jmp @r0 0xd101, // mov.l 2f, r1 0, 0, // 1: .long GOTPLT_ENTRY 0, 0, // 2: .long INDEX_IN_RELPLT }; static_assert(sizeof(insn) == E::plt_size); memcpy(buf, insn, sizeof(insn)); *(U32 *)(buf + 8) = sym.get_gotplt_addr(ctx); *(U32 *)(buf + 12) = sym.get_plt_idx(ctx) * sizeof(ElfRel); } } template <> void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym) { if (ctx.arg.pic) { constexpr U16 insn[] = { 0xd001, // mov.l 1f, r0 0x00ce, // mov.l @(r0, r12), r0 0x402b, // jmp @r0 0x0009, // nop 0, 0, // 1: .long GOT_ENTRY }; static_assert(sizeof(insn) == E::pltgot_size); memcpy(buf, insn, sizeof(insn)); *(U32 *)(buf + 8) = sym.get_got_pltgot_addr(ctx) - ctx.got->shdr.sh_addr; } else { constexpr U16 insn[] = { 0xd001, // mov.l 1f, r0 0x6002, // mov.l @r0, r0 0x402b, // jmp @r0 0x0009, // nop 0, 0, // 1: .long GOT_ENTRY }; static_assert(sizeof(insn) == E::pltgot_size); memcpy(buf, insn, sizeof(insn)); *(U32 *)(buf + 8) = sym.get_got_pltgot_addr(ctx); } } template <> void EhFrameSection::apply_eh_reloc(Context &ctx, const ElfRel &rel, u64 offset, u64 val) { u8 *loc = ctx.buf + this->shdr.sh_offset + offset; switch (rel.r_type) { case R_NONE: break; case R_SH_DIR32: *(U32 *)loc = val; break; case R_SH_REL32: *(U32 *)loc = val - this->shdr.sh_addr - offset; break; default: Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel; } } template <> void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE) continue; Symbol &sym = *file.symbols[rel.r_sym]; u8 *loc = base + rel.r_offset; u64 S = sym.get_addr(ctx); u64 A = get_addend(loc, rel); u64 P = get_addr() + rel.r_offset; u64 G = sym.get_got_idx(ctx) * sizeof(Word); u64 GOT = ctx.got->shdr.sh_addr; switch (rel.r_type) { case R_SH_DIR32: break; case R_SH_REL32: case R_SH_PLT32: *(U32 *)loc = S + A - P; break; case R_SH_GOT32: *(U32 *)loc = G; break; case R_SH_GOTPC: *(U32 *)loc = GOT + A - P; break; case R_SH_GOTOFF: *(U32 *)loc = S + A - GOT; break; case R_SH_TLS_GD_32: *(U32 *)loc = sym.get_tlsgd_addr(ctx) + A - GOT; break; case R_SH_TLS_LD_32: *(U32 *)loc = ctx.got->get_tlsld_addr(ctx) + A - GOT; break; case R_SH_TLS_LDO_32: *(U32 *)loc = S + A - ctx.dtp_addr; break; case R_SH_TLS_IE_32: *(U32 *)loc = sym.get_gottp_addr(ctx) + A - GOT; break; case R_SH_TLS_LE_32: *(U32 *)loc = S + A - ctx.tp_addr; break; default: unreachable(); } } } template <> void InputSection::apply_reloc_nonalloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE || record_undef_error(ctx, rel)) continue; Symbol &sym = *file.symbols[rel.r_sym]; u8 *loc = base + rel.r_offset; SectionFragment *frag; i64 frag_addend; std::tie(frag, frag_addend) = get_fragment(ctx, rel); u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx); u64 A = frag ? frag_addend : get_addend(loc, rel); switch (rel.r_type) { case R_SH_DIR32: if (std::optional val = get_tombstone(sym, frag)) *(U32 *)loc = *val; else *(U32 *)loc = S + A; break; default: Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: " << rel; } } } template <> void InputSection::scan_relocations(Context &ctx) { assert(shdr().sh_flags & SHF_ALLOC); std::span> rels = get_rels(ctx); for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE || record_undef_error(ctx, rel)) continue; Symbol &sym = *file.symbols[rel.r_sym]; if (sym.is_ifunc()) Error(ctx) << sym << ": GNU ifunc symbol is not supported on sh4"; switch (rel.r_type) { case R_SH_REL32: scan_pcrel(ctx, sym, rel); break; case R_SH_GOT32: sym.flags |= NEEDS_GOT; break; case R_SH_PLT32: if (sym.is_imported) sym.flags |= NEEDS_PLT; break; case R_SH_TLS_GD_32: sym.flags |= NEEDS_TLSGD; break; case R_SH_TLS_LD_32: ctx.needs_tlsld = true; break; case R_SH_TLS_IE_32: sym.flags |= NEEDS_GOTTP; break; case R_SH_TLS_LE_32: check_tlsle(ctx, sym, rel); break; case R_SH_DIR32: case R_SH_GOTPC: case R_SH_GOTOFF: case R_SH_TLS_LDO_32: break; default: Fatal(ctx) << *this << ": unknown relocation: " << rel; } } } } // namespace mold #endif ================================================ FILE: src/arch-sparc64.cc ================================================ // SPARC is a RISC ISA developed by Sun Microsystems. // // The byte order of the processor is big-endian. Anything larger than a // byte is stored in the "reverse" order compared to little-endian // processors such as x86-64. // // All instructions are 4 bytes long and aligned to 4 bytes boundaries. // // A notable feature of SPARC is that, unlike other RISC ISAs, it doesn't // need range extension thunks. It is because the SPARC's CALL instruction // contains a whopping 30 bits immediate. The processor scales it by 4 to // extend it to 32 bits (this is doable because all instructions are // aligned to 4 bytes boundaries, so the least significant two bits are // always zero). That means CALL's reach is PC ± 2 GiB, elinating the // need of range extension thunks. It comes with the cost that the CALL // instruction alone takes 1/4th of the instruction encoding space, // though. // // SPARC has 32 general purpose registers. CALL instruction saves a return // address to %o7, which is an alias for %r15. Thread pointer is stored to // %g7 which is %r7. // // SPARC does not have PC-relative load/store instructions. To access data // in the position-independent manner, we usually first set the address of // .got to, for example, %l7, with the following piece of code // // sethi %hi(. - _GLOBAL_OFFSET_TABLE_), %l7 // add %l7, %lo(. - _GLOBAL_OFFSET_TABLE_), %l7 // call __sparc_get_pc_thunk.l7 // nop // // where __sparc_get_pc_thunk.l7 is defined as // // retl // add %o7, %l7, %l7 // // . SETHI and the following ADD materialize a 32 bits offset to .got. // CALL instruction sets a return address to $o7, and the subsequent ADD // adds it to the GOT offset to materialize the absolute address of .got. // // Note that we have a NOP after CALL and an ADD after RETL because of // SPARC's delay branch slots. That is, the SPARC processor always // executes one instruction after a branch even if the branch is taken. // This may seem like an odd behavior, and indeed it is considered as such // (that's a premature optimization for the early pipelined SPARC // processors), but that's been a part of the ISA's spec so that's what it // is. // // Note also that the .got address obtained this way is not shared between // functions, so functions can use an arbitrary register to hold the .got // address. That also means each function needs to execute the above piece // of code to become position-independent. // // https://github.com/rui314/psabi/blob/main/sparc.pdf #if MOLD_SPARC64 #include "mold.h" namespace mold { using E = SPARC64; // SPARC's PLT section is writable despite containing executable code. // We don't need to write the PLT header entry because the dynamic loader // will do that for us. // // We also don't need a .got.plt section to store the result of lazy PLT // symbol resolution because the dynamic symbol resolver directly mutates // instructions in PLT so that they jump to the right places next time. // That's why each PLT entry contains lots of NOPs; they are a placeholder // for the runtime to add more instructions. // // Self-modifying code is nowadays considered really bad from the security // point of view, though. template <> void write_plt_header(Context &ctx, u8 *buf) { memset(buf, 0, E::plt_hdr_size); } template <> void write_plt_entry(Context &ctx, u8 *buf, Symbol &sym) { static ub32 insn[] = { 0x0300'0000, // sethi (. - .PLT0), %g1 0x3068'0000, // ba,a %xcc, .PLT1 0x0100'0000, // nop 0x0100'0000, // nop 0x0100'0000, // nop 0x0100'0000, // nop 0x0100'0000, // nop 0x0100'0000, // nop }; u64 plt0 = ctx.plt->shdr.sh_addr; u64 plt1 = ctx.plt->shdr.sh_addr + E::plt_size; u64 entry = sym.get_plt_addr(ctx); memcpy(buf, insn, sizeof(insn)); *(ub32 *)buf |= bits(entry - plt0, 21, 0); *(ub32 *)(buf + 4) |= bits(plt1 - entry - 4, 20, 2); } template <> void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym) { static ub32 entry[] = { 0x8a10'000f, // mov %o7, %g5 0x4000'0002, // call . + 8 0xc25b'e014, // ldx [ %o7 + 20 ], %g1 0xc25b'c001, // ldx [ %o7 + %g1 ], %g1 0x81c0'4000, // jmp %g1 0x9e10'0005, // mov %g5, %o7 0x0000'0000, // .quad $plt_entry - $got_entry 0x0000'0000, }; memcpy(buf, entry, sizeof(entry)); *(ub64 *)(buf + 24) = sym.get_got_pltgot_addr(ctx) - sym.get_plt_addr(ctx) - 4; } template <> void EhFrameSection::apply_eh_reloc(Context &ctx, const ElfRel &rel, u64 offset, u64 val) { u8 *loc = ctx.buf + this->shdr.sh_offset + offset; switch (rel.r_type) { case R_NONE: break; case R_SPARC_64: case R_SPARC_UA64: *(ub64 *)loc = val; break; case R_SPARC_DISP32: *(ub32 *)loc = val - this->shdr.sh_addr - offset; break; default: Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel; } } template <> void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); // We iterate over relocations in reverse order so that it is easy // to swap instructions for R_SPARC_TLS_GD_CALL. for (i64 i = (i64)rels.size() - 1; i >= 0; i--) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE) continue; Symbol &sym = *file.symbols[rel.r_sym]; u8 *loc = base + rel.r_offset; u64 S = sym.get_addr(ctx); u64 A = rel.r_addend; u64 P = get_addr() + rel.r_offset; u64 G = sym.get_got_idx(ctx) * sizeof(Word); u64 GOT = ctx.got->shdr.sh_addr; auto check = [&](i64 val, i64 lo, i64 hi) { check_range(ctx, i, val, lo, hi); }; auto rs1 = [&] { return *(ub32 *)loc & (0b11111 << 14); }; auto rs2 = [&] { return *(ub32 *)loc & 0b11111; }; auto rd = [&] { return *(ub32 *)loc & (0b11111 << 25); }; switch (rel.r_type) { case R_SPARC_5: check(S + A, 0, 1 << 5); *(ub32 *)loc |= bits(S + A, 4, 0); break; case R_SPARC_6: check(S + A, 0, 1 << 6); *(ub32 *)loc |= bits(S + A, 5, 0); break; case R_SPARC_7: check(S + A, 0, 1 << 7); *(ub32 *)loc |= bits(S + A, 6, 0); break; case R_SPARC_8: check(S + A, 0, 1 << 8); *loc = S + A; break; case R_SPARC_10: check(S + A, 0, 1 << 10); *(ub32 *)loc |= bits(S + A, 9, 0); break; case R_SPARC_LO10: case R_SPARC_LOPLT10: *(ub32 *)loc |= bits(S + A, 9, 0); break; case R_SPARC_11: check(S + A, 0, 1 << 11); *(ub32 *)loc |= bits(S + A, 10, 0); break; case R_SPARC_13: check(S + A, 0, 1 << 13); *(ub32 *)loc |= bits(S + A, 12, 0); break; case R_SPARC_16: case R_SPARC_UA16: check(S + A, 0, 1 << 16); *(ub16 *)loc = S + A; break; case R_SPARC_22: check(S + A, 0, 1 << 22); *(ub32 *)loc |= bits(S + A, 21, 0); break; case R_SPARC_32: case R_SPARC_UA32: case R_SPARC_PLT32: check(S + A, 0, 1LL << 32); *(ub32 *)loc = S + A; break; case R_SPARC_PLT64: case R_SPARC_REGISTER: *(ub64 *)loc = S + A; break; case R_SPARC_DISP8: check(S + A - P, -(1 << 7), 1 << 7); *loc = S + A - P; break; case R_SPARC_DISP16: check(S + A - P, -(1 << 15), 1 << 15); *(ub16 *)loc = S + A - P; break; case R_SPARC_DISP32: case R_SPARC_PCPLT32: check(S + A - P, -(1LL << 31), 1LL << 31); *(ub32 *)loc = S + A - P; break; case R_SPARC_DISP64: *(ub64 *)loc = S + A - P; break; case R_SPARC_WDISP16: { i64 val = S + A - P; check(val, -(1 << 16), 1 << 16); *(ub16 *)loc |= (bit(val, 16) << 21) | bits(val, 15, 2); break; } case R_SPARC_WDISP19: check(S + A - P, -(1 << 20), 1 << 20); *(ub32 *)loc |= bits(S + A - P, 20, 2); break; case R_SPARC_WDISP22: check(S + A - P, -(1 << 23), 1 << 23); *(ub32 *)loc |= bits(S + A - P, 23, 2); break; case R_SPARC_WDISP30: case R_SPARC_WPLT30: check(S + A - P, -(1LL << 31), 1LL << 31); *(ub32 *)loc |= bits(S + A - P, 31, 2); break; case R_SPARC_HI22: case R_SPARC_HIPLT22: case R_SPARC_LM22: *(ub32 *)loc |= bits(S + A, 31, 10); break; case R_SPARC_GOT10: *(ub32 *)loc |= bits(G, 9, 0); break; case R_SPARC_GOT13: check(G, 0, 1 << 12); *(ub32 *)loc |= bits(G, 12, 0); break; case R_SPARC_GOT22: *(ub32 *)loc |= bits(G, 31, 10); break; case R_SPARC_GOTDATA_HIX22: { i64 val = S + A - GOT; *(ub32 *)loc |= bits(val < 0 ? ~val : val, 31, 10); break; } case R_SPARC_GOTDATA_LOX10: { i64 val = S + A - GOT; *(ub32 *)loc |= bits(val, 9, 0) | (val < 0 ? 0b1'1100'0000'0000 : 0); break; } case R_SPARC_GOTDATA_OP_HIX22: // We always have to relax a GOT load to a load immediate if a // symbol is local, because R_SPARC_GOTDATA_OP cannot represent // an addend for a local symbol. if (sym.is_absolute()) { i64 val = S + A; *(ub32 *)loc |= bits(val < 0 ? ~val : val, 31, 10); } else if (sym.is_pcrel_linktime_const(ctx)) { i64 val = S + A - GOT; *(ub32 *)loc |= bits(val < 0 ? ~val : val, 31, 10); } else { *(ub32 *)loc |= bits(G, 31, 10); } break; case R_SPARC_GOTDATA_OP_LOX10: if (sym.is_absolute()) { i64 val = S + A; *(ub32 *)loc |= bits(val, 9, 0) | (val < 0 ? 0b1'1100'0000'0000 : 0); } else if (sym.is_pcrel_linktime_const(ctx)) { i64 val = S + A - GOT; *(ub32 *)loc |= bits(val, 9, 0) | (val < 0 ? 0b1'1100'0000'0000 : 0); } else { *(ub32 *)loc |= bits(G, 9, 0); } break; case R_SPARC_GOTDATA_OP: if (sym.is_absolute()) { // ldx [ %rs1 + %rs2 ], %rd → mov %rs2, %rd *(ub32 *)loc = 0x8010'0000 | rs2() | rd(); } else if (sym.is_pcrel_linktime_const(ctx)) { // ldx [ %rs1 + %rs2 ], %rd → add %rs1, %rs2, %rd *(ub32 *)loc = 0x8000'0000 | rs1() | rs2() | rd(); } break; case R_SPARC_PC10: case R_SPARC_PCPLT10: *(ub32 *)loc |= bits(S + A - P, 9, 0); break; case R_SPARC_PC22: case R_SPARC_PCPLT22: case R_SPARC_PC_LM22: *(ub32 *)loc |= bits(S + A - P, 31, 10); break; case R_SPARC_OLO10: *(ub32 *)loc |= bits(bits(S + A, 9, 0) + rel.r_type_data, 12, 0); break; case R_SPARC_HH22: *(ub32 *)loc |= bits(S + A, 63, 42); break; case R_SPARC_HM10: *(ub32 *)loc |= bits(S + A, 41, 32); break; case R_SPARC_PC_HH22: *(ub32 *)loc |= bits(S + A - P, 63, 42); break; case R_SPARC_PC_HM10: *(ub32 *)loc |= bits(S + A - P, 41, 32); break; case R_SPARC_HIX22: *(ub32 *)loc |= bits(~(S + A), 31, 10); break; case R_SPARC_LOX10: *(ub32 *)loc |= bits(S + A, 9, 0) | 0b1'1100'0000'0000; break; case R_SPARC_H44: *(ub32 *)loc |= bits(S + A, 43, 22); break; case R_SPARC_M44: *(ub32 *)loc |= bits(S + A, 21, 12); break; case R_SPARC_L44: *(ub32 *)loc |= bits(S + A, 11, 0); break; case R_SPARC_TLS_GD_HI22: if (sym.has_tlsgd(ctx)) { *(ub32 *)loc |= bits(sym.get_tlsgd_addr(ctx) + A - GOT, 31, 10); } else if (sym.has_gottp(ctx)) { *(ub32 *)loc |= bits(sym.get_gottp_addr(ctx) + A - GOT, 31, 10); } else { *(ub32 *)loc |= bits(~(S + A - ctx.tp_addr), 31, 10); } break; case R_SPARC_TLS_GD_LO10: if (sym.has_tlsgd(ctx)) { *(ub32 *)loc |= bits(sym.get_tlsgd_addr(ctx) + A - GOT, 9, 0); } else if (sym.has_gottp(ctx)) { // add %rs1, %rs2, %rd → or %rs1, $imm, %rd *(ub32 *)loc = 0x8010'2000 | rs1() | rd(); *(ub32 *)loc |= bits(sym.get_gottp_addr(ctx) + A - GOT, 9, 0); } else { // add %rs1, %rs2, %rd → xor %rs1, $imm, %rd *(ub32 *)loc = 0x8018'2000 | rs1() | rd(); *(ub32 *)loc |= bits(S + A - ctx.tp_addr, 9, 0) | 0b1'1100'0000'0000; } break; case R_SPARC_TLS_GD_ADD: if (sym.has_tlsgd(ctx)) { // do nothing } else if (sym.has_gottp(ctx)) { // add %rs1, %rs2, %rd → ldx [ %rs1 + %rs2 ], %rd *(ub32 *)loc = 0xc058'0000 | rs1() | rs2() | rd(); } else { // add %rs1, %rs2, %rd → add %g7, %rs2, %rd *(ub32 *)loc = 0x8001'c000 | rs2() | rd(); } break; case R_SPARC_TLS_GD_CALL: if (sym.has_tlsgd(ctx)) { u64 addr = ctx.extra.tls_get_addr->get_addr(ctx); *(ub32 *)loc |= bits(addr + A - P, 31, 2); } else if (sym.has_gottp(ctx)) { // When we rewrite a branch instruction with a non-branch one, // we need to swap the instruction and the following one so that // the original execution order, which is inverted due to the // branch delay slot, is preserved. // // Since we apply relocations from the end to the beginning, // the instruction at loc + 4 is already complete. memcpy(loc, loc + 4, 4); *(ub32 *)(loc + 4) = 0x9001'c008; // add %g7, %o0, %o0 } else { *(ub32 *)loc = 0x0100'0000; // call → nop } break; case R_SPARC_TLS_LDM_HI22: if (ctx.got->has_tlsld(ctx)) *(ub32 *)loc |= bits(ctx.got->get_tlsld_addr(ctx) + A - GOT, 31, 10); else *(ub32 *)loc |= bits(ctx.tp_addr - ctx.tls_begin, 31, 10); break; case R_SPARC_TLS_LDM_LO10: if (ctx.got->has_tlsld(ctx)) *(ub32 *)loc |= bits(ctx.got->get_tlsld_addr(ctx) + A - GOT, 9, 0); else *(ub32 *)loc |= bits(ctx.tp_addr - ctx.tls_begin, 9, 0); break; case R_SPARC_TLS_LDM_ADD: if (!ctx.got->has_tlsld(ctx)) *(ub32 *)loc = 0x8021'c000 | rs2() | rd(); // sub %g7, %rs2, %rd break; case R_SPARC_TLS_LDM_CALL: if (ctx.got->has_tlsld(ctx)) { u64 addr = ctx.extra.tls_get_addr->get_addr(ctx); *(ub32 *)loc |= bits(addr + A - P, 31, 2); } else { *(ub32 *)loc = 0x0100'0000; // nop } break; case R_SPARC_TLS_LDO_HIX22: *(ub32 *)loc |= bits(S + A - ctx.dtp_addr, 31, 10); break; case R_SPARC_TLS_LDO_LOX10: *(ub32 *)loc |= bits(S + A - ctx.dtp_addr, 9, 0); break; case R_SPARC_TLS_IE_HI22: *(ub32 *)loc |= bits(sym.get_gottp_addr(ctx) + A - GOT, 31, 10); break; case R_SPARC_TLS_IE_LO10: *(ub32 *)loc |= bits(sym.get_gottp_addr(ctx) + A - GOT, 9, 0); break; case R_SPARC_TLS_LE_HIX22: *(ub32 *)loc |= bits(~(S + A - ctx.tp_addr), 31, 10); break; case R_SPARC_TLS_LE_LOX10: *(ub32 *)loc |= bits(S + A - ctx.tp_addr, 9, 0) | 0b1'1100'0000'0000; break; case R_SPARC_SIZE32: *(ub32 *)loc = sym.esym().st_size + A; break; case R_SPARC_64: case R_SPARC_UA64: case R_SPARC_TLS_LDO_ADD: case R_SPARC_TLS_IE_LD: case R_SPARC_TLS_IE_LDX: case R_SPARC_TLS_IE_ADD: break; default: unreachable(); } } } template <> void InputSection::apply_reloc_nonalloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE || record_undef_error(ctx, rel)) continue; Symbol &sym = *file.symbols[rel.r_sym]; u8 *loc = base + rel.r_offset; SectionFragment *frag; i64 frag_addend; std::tie(frag, frag_addend) = get_fragment(ctx, rel); u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx); u64 A = frag ? frag_addend : (i64)rel.r_addend; auto check = [&](i64 val, i64 lo, i64 hi) { check_range(ctx, val, i, lo, hi); }; switch (rel.r_type) { case R_SPARC_64: case R_SPARC_UA64: if (std::optional val = get_tombstone(sym, frag)) *(ub64 *)loc = *val; else *(ub64 *)loc = S + A; break; case R_SPARC_32: case R_SPARC_UA32: check(S + A, 0, 1LL << 32); *(ub32 *)loc = S + A; break; case R_SPARC_TLS_DTPOFF32: *(ub32 *)loc = S + A - ctx.dtp_addr; break; case R_SPARC_TLS_DTPOFF64: *(ub64 *)loc = S + A - ctx.dtp_addr; break; default: Fatal(ctx) << *this << ": apply_reloc_nonalloc: " << rel; } } } template <> void InputSection::scan_relocations(Context &ctx) { assert(shdr().sh_flags & SHF_ALLOC); std::span> rels = get_rels(ctx); bool needs_tlsgd = false; // Scan relocations for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE || record_undef_error(ctx, rel)) continue; Symbol &sym = *file.symbols[rel.r_sym]; if (sym.is_ifunc()) sym.flags |= NEEDS_GOT | NEEDS_PLT; switch (rel.r_type) { case R_SPARC_8: case R_SPARC_5: case R_SPARC_6: case R_SPARC_7: case R_SPARC_10: case R_SPARC_11: case R_SPARC_13: case R_SPARC_16: case R_SPARC_22: case R_SPARC_32: case R_SPARC_REGISTER: case R_SPARC_UA16: case R_SPARC_UA32: case R_SPARC_PC_HM10: case R_SPARC_OLO10: case R_SPARC_LOX10: case R_SPARC_HM10: case R_SPARC_M44: case R_SPARC_HIX22: case R_SPARC_LO10: case R_SPARC_L44: case R_SPARC_LM22: case R_SPARC_HI22: case R_SPARC_H44: case R_SPARC_HH22: scan_absrel(ctx, sym, rel); break; case R_SPARC_PLT32: case R_SPARC_WPLT30: case R_SPARC_WDISP30: case R_SPARC_HIPLT22: case R_SPARC_LOPLT10: case R_SPARC_PCPLT32: case R_SPARC_PCPLT22: case R_SPARC_PCPLT10: case R_SPARC_PLT64: if (sym.is_imported) sym.flags |= NEEDS_PLT; break; case R_SPARC_GOT13: case R_SPARC_GOT10: case R_SPARC_GOT22: case R_SPARC_GOTDATA_HIX22: sym.flags |= NEEDS_GOT; break; case R_SPARC_GOTDATA_OP_HIX22: if (sym.is_imported) sym.flags |= NEEDS_GOT; break; case R_SPARC_DISP16: case R_SPARC_DISP32: case R_SPARC_DISP64: case R_SPARC_DISP8: case R_SPARC_PC10: case R_SPARC_PC22: case R_SPARC_PC_LM22: case R_SPARC_WDISP16: case R_SPARC_WDISP19: case R_SPARC_WDISP22: case R_SPARC_PC_HH22: scan_pcrel(ctx, sym, rel); break; case R_SPARC_TLS_GD_HI22: if (ctx.arg.static_ || (ctx.arg.relax && sym.is_tprel_linktime_const(ctx))) { // We always relax if -static because libc.a doesn't contain // __tls_get_addr(). } else if (ctx.arg.relax && sym.is_tprel_runtime_const(ctx)) { sym.flags |= NEEDS_GOTTP; } else { sym.flags |= NEEDS_TLSGD; needs_tlsgd = true; } break; case R_SPARC_TLS_LDM_HI22: if (ctx.arg.static_ || (ctx.arg.relax && !ctx.arg.shared)) { // We always relax if -static because libc.a doesn't contain // __tls_get_addr(). } else { ctx.needs_tlsld = true; } break; case R_SPARC_TLS_IE_HI22: sym.flags |= NEEDS_GOTTP; break; case R_SPARC_TLS_LE_HIX22: case R_SPARC_TLS_LE_LOX10: check_tlsle(ctx, sym, rel); break; case R_SPARC_64: case R_SPARC_UA64: case R_SPARC_GOTDATA_OP_LOX10: case R_SPARC_GOTDATA_OP: case R_SPARC_GOTDATA_LOX10: case R_SPARC_TLS_GD_LO10: case R_SPARC_TLS_GD_ADD: case R_SPARC_TLS_GD_CALL: case R_SPARC_TLS_LDM_LO10: case R_SPARC_TLS_LDM_ADD: case R_SPARC_TLS_LDM_CALL: case R_SPARC_TLS_LDO_HIX22: case R_SPARC_TLS_LDO_LOX10: case R_SPARC_TLS_LDO_ADD: case R_SPARC_TLS_IE_ADD: case R_SPARC_TLS_IE_LD: case R_SPARC_TLS_IE_LDX: case R_SPARC_TLS_IE_LO10: case R_SPARC_SIZE32: break; default: Error(ctx) << *this << ": unknown relocation: " << rel; } } // TLS_GD_CALL and TLS_LDM_CALL relocations implicitly refer to // __tls_get_addr, which may be dynamically linked from libc.so. Symbol &sym = *ctx.extra.tls_get_addr; if (sym.is_imported && (needs_tlsgd || ctx.needs_tlsld)) sym.flags |= NEEDS_PLT; } } // namespace mold #endif ================================================ FILE: src/arch-x86-64.cc ================================================ // Supporting x86-64 is straightforward. Unlike its predecessor, i386, // x86-64 supports PC-relative addressing for position-independent code. // Being CISC, its instructions are variable in size. Branch instructions // take 4 bytes offsets, so we don't need range extension thunks. // // The psABI specifies %r11 as neither caller- nor callee-saved. It's // intentionally left out so that we can use it as a scratch register in // PLT. // // Thread Pointer (TP) is stored not to a general-purpose register but to // FS segment register. Segment register is a 64-bits register which can // be used as a base address for memory access. Each thread has a unique // FS value, and they access their thread-local variables relative to FS // as %fs:offset_from_tp. // // The value of a segment register itself is not generally readable from // the user space. As a workaround, libc initializes %fs:0 (the first word // referenced by FS) to the value of %fs itself. So we can obtain TP just // by `mov %fs:0, %rax` if we need it. // // For historical reasons, TP points past the end of the TLS block on x86. // This is contrary to other psABIs which usually use the beginning of the // TLS block as TP (with some addend). As a result, offsets from TP to // thread-local variables (TLVs) in the main executable are all negative. // // https://gitlab.com/x86-psABIs/x86-64-ABI #if MOLD_X86_64 #include "mold.h" #include namespace mold { using E = X86_64; // This is a security-enhanced version of the regular PLT. The PLT // header and each PLT entry starts with endbr64 for the Intel's // control-flow enforcement security mechanism. // // Note that our IBT-enabled PLT instruction sequence is different // from the one used in GNU ld. GNU's IBTPLT implementation uses two // separate sections (.plt and .plt.sec) in which one PLT entry takes // 32 bytes in total. Our IBTPLT consists of just .plt and each entry // is 16 bytes long. // // Our PLT entry clobbers %r11, but that's fine because the resolver // function (_dl_runtime_resolve) clobbers %r11 anyway. template <> void write_plt_header(Context &ctx, u8 *buf) { static const u8 insn[] = { 0xf3, 0x0f, 0x1e, 0xfa, // endbr64 0x41, 0x53, // push %r11 0xff, 0x35, 0, 0, 0, 0, // push GOTPLT+8(%rip) 0xff, 0x25, 0, 0, 0, 0, // jmp *GOTPLT+16(%rip) 0xcc, 0xcc, 0xcc, 0xcc, // (padding) 0xcc, 0xcc, 0xcc, 0xcc, // (padding) 0xcc, 0xcc, 0xcc, 0xcc, // (padding) 0xcc, 0xcc, // (padding) }; memcpy(buf, insn, sizeof(insn)); *(ul32 *)(buf + 8) = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 4; *(ul32 *)(buf + 14) = ctx.gotplt->shdr.sh_addr - ctx.plt->shdr.sh_addr - 2; } template <> void write_plt_entry(Context &ctx, u8 *buf, Symbol &sym) { // Only a canonical PLT can be address-taken; there's no way to take // an address of a non-canonical PLT. Therefore, a non-canonical PLT // doesn't have to start with an endbr64. if (sym.is_canonical) { static const u8 insn[] = { 0xf3, 0x0f, 0x1e, 0xfa, // endbr64 0x41, 0xbb, 0, 0, 0, 0, // mov $index_in_relplt, %r11d 0xff, 0x25, 0, 0, 0, 0, // jmp *foo@GOTPLT }; memcpy(buf, insn, sizeof(insn)); *(ul32 *)(buf + 6) = sym.get_plt_idx(ctx); *(ul32 *)(buf + 12) = sym.get_gotplt_addr(ctx) - sym.get_plt_addr(ctx) - 16; } else { static const u8 insn[] = { 0x41, 0xbb, 0, 0, 0, 0, // mov $index_in_relplt, %r11d 0xff, 0x25, 0, 0, 0, 0, // jmp *foo@GOTPLT 0xcc, 0xcc, 0xcc, 0xcc, // (padding) }; memcpy(buf, insn, sizeof(insn)); *(ul32 *)(buf + 2) = sym.get_plt_idx(ctx); *(ul32 *)(buf + 8) = sym.get_gotplt_addr(ctx) - sym.get_plt_addr(ctx) - 12; } } template <> void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym) { static const u8 insn[] = { 0xff, 0x25, 0, 0, 0, 0, // jmp *foo@GOT 0xcc, 0xcc, // (padding) }; memcpy(buf, insn, sizeof(insn)); *(ul32 *)(buf + 2) = sym.get_got_pltgot_addr(ctx) - sym.get_plt_addr(ctx) - 6; } template <> void EhFrameSection::apply_eh_reloc(Context &ctx, const ElfRel &rel, u64 offset, u64 val) { u8 *loc = ctx.buf + this->shdr.sh_offset + offset; switch (rel.r_type) { case R_NONE: break; case R_X86_64_32: *(ul32 *)loc = val; break; case R_X86_64_64: *(ul64 *)loc = val; break; case R_X86_64_PC32: *(ul32 *)loc = val - this->shdr.sh_addr - offset; break; case R_X86_64_PC64: *(ul64 *)loc = val - this->shdr.sh_addr - offset; break; default: Fatal(ctx) << "unsupported relocation in .eh_frame: " << rel; } } static u32 relax_gotpcrelx(u8 *loc, const ElfRel &rel) { if (rel.r_type == R_X86_64_GOTPCRELX) { switch ((loc[-2] << 8) | loc[-1]) { case 0xff15: return 0x40e8; // call *0(%rip) -> call 0 case 0xff25: return 0x40e9; // jmp *0(%rip) -> jmp 0 } } else { assert(rel.r_type == R_X86_64_REX_GOTPCRELX || rel.r_type == R_X86_64_CODE_4_GOTPCRELX); switch ((loc[-3] << 16) | (loc[-2] << 8) | loc[-1]) { case 0x488b05: return 0x8d05; // mov 0(%rip), %rax -> lea 0(%rip), %rax case 0x488b0d: return 0x8d0d; // mov 0(%rip), %rcx -> lea 0(%rip), %rcx case 0x488b15: return 0x8d15; // mov 0(%rip), %rdx -> lea 0(%rip), %rdx case 0x488b1d: return 0x8d1d; // mov 0(%rip), %rbx -> lea 0(%rip), %rbx case 0x488b25: return 0x8d25; // mov 0(%rip), %rsp -> lea 0(%rip), %rsp case 0x488b2d: return 0x8d2d; // mov 0(%rip), %rbp -> lea 0(%rip), %rbp case 0x488b35: return 0x8d35; // mov 0(%rip), %rsi -> lea 0(%rip), %rsi case 0x488b3d: return 0x8d3d; // mov 0(%rip), %rdi -> lea 0(%rip), %rdi case 0x4c8b05: return 0x8d05; // mov 0(%rip), %r8 -> lea 0(%rip), %r8 case 0x4c8b0d: return 0x8d0d; // mov 0(%rip), %r9 -> lea 0(%rip), %r9 case 0x4c8b15: return 0x8d15; // mov 0(%rip), %r10 -> lea 0(%rip), %r10 case 0x4c8b1d: return 0x8d1d; // mov 0(%rip), %r11 -> lea 0(%rip), %r11 case 0x4c8b25: return 0x8d25; // mov 0(%rip), %r12 -> lea 0(%rip), %r12 case 0x4c8b2d: return 0x8d2d; // mov 0(%rip), %r13 -> lea 0(%rip), %r13 case 0x4c8b35: return 0x8d35; // mov 0(%rip), %r14 -> lea 0(%rip), %r14 case 0x4c8b3d: return 0x8d3d; // mov 0(%rip), %r15 -> lea 0(%rip), %r15 } } return 0; } static u32 relax_gottpoff(u8 *loc, const ElfRel &rel) { if (rel.r_type == R_X86_64_GOTTPOFF) { switch ((loc[-3] << 16) | (loc[-2] << 8) | loc[-1]) { case 0x488b05: return 0x48c7c0; // mov 0(%rip), %rax -> mov $0, %rax case 0x488b0d: return 0x48c7c1; // mov 0(%rip), %rcx -> mov $0, %rcx case 0x488b15: return 0x48c7c2; // mov 0(%rip), %rdx -> mov $0, %rdx case 0x488b1d: return 0x48c7c3; // mov 0(%rip), %rbx -> mov $0, %rbx case 0x488b25: return 0x48c7c4; // mov 0(%rip), %rsp -> mov $0, %rsp case 0x488b2d: return 0x48c7c5; // mov 0(%rip), %rbp -> mov $0, %rbp case 0x488b35: return 0x48c7c6; // mov 0(%rip), %rsi -> mov $0, %rsi case 0x488b3d: return 0x48c7c7; // mov 0(%rip), %rdi -> mov $0, %rdi case 0x4c8b05: return 0x49c7c0; // mov 0(%rip), %r8 -> mov $0, %r8 case 0x4c8b0d: return 0x49c7c1; // mov 0(%rip), %r9 -> mov $0, %r9 case 0x4c8b15: return 0x49c7c2; // mov 0(%rip), %r10 -> mov $0, %r10 case 0x4c8b1d: return 0x49c7c3; // mov 0(%rip), %r11 -> mov $0, %r11 case 0x4c8b25: return 0x49c7c4; // mov 0(%rip), %r12 -> mov $0, %r12 case 0x4c8b2d: return 0x49c7c5; // mov 0(%rip), %r13 -> mov $0, %r13 case 0x4c8b35: return 0x49c7c6; // mov 0(%rip), %r14 -> mov $0, %r14 case 0x4c8b3d: return 0x49c7c7; // mov 0(%rip), %r15 -> mov $0, %r15 } } else { assert(rel.r_type == R_X86_64_CODE_4_GOTTPOFF); switch ((loc[-3] << 16) | (loc[-2] << 8) | loc[-1]) { case 0x488b05: return 0x18c7c0; // mov 0(%rip), %r16 -> mov $0, %r16 case 0x488b0d: return 0x18c7c1; // mov 0(%rip), %r17 -> mov $0, %r17 case 0x488b15: return 0x18c7c2; // mov 0(%rip), %r18 -> mov $0, %r18 case 0x488b1d: return 0x18c7c3; // mov 0(%rip), %r19 -> mov $0, %r19 case 0x488b25: return 0x18c7c4; // mov 0(%rip), %r20 -> mov $0, %r20 case 0x488b2d: return 0x18c7c5; // mov 0(%rip), %r21 -> mov $0, %r21 case 0x488b35: return 0x18c7c6; // mov 0(%rip), %r22 -> mov $0, %r22 case 0x488b3d: return 0x18c7c7; // mov 0(%rip), %r23 -> mov $0, %r23 case 0x4c8b05: return 0x19c7c0; // mov 0(%rip), %r24 -> mov $0, %r24 case 0x4c8b0d: return 0x19c7c1; // mov 0(%rip), %r25 -> mov $0, %r25 case 0x4c8b15: return 0x19c7c2; // mov 0(%rip), %r26 -> mov $0, %r26 case 0x4c8b1d: return 0x19c7c3; // mov 0(%rip), %r27 -> mov $0, %r27 case 0x4c8b25: return 0x19c7c4; // mov 0(%rip), %r28 -> mov $0, %r28 case 0x4c8b2d: return 0x19c7c5; // mov 0(%rip), %r29 -> mov $0, %r29 case 0x4c8b35: return 0x19c7c6; // mov 0(%rip), %r30 -> mov $0, %r30 case 0x4c8b3d: return 0x19c7c7; // mov 0(%rip), %r31 -> mov $0, %r31 } } return 0; } static u32 relax_tlsdesc_to_ie(u8 *loc, const ElfRel &rel) { if (rel.r_type == R_X86_64_GOTPC32_TLSDESC) { switch ((loc[-3] << 16) | (loc[-2] << 8) | loc[-1]) { case 0x488d05: return 0x488b05; // lea 0(%rip), %rax -> mov 0(%rip), %rax case 0x488d0d: return 0x488b0d; // lea 0(%rip), %rcx -> mov 0(%rip), %rcx case 0x488d15: return 0x488b15; // lea 0(%rip), %rdx -> mov 0(%rip), %rdx case 0x488d1d: return 0x488b1d; // lea 0(%rip), %rbx -> mov 0(%rip), %rbx case 0x488d25: return 0x488b25; // lea 0(%rip), %rsp -> mov 0(%rip), %rsp case 0x488d2d: return 0x488b2d; // lea 0(%rip), %rbp -> mov 0(%rip), %rbp case 0x488d35: return 0x488b35; // lea 0(%rip), %rsi -> mov 0(%rip), %rsi case 0x488d3d: return 0x488b3d; // lea 0(%rip), %rdi -> mov 0(%rip), %rdi case 0x4c8d05: return 0x4c8b05; // lea 0(%rip), %r8 -> mov 0(%rip), %r8 case 0x4c8d0d: return 0x4c8b0d; // lea 0(%rip), %r9 -> mov 0(%rip), %r9 case 0x4c8d15: return 0x4c8b15; // lea 0(%rip), %r10 -> mov 0(%rip), %r10 case 0x4c8d1d: return 0x4c8b1d; // lea 0(%rip), %r11 -> mov 0(%rip), %r11 case 0x4c8d25: return 0x4c8b25; // lea 0(%rip), %r12 -> mov 0(%rip), %r12 case 0x4c8d2d: return 0x4c8b2d; // lea 0(%rip), %r13 -> mov 0(%rip), %r13 case 0x4c8d35: return 0x4c8b35; // lea 0(%rip), %r14 -> mov 0(%rip), %r14 case 0x4c8d3d: return 0x4c8b3d; // lea 0(%rip), %r15 -> mov 0(%rip), %r15 } } else { assert(rel.r_type == R_X86_64_CODE_4_GOTPC32_TLSDESC); switch ((loc[-3] << 16) | (loc[-2] << 8) | loc[-1]) { case 0x488d05: return 0x488b05; // lea 0(%rip), %r16 -> mov 0(%rip), %r16 case 0x488d0d: return 0x488b0d; // lea 0(%rip), %r17 -> mov 0(%rip), %r17 case 0x488d15: return 0x488b15; // lea 0(%rip), %r18 -> mov 0(%rip), %r18 case 0x488d1d: return 0x488b1d; // lea 0(%rip), %r19 -> mov 0(%rip), %r19 case 0x488d25: return 0x488b25; // lea 0(%rip), %r20 -> mov 0(%rip), %r20 case 0x488d2d: return 0x488b2d; // lea 0(%rip), %r21 -> mov 0(%rip), %r21 case 0x488d35: return 0x488b35; // lea 0(%rip), %r22 -> mov 0(%rip), %r22 case 0x488d3d: return 0x488b3d; // lea 0(%rip), %r23 -> mov 0(%rip), %r23 case 0x4c8d05: return 0x4c8b05; // lea 0(%rip), %r24 -> mov 0(%rip), %r24 case 0x4c8d0d: return 0x4c8b0d; // lea 0(%rip), %r25 -> mov 0(%rip), %r25 case 0x4c8d15: return 0x4c8b15; // lea 0(%rip), %r26 -> mov 0(%rip), %r26 case 0x4c8d1d: return 0x4c8b1d; // lea 0(%rip), %r27 -> mov 0(%rip), %r27 case 0x4c8d25: return 0x4c8b25; // lea 0(%rip), %r28 -> mov 0(%rip), %r28 case 0x4c8d2d: return 0x4c8b2d; // lea 0(%rip), %r29 -> mov 0(%rip), %r29 case 0x4c8d35: return 0x4c8b35; // lea 0(%rip), %r30 -> mov 0(%rip), %r30 case 0x4c8d3d: return 0x4c8b3d; // lea 0(%rip), %r31 -> mov 0(%rip), %r31 } } return 0; } static u32 relax_tlsdesc_to_le(u8 *loc, const ElfRel &rel) { if (rel.r_type == R_X86_64_GOTPC32_TLSDESC) { switch ((loc[-3] << 16) | (loc[-2] << 8) | loc[-1]) { case 0x488d05: return 0x48c7c0; // lea 0(%rip), %rax -> mov $0, %rax case 0x488d0d: return 0x48c7c1; // lea 0(%rip), %rcx -> mov $0, %rcx case 0x488d15: return 0x48c7c2; // lea 0(%rip), %rdx -> mov $0, %rdx case 0x488d1d: return 0x48c7c3; // lea 0(%rip), %rbx -> mov $0, %rbx case 0x488d25: return 0x48c7c4; // lea 0(%rip), %rsp -> mov $0, %rsp case 0x488d2d: return 0x48c7c5; // lea 0(%rip), %rbp -> mov $0, %rbp case 0x488d35: return 0x48c7c6; // lea 0(%rip), %rsi -> mov $0, %rsi case 0x488d3d: return 0x48c7c7; // lea 0(%rip), %rdi -> mov $0, %rdi case 0x4c8d05: return 0x49c7c0; // lea 0(%rip), %r8 -> mov $0, %r8 case 0x4c8d0d: return 0x49c7c1; // lea 0(%rip), %r9 -> mov $0, %r9 case 0x4c8d15: return 0x49c7c2; // lea 0(%rip), %r10 -> mov $0, %r10 case 0x4c8d1d: return 0x49c7c3; // lea 0(%rip), %r11 -> mov $0, %r11 case 0x4c8d25: return 0x49c7c4; // lea 0(%rip), %r12 -> mov $0, %r12 case 0x4c8d2d: return 0x49c7c5; // lea 0(%rip), %r13 -> mov $0, %r13 case 0x4c8d35: return 0x49c7c6; // lea 0(%rip), %r14 -> mov $0, %r14 case 0x4c8d3d: return 0x49c7c7; // lea 0(%rip), %r15 -> mov $0, %r15 } } else { assert(rel.r_type == R_X86_64_CODE_4_GOTPC32_TLSDESC); switch ((loc[-3] << 16) | (loc[-2] << 8) | loc[-1]) { case 0x488d05: return 0x18c7c0; // lea 0(%rip), %r16 -> mov $0, %r16 case 0x488d0d: return 0x18c7c1; // lea 0(%rip), %r17 -> mov $0, %r17 case 0x488d15: return 0x18c7c2; // lea 0(%rip), %r18 -> mov $0, %r18 case 0x488d1d: return 0x18c7c3; // lea 0(%rip), %r19 -> mov $0, %r19 case 0x488d25: return 0x18c7c4; // lea 0(%rip), %r20 -> mov $0, %r20 case 0x488d2d: return 0x18c7c5; // lea 0(%rip), %r21 -> mov $0, %r21 case 0x488d35: return 0x18c7c6; // lea 0(%rip), %r22 -> mov $0, %r22 case 0x488d3d: return 0x18c7c7; // lea 0(%rip), %r23 -> mov $0, %r23 case 0x4c8d05: return 0x19c7c0; // lea 0(%rip), %r24 -> mov $0, %r24 case 0x4c8d0d: return 0x19c7c1; // lea 0(%rip), %r25 -> mov $0, %r25 case 0x4c8d15: return 0x19c7c2; // lea 0(%rip), %r26 -> mov $0, %r26 case 0x4c8d1d: return 0x19c7c3; // lea 0(%rip), %r27 -> mov $0, %r27 case 0x4c8d25: return 0x19c7c4; // lea 0(%rip), %r28 -> mov $0, %r28 case 0x4c8d2d: return 0x19c7c5; // lea 0(%rip), %r29 -> mov $0, %r29 case 0x4c8d35: return 0x19c7c6; // lea 0(%rip), %r30 -> mov $0, %r30 case 0x4c8d3d: return 0x19c7c7; // lea 0(%rip), %r31 -> mov $0, %r31 } } return 0; } // Rewrite a function call to __tls_get_addr to a cheaper instruction // sequence. We can do this when we know the thread-local variable's TP- // relative address at link-time. static void relax_gd_to_le(u8 *loc, const ElfRel &rel, u64 val) { switch (rel.r_type) { case R_X86_64_PLT32: case R_X86_64_PC32: case R_X86_64_GOTPCREL: case R_X86_64_GOTPCRELX: { // The original instructions are the following: // // 66 48 8d 3d 00 00 00 00 lea foo@tlsgd(%rip), %rdi // 66 66 48 e8 00 00 00 00 call __tls_get_addr // // or // // 66 48 8d 3d 00 00 00 00 lea foo@tlsgd(%rip), %rdi // 66 48 ff 15 00 00 00 00 call *__tls_get_addr@GOT(%rip) static const u8 insn[] = { 0x64, 0x48, 0x8b, 0x04, 0x25, 0, 0, 0, 0, // mov %fs:0, %rax 0x48, 0x81, 0xc0, 0, 0, 0, 0, // add $tp_offset, %rax }; memcpy(loc - 4, insn, sizeof(insn)); *(ul32 *)(loc + 8) = val; break; } case R_X86_64_PLTOFF64: { // The original instructions are the following: // // 48 8d 3d 00 00 00 00 lea foo@tlsgd(%rip), %rdi // 48 b8 00 00 00 00 00 00 00 00 movabs __tls_get_addr, %rax // 48 01 d8 add %rbx, %rax // ff d0 call *%rax static const u8 insn[] = { 0x64, 0x48, 0x8b, 0x04, 0x25, 0, 0, 0, 0, // mov %fs:0, %rax 0x48, 0x81, 0xc0, 0, 0, 0, 0, // add $tp_offset, %rax 0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00, // nop }; memcpy(loc - 3, insn, sizeof(insn)); *(ul32 *)(loc + 9) = val; break; } default: unreachable(); } } static void relax_gd_to_ie(u8 *loc, const ElfRel &rel, u64 val) { switch (rel.r_type) { case R_X86_64_PLT32: case R_X86_64_PC32: case R_X86_64_GOTPCREL: case R_X86_64_GOTPCRELX: { static const u8 insn[] = { 0x64, 0x48, 0x8b, 0x04, 0x25, 0, 0, 0, 0, // mov %fs:0, %rax 0x48, 0x03, 0x05, 0, 0, 0, 0, // add foo@gottpoff(%rip), %rax }; memcpy(loc - 4, insn, sizeof(insn)); *(ul32 *)(loc + 8) = val - 12; break; } case R_X86_64_PLTOFF64: { static const u8 insn[] = { 0x64, 0x48, 0x8b, 0x04, 0x25, 0, 0, 0, 0, // mov %fs:0, %rax 0x48, 0x03, 0x05, 0, 0, 0, 0, // add foo@gottpoff(%rip), %rax 0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00, // nop }; memcpy(loc - 3, insn, sizeof(insn)); *(ul32 *)(loc + 9) = val - 13; break; } default: unreachable(); } } // Rewrite a function call to __tls_get_addr to a cheaper instruction // sequence. The difference from relax_gd_to_le is that we are materializing // the address of the beginning of TLS block instead of an address of a // particular thread-local variable. static void relax_ld_to_le(u8 *loc, const ElfRel &rel, i64 tls_size) { switch (rel.r_type) { case R_X86_64_PLT32: case R_X86_64_PC32: { // The original instructions are the following: // // 48 8d 3d 00 00 00 00 lea foo@tlsld(%rip), %rdi // e8 00 00 00 00 call __tls_get_addr // // Because the original instruction sequence is so short that we need a // little bit of code golfing here. "mov %fs:0, %rax" is 9 byte long, so // xor + mov is shorter. Note that `xor %eax, %eax` zero-clears %eax. static const u8 insn[] = { 0x31, 0xc0, // xor %eax, %eax 0x64, 0x48, 0x8b, 0x00, // mov %fs:(%rax), %rax 0x48, 0x2d, 0, 0, 0, 0, // sub $tls_size, %rax }; memcpy(loc - 3, insn, sizeof(insn)); *(ul32 *)(loc + 5) = tls_size; break; } case R_X86_64_GOTPCREL: case R_X86_64_GOTPCRELX: { // The original instructions are the following: // // 48 8d 3d 00 00 00 00 lea foo@tlsld(%rip), %rdi // ff 15 00 00 00 00 call *__tls_get_addr@GOT(%rip) static const u8 insn[] = { 0x48, 0x31, 0xc0, // xor %rax, %rax 0x64, 0x48, 0x8b, 0x00, // mov %fs:(%rax), %rax 0x48, 0x2d, 0, 0, 0, 0, // sub $tls_size, %rax }; memcpy(loc - 3, insn, sizeof(insn)); *(ul32 *)(loc + 6) = tls_size; break; } case R_X86_64_PLTOFF64: { // The original instructions are the following: // // 48 8d 3d 00 00 00 00 lea foo@tlsld(%rip), %rdi // 48 b8 00 00 00 00 00 00 00 00 movabs __tls_get_addr@GOTOFF, %rax // 48 01 d8 add %rbx, %rax // ff d0 call *%rax static const u8 insn[] = { 0x64, 0x48, 0x8b, 0x04, 0x25, 0, 0, 0, 0, // mov %fs:0, %rax 0x48, 0x2d, 0, 0, 0, 0, // sub $tls_size, %rax 0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00, // nop }; memcpy(loc - 3, insn, sizeof(insn)); *(ul32 *)(loc + 8) = tls_size; break; } default: unreachable(); } } // Apply relocations to SHF_ALLOC sections (i.e. sections that are // mapped to memory at runtime) based on the result of // scan_relocations(). template <> void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE) continue; Symbol &sym = *file.symbols[rel.r_sym]; u8 *loc = base + rel.r_offset; u64 S = sym.get_addr(ctx); u64 A = rel.r_addend; u64 P = get_addr() + rel.r_offset; u64 G = sym.get_got_addr(ctx) - ctx.gotplt->shdr.sh_addr; u64 GOT = ctx.gotplt->shdr.sh_addr; auto check = [&](i64 val, i64 lo, i64 hi) { check_range(ctx, i, val, lo, hi); }; auto write32 = [&](u64 val) { check(val, 0, 1LL << 32); *(ul32 *)loc = val; }; auto write32s = [&](u64 val) { check(val, -(1LL << 31), 1LL << 31); *(ul32 *)loc = val; }; switch (rel.r_type) { case R_X86_64_8: check(S + A, 0, 1 << 8); *loc = S + A; break; case R_X86_64_16: check(S + A, 0, 1 << 16); *(ul16 *)loc = S + A; break; case R_X86_64_32: write32(S + A); break; case R_X86_64_32S: write32s(S + A); break; case R_X86_64_64: break; case R_X86_64_PC8: check(S + A - P, -(1 << 7), 1 << 7); *loc = S + A - P; break; case R_X86_64_PC16: check(S + A - P, -(1 << 15), 1 << 15); *(ul16 *)loc = S + A - P; break; case R_X86_64_PC32: case R_X86_64_PLT32: write32s(S + A - P); break; case R_X86_64_PC64: *(ul64 *)loc = S + A - P; break; case R_X86_64_GOT32: write32(G + A); break; case R_X86_64_GOT64: *(ul64 *)loc = G + A; break; case R_X86_64_GOTOFF64: case R_X86_64_PLTOFF64: *(ul64 *)loc = S + A - GOT; break; case R_X86_64_GOTPC32: write32s(GOT + A - P); break; case R_X86_64_GOTPC64: *(ul64 *)loc = GOT + A - P; break; case R_X86_64_GOTPCREL: write32s(G + GOT + A - P); break; case R_X86_64_GOTPCREL64: *(ul64 *)loc = G + GOT + A - P; break; case R_X86_64_GOTPCRELX: case R_X86_64_REX_GOTPCRELX: case R_X86_64_CODE_4_GOTPCRELX: // We always want to relax GOTPCRELX relocs even if --no-relax // was given because some static PIE runtime code depends on these // relaxations. if (sym.is_pcrel_linktime_const(ctx) && is_int(S + A - P, 32)) { if (u32 insn = relax_gotpcrelx(loc, rel)) { loc[-2] = insn >> 8; loc[-1] = insn; *(ul32 *)loc = S + A - P; break; } } write32s(G + GOT + A - P); break; case R_X86_64_TLSGD: if (sym.has_tlsgd(ctx)) write32s(sym.get_tlsgd_addr(ctx) + A - P); else if (sym.has_gottp(ctx)) relax_gd_to_ie(loc, rels[++i], sym.get_gottp_addr(ctx) - P); else relax_gd_to_le(loc, rels[++i], S - ctx.tp_addr); break; case R_X86_64_TLSLD: if (ctx.got->has_tlsld(ctx)) write32s(ctx.got->get_tlsld_addr(ctx) + A - P); else relax_ld_to_le(loc, rels[++i], ctx.tp_addr - ctx.tls_begin); break; case R_X86_64_DTPOFF32: write32s(S + A - ctx.dtp_addr); break; case R_X86_64_DTPOFF64: *(ul64 *)loc = S + A - ctx.dtp_addr; break; case R_X86_64_TPOFF32: write32s(S + A - ctx.tp_addr); break; case R_X86_64_TPOFF64: *(ul64 *)loc = S + A - ctx.tp_addr; break; case R_X86_64_GOTTPOFF: case R_X86_64_CODE_4_GOTTPOFF: if (sym.has_gottp(ctx)) { write32s(sym.get_gottp_addr(ctx) + A - P); } else { u32 insn = relax_gottpoff(loc, rel); loc[-3] = insn >> 16; loc[-2] = insn >> 8; loc[-1] = insn; write32s(S - ctx.tp_addr); } break; case R_X86_64_CODE_6_GOTTPOFF: write32s(sym.get_gottp_addr(ctx) + A - P); break; case R_X86_64_GOTPC32_TLSDESC: case R_X86_64_CODE_4_GOTPC32_TLSDESC: // x86-64 TLSDESC uses the following code sequence to materialize // a TP-relative address in %rax. // // lea 0(%rip), %rax // R_X86_64_GOTPC32_TLSDESC foo // call *(%rax) // R_X86_64_TLSDESC_CALL foo // // We may relax the instructions to the following if its TP-relative // address is known at link-time // // mov $foo@TPOFF, %rax // nop // // or to the following if the TP-relative address is known at // process startup time. // // mov foo@GOTTPOFF(%rip), %rax // nop // // We allow the following alternative code sequence too because // LLVM emits such code. // // lea 0(%rip), %reg // R_X86_64_GOTPC32_TLSDESC foo // mov %reg, %rax // call *(%rax) // R_X86_64_TLSDESC_CALL foo if (sym.has_tlsdesc(ctx)) { write32s(sym.get_tlsdesc_addr(ctx) + A - P); } else if (sym.has_gottp(ctx)) { u32 insn = relax_tlsdesc_to_ie(loc, rel); if (!insn) Fatal(ctx) << *this << ": illegal instruction sequence for " << rel; loc[-3] = insn >> 16; loc[-2] = insn >> 8; loc[-1] = insn; write32s(sym.get_gottp_addr(ctx) + A - P); } else { u32 insn = relax_tlsdesc_to_le(loc, rel); if (!insn) Fatal(ctx) << *this << ": illegal instruction sequence for " << rel; loc[-3] = insn >> 16; loc[-2] = insn >> 8; loc[-1] = insn; write32s(S - ctx.tp_addr); } break; case R_X86_64_TLSDESC_CALL: if (!sym.has_tlsdesc(ctx)) { // call *(%rax) -> nop loc[0] = 0x66; loc[1] = 0x90; } break; case R_X86_64_SIZE32: write32(sym.esym().st_size + A); break; case R_X86_64_SIZE64: *(ul64 *)loc = sym.esym().st_size + A; break; default: unreachable(); } } } // This function is responsible for applying relocations against // non-SHF_ALLOC sections (i.e. sections that are not mapped to memory // at runtime). // // Relocations against non-SHF_ALLOC sections are much easier to // handle than that against SHF_ALLOC sections. It is because, since // they are not mapped to memory, they don't contain any variable or // function and never need PLT or GOT. Non-SHF_ALLOC sections are // mostly debug info sections. // // Relocations against non-SHF_ALLOC sections are not scanned by // scan_relocations. template <> void InputSection::apply_reloc_nonalloc(Context &ctx, u8 *base) { std::span> rels = get_rels(ctx); for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE || record_undef_error(ctx, rel)) continue; Symbol &sym = *file.symbols[rel.r_sym]; u8 *loc = base + rel.r_offset; SectionFragment *frag; i64 frag_addend; std::tie(frag, frag_addend) = get_fragment(ctx, rel); u64 S = frag ? frag->get_addr(ctx) : sym.get_addr(ctx); u64 A = frag ? frag_addend : (i64)rel.r_addend; auto check = [&](i64 val, i64 lo, i64 hi) { check_range(ctx, i, val, lo, hi); }; auto write32 = [&](u64 val) { check(val, 0, 1LL << 32); *(ul32 *)loc = val; }; auto write32s = [&](u64 val) { check(val, -(1LL << 31), 1LL << 31); *(ul32 *)loc = val; }; switch (rel.r_type) { case R_X86_64_8: check(S + A, 0, 1 << 8); *loc = S + A; break; case R_X86_64_16: check(S + A, 0, 1 << 16); *(ul16 *)loc = S + A; break; case R_X86_64_32: write32(S + A); break; case R_X86_64_32S: write32s(S + A); break; case R_X86_64_64: if (std::optional val = get_tombstone(sym, frag)) *(ul64 *)loc = *val; else *(ul64 *)loc = S + A; break; case R_X86_64_DTPOFF32: if (std::optional val = get_tombstone(sym, frag)) *(ul32 *)loc = *val; else write32s(S + A - ctx.dtp_addr); break; case R_X86_64_DTPOFF64: if (std::optional val = get_tombstone(sym, frag)) *(ul64 *)loc = *val; else *(ul64 *)loc = S + A - ctx.dtp_addr; break; case R_X86_64_GOTOFF64: *(ul64 *)loc = S + A - ctx.gotplt->shdr.sh_addr; break; case R_X86_64_GOTPC64: // PC-relative relocation doesn't make sense for non-memory-allocated // section, but GCC 6.3.0 seems to create this reloc for // _GLOBAL_OFFSET_TABLE_. *(ul64 *)loc = ctx.gotplt->shdr.sh_addr + A; break; case R_X86_64_SIZE32: write32(sym.esym().st_size + A); break; case R_X86_64_SIZE64: *(ul64 *)loc = sym.esym().st_size + A; break; default: Fatal(ctx) << *this << ": invalid relocation for non-allocated sections: " << rel; break; } } } // Linker has to create data structures in an output file to apply // some type of relocations. For example, if a relocation refers a GOT // or a PLT entry of a symbol, linker has to create an entry in .got // or in .plt for that symbol. In order to fix the file layout, we // need to scan relocations. template <> void InputSection::scan_relocations(Context &ctx) { assert(shdr().sh_flags & SHF_ALLOC); std::span> rels = get_rels(ctx); // Scan relocations for (i64 i = 0; i < rels.size(); i++) { const ElfRel &rel = rels[i]; if (rel.r_type == R_NONE || record_undef_error(ctx, rel)) continue; Symbol &sym = *file.symbols[rel.r_sym]; u8 *loc = (u8 *)(contents.data() + rel.r_offset); if (sym.is_ifunc()) sym.flags |= NEEDS_GOT | NEEDS_PLT; if (rel.r_type == R_X86_64_TLSGD || rel.r_type == R_X86_64_TLSLD) { if (i + 1 == rels.size()) Fatal(ctx) << *this << ": " << rel << " must be followed by PLT or GOTPCREL"; if (u32 ty = rels[i + 1].r_type; ty != R_X86_64_PLT32 && ty != R_X86_64_PC32 && ty != R_X86_64_PLTOFF64 && ty != R_X86_64_GOTPCREL && ty != R_X86_64_GOTPCRELX) Fatal(ctx) << *this << ": " << rel << " must be followed by PLT or GOTPCREL"; } switch (rel.r_type) { case R_X86_64_8: case R_X86_64_16: case R_X86_64_32: case R_X86_64_32S: scan_absrel(ctx, sym, rel); break; case R_X86_64_PC8: case R_X86_64_PC16: case R_X86_64_PC32: case R_X86_64_PC64: scan_pcrel(ctx, sym, rel); break; case R_X86_64_GOT32: case R_X86_64_GOT64: case R_X86_64_GOTPC32: case R_X86_64_GOTPC64: case R_X86_64_GOTPCREL: case R_X86_64_GOTPCREL64: case R_X86_64_GOTPCRELX: case R_X86_64_REX_GOTPCRELX: case R_X86_64_CODE_4_GOTPCRELX: sym.flags |= NEEDS_GOT; break; case R_X86_64_PLT32: case R_X86_64_PLTOFF64: if (sym.is_imported) sym.flags |= NEEDS_PLT; break; case R_X86_64_TLSGD: if (ctx.arg.static_ || (ctx.arg.relax && sym.is_tprel_linktime_const(ctx))) { // We always relax if -static because libc.a doesn't contain // __tls_get_addr(). i++; } else if (ctx.arg.relax && sym.is_tprel_runtime_const(ctx)) { sym.flags |= NEEDS_GOTTP; i++; } else { sym.flags |= NEEDS_TLSGD; } break; case R_X86_64_TLSLD: // We always relax if -static because libc.a doesn't contain // __tls_get_addr(). if (ctx.arg.static_ || (ctx.arg.relax && !ctx.arg.shared)) i++; else ctx.needs_tlsld = true; break; case R_X86_64_GOTTPOFF: case R_X86_64_CODE_4_GOTTPOFF: if (!ctx.arg.relax || !sym.is_tprel_linktime_const(ctx) || !relax_gottpoff(loc, rel)) sym.flags |= NEEDS_GOTTP; break; case R_X86_64_CODE_6_GOTTPOFF: sym.flags |= NEEDS_GOTTP; break; case R_X86_64_TLSDESC_CALL: scan_tlsdesc(ctx, sym); break; case R_X86_64_TPOFF32: case R_X86_64_TPOFF64: check_tlsle(ctx, sym, rel); break; case R_X86_64_64: case R_X86_64_GOTOFF64: case R_X86_64_DTPOFF32: case R_X86_64_DTPOFF64: case R_X86_64_SIZE32: case R_X86_64_SIZE64: case R_X86_64_GOTPC32_TLSDESC: case R_X86_64_CODE_4_GOTPC32_TLSDESC: break; default: Error(ctx) << *this << ": unknown relocation: " << rel; } } } // Intel CET is a relatively new CPU feature to enhance security by // protecting control flow integrity. If the feature is enabled, indirect // branches (i.e. branch instructions that take a register instead of an // immediate) must land on a "landing pad" instruction, or a CPU-level fault // will raise. That prevents an attacker to branch to a middle of a random // function, making ROP or JOP much harder to conduct. // // On x86-64, the landing pad instruction is ENDBR64. That is actually a // repurposed NOP instruction to provide binary compatibility with older // hardware that doesn't support CET. // // The problem here is that the compiler always emits a landing pad at the // beginning fo a global function because it doesn't know whether or not the // function's address is taken in other translation units. As a result, the // resulting binary contains more landing pads than necessary. // // This function rewrites a landing pad with a nop if the function's address // was not actually taken. We can do what the compiler cannot because we // know about all translation units. void rewrite_endbr(Context &ctx) { Timer t(ctx, "rewrite_endbr"); constexpr u8 endbr64[] = {0xf3, 0x0f, 0x1e, 0xfa}; constexpr u8 nop[] = {0x0f, 0x1f, 0x40, 0x00}; // Rewrite all endbr64 instructions referred to by function symbols with // NOPs. We handle only global symbols because the compiler doesn't emit // an endbr64 for a file-scoped function in the first place if its address // is not taken within the file. tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { for (Symbol *sym : file->get_global_syms()) { if (sym->file == file && sym->esym().st_type == STT_FUNC) { if (InputSection *isec = sym->get_input_section(); isec && (isec->shdr().sh_flags & SHF_EXECINSTR)) { if (OutputSection *osec = isec->output_section) { u8 *buf = ctx.buf + osec->shdr.sh_offset + isec->offset + sym->value; if (memcmp(buf, endbr64, 4) == 0) memcpy(buf, nop, 4); } } } } }); auto write_back = [&](InputSection *isec, i64 offset) { // If isec has an endbr64 at a given offset, copy that instruction to // the output buffer, possibly overwriting a nop written in the above // loop. if (isec && isec->output_section && (isec->shdr().sh_flags & SHF_EXECINSTR) && 0 <= offset && offset <= isec->contents.size() - 4 && memcmp(isec->contents.data() + offset, endbr64, 4) == 0) memcpy(ctx.buf + isec->output_section->shdr.sh_offset + isec->offset + offset, endbr64, 4); }; // Write back endbr64 instructions if they are referred to by address-taking // relocations. tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { for (std::unique_ptr> &isec : file->sections) { if (isec && isec->is_alive && (isec->shdr().sh_flags & SHF_ALLOC)) { for (const ElfRel &rel : isec->get_rels(ctx)) { if (!is_func_call_rel(rel)) { Symbol *sym = file->symbols[rel.r_sym]; if (sym->esym().st_type == STT_SECTION) write_back(sym->get_input_section(), rel.r_addend); else write_back(sym->get_input_section(), sym->value); } } } } }); // We record addresses of some symbols in the ELF header, .dynamic or in // .dynsym. We need to retain endbr64s for such symbols. auto keep = [&](Symbol *sym) { if (sym) write_back(sym->get_input_section(), sym->value); }; keep(ctx.arg.entry); keep(ctx.arg.init); keep(ctx.arg.fini); if (ctx.dynsym) for (Symbol *sym : ctx.dynsym->symbols) if (sym && sym->is_exported) keep(sym); } } // namespace mold #endif ================================================ FILE: src/archive-file.cc ================================================ // This file contains functions to read an archive file (.a file). // An archive file is just a bundle of object files. It's similar to // tar or zip, but the contents are not compressed. // // An archive file is either "regular" or "thin". A regular archive // contains object files directly, while a thin archive contains only // pathnames. In the latter case, actual file contents have to be read // from given pathnames. A regular archive is sometimes called "fat" // archive as opposed to "thin". // // If an archive file is given to the linker, the linker pulls out // object files that are needed to resolve undefined symbols. So, // bunding object files as an archive and giving that archive to the // linker has a different meaning than directly giving the same set of // object files to the linker. The former links only needed object // files, while the latter links all the given object files. // // Therefore, if you link libc.a for example, not all the libc // functions are linked to your binary. Instead, only object files // that provides functions and variables used in your program get // linked. To make this efficient, static library functions are // usually separated to each object file in an archive file. You can // see the contents of libc.a by running `ar t // /usr/lib/x86_64-linux-gnu/libc.a`. #include "mold.h" namespace mold { namespace { struct ArHdr { char ar_name[16]; char ar_date[12]; char ar_uid[6]; char ar_gid[6]; char ar_mode[8]; char ar_size[10]; char ar_fmag[2]; bool starts_with(std::string_view s) const { return std::string_view(ar_name, s.size()) == s; } bool is_strtab() const { return starts_with("// "); } bool is_symtab() const { return starts_with("/ ") || starts_with("/SYM64/ "); } std::string read_name(std::string_view strtab, u8 *&ptr) const { // BSD-style long filename if (starts_with("#1/")) { int namelen = atoi(ar_name + 3); std::string name{(char *)ptr, (size_t)namelen}; ptr += namelen; if (size_t pos = name.find('\0')) name = name.substr(0, pos); return name; } // SysV-style long filename if (starts_with("/")) { const char *start = strtab.data() + atoi(ar_name + 1); return {start, (const char *)strstr(start, "/\n")}; } // Short fileanme if (const char *end = (char *)memchr(ar_name, '/', sizeof(ar_name))) return {ar_name, end}; return {ar_name, sizeof(ar_name)}; } }; } template std::vector read_thin_archive_members(Context &ctx, MappedFile *mf) { u8 *begin = mf->data; u8 *data = begin + 8; std::vector vec; std::string_view strtab; while (data < begin + mf->size) { // Each header is aligned to a 2 byte boundary. if ((begin - data) % 2) data++; ArHdr &hdr = *(ArHdr *)data; u8 *body = data + sizeof(hdr); u64 size = atol(hdr.ar_size); // Read a string table. if (hdr.is_strtab()) { strtab = {(char *)body, (size_t)size}; data = body + size; continue; } // Skip a symbol table. if (hdr.is_symtab()) { data = body + size; continue; } if (!hdr.starts_with("#1/") && !hdr.starts_with("/")) Fatal(ctx) << mf->name << ": filename is not stored as a long filename"; std::string name = hdr.read_name(strtab, body); // Skip if symbol table if (name == "__.SYMDEF" || name == "__.SYMDEF SORTED") continue; std::string path = name.starts_with('/') ? name : (path_dirname(mf->name) / name).string(); vec.push_back(must_open_file(ctx, path)); vec.back()->thin_parent = mf; data = body; } return vec; } template std::vector read_fat_archive_members(Context &ctx, MappedFile *mf) { u8 *begin = mf->data; u8 *data = begin + 8; std::vector vec; std::string_view strtab; while (begin + mf->size - data >= 2) { if ((begin - data) % 2) data++; ArHdr &hdr = *(ArHdr *)data; u8 *body = data + sizeof(hdr); u64 size = atol(hdr.ar_size); data = body + size; // Read if string table if (hdr.is_strtab()) { strtab = {(char *)body, (size_t)size}; continue; } // Skip if symbol table if (hdr.is_symtab()) continue; // Read the name field std::string name = hdr.read_name(strtab, body); // Skip if symbol table if (name == "__.SYMDEF" || name == "__.SYMDEF SORTED") continue; vec.push_back(mf->slice(ctx, name, body - begin, data - body)); } return vec; } template std::vector read_archive_members(Context &ctx, MappedFile *mf) { std::string_view str = mf->get_contents(); if (str.starts_with("!\n")) return read_fat_archive_members(ctx, mf); assert(str.starts_with("!\n")); return read_thin_archive_members(ctx, mf); } using E = MOLD_TARGET; template std::vector read_thin_archive_members(Context &, MappedFile *); template std::vector read_fat_archive_members(Context &, MappedFile *); template std::vector read_archive_members(Context &, MappedFile *); } // namespace mold ================================================ FILE: src/cmdline.cc ================================================ #include "config.h" #include "mold.h" #include #include #include #include #include #include #include #if __has_include() # include #endif #if __has_include() # include #else # include # define isatty _isatty # define chdir _chdir # define STDERR_FILENO (_fileno(stderr)) #endif namespace mold { static const char helpmsg[] = R"( Options: --help Report usage information -v, --version Report version information -V Report version and target information -(, --start-group Ignored -), --end-group Ignored -C DIR, --directory DIR Change to DIR before doing anything -E, --export-dynamic Put symbols in the dynamic symbol table --no-export-dynamic -F LIBNAME, --filter LIBNAME Set DT_FILTER to the specified value -I FILE, --dynamic-linker FILE Set dynamic linker path --no-dynamic-linker -L DIR, --library-path DIR Add DIR to library search path -M, --print-map Write map file to stdout -N, --omagic Do not page align data; do not make text readonly --no-omagic -O NUMBER Ignored -P AUDITLIB, --depaudit AUDITLIB Set DT_DEPAUDIT to the specified value -S, --strip-debug Strip .debug_* sections -T FILE, --script FILE Read linker script -X, --discard-locals Discard temporary local symbols -e SYMBOL, --entry SYMBOL Set program entry point -f SHLIB, --auxiliary SHLIB Set DT_AUXILIARY to the specified value -h LIBNAME, --soname LIBNAME Set shared library name -l LIBNAME, --library LIBNAME Search for a given library -m TARGET Set target -o FILE, --output FILE Set output filename -q, --emit-relocs Leaves relocation sections in the output -r, --relocatable Generate relocatable output -s, --strip-all Strip .symtab section -u SYMBOL, --undefined SYMBOL Force to resolve SYMBOL -y SYMBOL, --trace-symbol SYMBOL Trace references to SYMBOL --Bdynamic, --dy Link against shared libraries (default) --Bstatic, --dn, --static Do not link against shared libraries --Bsymbolic Bind all symbols locally --Bsymbolic-functions Bind function symbols locally --Bsymbolic-non-weak Bind all but weak symbols locally --Bsymbolic-non-weak-functions Bind all but weak function symbols locally --Bno-symbolic Cancel --Bsymbolic options --Map FILE Write map file to a given file --Tbss=ADDR Set address to .bss --Tdata=ADDR Set address to .data --Ttext=ADDR Set address to .text --allow-multiple-definition Allow multiple definitions --apply-dynamic-relocs Apply link-time values for dynamic relocations (default) --no-apply-dynamic-relocs --as-needed Only set DT_NEEDED if used --no-as-needed --audit LIBNAME Set DT_AUDIT to the specified value --build-id [none,md5,sha1,sha256,fast,uuid,HEXSTRING] Generate build ID --no-build-id --chroot DIR Set a given path to the root directory --color-diagnostics=[auto,always,never] Use colors in diagnostics --color-diagnostics Alias for --color-diagnostics=always --compress-debug-sections [none,zlib,zlib-gabi,zstd] Compress .debug_* sections --dc Ignored --dependency-file=FILE Write Makefile-style dependency rules to FILE --defsym=SYMBOL=VALUE Define a symbol alias --demangle Demangle C++ symbols in log messages (default) --no-demangle --detach Create separate debug info file in the background (default) --no-detach --enable-new-dtags Emit DT_RUNPATH for --rpath (default) --disable-new-dtags Emit DT_RPATH for --rpath --execute-only Make executable segments unreadable --dp Ignored --dynamic-list=FILE Read a list of dynamic symbols (implies -Bsymbolic) --dynamic-list-data Add data symbols to dynamic symbols --eh-frame-hdr Create .eh_frame_hdr section --no-eh-frame-hdr --exclude-libs LIB,LIB,.. Mark all symbols in given libraries as hidden --export-dynamic-symbol Put symbols matching glob in the dynamic symbol table --export-dynamic-symbol-list=FILE Read a list of dynamic symbols --fatal-warnings Treat warnings as errors --no-fatal-warnings Do not treat warnings as errors (default) --fini SYMBOL Call SYMBOL at unload-time --fork Spawn a child process (default) --no-fork --gc-sections Remove unreferenced sections --no-gc-sections --gdb-index Create .gdb_index for faster gdb startup --hash-style [sysv,gnu,both,none] Set hash style --icf=[all,safe,none] Fold identical code --no-icf --ignore-data-address-equality Allow merging non-executable sections with --icf --image-base ADDR Set the base address to a given value --init SYMBOL Call SYMBOL at load-time --nmagic Do not page align sections --no-nmagic --no-undefined Report undefined symbols (even with --shared) --noinhibit-exec Create an output file even if errors occur --oformat=binary Omit ELF, section, and program headers --pack-dyn-relocs=[relr,none] Pack dynamic relocations --package-metadata=PERCENT_ENCODED_STRING Set a given string to .note.package --perf Print performance statistics --pie, --pic-executable Create a position-independent executable --no-pie, --no-pic-executable --pop-state Restore the state of flags governing input file handling --print-gc-sections[=FILE] Print, or save in FILE, removed unreferenced sections --no-print-gc-sections --print-icf-sections[=FILE] Print, or save in FILE, folded identical sections --no-print-icf-sections --push-state Save the state of flags governing input file handling --quick-exit Use quick_exit to exit (default) --no-quick-exit --relax Optimize instructions (default) --no-relax --repro Embed input files in .repro section --require-defined SYMBOL Require SYMBOL be defined in the final output --retain-symbols-file FILE Keep only symbols listed in FILE --reverse-sections Reverse input sections in the output file --rosegment Put read-only non-executable sections in their own segment (default) --no-rosegment Put read-only non-executable sections in an executable segment --rpath DIR Add DIR to the runtime search path --rpath-link DIR Ignored --run COMMAND ARG... Run COMMAND with mold as /usr/bin/ld --section-start=SECTION=ADDR Set address for section --separate-debug-file[=FILE] Separate debug info to the specified file --no-separate-debug-file --shared, --Bshareable Create a shared library --shuffle-sections[=SEED] Randomize the output by shuffling input sections --sort-common Ignored --sort-section Ignored --spare-dynamic-tags NUMBER Reserve the given number of tags in the .dynamic section --spare-program-headers NUMBER Reserve the given number of slots in the program header --start-lib Give following object files in-archive-file semantics --end-lib End the effect of --start-lib --stats Print input statistics --sysroot DIR Set the target system root directory --thread-count COUNT, --threads=COUNT Use COUNT number of threads --threads Use multiple threads (default) --no-threads --trace Print the name of each input file --undefined-glob PATTERN Force to resolve all symbols that match a given pattern --undefined-version Do not report version scripts that refer to undefined symbols --no-undefined-version Report version scripts that refer to undefined symbols (default) --unique PATTERN Don't merge input sections that match a given pattern --unresolved-symbols [report-all,ignore-all,ignore-in-object-files,ignore-in-shared-libs] Handle unresolved symbols --version-script FILE Read version script --warn-common Warn about common symbols --no-warn-common --warn-once Only warn once for each undefined symbol --warn-shared-textrel Warn if the output .so needs text relocations --warn-textrel Warn if the output file needs text relocations --warn-unresolved-symbols Report unresolved symbols as warnings --error-unresolved-symbols Report unresolved symbols as errors (default) --whole-archive Include all objects from static archives --no-whole-archive --wrap SYMBOL Use a wrapper function for a given symbol --zero-to-bss Convert all-zero data sections into BSS -z defs Report undefined symbols (even with --shared) -z nodefs -z common-page-size=VALUE Ignored -z execstack Require an executable stack -z noexecstack -z execstack-if-needed Make the stack area executable if an input file explicitly requests it -z initfirst Mark DSO to be initialized first at runtime -z interpose Mark object to interpose all DSOs but the executable -z keep-text-section-prefix Keep .text.{hot,unknown,unlikely,startup,exit} as separate sections in the final binary -z nokeep-text-section-prefix -z lazy Enable lazy function resolution (default) -z max-page-size=VALUE Use VALUE as the memory page size -z nocopyreloc Do not create copy relocations -z nodefaultlib Make the dynamic loader ignore default search paths -z nodelete Mark DSO non-deletable at runtime -z nodlopen Mark DSO not available to dlopen -z nodump Mark DSO not available to dldump -z now Disable lazy function resolution -z origin Mark object requiring immediate $ORIGIN processing at runtime -z pack-relative-relocs Alias for --pack-dyn-relocs=relr -z nopack-relative-relocs -z sectionheader Do not omit section header (default) -z nosectionheader Omit section header -z start_stop_visibility=[hidden,protected] Specify symbol visibility for "__start_SECNAME" and "__stop_SECNAME" symbols -z separate-loadable-segments Separate all loadable segments onto different pages -z separate-code Separate code and data onto different pages -z noseparate-code Allow overlap in pages -z stack-size=VALUE Set the size of the stack segment -z relro Make some sections read-only after relocation (default) -z norelro -z rewrite-endbr Rewrite indirect branch target instructions with NOPs -z norewrite-endbr -z rodynamic Make the .dynamic section read-only -z text Report error if DT_TEXTREL is set -z notext -z textoff mold: supported targets: elf32-i386 elf64-x86-64 elf32-littlearm elf64-littleaarch64 elf64-bigaarch64 elf32-littleriscv elf32-bigriscv elf64-littleriscv elf64-bigriscv elf32-powerpc elf64-powerpc elf64-powerpc elf64-powerpcle elf64-s390 elf64-sparc elf32-m68k elf32-sh-linux elf64-loongarch elf32-loongarch mold: supported emulations: elf_i386 elf_x86_64 armelf_linux_eabi aarch64elf aarch64linux aarch64elfb aarch64linuxb elf32lriscv elf32briscv elf64lriscv elf64briscv elf32ppc elf32ppclinux elf64ppc elf64lppc elf64_s390 elf64_sparc m68kelf shlelf_linux shelf_linux elf64loongarch elf32loongarch)"; // If a command line argument is in the form of `@path/to/some/file` (i.e. // it starts with an atsign), the linker reads the given file and // interprets its contents as a list of command line arguments. A file // containing command line arguments is called a "response file". // // A response file is often used to pass a very large number of arguments // to the linker without exceeding the kernel's command line length limit. // // This function opens a given file, tokenizes its contents, and returns a // list of tokens. template static std::vector read_response_file(Context &ctx, std::string_view path, i64 depth) { if (depth > 10) Fatal(ctx) << path << ": response file nesting too deep"; MappedFile *mf = must_open_file(ctx, std::string(path)); mf->is_dependency = false; std::vector vec; std::ostringstream os; char quote = 0; // Each state represents the type of characters currently being read. // SPACE indicates blank characters between tokens, BARE indicates an // unquoted token, and QUOTED indicates a quoted token. enum { SPACE, BARE, QUOTED } state = SPACE; for (i64 i = 0; i <= mf->size; i++) { char c = (i < mf->size) ? mf->data[i] : 0; char c2 = (i + 1 < mf->size) ? mf->data[i + 1] : 0; if (c == '\\' && c2 == 0) Fatal(ctx) << path << ": premature end of input"; switch (state) { case SPACE: if (c == 0 || isspace(c)) break; if (c == '\\') { os << c2; state = BARE; i++; break; } if (c == '\'' || c == '"') { quote = c; state = QUOTED; break; } os << c; state = BARE; break; case BARE: if (c == 0 || isspace(c)) { vec.push_back(os.str()); os = {}; state = SPACE; break; } if (c == '\\') { os << c2; i++; break; } if (c == '\'' || c == '"') { quote = c; state = QUOTED; break; } os << c; break; case QUOTED: if (c == 0) Fatal(ctx) << path << ": premature end of input"; if (c == '\\') { os << c2; i++; break; } if (c == quote) { state = BARE; break; } os << c; break; } } std::vector vec2; for (std::string &tok : vec) { if (tok.starts_with('@')) append(vec2, read_response_file(ctx, tok.substr(1), depth + 1)); else vec2.push_back(save_string(ctx, tok)); } return vec2; } // Replace "@path/to/some/text/file" with its file contents. template std::vector expand_response_files(Context &ctx, char **argv) { std::vector vec; for (i64 i = 0; argv[i]; i++) { if (argv[i][0] == '@') append(vec, read_response_file(ctx, argv[i] + 1, 1)); else vec.push_back(argv[i]); } return vec; } static std::string_view string_trim(std::string_view str) { size_t pos = str.find_first_not_of(" \t"); if (pos == str.npos) return ""; str = str.substr(pos); pos = str.find_last_not_of(" \t"); if (pos == str.npos) return str; return str.substr(0, pos + 1); } static std::vector add_dashes(std::string name) { // Single-letter option if (name.size() == 1) return {"-" + name}; // Multi-letter linker options can be preceded by either a single // dash or double dashes except ones starting with "o", which must // be preceded by double dashes. For example, "-omagic" is // interpreted as "-o magic". If you really want to specify the // "omagic" option, you have to pass "--omagic". if (name[0] == 'o') return {"--" + name}; return {"-" + name, "--" + name}; } template static i64 parse_hex(Context &ctx, std::string opt, std::string_view value) { auto flags = std::regex_constants::optimize | std::regex_constants::ECMAScript; static std::regex re(R"((?:0x|0X)?([0-9a-fA-F]+))", flags); std::cmatch m; if (!std::regex_match(value.data(), value.data() + value.size(), m, re)) Fatal(ctx) << "option -" << opt << ": not a hexadecimal number"; return std::stoul(m[1], nullptr, 16); } template static i64 parse_number(Context &ctx, std::string opt, std::string_view value) { size_t nread; if (value.starts_with('-')) { i64 ret = std::stoul(std::string(value.substr(1)), &nread, 0); if (value.size() - 1 != nread) Fatal(ctx) << "option -" << opt << ": not a number: " << value; return -ret; } i64 ret = std::stoul(std::string(value), &nread, 0); if (value.size() != nread) Fatal(ctx) << "option -" << opt << ": not a number: " << value; return ret; } static char from_hex(char c) { if ('0' <= c && c <= '9') return c - '0'; if ('a' <= c && c <= 'f') return c - 'a' + 10; assert('A' <= c && c <= 'F'); return c - 'A' + 10; } template static std::vector parse_hex_build_id(Context &ctx, std::string_view arg) { auto flags = std::regex_constants::optimize | std::regex_constants::ECMAScript; static std::regex re(R"(0[xX]([0-9a-fA-F][0-9a-fA-F])+)", flags); if (!std::regex_match(arg.begin(), arg.end(), re)) Fatal(ctx) << "invalid build-id: " << arg; std::vector vec; for (i64 i = 2; i < arg.size(); i += 2) vec.push_back((from_hex(arg[i]) << 4) | from_hex(arg[i + 1])); return vec; } template static std::string parse_package_metadata(Context &ctx, std::string_view arg) { auto flags = std::regex_constants::optimize | std::regex_constants::ECMAScript; static std::regex re(R"(([^%]|%[0-9a-fA-F][0-9a-fA-F])*)", flags); if (!std::regex_match(arg.begin(), arg.end(), re)) Fatal(ctx) << "--package-metadata: invalid string: " << arg; std::ostringstream out; while (!arg.empty()) { if (arg[0] == '%') { out << (char)((from_hex(arg[1]) << 4) | from_hex(arg[2])); arg = arg.substr(3); } else { out << arg[0]; arg = arg.substr(1); } } return out.str(); } static std::vector split_string(std::string_view str, std::string_view sep) { std::vector vec; for (;;) { i64 pos = str.find_first_of(sep); if (pos == str.npos) { vec.push_back(str); break; } vec.push_back(str.substr(0, pos)); str = str.substr(pos + 1); } return vec; } template static void read_retain_symbols_file(Context &ctx, std::string_view path) { MappedFile *mf = must_open_file(ctx, std::string(path)); std::string_view data((char *)mf->data, mf->size); std::vector *> vec; while (!data.empty()) { size_t pos = data.find('\n'); std::string_view name; if (pos == data.npos) { name = data; data = ""; } else { name = data.substr(0, pos); data = data.substr(pos + 1); } name = string_trim(name); if (!name.empty()) vec.push_back(get_symbol(ctx, name)); } ctx.arg.retain_symbols_file = std::move(vec); } static bool is_file(const std::filesystem::path& path) { std::error_code error; return !std::filesystem::is_directory(path, error) && !error; } template static std::vector parse_section_order(Context &ctx, std::string_view arg) { auto flags = std::regex_constants::ECMAScript | std::regex_constants::icase | std::regex_constants::optimize; static std::regex re1(R"(TEXT|DATA|RODATA|BSS)", flags); static std::regex re2(R"([a-zA-Z0-9_.]\S*|EHDR|PHDR)", flags); static std::regex re3(R"(=(0x[0-9a-f]+|\d+))", flags); static std::regex re4(R"(%(0x[0-9a-f]+|\d+))", flags); static std::regex re5(R"(!(\S+))", flags); std::vector vec; for (std::string_view tok : split_string(arg, " \t")) { if (tok.empty()) continue; vec.push_back(SectionOrder{ .token = tok }); SectionOrder &ord = vec.back(); std::cmatch m; if (std::regex_match(tok.data(), tok.data() + tok.size(), m, re1)) { ord.type = SectionOrder::GROUP; ord.name = m[0].str(); } else if (std::regex_match(tok.data(), tok.data() + tok.size(), m, re2)) { ord.type = SectionOrder::SECTION; ord.name = m[0].str(); } else if (std::regex_match(tok.data(), tok.data() + tok.size(), m, re3)) { ord.type = SectionOrder::ADDR; std::string s = m[1]; ord.value = std::stoull(s, nullptr, s.starts_with("0x") ? 16 : 10); } else if (std::regex_match(tok.data(), tok.data() + tok.size(), m, re4)) { ord.type = SectionOrder::ALIGN; std::string s = m[1]; ord.value = std::stoull(s, nullptr, s.starts_with("0x") ? 16 : 10); } else if (std::regex_match(tok.data(), tok.data() + tok.size(), m, re5)) { ord.type = SectionOrder::SYMBOL; ord.name = m[1].str(); } else { Fatal(ctx) << "--section-order: parse error: " << arg; } } bool is_first = true; for (SectionOrder &ord : vec) { if (ord.type == SectionOrder::SECTION) { if (is_first) { is_first = false; } else if (ord.name == "EHDR") { Fatal(ctx) << "--section-order: EHDR must be the first " << "section specifier: " << arg; } } } return vec; } template static std::variant *, u64> parse_defsym_value(Context &ctx, std::string_view s) { if (s.starts_with("0x") || s.starts_with("0X")) { size_t nread; u64 addr = std::stoull(std::string(s), &nread, 16); if (s.size() != nread) return {}; return addr; } if (s.find_first_not_of("0123456789") == s.npos) return (u64)std::stoull(std::string(s), nullptr, 10); return get_symbol(ctx, s); } // Parses a kernel version string, e.g. "6.8.0-47-generic". static std::tuple parse_kernel_version(std::string str) { auto flags = std::regex_constants::optimize | std::regex_constants::ECMAScript; static std::regex re(R"(^(\d+)\.(\d+)\.(\d+))", flags); std::smatch m; if (!std::regex_search(str, m, re)) return {0, 0, 0}; return {std::stoi(m[1]), std::stoi(m[2]), std::stoi(m[3])}; } // Version 6.11 and 6.12 of the Linux kernel does not return ETXTBSY for // open(2) on an executable file that is currently running. This function // returns true if we are running on a Linux kernel older than 6.11 or newer // than 6.12. static bool returns_etxtbsy() { #if HAVE_UNAME struct utsname buf; if (uname(&buf) == 0 && strcmp(buf.sysname, "Linux") == 0) { std::tuple ver = parse_kernel_version(buf.release); return ver < std::tuple{6, 11, 0} || std::tuple{6, 13, 0} <= ver; } #endif return false; } template std::vector parse_nonpositional_args(Context &ctx) { std::span args = ctx.cmdline_args; args = args.subspan(1); std::vector remaining; std::string_view arg; ctx.arg.color_diagnostics = isatty(STDERR_FILENO); bool version_shown = false; bool warn_shared_textrel = false; bool error_unresolved_symbols = true; std::optional z_separate_code; std::optional allow_shlib_undefined; std::optional report_undefined; std::optional z_relro; std::optional z_dynamic_undefined_weak; std::optional separate_debug_file; std::optional shuffle_sections_seed; std::unordered_set rpaths; std::vector version_scripts; auto add_rpath = [&](std::string_view arg) { if (rpaths.insert(arg).second) { if (!ctx.arg.rpaths.empty()) ctx.arg.rpaths += ':'; ctx.arg.rpaths += arg; } }; // RISC-V and LoongArch object files contains lots of local symbols, // so by default we discard them. This is compatible with GNU ld. if constexpr (is_riscv || is_loongarch) ctx.arg.discard_locals = true; // We generally don't need to write addends to relocated places if the // relocation type is RELA because RELA records contain addends. // However, there are too much code that wrongly assumes that addends // are written to both RELA records and relocated places, so we write // addends to relocated places by default. There are a few exceptions: // // - It looks like the SPARC's dynamic linker takes both RELA's r_addend // and the value at the relocated place. So we don't want to write // values to relocated places. // // - Static PIE binaries crash on startup in some RISC-V environment if // we write addends to relocated places. ctx.arg.apply_dynamic_relocs = !is_sparc && !is_riscv; auto read_arg = [&](std::string name) { for (const std::string &opt : add_dashes(name)) { if (args[0] == opt) { if (args.size() == 1) Fatal(ctx) << "option -" << name << ": argument missing"; arg = args[1]; args = args.subspan(2); return true; } std::string prefix = (name.size() == 1) ? opt : opt + "="; if (args[0].starts_with(prefix)) { arg = args[0].substr(prefix.size()); args = args.subspan(1); return true; } } return false; }; auto read_eq = [&](std::string name) { for (const std::string &opt : add_dashes(name)) { if (args[0].starts_with(opt + "=")) { arg = args[0].substr(opt.size() + 1); args = args.subspan(1); return true; } } return false; }; auto read_flag = [&](std::string name) { for (const std::string &opt : add_dashes(name)) { if (args[0] == opt) { args = args.subspan(1); return true; } } return false; }; auto read_z_flag = [&](std::string name) { if (args.size() >= 2 && args[0] == "-z" && args[1] == name) { args = args.subspan(2); return true; } if (!args.empty() && args[0] == "-z" + name) { args = args.subspan(1); return true; } return false; }; auto read_z_arg = [&](std::string name) { if (args.size() >= 2 && args[0] == "-z" && args[1].starts_with(name + "=")) { arg = args[1].substr(name.size() + 1); args = args.subspan(2); return true; } if (!args.empty() && args[0].starts_with("-z" + name + "=")) { arg = args[0].substr(name.size() + 3); args = args.subspan(1); return true; } return false; }; while (!args.empty()) { if (read_flag("help")) { Out(ctx) << "Usage: " << ctx.cmdline_args[0] << " [options] file...\n" << helpmsg; exit(0); } if (read_arg("o") || read_arg("output")) { ctx.arg.output = arg; } else if (read_arg("dynamic-linker") || read_arg("I")) { ctx.arg.dynamic_linker = arg; } else if (read_flag("no-dynamic-linker")) { ctx.arg.dynamic_linker = ""; } else if (read_flag("v")) { Out(ctx) << mold_version; version_shown = true; } else if (read_flag("version")) { Out(ctx) << mold_version; exit(0); } else if (read_flag("V")) { Out(ctx) << mold_version << "\n Supported emulations:\n elf_x86_64\n elf_i386\n" << " aarch64elf\n aarch64linux\n aarch64elfb\n" << " aarch64linuxb\n armelf_linux_eabi\n elf64lriscv\n" << " elf64briscv\n elf32lriscv\n elf32briscv\n" << " elf32ppc\n elf64ppc\n elf64lppc\n elf64_s390\n" << " elf64_sparc\n m68kelf\n shlelf_linux\n" << " shelf_linux\n elf64loongarch\n elf32loongarch"; version_shown = true; } else if (read_arg("mllvm")) { ctx.arg.plugin_opt.emplace_back(arg); } else if (read_arg("m")) { auto check = [&](bool supported, std::string_view name) { if (!supported) Fatal(ctx) << "'-m " << arg << "' is not supported; you may want to" << " rebuild mold with " << name << " support"; }; if (arg == "elf_x86_64") { check(HAVE_TARGET_X86_64, X86_64::name); ctx.arg.emulation = X86_64::name; } else if (arg == "elf_i386") { check(HAVE_TARGET_I386, I386::name); ctx.arg.emulation = I386::name; } else if (arg == "aarch64elf" || arg == "aarch64linux") { check(HAVE_TARGET_ARM64LE, ARM64LE::name); ctx.arg.emulation = ARM64LE::name; } else if (arg == "aarch64elfb" || arg == "aarch64linuxb") { check(HAVE_TARGET_ARM64BE, ARM64BE::name); ctx.arg.emulation = ARM64BE::name; } else if (arg == "armelf_linux_eabi") { check(HAVE_TARGET_ARM32LE, ARM32LE::name); ctx.arg.emulation = ARM32LE::name; } else if (arg == "armelfb_linux_eabi") { check(HAVE_TARGET_ARM32BE, ARM32BE::name); ctx.arg.emulation = ARM32BE::name; } else if (arg == "elf64lriscv") { check(HAVE_TARGET_RV64LE, RV64LE::name); ctx.arg.emulation = RV64LE::name; } else if (arg == "elf64briscv") { check(HAVE_TARGET_RV64BE, RV64BE::name); ctx.arg.emulation = RV64BE::name; } else if (arg == "elf32lriscv") { check(HAVE_TARGET_RV32LE, RV32LE::name); ctx.arg.emulation = RV32LE::name; } else if (arg == "elf32briscv") { check(HAVE_TARGET_RV32BE, RV32BE::name); ctx.arg.emulation = RV32BE::name; } else if (arg == "elf32ppc" || arg == "elf32ppclinux") { check(HAVE_TARGET_PPC32, PPC32::name); ctx.arg.emulation = PPC32::name; } else if (arg == "elf64ppc") { check(HAVE_TARGET_PPC64V1, PPC64V1::name); ctx.arg.emulation = PPC64V1::name; } else if (arg == "elf64lppc") { check(HAVE_TARGET_PPC64V2, PPC64V2::name); ctx.arg.emulation = PPC64V2::name; } else if (arg == "elf64_s390") { check(HAVE_TARGET_S390X, S390X::name); ctx.arg.emulation = S390X::name; } else if (arg == "elf64_sparc") { check(HAVE_TARGET_SPARC64, SPARC64::name); ctx.arg.emulation = SPARC64::name; } else if (arg == "m68kelf") { check(HAVE_TARGET_M68K, M68K::name); ctx.arg.emulation = M68K::name; } else if (arg == "shlelf" || arg == "shlelf_linux") { check(HAVE_TARGET_SH4LE, SH4LE::name); ctx.arg.emulation = SH4LE::name; } else if (arg == "shelf" || arg == "shelf_linux") { check(HAVE_TARGET_SH4BE, SH4BE::name); ctx.arg.emulation = SH4BE::name; } else if (arg == "elf64loongarch") { check(HAVE_TARGET_LOONGARCH64, LOONGARCH64::name); ctx.arg.emulation = LOONGARCH64::name; } else if (arg == "elf32loongarch") { check(HAVE_TARGET_LOONGARCH32, LOONGARCH32::name); ctx.arg.emulation = LOONGARCH32::name; } else { Fatal(ctx) << "unknown -m argument: " << arg; } } else if (read_flag("end-lib")) { remaining.emplace_back("--end-lib"); } else if (read_flag("export-dynamic") || read_flag("E")) { ctx.arg.export_dynamic = true; } else if (read_flag("no-export-dynamic")) { ctx.arg.export_dynamic = false; } else if (read_flag("Bsymbolic")) { ctx.arg.Bsymbolic = BSYMBOLIC_ALL; } else if (read_flag("Bsymbolic-functions")) { ctx.arg.Bsymbolic = BSYMBOLIC_FUNCTIONS; } else if (read_flag("Bsymbolic-non-weak")) { ctx.arg.Bsymbolic = BSYMBOLIC_NON_WEAK; } else if (read_flag("Bsymbolic-non-weak-functions")) { ctx.arg.Bsymbolic = BSYMBOLIC_NON_WEAK_FUNCTIONS; } else if (read_flag("Bno-symbolic")) { ctx.arg.Bsymbolic = BSYMBOLIC_NONE; } else if (read_arg("exclude-libs")) { for (std::string_view lib : split_string(arg, ",:")) ctx.arg.exclude_libs.insert(lib); } else if (read_flag("q") || read_flag("emit-relocs")) { ctx.arg.emit_relocs = true; ctx.arg.discard_locals = false; } else if (read_arg("e") || read_arg("entry")) { ctx.arg.entry = get_symbol(ctx, arg); } else if (read_arg("Map")) { ctx.arg.Map = arg; ctx.arg.print_map = true; } else if (read_flag("print-dependencies")) { ctx.arg.print_dependencies = true; } else if (read_flag("print-map") || read_flag("M")) { ctx.arg.print_map = true; } else if (read_flag("Bstatic") || read_flag("dn") || read_flag("static")) { remaining.emplace_back("--Bstatic"); } else if (read_flag("Bdynamic") || read_flag("dy")) { remaining.emplace_back("--Bdynamic"); } else if (read_flag("shared") || read_flag("Bshareable")) { ctx.arg.shared = true; } else if (read_arg("spare-dynamic-tags")) { ctx.arg.spare_dynamic_tags = parse_number(ctx, "spare-dynamic-tags", arg); } else if (read_arg("spare-program-headers")) { ctx.arg.spare_program_headers = parse_number(ctx, "spare-program-headers", arg); } else if (read_flag("start-lib")) { remaining.emplace_back("--start-lib"); } else if (read_flag("start-stop")) { ctx.arg.start_stop = true; } else if (read_arg("dependency-file")) { ctx.arg.dependency_file = arg; } else if (read_arg("defsym")) { size_t pos = arg.find('='); if (pos == arg.npos || pos == arg.size() - 1) Fatal(ctx) << "-defsym: syntax error: " << arg; ctx.arg.defsyms.emplace_back(get_symbol(ctx, arg.substr(0, pos)), parse_defsym_value(ctx, arg.substr(pos + 1))); } else if (read_flag(":lto-pass2")) { ctx.arg.lto_pass2 = true; } else if (read_arg(":ignore-ir-file")) { ctx.arg.ignore_ir_file.insert(arg); } else if (read_flag("demangle")) { ctx.arg.demangle = true; } else if (read_flag("no-demangle")) { ctx.arg.demangle = false; } else if (read_flag("detach")) { ctx.arg.detach = true; } else if (read_flag("no-detach")) { ctx.arg.detach = false; } else if (read_flag("default-symver")) { ctx.arg.default_symver = true; } else if (read_flag("noinhibit-exec")) { ctx.arg.noinhibit_exec = true; } else if (read_flag("shuffle-sections")) { ctx.arg.shuffle_sections = SHUFFLE_SECTIONS_SHUFFLE; } else if (read_eq("shuffle-sections")) { ctx.arg.shuffle_sections = SHUFFLE_SECTIONS_SHUFFLE; shuffle_sections_seed = parse_number(ctx, "shuffle-sections", arg); } else if (read_flag("reverse-sections")) { ctx.arg.shuffle_sections = SHUFFLE_SECTIONS_REVERSE; } else if (read_flag("rosegment")) { ctx.arg.rosegment = true; } else if (read_flag("no-rosegment")) { ctx.arg.rosegment = false; } else if (read_arg("y") || read_arg("trace-symbol")) { ctx.arg.trace_symbol.push_back(arg); } else if (read_arg("filler")) { ctx.arg.filler = parse_hex(ctx, "filler", arg); } else if (read_arg("L") || read_arg("library-path")) { ctx.arg.library_paths.emplace_back(arg); } else if (read_arg("sysroot")) { ctx.arg.sysroot = arg; } else if (read_arg("unique")) { if (!ctx.arg.unique.add(arg, 1)) Fatal(ctx) << "-unique: invalid glob pattern: " << arg; } else if (read_arg("unresolved-symbols")) { if (arg == "report-all" || arg == "ignore-in-shared-libs") report_undefined = true; else if (arg == "ignore-all" || arg == "ignore-in-object-files") report_undefined = false; else Fatal(ctx) << "unknown --unresolved-symbols argument: " << arg; } else if (read_arg("undefined") || read_arg("u")) { ctx.arg.undefined.push_back(get_symbol(ctx, arg)); } else if (read_arg("undefined-glob")) { if (!ctx.arg.undefined_glob.add(arg, 0)) Fatal(ctx) << "--undefined-glob: invalid pattern: " << arg; } else if (read_arg("require-defined")) { ctx.arg.require_defined.push_back(get_symbol(ctx, arg)); } else if (read_arg("init")) { ctx.arg.init = get_symbol(ctx, arg); } else if (read_arg("fini")) { ctx.arg.fini = get_symbol(ctx, arg); } else if (read_arg("hash-style")) { if (arg == "sysv") { ctx.arg.hash_style_sysv = true; ctx.arg.hash_style_gnu = false; } else if (arg == "gnu") { ctx.arg.hash_style_sysv = false; ctx.arg.hash_style_gnu = true; } else if (arg == "both") { ctx.arg.hash_style_sysv = true; ctx.arg.hash_style_gnu = true; } else if (arg == "none") { ctx.arg.hash_style_sysv = false; ctx.arg.hash_style_gnu = false; } else { Fatal(ctx) << "invalid --hash-style argument: " << arg; } } else if (read_arg("soname") || read_arg("h")) { ctx.arg.soname = arg; } else if (read_arg("audit")) { if (!ctx.arg.audit.empty()) ctx.arg.audit += ':'; ctx.arg.audit += std::string(arg); } else if (read_arg("depaudit") || read_arg("P")) { if (!ctx.arg.depaudit.empty()) ctx.arg.depaudit += ':'; ctx.arg.depaudit += std::string(arg); } else if (read_flag("allow-multiple-definition")) { ctx.arg.allow_multiple_definition = true; } else if (read_flag("apply-dynamic-relocs")) { ctx.arg.apply_dynamic_relocs = true; } else if (read_flag("no-apply-dynamic-relocs")) { ctx.arg.apply_dynamic_relocs = false; } else if (read_flag("trace")) { ctx.arg.trace = true; } else if (read_flag("eh-frame-hdr")) { ctx.arg.eh_frame_hdr = true; } else if (read_flag("no-eh-frame-hdr")) { ctx.arg.eh_frame_hdr = false; } else if (read_flag("pie") || read_flag("pic-executable")) { ctx.arg.pic = true; ctx.arg.pie = true; } else if (read_flag("no-pie") || read_flag("no-pic-executable") || read_flag("nopie")) { ctx.arg.pic = false; ctx.arg.pie = false; } else if (read_flag("relax")) { ctx.arg.relax = true; } else if (read_flag("no-relax")) { ctx.arg.relax = false; } else if (read_flag("gdb-index")) { ctx.arg.gdb_index = true; } else if (read_flag("no-gdb-index")) { ctx.arg.gdb_index = false; } else if (read_flag("r") || read_flag("relocatable")) { ctx.arg.relocatable = true; ctx.arg.emit_relocs = true; ctx.arg.discard_locals = false; } else if (read_flag("relocatable-merge-sections")) { ctx.arg.relocatable_merge_sections = true; } else if (read_flag("perf")) { ctx.arg.perf = true; } else if (read_flag("pack-dyn-relocs=relr") || read_z_flag("pack-relative-relocs")) { ctx.arg.pack_dyn_relocs_relr = true; } else if (read_flag("pack-dyn-relocs=none") || read_z_flag("nopack-relative-relocs")) { ctx.arg.pack_dyn_relocs_relr = false; } else if (read_arg("package-metadata")) { ctx.arg.package_metadata = parse_package_metadata(ctx, arg); } else if (read_flag("stats")) { ctx.arg.stats = true; Counter::enabled = true; } else if (read_arg("C") || read_arg("directory")) { ctx.arg.directory = arg; } else if (read_arg("chroot")) { ctx.arg.chroot = arg; } else if (read_flag("color-diagnostics") || read_flag("color-diagnostics=auto")) { ctx.arg.color_diagnostics = isatty(STDERR_FILENO); } else if (read_flag("color-diagnostics=always")) { ctx.arg.color_diagnostics = true; } else if (read_flag("color-diagnostics=never")) { ctx.arg.color_diagnostics = false; } else if (read_flag("warn-common")) { ctx.arg.warn_common = true; } else if (read_flag("no-warn-common")) { ctx.arg.warn_common = false; } else if (read_flag("warn-once")) { ctx.arg.warn_once = true; } else if (read_flag("warn-shared-textrel")) { warn_shared_textrel = true; } else if (read_flag("warn-textrel")) { ctx.arg.warn_textrel = true; } else if (read_flag("enable-new-dtags")) { ctx.arg.enable_new_dtags = true; } else if (read_flag("disable-new-dtags")) { ctx.arg.enable_new_dtags = false; } else if (read_flag("execute-only")) { ctx.arg.execute_only = true; } else if (read_flag("zero-to-bss")) { ctx.arg.zero_to_bss = true; } else if (read_arg("compress-debug-sections")) { if (arg == "zlib" || arg == "zlib-gabi") ctx.arg.compress_debug_sections = ELFCOMPRESS_ZLIB; else if (arg == "zstd") ctx.arg.compress_debug_sections = ELFCOMPRESS_ZSTD; else if (arg == "none") ctx.arg.compress_debug_sections = ELFCOMPRESS_NONE; else Fatal(ctx) << "invalid --compress-debug-sections argument: " << arg; } else if (read_arg("wrap")) { ctx.arg.wrap.insert(arg); } else if (read_flag("omagic") || read_flag("N")) { ctx.arg.omagic = true; } else if (read_flag("no-omagic")) { ctx.arg.omagic = false; } else if (read_arg("oformat")) { if (arg != "binary") Fatal(ctx) << "-oformat: " << arg << " is not supported"; ctx.arg.oformat_binary = true; } else if (read_arg("retain-symbols-file")) { read_retain_symbols_file(ctx, arg); } else if (read_arg("section-align")) { size_t pos = arg.find('='); if (pos == arg.npos || pos == arg.size() - 1) Fatal(ctx) << "--section-align: syntax error: " << arg; i64 value = parse_number(ctx, "section-align", arg.substr(pos + 1)); if (!has_single_bit(value)) Fatal(ctx) << "--section-align=" << arg << ": value must be a power of 2"; ctx.arg.section_align[arg.substr(0, pos)] = value; } else if (read_arg("section-start")) { size_t pos = arg.find('='); if (pos == arg.npos || pos == arg.size() - 1) Fatal(ctx) << "--section-start: syntax error: " << arg; ctx.arg.section_start[arg.substr(0, pos)] = parse_hex(ctx, "section-start", arg.substr(pos + 1)); } else if (read_arg("section-order")) { ctx.arg.section_order = parse_section_order(ctx, arg); } else if (read_arg("Tbss")) { ctx.arg.section_start[".bss"] = parse_hex(ctx, "Tbss", arg); } else if (read_arg("Tdata")) { ctx.arg.section_start[".data"] = parse_hex(ctx, "Tdata", arg); } else if (read_arg("Ttext")) { ctx.arg.section_start[".text"] = parse_hex(ctx, "Ttext", arg); } else if (read_flag("repro")) { ctx.arg.repro = true; } else if (read_z_flag("now")) { ctx.arg.z_now = true; } else if (read_z_flag("lazy")) { ctx.arg.z_now = false; } else if (read_z_flag("cet-report=none")) { ctx.arg.z_cet_report = CET_REPORT_NONE; } else if (read_z_flag("cet-report=warning")) { ctx.arg.z_cet_report = CET_REPORT_WARNING; } else if (read_z_flag("cet-report=error")) { ctx.arg.z_cet_report = CET_REPORT_ERROR; } else if (read_z_flag("execstack")) { ctx.arg.z_execstack = true; } else if (read_z_flag("execstack-if-needed")) { ctx.arg.z_execstack_if_needed = true; } else if (read_z_arg("max-page-size")) { ctx.page_size = parse_number(ctx, "-z max-page-size", arg); if (!has_single_bit(ctx.page_size)) Fatal(ctx) << "-z max-page-size " << arg << ": value must be a power of 2"; } else if (read_z_flag("start-stop-visibility=protected")) { ctx.arg.z_start_stop_visibility_protected = true; } else if (read_z_flag("start-stop-visibility=hidden")) { ctx.arg.z_start_stop_visibility_protected = false; } else if (read_z_flag("noexecstack")) { ctx.arg.z_execstack = false; } else if (read_z_flag("relro")) { z_relro = true; } else if (read_z_flag("norelro")) { z_relro = false; } else if (read_z_flag("defs") || read_flag("no-undefined")) { report_undefined = true; } else if (read_z_flag("undefs")) { report_undefined = false; } else if (read_z_flag("nodlopen")) { ctx.arg.z_dlopen = false; } else if (read_z_flag("nodelete")) { ctx.arg.z_delete = false; } else if (read_z_flag("nocopyreloc")) { ctx.arg.z_copyreloc = false; } else if (read_z_flag("nodump")) { ctx.arg.z_dump = false; } else if (read_z_flag("initfirst")) { ctx.arg.z_initfirst = true; } else if (read_z_flag("interpose")) { ctx.arg.z_interpose = true; } else if (read_z_flag("ibt")) { ctx.arg.z_ibt = true; } else if (read_z_flag("ibtplt")) { } else if (read_z_flag("muldefs")) { ctx.arg.allow_multiple_definition = true; } else if (read_z_flag("keep-text-section-prefix")) { ctx.arg.z_keep_text_section_prefix = true; } else if (read_z_flag("nokeep-text-section-prefix")) { ctx.arg.z_keep_text_section_prefix = false; } else if (read_z_flag("shstk")) { ctx.arg.z_shstk = true; } else if (read_z_flag("text")) { ctx.arg.z_text = true; } else if (read_z_flag("notext") || read_z_flag("textoff")) { ctx.arg.z_text = false; } else if (read_z_flag("origin")) { ctx.arg.z_origin = true; } else if (read_z_flag("nodefaultlib")) { ctx.arg.z_nodefaultlib = true; } else if (read_eq("separate-debug-file")) { separate_debug_file = arg; } else if (read_flag("separate-debug-file")) { separate_debug_file = ""; } else if (read_flag("no-separate-debug-file")) { separate_debug_file.reset(); } else if (read_z_flag("separate-loadable-segments")) { z_separate_code = SEPARATE_LOADABLE_SEGMENTS; } else if (read_z_flag("separate-code")) { z_separate_code = SEPARATE_CODE; } else if (read_z_flag("noseparate-code")) { z_separate_code = NOSEPARATE_CODE; } else if (read_z_arg("stack-size")) { ctx.arg.z_stack_size = parse_number(ctx, "-z stack-size", arg); } else if (read_z_flag("dynamic-undefined-weak")) { z_dynamic_undefined_weak = true; } else if (read_z_flag("nodynamic-undefined-weak")) { z_dynamic_undefined_weak = false; } else if (read_z_flag("sectionheader")) { ctx.arg.z_sectionheader = true; } else if (read_z_flag("nosectionheader")) { ctx.arg.z_sectionheader = false; } else if (read_z_flag("rodynamic")) { ctx.arg.z_rodynamic = true; } else if (read_z_flag("x86-64-v2")) { ctx.arg.z_x86_64_isa_level |= GNU_PROPERTY_X86_ISA_1_V2; } else if (read_z_flag("x86-64-v3")) { ctx.arg.z_x86_64_isa_level |= GNU_PROPERTY_X86_ISA_1_V3; } else if (read_z_flag("x86-64-v4")) { ctx.arg.z_x86_64_isa_level |= GNU_PROPERTY_X86_ISA_1_V4; } else if (read_z_flag("rewrite-endbr")) { if constexpr (!is_x86_64) Fatal(ctx) << "-z rewrite-endbr is supported only on x86-64"; ctx.arg.z_rewrite_endbr = true; } else if (read_z_flag("norewrite-endbr")) { ctx.arg.z_rewrite_endbr = false; } else if (read_flag("nmagic")) { ctx.arg.nmagic = true; } else if (read_flag("no-nmagic")) { ctx.arg.nmagic = false; } else if (read_flag("fatal-warnings")) { ctx.arg.fatal_warnings = true; } else if (read_flag("no-fatal-warnings")) { ctx.arg.fatal_warnings = false; } else if (read_flag("fork")) { ctx.arg.fork = true; } else if (read_flag("no-fork")) { ctx.arg.fork = false; } else if (read_flag("gc-sections")) { ctx.arg.gc_sections = true; } else if (read_flag("no-gc-sections")) { ctx.arg.gc_sections = false; } else if (read_flag("print-gc-sections")) { ctx.arg.print_gc_sections = "-"; } else if (read_eq("print-gc-sections")) { ctx.arg.print_gc_sections = arg; } else if (read_flag("no-print-gc-sections")) { ctx.arg.print_gc_sections = ""; } else if (read_arg("discard-section")) { ctx.arg.discard_section.insert(arg); } else if (read_arg("no-discard-section")) { ctx.arg.discard_section.erase(arg); } else if (read_arg("icf")) { if (arg == "all") { ctx.arg.icf = true; ctx.arg.icf_all = true; } else if (arg == "safe") { ctx.arg.icf = true; } else if (arg == "none") { ctx.arg.icf = false; } else { Fatal(ctx) << "unknown --icf argument: " << arg; } } else if (read_flag("no-icf")) { ctx.arg.icf = false; } else if (read_flag("ignore-data-address-equality")) { ctx.arg.ignore_data_address_equality = true; } else if (read_arg("image-base")) { ctx.arg.image_base = parse_number(ctx, "image-base", arg); } else if (read_arg("physical-image-base")) { ctx.arg.physical_image_base = parse_number(ctx, "physical-image-base", arg); } else if (read_flag("print-icf-sections")) { ctx.arg.print_icf_sections = "-"; } else if (read_eq("print-icf-sections")) { ctx.arg.print_icf_sections = arg; } else if (read_flag("no-print-icf-sections")) { ctx.arg.print_icf_sections = ""; } else if (read_flag("quick-exit")) { ctx.arg.quick_exit = true; } else if (read_flag("no-quick-exit")) { ctx.arg.quick_exit = false; } else if (read_arg("plugin")) { ctx.arg.plugin = arg; } else if (read_arg("plugin-opt")) { ctx.arg.plugin_opt.emplace_back(arg); } else if (read_flag("lto-cs-profile-generate")) { ctx.arg.plugin_opt.emplace_back("cs-profile-generate"); } else if (read_arg("lto-cs-profile-file")) { ctx.arg.plugin_opt.push_back("cs-profile-path=" + std::string(arg)); } else if (read_flag("lto-debug-pass-manager")) { ctx.arg.plugin_opt.emplace_back("debug-pass-manager"); } else if (read_flag("disable-verify")) { ctx.arg.plugin_opt.emplace_back("disable-verify"); } else if (read_flag("lto-emit-asm")) { ctx.arg.plugin_opt.emplace_back("emit-asm"); } else if (read_flag("no-legacy-pass-manager")) { ctx.arg.plugin_opt.emplace_back("legacy-pass-manager"); } else if (read_arg("lto-partitions")) { ctx.arg.plugin_opt.push_back("lto-partitions=" + std::string(arg)); } else if (read_flag("no-lto-legacy-pass-manager")) { ctx.arg.plugin_opt.emplace_back("new-pass-manager"); } else if (read_arg("lto-obj-path")) { ctx.arg.plugin_opt.push_back("obj-path=" + std::string(arg)); } else if (read_arg("opt-remarks-filename")) { ctx.arg.plugin_opt.push_back("opt-remarks-filename=" + std::string(arg)); } else if (read_arg("opt-remarks-format")) { ctx.arg.plugin_opt.push_back("opt-remarks-format=" + std::string(arg)); } else if (read_arg("opt-remarks-hotness-threshold")) { ctx.arg.plugin_opt.push_back("opt-remarks-hotness-threshold=" + std::string(arg)); } else if (read_arg("opt-remarks-passes")) { ctx.arg.plugin_opt.push_back("opt-remarks-passes=" + std::string(arg)); } else if (read_flag("opt-remarks-with_hotness")) { ctx.arg.plugin_opt.emplace_back("opt-remarks-with-hotness"); } else if (args[0].starts_with("-lto-O")) { ctx.arg.plugin_opt.push_back("O" + std::string(args[0].substr(6))); args = args.subspan(1); } else if (args[0].starts_with("--lto-O")) { ctx.arg.plugin_opt.push_back("O" + std::string(args[0].substr(7))); args = args.subspan(1); } else if (read_arg("lto-pseudo-probe-for-profiling")) { ctx.arg.plugin_opt.push_back("pseudo-probe-for-profiling=" + std::string(arg)); } else if (read_arg("lto-sample-profile")) { ctx.arg.plugin_opt.push_back("sample-profile=" + std::string(arg)); } else if (read_flag("save-temps")) { ctx.arg.plugin_opt.emplace_back("save-temps"); } else if (read_flag("thinlto-emit-imports-files")) { ctx.arg.plugin_opt.emplace_back("thinlto-emit-imports-files"); } else if (read_arg("thinlto-index-only")) { ctx.arg.plugin_opt.push_back("thinlto-index-only=" + std::string(arg)); } else if (read_flag("thinlto-index-only")) { ctx.arg.plugin_opt.emplace_back("thinlto-index-only"); } else if (read_arg("thinlto-object-suffix-replace")) { ctx.arg.plugin_opt.push_back("thinlto-object-suffix-replace=" + std::string(arg)); } else if (read_arg("thinlto-prefix-replace")) { ctx.arg.plugin_opt.push_back("thinlto-prefix-replace=" + std::string(arg)); } else if (read_arg("thinlto-cache-dir")) { ctx.arg.plugin_opt.push_back("cache-dir=" + std::string(arg)); } else if (read_arg("thinlto-cache-policy")) { ctx.arg.plugin_opt.push_back("cache-policy=" + std::string(arg)); } else if (read_arg("thinlto-jobs")) { ctx.arg.plugin_opt.push_back("jobs=" + std::string(arg)); } else if (read_arg("thread-count")) { ctx.arg.thread_count = parse_number(ctx, "thread-count", arg); } else if (read_flag("threads")) { ctx.arg.thread_count.reset(); } else if (read_flag("no-threads")) { ctx.arg.thread_count = 1; } else if (read_eq("threads")) { ctx.arg.thread_count = parse_number(ctx, "threads", arg); } else if (read_flag("discard-all") || read_flag("x")) { ctx.arg.discard_all = true; } else if (read_flag("discard-locals") || read_flag("X")) { ctx.arg.discard_locals = true; } else if (read_flag("strip-all") || read_flag("s")) { ctx.arg.strip_all = true; } else if (read_flag("strip-debug") || read_flag("S")) { ctx.arg.strip_debug = true; } else if (read_flag("warn-unresolved-symbols")) { error_unresolved_symbols = false; } else if (read_flag("error-unresolved-symbols")) { error_unresolved_symbols = true; } else if (read_arg("rpath")) { add_rpath(arg); } else if (read_arg("R")) { if (is_file(arg)) Fatal(ctx) << "-R" << arg << ": -R as an alias for --just-symbols is not supported"; add_rpath(arg); } else if (read_flag("undefined-version")) { ctx.arg.undefined_version = true; } else if (read_flag("no-undefined-version")) { ctx.arg.undefined_version = false; } else if (read_flag("build-id")) { ctx.arg.build_id.kind = BuildId::HASH; ctx.arg.build_id.hash_size = 20; } else if (read_arg("build-id")) { if (arg == "none") { ctx.arg.build_id.kind = BuildId::NONE; } else if (arg == "uuid") { ctx.arg.build_id.kind = BuildId::UUID; } else if (arg == "md5") { ctx.arg.build_id.kind = BuildId::HASH; ctx.arg.build_id.hash_size = 16; } else if (arg == "sha1") { ctx.arg.build_id.kind = BuildId::HASH; ctx.arg.build_id.hash_size = 20; } else if (arg == "sha256" || arg == "fast") { ctx.arg.build_id.kind = BuildId::HASH; ctx.arg.build_id.hash_size = 32; } else if (arg.starts_with("0x") || arg.starts_with("0X")) { ctx.arg.build_id.kind = BuildId::HEX; ctx.arg.build_id.value = parse_hex_build_id(ctx, arg); } else { Fatal(ctx) << "invalid --build-id argument: " << arg; } } else if (read_flag("no-build-id")) { ctx.arg.build_id.kind = BuildId::NONE; } else if (read_flag("be8")) { ctx.arg.be8 = true; } else if (read_flag("be32")) { ctx.arg.be8 = false; } else if (read_arg("format") || read_arg("b")) { if (arg == "binary") Fatal(ctx) << "mold does not support `-b binary`. If you want to convert a" << " binary file into an object file, use `objcopy -I binary -O" << " default ` instead."; Fatal(ctx) << "unknown command line option: -b " << arg; } else if (read_arg("fuse-ld")) { } else if (read_arg("auxiliary") || read_arg("f")) { ctx.arg.auxiliary.push_back(arg); } else if (read_arg("filter") || read_arg("F")) { ctx.arg.filter.push_back(arg); } else if (read_flag("allow-shlib-undefined")) { allow_shlib_undefined = true; } else if (read_flag("no-allow-shlib-undefined")) { allow_shlib_undefined = false; } else if (read_arg("O")) { } else if (read_flag("EB")) { } else if (read_flag("EL")) { } else if (read_flag("O0")) { } else if (read_flag("O1")) { } else if (read_flag("O2")) { } else if (read_flag("verbose")) { } else if (read_flag("color-diagnostics")) { } else if (read_flag("eh-frame-hdr")) { } else if (read_flag("start-group")) { } else if (read_flag("end-group")) { } else if (read_flag("(")) { } else if (read_flag(")")) { } else if (read_flag("fatal-warnings")) { } else if (read_flag("enable-new-dtags")) { } else if (read_flag("disable-new-dtags")) { } else if (read_flag("nostdlib")) { } else if (read_flag("no-add-needed")) { } else if (read_flag("no-call-graph-profile-sort")) { } else if (read_flag("no-copy-dt-needed-entries")) { } else if (read_arg("sort-section")) { } else if (read_flag("sort-common")) { } else if (read_flag("dc")) { } else if (read_flag("dp")) { } else if (read_flag("fix-cortex-a53-835769")) { } else if (read_flag("fix-cortex-a53-843419")) { } else if (read_flag("EL")) { } else if (read_flag("warn-once")) { } else if (read_flag("nodefaultlibs")) { } else if (read_flag("warn-constructors")) { } else if (read_flag("warn-execstack")) { } else if (read_flag("no-warn-execstack")) { } else if (read_flag("long-plt")) { } else if (read_flag("secure-plt")) { } else if (read_arg("rpath-link")) { } else if (read_z_flag("combreloc")) { } else if (read_z_flag("nocombreloc")) { } else if (read_z_arg("common-page-size")) { } else if (read_flag("no-keep-memory")) { } else if (read_arg("max-cache-size")) { } else if (read_flag("mmap-output-file")) { } else if (read_flag("no-mmap-output-file")) { } else if (read_arg("version-script")) { version_scripts.push_back(arg); } else if (read_arg("dynamic-list")) { ctx.arg.Bsymbolic = BSYMBOLIC_ALL; append(ctx.dynamic_list_patterns, parse_dynamic_list(ctx, arg)); } else if (read_arg("dynamic-list-data")) { ctx.arg.dynamic_list_data = true; } else if (read_arg("export-dynamic-symbol")) { ctx.dynamic_list_patterns.push_back({arg, ""}); } else if (read_arg("export-dynamic-symbol-list")) { append(ctx.dynamic_list_patterns, parse_dynamic_list(ctx, arg)); } else if (read_flag("as-needed")) { remaining.emplace_back("--as-needed"); } else if (read_flag("no-as-needed")) { remaining.emplace_back("--no-as-needed"); } else if (read_flag("whole-archive")) { remaining.emplace_back("--whole-archive"); } else if (read_flag("no-whole-archive")) { remaining.emplace_back("--no-whole-archive"); } else if (read_arg("l") || read_arg("library")) { remaining.push_back("-l" + std::string(arg)); } else if (read_arg("script") || read_arg("T")) { remaining.emplace_back(arg); } else if (read_flag("push-state")) { remaining.emplace_back("--push-state"); } else if (read_flag("pop-state")) { remaining.emplace_back("--pop-state"); } else if (args[0].starts_with("-z") && args[0].size() > 2) { Warn(ctx) << "unknown command line option: " << args[0]; args = args.subspan(1); } else if (args[0] == "-z" && args.size() >= 2) { Warn(ctx) << "unknown command line option: -z " << args[1]; args = args.subspan(2); } else if (args[0] == "-dynamic") { Fatal(ctx) << "unknown command line option: -dynamic; -dynamic is a " << "macOS linker's option. mold does not support macOS."; } else { if (args[0].starts_with('-')) Fatal(ctx) << "unknown command line option: " << args[0]; remaining.emplace_back(args[0]); args = args.subspan(1); } } if (!ctx.arg.chroot.empty()) { if (!ctx.arg.Map.empty()) ctx.arg.Map = ctx.arg.chroot + "/" + ctx.arg.Map; if (!ctx.arg.dependency_file.empty()) ctx.arg.dependency_file = ctx.arg.chroot + "/" + ctx.arg.dependency_file; } if (!ctx.arg.directory.empty()) if (chdir(ctx.arg.directory.c_str()) == -1) Fatal(ctx) << "chdir failed: " << ctx.arg.directory << ": " << errno_string(); if (!ctx.arg.sysroot.empty()) { for (std::string &path : ctx.arg.library_paths) { if (std::string_view(path).starts_with('=')) path = ctx.arg.sysroot + path.substr(1); else if (std::string_view(path).starts_with("$SYSROOT")) path = ctx.arg.sysroot + path.substr(8); } } // Clean library paths by removing redundant `/..` and `/.` // so that they are easier to read in log messages. for (std::string &path : ctx.arg.library_paths) path = path_clean(path); if (ctx.arg.shared) ctx.arg.pic = true; if (ctx.arg.pic) ctx.arg.image_base = 0; if (allow_shlib_undefined) ctx.arg.allow_shlib_undefined = *allow_shlib_undefined; else ctx.arg.allow_shlib_undefined = ctx.arg.shared; if (!report_undefined) report_undefined = !ctx.arg.shared; if (*report_undefined) { if (error_unresolved_symbols) ctx.arg.unresolved_symbols = UNRESOLVED_ERROR; else ctx.arg.unresolved_symbols = UNRESOLVED_WARN; } else { ctx.arg.unresolved_symbols = UNRESOLVED_IGNORE; } if (ctx.arg.retain_symbols_file) { ctx.arg.strip_all = false; ctx.arg.discard_all = false; } if (ctx.arg.shuffle_sections == SHUFFLE_SECTIONS_SHUFFLE) { if (shuffle_sections_seed) ctx.arg.shuffle_sections_seed = *shuffle_sections_seed; else ctx.arg.shuffle_sections_seed = ((u64)std::random_device()() << 32) | std::random_device()(); } // --section-order implies `-z separate-loadable-segments` if (z_separate_code) ctx.arg.z_separate_code = *z_separate_code; else if (!ctx.arg.section_order.empty()) ctx.arg.z_separate_code = SEPARATE_LOADABLE_SEGMENTS; // `-z dynamic-undefined-weak` is enabled by default for DSOs. if (z_dynamic_undefined_weak) ctx.arg.z_dynamic_undefined_weak = *z_dynamic_undefined_weak; else ctx.arg.z_dynamic_undefined_weak = ctx.arg.shared; // --section-order implies `-z norelro` if (z_relro) ctx.arg.z_relro = *z_relro; else if (!ctx.arg.section_order.empty()) ctx.arg.z_relro = false; if (ctx.arg.nmagic) ctx.arg.z_relro = false; if (!ctx.arg.shared) { if (!ctx.arg.filter.empty()) Fatal(ctx) << "-filter may not be used without -shared"; if (!ctx.arg.auxiliary.empty()) Fatal(ctx) << "-auxiliary may not be used without -shared"; } // Even though SH4 is RELA, addends in its relocation records are always // zero, and actual addends are written to relocated places. So we need // to handle it as an exception. if constexpr (!E::is_rela || is_sh4) if (!ctx.arg.apply_dynamic_relocs) Fatal(ctx) << "--no-apply-dynamic-relocs may not be used on " << E::name; if constexpr (is_sparc) if (ctx.arg.apply_dynamic_relocs) Fatal(ctx) << "--apply-dynamic-relocs may not be used on SPARC64"; if (!ctx.arg.section_start.empty() && !ctx.arg.section_order.empty()) Fatal(ctx) << "--section-start may not be used with --section-order"; if (ctx.arg.image_base % ctx.page_size) Fatal(ctx) << "-image-base must be a multiple of -max-page-size"; if (ctx.arg.emulation == ARM32BE::name && !ctx.arg.be8) Fatal(ctx) << "--be32 is not supported"; if (char *env = getenv("MOLD_REPRO"); env && env[0]) ctx.arg.repro = true; if (ctx.arg.default_symver) { std::string ver = ctx.arg.soname; if (ver.empty()) ver = path_filename(ctx.arg.output); ctx.arg.version_definitions.push_back(ver); } for (std::string_view path : version_scripts) { auto open = [&] { if (MappedFile *mf = open_file(ctx, std::string(path))) return mf; for (std::string_view dir : ctx.arg.library_paths) if (MappedFile *mf = open_file(ctx, std::string(dir) + "/" + std::string(path))) return mf; Fatal(ctx) << "--version-script: file not found: " << path; }; ReaderContext rctx; Script(ctx, rctx, open()).parse_version_script(); } if (separate_debug_file) { if (separate_debug_file->empty()) ctx.arg.separate_debug_file = ctx.arg.output + ".dbg"; else ctx.arg.separate_debug_file = *separate_debug_file; } if (ctx.arg.shared && warn_shared_textrel) ctx.arg.warn_textrel = true; // We don't want the background process to write to stdout if (ctx.arg.stats || ctx.arg.perf) ctx.arg.detach = false; ctx.arg.undefined.push_back(ctx.arg.entry); for (i64 i = 0; i < ctx.arg.defsyms.size(); i++) { std::variant *, u64> &val = ctx.arg.defsyms[i].second; if (Symbol **sym = std::get_if *>(&val)) ctx.arg.undefined.push_back(*sym); } // --oformat=binary implies --strip-all because without a section // header, there's no way to identify the locations of a symbol // table in an output file in the first place. if (ctx.arg.oformat_binary) ctx.arg.strip_all = true; // By default, mold tries to ovewrite to an output file if exists // because at least on Linux, writing to an existing file is much // faster than creating a fresh file and writing to it. // // However, if an existing file is in use, writing to it will mess // up processes that are executing that file. Linux prevents a write // to a running executable file; it returns ETXTBSY on open(2). // However, that mechanism doesn't protect .so files. Therefore, we // want to disable this optimization if we are creating a shared // object file. ctx.overwrite_output_file = (!ctx.arg.shared && returns_etxtbsy()); // Mark GC root symbols for (Symbol *sym : ctx.arg.undefined) sym->gc_root = true; for (Symbol *sym : ctx.arg.require_defined) sym->gc_root = true; ctx.arg.entry->gc_root = true; if (version_shown && remaining.empty()) exit(0); return remaining; } using E = MOLD_TARGET; template std::vector expand_response_files(Context &, char **); template std::vector parse_nonpositional_args(Context &ctx); } // namespace mold ================================================ FILE: src/elf.cc ================================================ #include "elf.h" #include namespace mold { static std::string unknown_type(u32 r_type) { std::stringstream ss; ss << "unknown (0x" << std::hex << r_type << ")"; return ss.str(); } #define CASE(x) case x: return #x template <> std::string rel_to_string(u32 r_type) { switch (r_type) { CASE(R_X86_64_NONE); CASE(R_X86_64_64); CASE(R_X86_64_PC32); CASE(R_X86_64_GOT32); CASE(R_X86_64_PLT32); CASE(R_X86_64_COPY); CASE(R_X86_64_GLOB_DAT); CASE(R_X86_64_JUMP_SLOT); CASE(R_X86_64_RELATIVE); CASE(R_X86_64_GOTPCREL); CASE(R_X86_64_32); CASE(R_X86_64_32S); CASE(R_X86_64_16); CASE(R_X86_64_PC16); CASE(R_X86_64_8); CASE(R_X86_64_PC8); CASE(R_X86_64_DTPMOD64); CASE(R_X86_64_DTPOFF64); CASE(R_X86_64_TPOFF64); CASE(R_X86_64_TLSGD); CASE(R_X86_64_TLSLD); CASE(R_X86_64_DTPOFF32); CASE(R_X86_64_GOTTPOFF); CASE(R_X86_64_TPOFF32); CASE(R_X86_64_PC64); CASE(R_X86_64_GOTOFF64); CASE(R_X86_64_GOTPC32); CASE(R_X86_64_GOT64); CASE(R_X86_64_GOTPCREL64); CASE(R_X86_64_GOTPC64); CASE(R_X86_64_GOTPLT64); CASE(R_X86_64_PLTOFF64); CASE(R_X86_64_SIZE32); CASE(R_X86_64_SIZE64); CASE(R_X86_64_GOTPC32_TLSDESC); CASE(R_X86_64_TLSDESC_CALL); CASE(R_X86_64_TLSDESC); CASE(R_X86_64_IRELATIVE); CASE(R_X86_64_GOTPCRELX); CASE(R_X86_64_REX_GOTPCRELX); CASE(R_X86_64_CODE_4_GOTPCRELX); CASE(R_X86_64_CODE_4_GOTTPOFF); CASE(R_X86_64_CODE_4_GOTPC32_TLSDESC); CASE(R_X86_64_CODE_5_GOTPCRELX); CASE(R_X86_64_CODE_5_GOTTPOFF); CASE(R_X86_64_CODE_5_GOTPC32_TLSDESC); CASE(R_X86_64_CODE_6_GOTPCRELX); CASE(R_X86_64_CODE_6_GOTTPOFF); CASE(R_X86_64_CODE_6_GOTPC32_TLSDESC); } return unknown_type(r_type); } template <> std::string rel_to_string(u32 r_type) { switch (r_type) { CASE(R_386_NONE); CASE(R_386_32); CASE(R_386_PC32); CASE(R_386_GOT32); CASE(R_386_PLT32); CASE(R_386_COPY); CASE(R_386_GLOB_DAT); CASE(R_386_JUMP_SLOT); CASE(R_386_RELATIVE); CASE(R_386_GOTOFF); CASE(R_386_GOTPC); CASE(R_386_32PLT); CASE(R_386_TLS_TPOFF); CASE(R_386_TLS_IE); CASE(R_386_TLS_GOTIE); CASE(R_386_TLS_LE); CASE(R_386_TLS_GD); CASE(R_386_TLS_LDM); CASE(R_386_16); CASE(R_386_PC16); CASE(R_386_8); CASE(R_386_PC8); CASE(R_386_TLS_GD_32); CASE(R_386_TLS_GD_PUSH); CASE(R_386_TLS_GD_CALL); CASE(R_386_TLS_GD_POP); CASE(R_386_TLS_LDM_32); CASE(R_386_TLS_LDM_PUSH); CASE(R_386_TLS_LDM_CALL); CASE(R_386_TLS_LDM_POP); CASE(R_386_TLS_LDO_32); CASE(R_386_TLS_IE_32); CASE(R_386_TLS_LE_32); CASE(R_386_TLS_DTPMOD32); CASE(R_386_TLS_DTPOFF32); CASE(R_386_TLS_TPOFF32); CASE(R_386_SIZE32); CASE(R_386_TLS_GOTDESC); CASE(R_386_TLS_DESC_CALL); CASE(R_386_TLS_DESC); CASE(R_386_IRELATIVE); CASE(R_386_GOT32X); } return unknown_type(r_type); } template <> std::string rel_to_string(u32 r_type) { switch (r_type) { CASE(R_AARCH64_NONE); CASE(R_AARCH64_ABS64); CASE(R_AARCH64_ABS32); CASE(R_AARCH64_ABS16); CASE(R_AARCH64_PREL64); CASE(R_AARCH64_PREL32); CASE(R_AARCH64_PREL16); CASE(R_AARCH64_MOVW_UABS_G0); CASE(R_AARCH64_MOVW_UABS_G0_NC); CASE(R_AARCH64_MOVW_UABS_G1); CASE(R_AARCH64_MOVW_UABS_G1_NC); CASE(R_AARCH64_MOVW_UABS_G2); CASE(R_AARCH64_MOVW_UABS_G2_NC); CASE(R_AARCH64_MOVW_UABS_G3); CASE(R_AARCH64_MOVW_SABS_G0); CASE(R_AARCH64_MOVW_SABS_G1); CASE(R_AARCH64_MOVW_SABS_G2); CASE(R_AARCH64_LD_PREL_LO19); CASE(R_AARCH64_ADR_PREL_LO21); CASE(R_AARCH64_ADR_PREL_PG_HI21); CASE(R_AARCH64_ADR_PREL_PG_HI21_NC); CASE(R_AARCH64_ADD_ABS_LO12_NC); CASE(R_AARCH64_LDST8_ABS_LO12_NC); CASE(R_AARCH64_TSTBR14); CASE(R_AARCH64_CONDBR19); CASE(R_AARCH64_JUMP26); CASE(R_AARCH64_CALL26); CASE(R_AARCH64_LDST16_ABS_LO12_NC); CASE(R_AARCH64_LDST32_ABS_LO12_NC); CASE(R_AARCH64_LDST64_ABS_LO12_NC); CASE(R_AARCH64_MOVW_PREL_G0); CASE(R_AARCH64_MOVW_PREL_G0_NC); CASE(R_AARCH64_MOVW_PREL_G1); CASE(R_AARCH64_MOVW_PREL_G1_NC); CASE(R_AARCH64_MOVW_PREL_G2); CASE(R_AARCH64_MOVW_PREL_G2_NC); CASE(R_AARCH64_MOVW_PREL_G3); CASE(R_AARCH64_LDST128_ABS_LO12_NC); CASE(R_AARCH64_ADR_GOT_PAGE); CASE(R_AARCH64_LD64_GOT_LO12_NC); CASE(R_AARCH64_LD64_GOTPAGE_LO15); CASE(R_AARCH64_PLT32); CASE(R_AARCH64_TLSGD_ADR_PREL21); CASE(R_AARCH64_TLSGD_ADR_PAGE21); CASE(R_AARCH64_TLSGD_ADD_LO12_NC); CASE(R_AARCH64_TLSGD_MOVW_G1); CASE(R_AARCH64_TLSGD_MOVW_G0_NC); CASE(R_AARCH64_TLSLD_ADR_PREL21); CASE(R_AARCH64_TLSLD_ADR_PAGE21); CASE(R_AARCH64_TLSLD_ADD_LO12_NC); CASE(R_AARCH64_TLSLD_MOVW_G1); CASE(R_AARCH64_TLSLD_MOVW_G0_NC); CASE(R_AARCH64_TLSLD_LD_PREL19); CASE(R_AARCH64_TLSLD_MOVW_DTPREL_G2); CASE(R_AARCH64_TLSLD_MOVW_DTPREL_G1); CASE(R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC); CASE(R_AARCH64_TLSLD_MOVW_DTPREL_G0); CASE(R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC); CASE(R_AARCH64_TLSLD_ADD_DTPREL_HI12); CASE(R_AARCH64_TLSLD_ADD_DTPREL_LO12); CASE(R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC); CASE(R_AARCH64_TLSLD_LDST8_DTPREL_LO12); CASE(R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC); CASE(R_AARCH64_TLSLD_LDST16_DTPREL_LO12); CASE(R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC); CASE(R_AARCH64_TLSLD_LDST32_DTPREL_LO12); CASE(R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC); CASE(R_AARCH64_TLSLD_LDST64_DTPREL_LO12); CASE(R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC); CASE(R_AARCH64_TLSIE_MOVW_GOTTPREL_G1); CASE(R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC); CASE(R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21); CASE(R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC); CASE(R_AARCH64_TLSIE_LD_GOTTPREL_PREL19); CASE(R_AARCH64_TLSLE_MOVW_TPREL_G2); CASE(R_AARCH64_TLSLE_MOVW_TPREL_G1); CASE(R_AARCH64_TLSLE_MOVW_TPREL_G1_NC); CASE(R_AARCH64_TLSLE_MOVW_TPREL_G0); CASE(R_AARCH64_TLSLE_MOVW_TPREL_G0_NC); CASE(R_AARCH64_TLSLE_ADD_TPREL_HI12); CASE(R_AARCH64_TLSLE_ADD_TPREL_LO12); CASE(R_AARCH64_TLSLE_ADD_TPREL_LO12_NC); CASE(R_AARCH64_TLSLE_LDST8_TPREL_LO12); CASE(R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC); CASE(R_AARCH64_TLSLE_LDST16_TPREL_LO12); CASE(R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC); CASE(R_AARCH64_TLSLE_LDST32_TPREL_LO12); CASE(R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC); CASE(R_AARCH64_TLSLE_LDST64_TPREL_LO12); CASE(R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC); CASE(R_AARCH64_TLSDESC_ADR_PAGE21); CASE(R_AARCH64_TLSDESC_LD64_LO12); CASE(R_AARCH64_TLSDESC_ADD_LO12); CASE(R_AARCH64_TLSDESC_CALL); CASE(R_AARCH64_TLSLE_LDST128_TPREL_LO12_NC); CASE(R_AARCH64_COPY); CASE(R_AARCH64_GLOB_DAT); CASE(R_AARCH64_JUMP_SLOT); CASE(R_AARCH64_RELATIVE); CASE(R_AARCH64_TLS_DTPMOD64); CASE(R_AARCH64_TLS_DTPREL64); CASE(R_AARCH64_TLS_TPREL64); CASE(R_AARCH64_TLSDESC); CASE(R_AARCH64_IRELATIVE); } return unknown_type(r_type); } template <> std::string rel_to_string(u32 r_type) { return rel_to_string(r_type); } template <> std::string rel_to_string(u32 r_type) { switch (r_type) { CASE(R_ARM_NONE); CASE(R_ARM_PC24); CASE(R_ARM_ABS32); CASE(R_ARM_REL32); CASE(R_ARM_LDR_PC_G0); CASE(R_ARM_ABS16); CASE(R_ARM_ABS12); CASE(R_ARM_THM_ABS5); CASE(R_ARM_ABS8); CASE(R_ARM_SBREL32); CASE(R_ARM_THM_CALL); CASE(R_ARM_THM_PC8); CASE(R_ARM_BREL_ADJ); CASE(R_ARM_TLS_DESC); CASE(R_ARM_THM_SWI8); CASE(R_ARM_XPC25); CASE(R_ARM_THM_XPC22); CASE(R_ARM_TLS_DTPMOD32); CASE(R_ARM_TLS_DTPOFF32); CASE(R_ARM_TLS_TPOFF32); CASE(R_ARM_COPY); CASE(R_ARM_GLOB_DAT); CASE(R_ARM_JUMP_SLOT); CASE(R_ARM_RELATIVE); CASE(R_ARM_GOTOFF32); CASE(R_ARM_BASE_PREL); CASE(R_ARM_GOT_BREL); CASE(R_ARM_PLT32); CASE(R_ARM_CALL); CASE(R_ARM_JUMP24); CASE(R_ARM_THM_JUMP24); CASE(R_ARM_BASE_ABS); CASE(R_ARM_ALU_PCREL_7_0); CASE(R_ARM_ALU_PCREL_15_8); CASE(R_ARM_ALU_PCREL_23_15); CASE(R_ARM_LDR_SBREL_11_0_NC); CASE(R_ARM_ALU_SBREL_19_12_NC); CASE(R_ARM_ALU_SBREL_27_20_CK); CASE(R_ARM_TARGET1); CASE(R_ARM_SBREL31); CASE(R_ARM_V4BX); CASE(R_ARM_TARGET2); CASE(R_ARM_PREL31); CASE(R_ARM_MOVW_ABS_NC); CASE(R_ARM_MOVT_ABS); CASE(R_ARM_MOVW_PREL_NC); CASE(R_ARM_MOVT_PREL); CASE(R_ARM_THM_MOVW_ABS_NC); CASE(R_ARM_THM_MOVT_ABS); CASE(R_ARM_THM_MOVW_PREL_NC); CASE(R_ARM_THM_MOVT_PREL); CASE(R_ARM_THM_JUMP19); CASE(R_ARM_THM_JUMP6); CASE(R_ARM_THM_ALU_PREL_11_0); CASE(R_ARM_THM_PC12); CASE(R_ARM_ABS32_NOI); CASE(R_ARM_REL32_NOI); CASE(R_ARM_ALU_PC_G0_NC); CASE(R_ARM_ALU_PC_G0); CASE(R_ARM_ALU_PC_G1_NC); CASE(R_ARM_ALU_PC_G1); CASE(R_ARM_ALU_PC_G2); CASE(R_ARM_LDR_PC_G1); CASE(R_ARM_LDR_PC_G2); CASE(R_ARM_LDRS_PC_G0); CASE(R_ARM_LDRS_PC_G1); CASE(R_ARM_LDRS_PC_G2); CASE(R_ARM_LDC_PC_G0); CASE(R_ARM_LDC_PC_G1); CASE(R_ARM_LDC_PC_G2); CASE(R_ARM_ALU_SB_G0_NC); CASE(R_ARM_ALU_SB_G0); CASE(R_ARM_ALU_SB_G1_NC); CASE(R_ARM_ALU_SB_G1); CASE(R_ARM_ALU_SB_G2); CASE(R_ARM_LDR_SB_G0); CASE(R_ARM_LDR_SB_G1); CASE(R_ARM_LDR_SB_G2); CASE(R_ARM_LDRS_SB_G0); CASE(R_ARM_LDRS_SB_G1); CASE(R_ARM_LDRS_SB_G2); CASE(R_ARM_LDC_SB_G0); CASE(R_ARM_LDC_SB_G1); CASE(R_ARM_LDC_SB_G2); CASE(R_ARM_MOVW_BREL_NC); CASE(R_ARM_MOVT_BREL); CASE(R_ARM_MOVW_BREL); CASE(R_ARM_THM_MOVW_BREL_NC); CASE(R_ARM_THM_MOVT_BREL); CASE(R_ARM_THM_MOVW_BREL); CASE(R_ARM_TLS_GOTDESC); CASE(R_ARM_TLS_CALL); CASE(R_ARM_TLS_DESCSEQ); CASE(R_ARM_THM_TLS_CALL); CASE(R_ARM_PLT32_ABS); CASE(R_ARM_GOT_ABS); CASE(R_ARM_GOT_PREL); CASE(R_ARM_GOT_BREL12); CASE(R_ARM_GOTOFF12); CASE(R_ARM_GOTRELAX); CASE(R_ARM_GNU_VTENTRY); CASE(R_ARM_GNU_VTINHERIT); CASE(R_ARM_THM_JUMP11); CASE(R_ARM_THM_JUMP8); CASE(R_ARM_TLS_GD32); CASE(R_ARM_TLS_LDM32); CASE(R_ARM_TLS_LDO32); CASE(R_ARM_TLS_IE32); CASE(R_ARM_TLS_LE32); CASE(R_ARM_TLS_LDO12); CASE(R_ARM_TLS_LE12); CASE(R_ARM_TLS_IE12GP); CASE(R_ARM_PRIVATE_0); CASE(R_ARM_PRIVATE_1); CASE(R_ARM_PRIVATE_2); CASE(R_ARM_PRIVATE_3); CASE(R_ARM_PRIVATE_4); CASE(R_ARM_PRIVATE_5); CASE(R_ARM_PRIVATE_6); CASE(R_ARM_PRIVATE_7); CASE(R_ARM_PRIVATE_8); CASE(R_ARM_PRIVATE_9); CASE(R_ARM_PRIVATE_10); CASE(R_ARM_PRIVATE_11); CASE(R_ARM_PRIVATE_12); CASE(R_ARM_PRIVATE_13); CASE(R_ARM_PRIVATE_14); CASE(R_ARM_PRIVATE_15); CASE(R_ARM_ME_TOO); CASE(R_ARM_THM_TLS_DESCSEQ16); CASE(R_ARM_THM_TLS_DESCSEQ32); CASE(R_ARM_THM_BF16); CASE(R_ARM_THM_BF12); CASE(R_ARM_THM_BF18); CASE(R_ARM_IRELATIVE); } return unknown_type(r_type); } template <> std::string rel_to_string(u32 r_type) { return rel_to_string(r_type); } template <> std::string rel_to_string(u32 r_type) { switch (r_type) { CASE(R_RISCV_NONE); CASE(R_RISCV_32); CASE(R_RISCV_64); CASE(R_RISCV_RELATIVE); CASE(R_RISCV_COPY); CASE(R_RISCV_JUMP_SLOT); CASE(R_RISCV_TLS_DTPMOD32); CASE(R_RISCV_TLS_DTPMOD64); CASE(R_RISCV_TLS_DTPREL32); CASE(R_RISCV_TLS_DTPREL64); CASE(R_RISCV_TLS_TPREL32); CASE(R_RISCV_TLS_TPREL64); CASE(R_RISCV_BRANCH); CASE(R_RISCV_JAL); CASE(R_RISCV_CALL); CASE(R_RISCV_CALL_PLT); CASE(R_RISCV_GOT_HI20); CASE(R_RISCV_TLS_GOT_HI20); CASE(R_RISCV_TLS_GD_HI20); CASE(R_RISCV_PCREL_HI20); CASE(R_RISCV_PCREL_LO12_I); CASE(R_RISCV_PCREL_LO12_S); CASE(R_RISCV_HI20); CASE(R_RISCV_LO12_I); CASE(R_RISCV_LO12_S); CASE(R_RISCV_TPREL_HI20); CASE(R_RISCV_TPREL_LO12_I); CASE(R_RISCV_TPREL_LO12_S); CASE(R_RISCV_TPREL_ADD); CASE(R_RISCV_ADD8); CASE(R_RISCV_ADD16); CASE(R_RISCV_ADD32); CASE(R_RISCV_ADD64); CASE(R_RISCV_SUB8); CASE(R_RISCV_SUB16); CASE(R_RISCV_SUB32); CASE(R_RISCV_SUB64); CASE(R_RISCV_ALIGN); CASE(R_RISCV_RVC_BRANCH); CASE(R_RISCV_RVC_JUMP); CASE(R_RISCV_RELAX); CASE(R_RISCV_SUB6); CASE(R_RISCV_SET6); CASE(R_RISCV_SET8); CASE(R_RISCV_SET16); CASE(R_RISCV_SET32); CASE(R_RISCV_32_PCREL); CASE(R_RISCV_IRELATIVE); CASE(R_RISCV_PLT32); CASE(R_RISCV_SET_ULEB128); CASE(R_RISCV_SUB_ULEB128); CASE(R_RISCV_TLSDESC_HI20); CASE(R_RISCV_TLSDESC_LOAD_LO12); CASE(R_RISCV_TLSDESC_ADD_LO12); CASE(R_RISCV_TLSDESC_CALL); } return unknown_type(r_type); } template <> std::string rel_to_string(u32 r_type) { return rel_to_string(r_type); } template <> std::string rel_to_string(u32 r_type) { return rel_to_string(r_type); } template <> std::string rel_to_string(u32 r_type) { return rel_to_string(r_type); } template <> std::string rel_to_string(u32 r_type) { switch (r_type) { CASE(R_PPC_NONE); CASE(R_PPC_ADDR32); CASE(R_PPC_ADDR24); CASE(R_PPC_ADDR16); CASE(R_PPC_ADDR16_LO); CASE(R_PPC_ADDR16_HI); CASE(R_PPC_ADDR16_HA); CASE(R_PPC_ADDR14); CASE(R_PPC_ADDR14_BRTAKEN); CASE(R_PPC_ADDR14_BRNTAKEN); CASE(R_PPC_REL24); CASE(R_PPC_REL14); CASE(R_PPC_REL14_BRTAKEN); CASE(R_PPC_REL14_BRNTAKEN); CASE(R_PPC_GOT16); CASE(R_PPC_GOT16_LO); CASE(R_PPC_GOT16_HI); CASE(R_PPC_GOT16_HA); CASE(R_PPC_PLTREL24); CASE(R_PPC_COPY); CASE(R_PPC_GLOB_DAT); CASE(R_PPC_JMP_SLOT); CASE(R_PPC_RELATIVE); CASE(R_PPC_LOCAL24PC); CASE(R_PPC_UADDR32); CASE(R_PPC_UADDR16); CASE(R_PPC_REL32); CASE(R_PPC_PLT32); CASE(R_PPC_PLTREL32); CASE(R_PPC_PLT16_LO); CASE(R_PPC_PLT16_HI); CASE(R_PPC_PLT16_HA); CASE(R_PPC_SDAREL16); CASE(R_PPC_SECTOFF); CASE(R_PPC_SECTOFF_LO); CASE(R_PPC_SECTOFF_HI); CASE(R_PPC_SECTOFF_HA); CASE(R_PPC_ADDR30); CASE(R_PPC_TLS); CASE(R_PPC_DTPMOD32); CASE(R_PPC_TPREL16); CASE(R_PPC_TPREL16_LO); CASE(R_PPC_TPREL16_HI); CASE(R_PPC_TPREL16_HA); CASE(R_PPC_TPREL32); CASE(R_PPC_DTPREL16); CASE(R_PPC_DTPREL16_LO); CASE(R_PPC_DTPREL16_HI); CASE(R_PPC_DTPREL16_HA); CASE(R_PPC_DTPREL32); CASE(R_PPC_GOT_TLSGD16); CASE(R_PPC_GOT_TLSGD16_LO); CASE(R_PPC_GOT_TLSGD16_HI); CASE(R_PPC_GOT_TLSGD16_HA); CASE(R_PPC_GOT_TLSLD16); CASE(R_PPC_GOT_TLSLD16_LO); CASE(R_PPC_GOT_TLSLD16_HI); CASE(R_PPC_GOT_TLSLD16_HA); CASE(R_PPC_GOT_TPREL16); CASE(R_PPC_GOT_TPREL16_LO); CASE(R_PPC_GOT_TPREL16_HI); CASE(R_PPC_GOT_TPREL16_HA); CASE(R_PPC_GOT_DTPREL16); CASE(R_PPC_GOT_DTPREL16_LO); CASE(R_PPC_GOT_DTPREL16_HI); CASE(R_PPC_GOT_DTPREL16_HA); CASE(R_PPC_TLSGD); CASE(R_PPC_TLSLD); CASE(R_PPC_PLTSEQ); CASE(R_PPC_PLTCALL); CASE(R_PPC_IRELATIVE); CASE(R_PPC_REL16); CASE(R_PPC_REL16_LO); CASE(R_PPC_REL16_HI); CASE(R_PPC_REL16_HA); } return unknown_type(r_type); } template <> std::string rel_to_string(u32 r_type) { switch (r_type) { CASE(R_PPC64_NONE); CASE(R_PPC64_ADDR32); CASE(R_PPC64_ADDR24); CASE(R_PPC64_ADDR16); CASE(R_PPC64_ADDR16_LO); CASE(R_PPC64_ADDR16_HI); CASE(R_PPC64_ADDR16_HA); CASE(R_PPC64_ADDR14); CASE(R_PPC64_ADDR14_BRTAKEN); CASE(R_PPC64_ADDR14_BRNTAKEN); CASE(R_PPC64_REL24); CASE(R_PPC64_REL14); CASE(R_PPC64_REL14_BRTAKEN); CASE(R_PPC64_REL14_BRNTAKEN); CASE(R_PPC64_GOT16); CASE(R_PPC64_GOT16_LO); CASE(R_PPC64_GOT16_HI); CASE(R_PPC64_GOT16_HA); CASE(R_PPC64_COPY); CASE(R_PPC64_GLOB_DAT); CASE(R_PPC64_JMP_SLOT); CASE(R_PPC64_RELATIVE); CASE(R_PPC64_REL32); CASE(R_PPC64_PLT16_LO); CASE(R_PPC64_PLT16_HI); CASE(R_PPC64_PLT16_HA); CASE(R_PPC64_ADDR64); CASE(R_PPC64_ADDR16_HIGHER); CASE(R_PPC64_ADDR16_HIGHERA); CASE(R_PPC64_ADDR16_HIGHEST); CASE(R_PPC64_ADDR16_HIGHESTA); CASE(R_PPC64_REL64); CASE(R_PPC64_TOC16); CASE(R_PPC64_TOC16_LO); CASE(R_PPC64_TOC16_HI); CASE(R_PPC64_TOC16_HA); CASE(R_PPC64_TOC); CASE(R_PPC64_ADDR16_DS); CASE(R_PPC64_ADDR16_LO_DS); CASE(R_PPC64_GOT16_DS); CASE(R_PPC64_GOT16_LO_DS); CASE(R_PPC64_PLT16_LO_DS); CASE(R_PPC64_TOC16_DS); CASE(R_PPC64_TOC16_LO_DS); CASE(R_PPC64_TLS); CASE(R_PPC64_DTPMOD64); CASE(R_PPC64_TPREL16); CASE(R_PPC64_TPREL16_LO); CASE(R_PPC64_TPREL16_HI); CASE(R_PPC64_TPREL16_HA); CASE(R_PPC64_TPREL64); CASE(R_PPC64_DTPREL16); CASE(R_PPC64_DTPREL16_LO); CASE(R_PPC64_DTPREL16_HI); CASE(R_PPC64_DTPREL16_HA); CASE(R_PPC64_DTPREL64); CASE(R_PPC64_GOT_TLSGD16); CASE(R_PPC64_GOT_TLSGD16_LO); CASE(R_PPC64_GOT_TLSGD16_HI); CASE(R_PPC64_GOT_TLSGD16_HA); CASE(R_PPC64_GOT_TLSLD16); CASE(R_PPC64_GOT_TLSLD16_LO); CASE(R_PPC64_GOT_TLSLD16_HI); CASE(R_PPC64_GOT_TLSLD16_HA); CASE(R_PPC64_GOT_TPREL16_DS); CASE(R_PPC64_GOT_TPREL16_LO_DS); CASE(R_PPC64_GOT_TPREL16_HI); CASE(R_PPC64_GOT_TPREL16_HA); CASE(R_PPC64_GOT_DTPREL16_DS); CASE(R_PPC64_GOT_DTPREL16_LO_DS); CASE(R_PPC64_GOT_DTPREL16_HI); CASE(R_PPC64_GOT_DTPREL16_HA); CASE(R_PPC64_TPREL16_DS); CASE(R_PPC64_TPREL16_LO_DS); CASE(R_PPC64_TPREL16_HIGHER); CASE(R_PPC64_TPREL16_HIGHERA); CASE(R_PPC64_TPREL16_HIGHEST); CASE(R_PPC64_TPREL16_HIGHESTA); CASE(R_PPC64_DTPREL16_DS); CASE(R_PPC64_DTPREL16_LO_DS); CASE(R_PPC64_DTPREL16_HIGHER); CASE(R_PPC64_DTPREL16_HIGHERA); CASE(R_PPC64_DTPREL16_HIGHEST); CASE(R_PPC64_DTPREL16_HIGHESTA); CASE(R_PPC64_TLSGD); CASE(R_PPC64_TLSLD); CASE(R_PPC64_ADDR16_HIGH); CASE(R_PPC64_ADDR16_HIGHA); CASE(R_PPC64_TPREL16_HIGH); CASE(R_PPC64_TPREL16_HIGHA); CASE(R_PPC64_DTPREL16_HIGH); CASE(R_PPC64_DTPREL16_HIGHA); CASE(R_PPC64_REL24_NOTOC); CASE(R_PPC64_ENTRY); CASE(R_PPC64_PLTSEQ); CASE(R_PPC64_PLTCALL); CASE(R_PPC64_PLTSEQ_NOTOC); CASE(R_PPC64_PLTCALL_NOTOC); CASE(R_PPC64_PCREL_OPT); CASE(R_PPC64_PCREL34); CASE(R_PPC64_GOT_PCREL34); CASE(R_PPC64_PLT_PCREL34); CASE(R_PPC64_PLT_PCREL34_NOTOC); CASE(R_PPC64_TPREL34); CASE(R_PPC64_DTPREL34); CASE(R_PPC64_GOT_TLSGD_PCREL34); CASE(R_PPC64_GOT_TLSLD_PCREL34); CASE(R_PPC64_GOT_TPREL_PCREL34); CASE(R_PPC64_IRELATIVE); CASE(R_PPC64_REL16); CASE(R_PPC64_REL16_LO); CASE(R_PPC64_REL16_HI); CASE(R_PPC64_REL16_HA); } return unknown_type(r_type); } template <> std::string rel_to_string(u32 r_type) { return rel_to_string(r_type); } template <> std::string rel_to_string(u32 r_type) { switch (r_type) { CASE(R_SPARC_NONE); CASE(R_SPARC_8); CASE(R_SPARC_16); CASE(R_SPARC_32); CASE(R_SPARC_DISP8); CASE(R_SPARC_DISP16); CASE(R_SPARC_DISP32); CASE(R_SPARC_WDISP30); CASE(R_SPARC_WDISP22); CASE(R_SPARC_HI22); CASE(R_SPARC_22); CASE(R_SPARC_13); CASE(R_SPARC_LO10); CASE(R_SPARC_GOT10); CASE(R_SPARC_GOT13); CASE(R_SPARC_GOT22); CASE(R_SPARC_PC10); CASE(R_SPARC_PC22); CASE(R_SPARC_WPLT30); CASE(R_SPARC_COPY); CASE(R_SPARC_GLOB_DAT); CASE(R_SPARC_JMP_SLOT); CASE(R_SPARC_RELATIVE); CASE(R_SPARC_UA32); CASE(R_SPARC_PLT32); CASE(R_SPARC_HIPLT22); CASE(R_SPARC_LOPLT10); CASE(R_SPARC_PCPLT32); CASE(R_SPARC_PCPLT22); CASE(R_SPARC_PCPLT10); CASE(R_SPARC_10); CASE(R_SPARC_11); CASE(R_SPARC_64); CASE(R_SPARC_OLO10); CASE(R_SPARC_HH22); CASE(R_SPARC_HM10); CASE(R_SPARC_LM22); CASE(R_SPARC_PC_HH22); CASE(R_SPARC_PC_HM10); CASE(R_SPARC_PC_LM22); CASE(R_SPARC_WDISP16); CASE(R_SPARC_WDISP19); CASE(R_SPARC_7); CASE(R_SPARC_5); CASE(R_SPARC_6); CASE(R_SPARC_DISP64); CASE(R_SPARC_PLT64); CASE(R_SPARC_HIX22); CASE(R_SPARC_LOX10); CASE(R_SPARC_H44); CASE(R_SPARC_M44); CASE(R_SPARC_L44); CASE(R_SPARC_REGISTER); CASE(R_SPARC_UA64); CASE(R_SPARC_UA16); CASE(R_SPARC_TLS_GD_HI22); CASE(R_SPARC_TLS_GD_LO10); CASE(R_SPARC_TLS_GD_ADD); CASE(R_SPARC_TLS_GD_CALL); CASE(R_SPARC_TLS_LDM_HI22); CASE(R_SPARC_TLS_LDM_LO10); CASE(R_SPARC_TLS_LDM_ADD); CASE(R_SPARC_TLS_LDM_CALL); CASE(R_SPARC_TLS_LDO_HIX22); CASE(R_SPARC_TLS_LDO_LOX10); CASE(R_SPARC_TLS_LDO_ADD); CASE(R_SPARC_TLS_IE_HI22); CASE(R_SPARC_TLS_IE_LO10); CASE(R_SPARC_TLS_IE_LD); CASE(R_SPARC_TLS_IE_LDX); CASE(R_SPARC_TLS_IE_ADD); CASE(R_SPARC_TLS_LE_HIX22); CASE(R_SPARC_TLS_LE_LOX10); CASE(R_SPARC_TLS_DTPMOD32); CASE(R_SPARC_TLS_DTPMOD64); CASE(R_SPARC_TLS_DTPOFF32); CASE(R_SPARC_TLS_DTPOFF64); CASE(R_SPARC_TLS_TPOFF32); CASE(R_SPARC_TLS_TPOFF64); CASE(R_SPARC_GOTDATA_HIX22); CASE(R_SPARC_GOTDATA_LOX10); CASE(R_SPARC_GOTDATA_OP_HIX22); CASE(R_SPARC_GOTDATA_OP_LOX10); CASE(R_SPARC_GOTDATA_OP); CASE(R_SPARC_IRELATIVE); } return unknown_type(r_type); } template <> std::string rel_to_string(u32 r_type) { switch (r_type) { CASE(R_390_NONE); CASE(R_390_8); CASE(R_390_12); CASE(R_390_16); CASE(R_390_32); CASE(R_390_PC32); CASE(R_390_GOT12); CASE(R_390_GOT32); CASE(R_390_PLT32); CASE(R_390_COPY); CASE(R_390_GLOB_DAT); CASE(R_390_JMP_SLOT); CASE(R_390_RELATIVE); CASE(R_390_GOTOFF32); CASE(R_390_GOTPC); CASE(R_390_GOT16); CASE(R_390_PC16); CASE(R_390_PC16DBL); CASE(R_390_PLT16DBL); CASE(R_390_PC32DBL); CASE(R_390_PLT32DBL); CASE(R_390_GOTPCDBL); CASE(R_390_64); CASE(R_390_PC64); CASE(R_390_GOT64); CASE(R_390_PLT64); CASE(R_390_GOTENT); CASE(R_390_GOTOFF16); CASE(R_390_GOTOFF64); CASE(R_390_GOTPLT12); CASE(R_390_GOTPLT16); CASE(R_390_GOTPLT32); CASE(R_390_GOTPLT64); CASE(R_390_GOTPLTENT); CASE(R_390_PLTOFF16); CASE(R_390_PLTOFF32); CASE(R_390_PLTOFF64); CASE(R_390_TLS_LOAD); CASE(R_390_TLS_GDCALL); CASE(R_390_TLS_LDCALL); CASE(R_390_TLS_GD32); CASE(R_390_TLS_GD64); CASE(R_390_TLS_GOTIE12); CASE(R_390_TLS_GOTIE32); CASE(R_390_TLS_GOTIE64); CASE(R_390_TLS_LDM32); CASE(R_390_TLS_LDM64); CASE(R_390_TLS_IE32); CASE(R_390_TLS_IE64); CASE(R_390_TLS_IEENT); CASE(R_390_TLS_LE32); CASE(R_390_TLS_LE64); CASE(R_390_TLS_LDO32); CASE(R_390_TLS_LDO64); CASE(R_390_TLS_DTPMOD); CASE(R_390_TLS_DTPOFF); CASE(R_390_TLS_TPOFF); CASE(R_390_20); CASE(R_390_GOT20); CASE(R_390_GOTPLT20); CASE(R_390_TLS_GOTIE20); CASE(R_390_IRELATIVE); CASE(R_390_PC12DBL); CASE(R_390_PLT12DBL); CASE(R_390_PC24DBL); CASE(R_390_PLT24DBL); } return unknown_type(r_type); } template <> std::string rel_to_string(u32 r_type) { switch (r_type) { CASE(R_68K_NONE); CASE(R_68K_32); CASE(R_68K_16); CASE(R_68K_8); CASE(R_68K_PC32); CASE(R_68K_PC16); CASE(R_68K_PC8); CASE(R_68K_GOTPCREL32); CASE(R_68K_GOTPCREL16); CASE(R_68K_GOTPCREL8); CASE(R_68K_GOTOFF32); CASE(R_68K_GOTOFF16); CASE(R_68K_GOTOFF8); CASE(R_68K_PLT32); CASE(R_68K_PLT16); CASE(R_68K_PLT8); CASE(R_68K_PLTOFF32); CASE(R_68K_PLTOFF16); CASE(R_68K_PLTOFF8); CASE(R_68K_COPY); CASE(R_68K_GLOB_DAT); CASE(R_68K_JMP_SLOT); CASE(R_68K_RELATIVE); CASE(R_68K_TLS_GD32); CASE(R_68K_TLS_GD16); CASE(R_68K_TLS_GD8); CASE(R_68K_TLS_LDM32); CASE(R_68K_TLS_LDM16); CASE(R_68K_TLS_LDM8); CASE(R_68K_TLS_LDO32); CASE(R_68K_TLS_LDO16); CASE(R_68K_TLS_LDO8); CASE(R_68K_TLS_IE32); CASE(R_68K_TLS_IE16); CASE(R_68K_TLS_IE8); CASE(R_68K_TLS_LE32); CASE(R_68K_TLS_LE16); CASE(R_68K_TLS_LE8); CASE(R_68K_TLS_DTPMOD32); CASE(R_68K_TLS_DTPREL32); CASE(R_68K_TLS_TPREL32); } return unknown_type(r_type); } template <> std::string rel_to_string(u32 r_type) { switch (r_type) { CASE(R_SH_NONE); CASE(R_SH_DIR32); CASE(R_SH_REL32); CASE(R_SH_DIR8WPN); CASE(R_SH_IND12W); CASE(R_SH_DIR8WPL); CASE(R_SH_DIR8WPZ); CASE(R_SH_DIR8BP); CASE(R_SH_DIR8W); CASE(R_SH_DIR8L); CASE(R_SH_TLS_GD_32); CASE(R_SH_TLS_LD_32); CASE(R_SH_TLS_LDO_32); CASE(R_SH_TLS_IE_32); CASE(R_SH_TLS_LE_32); CASE(R_SH_TLS_DTPMOD32); CASE(R_SH_TLS_DTPOFF32); CASE(R_SH_TLS_TPOFF32); CASE(R_SH_GOT32); CASE(R_SH_PLT32); CASE(R_SH_COPY); CASE(R_SH_GLOB_DAT); CASE(R_SH_JMP_SLOT); CASE(R_SH_RELATIVE); CASE(R_SH_GOTOFF); CASE(R_SH_GOTPC); CASE(R_SH_GOTPLT32); } return unknown_type(r_type); } template <> std::string rel_to_string(u32 r_type) { return rel_to_string(r_type); } template <> std::string rel_to_string(u32 r_type) { switch (r_type) { CASE(R_LARCH_NONE); CASE(R_LARCH_32); CASE(R_LARCH_64); CASE(R_LARCH_RELATIVE); CASE(R_LARCH_COPY); CASE(R_LARCH_JUMP_SLOT); CASE(R_LARCH_TLS_DTPMOD32); CASE(R_LARCH_TLS_DTPMOD64); CASE(R_LARCH_TLS_DTPREL32); CASE(R_LARCH_TLS_DTPREL64); CASE(R_LARCH_TLS_TPREL32); CASE(R_LARCH_TLS_TPREL64); CASE(R_LARCH_IRELATIVE); CASE(R_LARCH_TLS_DESC32); CASE(R_LARCH_TLS_DESC64); CASE(R_LARCH_MARK_LA); CASE(R_LARCH_MARK_PCREL); CASE(R_LARCH_SOP_PUSH_PCREL); CASE(R_LARCH_SOP_PUSH_ABSOLUTE); CASE(R_LARCH_SOP_PUSH_DUP); CASE(R_LARCH_SOP_PUSH_GPREL); CASE(R_LARCH_SOP_PUSH_TLS_TPREL); CASE(R_LARCH_SOP_PUSH_TLS_GOT); CASE(R_LARCH_SOP_PUSH_TLS_GD); CASE(R_LARCH_SOP_PUSH_PLT_PCREL); CASE(R_LARCH_SOP_ASSERT); CASE(R_LARCH_SOP_NOT); CASE(R_LARCH_SOP_SUB); CASE(R_LARCH_SOP_SL); CASE(R_LARCH_SOP_SR); CASE(R_LARCH_SOP_ADD); CASE(R_LARCH_SOP_AND); CASE(R_LARCH_SOP_IF_ELSE); CASE(R_LARCH_SOP_POP_32_S_10_5); CASE(R_LARCH_SOP_POP_32_U_10_12); CASE(R_LARCH_SOP_POP_32_S_10_12); CASE(R_LARCH_SOP_POP_32_S_10_16); CASE(R_LARCH_SOP_POP_32_S_10_16_S2); CASE(R_LARCH_SOP_POP_32_S_5_20); CASE(R_LARCH_SOP_POP_32_S_0_5_10_16_S2); CASE(R_LARCH_SOP_POP_32_S_0_10_10_16_S2); CASE(R_LARCH_SOP_POP_32_U); CASE(R_LARCH_ADD8); CASE(R_LARCH_ADD16); CASE(R_LARCH_ADD24); CASE(R_LARCH_ADD32); CASE(R_LARCH_ADD64); CASE(R_LARCH_SUB8); CASE(R_LARCH_SUB16); CASE(R_LARCH_SUB24); CASE(R_LARCH_SUB32); CASE(R_LARCH_SUB64); CASE(R_LARCH_GNU_VTINHERIT); CASE(R_LARCH_GNU_VTENTRY); CASE(R_LARCH_B16); CASE(R_LARCH_B21); CASE(R_LARCH_B26); CASE(R_LARCH_ABS_HI20); CASE(R_LARCH_ABS_LO12); CASE(R_LARCH_ABS64_LO20); CASE(R_LARCH_ABS64_HI12); CASE(R_LARCH_PCALA_HI20); CASE(R_LARCH_PCALA_LO12); CASE(R_LARCH_PCALA64_LO20); CASE(R_LARCH_PCALA64_HI12); CASE(R_LARCH_GOT_PC_HI20); CASE(R_LARCH_GOT_PC_LO12); CASE(R_LARCH_GOT64_PC_LO20); CASE(R_LARCH_GOT64_PC_HI12); CASE(R_LARCH_GOT_HI20); CASE(R_LARCH_GOT_LO12); CASE(R_LARCH_GOT64_LO20); CASE(R_LARCH_GOT64_HI12); CASE(R_LARCH_TLS_LE_HI20); CASE(R_LARCH_TLS_LE_LO12); CASE(R_LARCH_TLS_LE64_LO20); CASE(R_LARCH_TLS_LE64_HI12); CASE(R_LARCH_TLS_IE_PC_HI20); CASE(R_LARCH_TLS_IE_PC_LO12); CASE(R_LARCH_TLS_IE64_PC_LO20); CASE(R_LARCH_TLS_IE64_PC_HI12); CASE(R_LARCH_TLS_IE_HI20); CASE(R_LARCH_TLS_IE_LO12); CASE(R_LARCH_TLS_IE64_LO20); CASE(R_LARCH_TLS_IE64_HI12); CASE(R_LARCH_TLS_LD_PC_HI20); CASE(R_LARCH_TLS_LD_HI20); CASE(R_LARCH_TLS_GD_PC_HI20); CASE(R_LARCH_TLS_GD_HI20); CASE(R_LARCH_32_PCREL); CASE(R_LARCH_RELAX); CASE(R_LARCH_DELETE); CASE(R_LARCH_ALIGN); CASE(R_LARCH_PCREL20_S2); CASE(R_LARCH_CFA); CASE(R_LARCH_ADD6); CASE(R_LARCH_SUB6); CASE(R_LARCH_ADD_ULEB128); CASE(R_LARCH_SUB_ULEB128); CASE(R_LARCH_64_PCREL); CASE(R_LARCH_CALL36); CASE(R_LARCH_TLS_DESC_PC_HI20); CASE(R_LARCH_TLS_DESC_PC_LO12); CASE(R_LARCH_TLS_DESC64_PC_LO20); CASE(R_LARCH_TLS_DESC64_PC_HI12); CASE(R_LARCH_TLS_DESC_HI20); CASE(R_LARCH_TLS_DESC_LO12); CASE(R_LARCH_TLS_DESC64_LO20); CASE(R_LARCH_TLS_DESC64_HI12); CASE(R_LARCH_TLS_DESC_LD); CASE(R_LARCH_TLS_DESC_CALL); CASE(R_LARCH_TLS_LE_HI20_R); CASE(R_LARCH_TLS_LE_ADD_R); CASE(R_LARCH_TLS_LE_LO12_R); CASE(R_LARCH_TLS_LD_PCREL20_S2); CASE(R_LARCH_TLS_GD_PCREL20_S2); CASE(R_LARCH_TLS_DESC_PCREL20_S2); } return unknown_type(r_type); } template <> std::string rel_to_string(u32 r_type) { return rel_to_string(r_type); } } // namespace mold ================================================ FILE: src/elf.h ================================================ #pragma once #include "../lib/integers.h" #include #include #include #include namespace mold { struct X86_64; struct I386; struct ARM64LE; struct ARM64BE; struct ARM32LE; struct ARM32BE; struct RV64LE; struct RV64BE; struct RV32LE; struct RV32BE; struct PPC32; struct PPC64V1; struct PPC64V2; struct S390X; struct SPARC64; struct M68K; struct SH4LE; struct SH4BE; struct LOONGARCH64; struct LOONGARCH32; template struct ElfSym; template struct ElfShdr; template struct ElfEhdr; template struct ElfPhdr; template struct ElfRel; template struct ElfDyn; template struct ElfVerneed; template struct ElfVernaux; template struct ElfVerdef; template struct ElfVerdaux; template struct ElfChdr; template struct ElfNhdr; template std::string rel_to_string(u32 r_type); template std::ostream &operator<<(std::ostream &out, const ElfRel &rel) { out << rel_to_string(rel.r_type); return out; } enum : u32 { SHN_UNDEF = 0, SHN_LORESERVE = 0xff00, SHN_ABS = 0xfff1, SHN_COMMON = 0xfff2, SHN_XINDEX = 0xffff, }; enum : u32 { SHT_NULL = 0, SHT_PROGBITS = 1, SHT_SYMTAB = 2, SHT_STRTAB = 3, SHT_RELA = 4, SHT_HASH = 5, SHT_DYNAMIC = 6, SHT_NOTE = 7, SHT_NOBITS = 8, SHT_REL = 9, SHT_SHLIB = 10, SHT_DYNSYM = 11, SHT_INIT_ARRAY = 14, SHT_FINI_ARRAY = 15, SHT_PREINIT_ARRAY = 16, SHT_GROUP = 17, SHT_SYMTAB_SHNDX = 18, SHT_RELR = 19, SHT_CREL = 0x40000014, SHT_LOOS = 0x60000000, SHT_LLVM_ADDRSIG = 0x6fff4c03, SHT_GNU_HASH = 0x6ffffff6, SHT_GNU_VERDEF = 0x6ffffffd, SHT_GNU_VERNEED = 0x6ffffffe, SHT_GNU_VERSYM = 0x6fffffff, SHT_HIOS = 0x6fffffff, SHT_X86_64_UNWIND = 0x70000001, SHT_ARM_EXIDX = 0x70000001, SHT_ARM_ATTRIBUTES = 0x70000003, SHT_RISCV_ATTRIBUTES = 0x70000003, SHT_LOUSER = 0x80000000, SHT_HIUSER = 0xffffffff, }; enum : u32 { SHF_WRITE = 0x1, SHF_ALLOC = 0x2, SHF_EXECINSTR = 0x4, SHF_MERGE = 0x10, SHF_STRINGS = 0x20, SHF_INFO_LINK = 0x40, SHF_LINK_ORDER = 0x80, SHF_OS_NONCONFORMING = 0x100, SHF_GROUP = 0x200, SHF_TLS = 0x400, SHF_COMPRESSED = 0x800, SHF_GNU_RETAIN = 0x200000, SHF_EXCLUDE = 0x80000000, }; enum : u32 { GRP_COMDAT = 1, }; enum : u32 { STT_NOTYPE = 0, STT_OBJECT = 1, STT_FUNC = 2, STT_SECTION = 3, STT_FILE = 4, STT_COMMON = 5, STT_TLS = 6, STT_GNU_IFUNC = 10, STT_SPARC_REGISTER = 13, }; template inline std::string stt_to_string(u32 st_type) { switch (st_type) { case STT_NOTYPE: return "STT_NOTYPE"; case STT_OBJECT: return "STT_OBJECT"; case STT_FUNC: return "STT_FUNC"; case STT_SECTION: return "STT_SECTION"; case STT_FILE: return "STT_FILE"; case STT_COMMON: return "STT_COMMON"; case STT_TLS: return "STT_TLS"; case STT_GNU_IFUNC: return "STT_GNU_IFUNC"; } if constexpr (std::is_same_v) if (st_type == STT_SPARC_REGISTER) return "STT_SPARC_REGISTER"; return "unknown st_type (" + std::to_string(st_type) + ")"; } enum : u32 { STB_LOCAL = 0, STB_GLOBAL = 1, STB_WEAK = 2, STB_GNU_UNIQUE = 10, }; enum : u32 { STV_DEFAULT = 0, STV_INTERNAL = 1, STV_HIDDEN = 2, STV_PROTECTED = 3, }; enum : u32 { VER_NDX_LOCAL = 0, VER_NDX_GLOBAL = 1, VER_NDX_LAST_RESERVED = 1, VER_NDX_UNSPECIFIED = 0xffff, }; enum : u32 { VER_FLG_BASE = 1, VER_FLG_WEAK = 2, VER_FLG_INFO = 4, }; enum : u32 { VERSYM_HIDDEN = 0x8000, }; enum : u32 { PT_NULL = 0, PT_LOAD = 1, PT_DYNAMIC = 2, PT_INTERP = 3, PT_NOTE = 4, PT_SHLIB = 5, PT_PHDR = 6, PT_TLS = 7, PT_GNU_EH_FRAME = 0x6474e550, PT_GNU_STACK = 0x6474e551, PT_GNU_RELRO = 0x6474e552, PT_GNU_PROPERTY = 0x6474e553, PT_OPENBSD_RANDOMIZE = 0x65a3dbe6, PT_ARM_EXIDX = 0x70000001, PT_RISCV_ATTRIBUTES = 0x70000003, }; enum : u32 { PF_NONE = 0, PF_X = 1, PF_W = 2, PF_R = 4, }; enum : u32 { ET_NONE = 0, ET_REL = 1, ET_EXEC = 2, ET_DYN = 3, }; enum : u32 { ELFDATA2LSB = 1, ELFDATA2MSB = 2, }; enum : u32 { ELFCLASS32 = 1, ELFCLASS64 = 2, }; enum : u32 { EV_CURRENT = 1, }; enum : u32 { EM_NONE = 0, EM_386 = 3, EM_68K = 4, EM_PPC = 20, EM_PPC64 = 21, EM_S390X = 22, EM_ARM = 40, EM_SH = 42, EM_SPARC64 = 43, EM_X86_64 = 62, EM_AARCH64 = 183, EM_RISCV = 243, EM_LOONGARCH = 258, }; enum : u32 { EI_CLASS = 4, EI_DATA = 5, EI_VERSION = 6, EI_OSABI = 7, EI_ABIVERSION = 8, }; enum : u32 { DT_NULL = 0, DT_NEEDED = 1, DT_PLTRELSZ = 2, DT_PLTGOT = 3, DT_HASH = 4, DT_STRTAB = 5, DT_SYMTAB = 6, DT_RELA = 7, DT_RELASZ = 8, DT_RELAENT = 9, DT_STRSZ = 10, DT_SYMENT = 11, DT_INIT = 12, DT_FINI = 13, DT_SONAME = 14, DT_RPATH = 15, DT_SYMBOLIC = 16, DT_REL = 17, DT_RELSZ = 18, DT_RELENT = 19, DT_PLTREL = 20, DT_DEBUG = 21, DT_TEXTREL = 22, DT_JMPREL = 23, DT_BIND_NOW = 24, DT_INIT_ARRAY = 25, DT_FINI_ARRAY = 26, DT_INIT_ARRAYSZ = 27, DT_FINI_ARRAYSZ = 28, DT_RUNPATH = 29, DT_FLAGS = 30, DT_PREINIT_ARRAY = 32, DT_PREINIT_ARRAYSZ = 33, DT_RELRSZ = 35, DT_RELR = 36, DT_RELRENT = 37, DT_GNU_HASH = 0x6ffffef5, DT_DEPAUDIT = 0x6ffffefb, DT_AUDIT = 0x6ffffefc, DT_VERSYM = 0x6ffffff0, DT_RELACOUNT = 0x6ffffff9, DT_RELCOUNT = 0x6ffffffa, DT_FLAGS_1 = 0x6ffffffb, DT_VERDEF = 0x6ffffffc, DT_VERDEFNUM = 0x6ffffffd, DT_VERNEED = 0x6ffffffe, DT_VERNEEDNUM = 0x6fffffff, DT_PPC_GOT = 0x70000000, DT_PPC64_GLINK = 0x70000000, DT_RISCV_VARIANT_CC = 0x70000001, DT_AARCH64_VARIANT_PCS = 0x70000005, DT_AUXILIARY = 0x7ffffffd, DT_FILTER = 0x7fffffff, }; enum : u32 { DF_ORIGIN = 0x01, DF_SYMBOLIC = 0x02, DF_TEXTREL = 0x04, DF_BIND_NOW = 0x08, DF_STATIC_TLS = 0x10, }; enum : u32 { DF_1_NOW = 0x00000001, DF_1_NODELETE = 0x00000008, DF_1_INITFIRST = 0x00000020, DF_1_NOOPEN = 0x00000040, DF_1_ORIGIN = 0x00000080, DF_1_INTERPOSE = 0x00000400, DF_1_NODEFLIB = 0x00000800, DF_1_NODUMP = 0x00001000, DF_1_PIE = 0x08000000, }; enum : u32 { NT_GNU_ABI_TAG = 1, NT_GNU_HWCAP = 2, NT_GNU_BUILD_ID = 3, NT_GNU_GOLD_VERSION = 4, NT_GNU_PROPERTY_TYPE_0 = 5, NT_FDO_PACKAGING_METADATA = 0xcafe1a7e, }; enum : u32 { GNU_PROPERTY_X86_UINT32_AND_LO = 0xc0000002, GNU_PROPERTY_X86_UINT32_AND_HI = 0xc0007fff, GNU_PROPERTY_X86_UINT32_OR_LO = 0xc0008000, GNU_PROPERTY_X86_UINT32_OR_HI = 0xc000ffff, GNU_PROPERTY_X86_UINT32_OR_AND_LO = 0xc0010000, GNU_PROPERTY_X86_UINT32_OR_AND_HI = 0xc0017fff, GNU_PROPERTY_X86_FEATURE_1_IBT = 1, GNU_PROPERTY_X86_FEATURE_1_SHSTK = 2, GNU_PROPERTY_X86_FEATURE_1_AND = 0xc0000002, GNU_PROPERTY_X86_ISA_1_NEEDED = 0xc0008002, GNU_PROPERTY_X86_ISA_1_BASELINE = 1, GNU_PROPERTY_X86_ISA_1_V2 = 2, GNU_PROPERTY_X86_ISA_1_V3 = 4, GNU_PROPERTY_X86_ISA_1_V4 = 8, }; enum : u32 { ELFCOMPRESS_NONE = 0, ELFCOMPRESS_ZLIB = 1, ELFCOMPRESS_ZSTD = 2, }; enum : u32 { EF_ARM_ABI_FLOAT_SOFT = 0x200, EF_ARM_ABI_FLOAT_HARD = 0x400, EF_ARM_BE8 = 0x800000, EF_ARM_EABI_VER5 = 0x5000000, }; enum : u32 { EF_RISCV_RVC = 1, EF_RISCV_FLOAT_ABI = 6, EF_RISCV_FLOAT_ABI_SOFT = 0, EF_RISCV_FLOAT_ABI_SINGLE = 2, EF_RISCV_FLOAT_ABI_DOUBLE = 4, EF_RISCV_FLOAT_ABI_QUAD = 6, EF_RISCV_RVE = 8, EF_RISCV_TSO = 16, }; enum : u32 { EF_SPARC64_MM = 0x3, EF_SPARC64_TSO = 0x0, EF_SPARC64_PSO = 0x1, EF_SPARC64_RMO = 0x2, EF_SPARC_EXT_MASK = 0xffff00, EF_SPARC_SUN_US1 = 0x000200, EF_SPARC_HAL_R1 = 0x000400, EF_SPARC_SUN_US3 = 0x000800, }; enum : u32 { STO_RISCV_VARIANT_CC = 0x80, }; enum : u32 { ELF_TAG_FILE = 1, ELF_TAG_SECTION = 2, ELF_TAG_SYMBOL = 3, ELF_TAG_RISCV_STACK_ALIGN = 4, ELF_TAG_RISCV_ARCH = 5, ELF_TAG_RISCV_UNALIGNED_ACCESS = 6, }; enum : u32 { EF_LOONGARCH_ABI_SOFT_FLOAT = 0x1, EF_LOONGARCH_ABI_SINGLE_FLOAT = 0x2, EF_LOONGARCH_ABI_DOUBLE_FLOAT = 0x3, EF_LOONGARCH_ABI_MODIFIER_MASK = 0x7, EF_LOONGARCH_OBJABI_V1 = 0x40, EF_LOONGARCH_OBJABI_MASK = 0xC0, }; // // Relocation types // enum : u32 { R_NONE = 0, }; enum : u32 { R_X86_64_NONE = 0, R_X86_64_64 = 1, R_X86_64_PC32 = 2, R_X86_64_GOT32 = 3, R_X86_64_PLT32 = 4, R_X86_64_COPY = 5, R_X86_64_GLOB_DAT = 6, R_X86_64_JUMP_SLOT = 7, R_X86_64_RELATIVE = 8, R_X86_64_GOTPCREL = 9, R_X86_64_32 = 10, R_X86_64_32S = 11, R_X86_64_16 = 12, R_X86_64_PC16 = 13, R_X86_64_8 = 14, R_X86_64_PC8 = 15, R_X86_64_DTPMOD64 = 16, R_X86_64_DTPOFF64 = 17, R_X86_64_TPOFF64 = 18, R_X86_64_TLSGD = 19, R_X86_64_TLSLD = 20, R_X86_64_DTPOFF32 = 21, R_X86_64_GOTTPOFF = 22, R_X86_64_TPOFF32 = 23, R_X86_64_PC64 = 24, R_X86_64_GOTOFF64 = 25, R_X86_64_GOTPC32 = 26, R_X86_64_GOT64 = 27, R_X86_64_GOTPCREL64 = 28, R_X86_64_GOTPC64 = 29, R_X86_64_GOTPLT64 = 30, R_X86_64_PLTOFF64 = 31, R_X86_64_SIZE32 = 32, R_X86_64_SIZE64 = 33, R_X86_64_GOTPC32_TLSDESC = 34, R_X86_64_TLSDESC_CALL = 35, R_X86_64_TLSDESC = 36, R_X86_64_IRELATIVE = 37, R_X86_64_GOTPCRELX = 41, R_X86_64_REX_GOTPCRELX = 42, R_X86_64_CODE_4_GOTPCRELX = 43, R_X86_64_CODE_4_GOTTPOFF = 44, R_X86_64_CODE_4_GOTPC32_TLSDESC = 45, R_X86_64_CODE_5_GOTPCRELX = 46, R_X86_64_CODE_5_GOTTPOFF = 47, R_X86_64_CODE_5_GOTPC32_TLSDESC = 48, R_X86_64_CODE_6_GOTPCRELX = 49, R_X86_64_CODE_6_GOTTPOFF = 50, R_X86_64_CODE_6_GOTPC32_TLSDESC = 51, }; enum : u32 { R_386_NONE = 0, R_386_32 = 1, R_386_PC32 = 2, R_386_GOT32 = 3, R_386_PLT32 = 4, R_386_COPY = 5, R_386_GLOB_DAT = 6, R_386_JUMP_SLOT = 7, R_386_RELATIVE = 8, R_386_GOTOFF = 9, R_386_GOTPC = 10, R_386_32PLT = 11, R_386_TLS_TPOFF = 14, R_386_TLS_IE = 15, R_386_TLS_GOTIE = 16, R_386_TLS_LE = 17, R_386_TLS_GD = 18, R_386_TLS_LDM = 19, R_386_16 = 20, R_386_PC16 = 21, R_386_8 = 22, R_386_PC8 = 23, R_386_TLS_GD_32 = 24, R_386_TLS_GD_PUSH = 25, R_386_TLS_GD_CALL = 26, R_386_TLS_GD_POP = 27, R_386_TLS_LDM_32 = 28, R_386_TLS_LDM_PUSH = 29, R_386_TLS_LDM_CALL = 30, R_386_TLS_LDM_POP = 31, R_386_TLS_LDO_32 = 32, R_386_TLS_IE_32 = 33, R_386_TLS_LE_32 = 34, R_386_TLS_DTPMOD32 = 35, R_386_TLS_DTPOFF32 = 36, R_386_TLS_TPOFF32 = 37, R_386_SIZE32 = 38, R_386_TLS_GOTDESC = 39, R_386_TLS_DESC_CALL = 40, R_386_TLS_DESC = 41, R_386_IRELATIVE = 42, R_386_GOT32X = 43, }; enum : u32 { R_AARCH64_NONE = 0, R_AARCH64_ABS64 = 0x101, R_AARCH64_ABS32 = 0x102, R_AARCH64_ABS16 = 0x103, R_AARCH64_PREL64 = 0x104, R_AARCH64_PREL32 = 0x105, R_AARCH64_PREL16 = 0x106, R_AARCH64_MOVW_UABS_G0 = 0x107, R_AARCH64_MOVW_UABS_G0_NC = 0x108, R_AARCH64_MOVW_UABS_G1 = 0x109, R_AARCH64_MOVW_UABS_G1_NC = 0x10a, R_AARCH64_MOVW_UABS_G2 = 0x10b, R_AARCH64_MOVW_UABS_G2_NC = 0x10c, R_AARCH64_MOVW_UABS_G3 = 0x10d, R_AARCH64_MOVW_SABS_G0 = 0x10e, R_AARCH64_MOVW_SABS_G1 = 0x10f, R_AARCH64_MOVW_SABS_G2 = 0x110, R_AARCH64_LD_PREL_LO19 = 0x111, R_AARCH64_ADR_PREL_LO21 = 0x112, R_AARCH64_ADR_PREL_PG_HI21 = 0x113, R_AARCH64_ADR_PREL_PG_HI21_NC = 0x114, R_AARCH64_ADD_ABS_LO12_NC = 0x115, R_AARCH64_LDST8_ABS_LO12_NC = 0x116, R_AARCH64_TSTBR14 = 0x117, R_AARCH64_CONDBR19 = 0x118, R_AARCH64_JUMP26 = 0x11a, R_AARCH64_CALL26 = 0x11b, R_AARCH64_LDST16_ABS_LO12_NC = 0x11c, R_AARCH64_LDST32_ABS_LO12_NC = 0x11d, R_AARCH64_LDST64_ABS_LO12_NC = 0x11e, R_AARCH64_MOVW_PREL_G0 = 0x11f, R_AARCH64_MOVW_PREL_G0_NC = 0x120, R_AARCH64_MOVW_PREL_G1 = 0x121, R_AARCH64_MOVW_PREL_G1_NC = 0x122, R_AARCH64_MOVW_PREL_G2 = 0x123, R_AARCH64_MOVW_PREL_G2_NC = 0x124, R_AARCH64_MOVW_PREL_G3 = 0x125, R_AARCH64_LDST128_ABS_LO12_NC = 0x12b, R_AARCH64_ADR_GOT_PAGE = 0x137, R_AARCH64_LD64_GOT_LO12_NC = 0x138, R_AARCH64_LD64_GOTPAGE_LO15 = 0x139, R_AARCH64_PLT32 = 0x13a, R_AARCH64_TLSGD_ADR_PREL21 = 0x200, R_AARCH64_TLSGD_ADR_PAGE21 = 0x201, R_AARCH64_TLSGD_ADD_LO12_NC = 0x202, R_AARCH64_TLSGD_MOVW_G1 = 0x203, R_AARCH64_TLSGD_MOVW_G0_NC = 0x204, R_AARCH64_TLSLD_ADR_PREL21 = 0x205, R_AARCH64_TLSLD_ADR_PAGE21 = 0x206, R_AARCH64_TLSLD_ADD_LO12_NC = 0x207, R_AARCH64_TLSLD_MOVW_G1 = 0x208, R_AARCH64_TLSLD_MOVW_G0_NC = 0x209, R_AARCH64_TLSLD_LD_PREL19 = 0x20a, R_AARCH64_TLSLD_MOVW_DTPREL_G2 = 0x20b, R_AARCH64_TLSLD_MOVW_DTPREL_G1 = 0x20c, R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC = 0x20d, R_AARCH64_TLSLD_MOVW_DTPREL_G0 = 0x20e, R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC = 0x20f, R_AARCH64_TLSLD_ADD_DTPREL_HI12 = 0x210, R_AARCH64_TLSLD_ADD_DTPREL_LO12 = 0x211, R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC = 0x212, R_AARCH64_TLSLD_LDST8_DTPREL_LO12 = 0x213, R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC = 0x214, R_AARCH64_TLSLD_LDST16_DTPREL_LO12 = 0x215, R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC = 0x216, R_AARCH64_TLSLD_LDST32_DTPREL_LO12 = 0x217, R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC = 0x218, R_AARCH64_TLSLD_LDST64_DTPREL_LO12 = 0x219, R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC = 0x21a, R_AARCH64_TLSIE_MOVW_GOTTPREL_G1 = 0x21b, R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC = 0x21c, R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21 = 0x21d, R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC = 0x21e, R_AARCH64_TLSIE_LD_GOTTPREL_PREL19 = 0x21f, R_AARCH64_TLSLE_MOVW_TPREL_G2 = 0x220, R_AARCH64_TLSLE_MOVW_TPREL_G1 = 0x221, R_AARCH64_TLSLE_MOVW_TPREL_G1_NC = 0x222, R_AARCH64_TLSLE_MOVW_TPREL_G0 = 0x223, R_AARCH64_TLSLE_MOVW_TPREL_G0_NC = 0x224, R_AARCH64_TLSLE_ADD_TPREL_HI12 = 0x225, R_AARCH64_TLSLE_ADD_TPREL_LO12 = 0x226, R_AARCH64_TLSLE_ADD_TPREL_LO12_NC = 0x227, R_AARCH64_TLSLE_LDST8_TPREL_LO12 = 0x228, R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC = 0x229, R_AARCH64_TLSLE_LDST16_TPREL_LO12 = 0x22a, R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC = 0x22b, R_AARCH64_TLSLE_LDST32_TPREL_LO12 = 0x22c, R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC = 0x22d, R_AARCH64_TLSLE_LDST64_TPREL_LO12 = 0x22e, R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC = 0x22f, R_AARCH64_TLSDESC_ADR_PAGE21 = 0x232, R_AARCH64_TLSDESC_LD64_LO12 = 0x233, R_AARCH64_TLSDESC_ADD_LO12 = 0x234, R_AARCH64_TLSDESC_CALL = 0x239, R_AARCH64_TLSLE_LDST128_TPREL_LO12_NC = 0x23b, R_AARCH64_COPY = 0x400, R_AARCH64_GLOB_DAT = 0x401, R_AARCH64_JUMP_SLOT = 0x402, R_AARCH64_RELATIVE = 0x403, R_AARCH64_TLS_DTPMOD64 = 0x404, R_AARCH64_TLS_DTPREL64 = 0x405, R_AARCH64_TLS_TPREL64 = 0x406, R_AARCH64_TLSDESC = 0x407, R_AARCH64_IRELATIVE = 0x408, }; enum : u32 { R_ARM_NONE = 0x0, R_ARM_PC24 = 0x1, R_ARM_ABS32 = 0x2, R_ARM_REL32 = 0x3, R_ARM_LDR_PC_G0 = 0x4, R_ARM_ABS16 = 0x5, R_ARM_ABS12 = 0x6, R_ARM_THM_ABS5 = 0x7, R_ARM_ABS8 = 0x8, R_ARM_SBREL32 = 0x9, R_ARM_THM_CALL = 0xa, R_ARM_THM_PC8 = 0xb, R_ARM_BREL_ADJ = 0xc, R_ARM_TLS_DESC = 0xd, R_ARM_THM_SWI8 = 0xe, R_ARM_XPC25 = 0xf, R_ARM_THM_XPC22 = 0x10, R_ARM_TLS_DTPMOD32 = 0x11, R_ARM_TLS_DTPOFF32 = 0x12, R_ARM_TLS_TPOFF32 = 0x13, R_ARM_COPY = 0x14, R_ARM_GLOB_DAT = 0x15, R_ARM_JUMP_SLOT = 0x16, R_ARM_RELATIVE = 0x17, R_ARM_GOTOFF32 = 0x18, R_ARM_BASE_PREL = 0x19, R_ARM_GOT_BREL = 0x1a, R_ARM_PLT32 = 0x1b, R_ARM_CALL = 0x1c, R_ARM_JUMP24 = 0x1d, R_ARM_THM_JUMP24 = 0x1e, R_ARM_BASE_ABS = 0x1f, R_ARM_ALU_PCREL_7_0 = 0x20, R_ARM_ALU_PCREL_15_8 = 0x21, R_ARM_ALU_PCREL_23_15 = 0x22, R_ARM_LDR_SBREL_11_0_NC = 0x23, R_ARM_ALU_SBREL_19_12_NC = 0x24, R_ARM_ALU_SBREL_27_20_CK = 0x25, R_ARM_TARGET1 = 0x26, R_ARM_SBREL31 = 0x27, R_ARM_V4BX = 0x28, R_ARM_TARGET2 = 0x29, R_ARM_PREL31 = 0x2a, R_ARM_MOVW_ABS_NC = 0x2b, R_ARM_MOVT_ABS = 0x2c, R_ARM_MOVW_PREL_NC = 0x2d, R_ARM_MOVT_PREL = 0x2e, R_ARM_THM_MOVW_ABS_NC = 0x2f, R_ARM_THM_MOVT_ABS = 0x30, R_ARM_THM_MOVW_PREL_NC = 0x31, R_ARM_THM_MOVT_PREL = 0x32, R_ARM_THM_JUMP19 = 0x33, R_ARM_THM_JUMP6 = 0x34, R_ARM_THM_ALU_PREL_11_0 = 0x35, R_ARM_THM_PC12 = 0x36, R_ARM_ABS32_NOI = 0x37, R_ARM_REL32_NOI = 0x38, R_ARM_ALU_PC_G0_NC = 0x39, R_ARM_ALU_PC_G0 = 0x3a, R_ARM_ALU_PC_G1_NC = 0x3b, R_ARM_ALU_PC_G1 = 0x3c, R_ARM_ALU_PC_G2 = 0x3d, R_ARM_LDR_PC_G1 = 0x3e, R_ARM_LDR_PC_G2 = 0x3f, R_ARM_LDRS_PC_G0 = 0x40, R_ARM_LDRS_PC_G1 = 0x41, R_ARM_LDRS_PC_G2 = 0x42, R_ARM_LDC_PC_G0 = 0x43, R_ARM_LDC_PC_G1 = 0x44, R_ARM_LDC_PC_G2 = 0x45, R_ARM_ALU_SB_G0_NC = 0x46, R_ARM_ALU_SB_G0 = 0x47, R_ARM_ALU_SB_G1_NC = 0x48, R_ARM_ALU_SB_G1 = 0x49, R_ARM_ALU_SB_G2 = 0x4a, R_ARM_LDR_SB_G0 = 0x4b, R_ARM_LDR_SB_G1 = 0x4c, R_ARM_LDR_SB_G2 = 0x4d, R_ARM_LDRS_SB_G0 = 0x4e, R_ARM_LDRS_SB_G1 = 0x4f, R_ARM_LDRS_SB_G2 = 0x50, R_ARM_LDC_SB_G0 = 0x51, R_ARM_LDC_SB_G1 = 0x52, R_ARM_LDC_SB_G2 = 0x53, R_ARM_MOVW_BREL_NC = 0x54, R_ARM_MOVT_BREL = 0x55, R_ARM_MOVW_BREL = 0x56, R_ARM_THM_MOVW_BREL_NC = 0x57, R_ARM_THM_MOVT_BREL = 0x58, R_ARM_THM_MOVW_BREL = 0x59, R_ARM_TLS_GOTDESC = 0x5a, R_ARM_TLS_CALL = 0x5b, R_ARM_TLS_DESCSEQ = 0x5c, R_ARM_THM_TLS_CALL = 0x5d, R_ARM_PLT32_ABS = 0x5e, R_ARM_GOT_ABS = 0x5f, R_ARM_GOT_PREL = 0x60, R_ARM_GOT_BREL12 = 0x61, R_ARM_GOTOFF12 = 0x62, R_ARM_GOTRELAX = 0x63, R_ARM_GNU_VTENTRY = 0x64, R_ARM_GNU_VTINHERIT = 0x65, R_ARM_THM_JUMP11 = 0x66, R_ARM_THM_JUMP8 = 0x67, R_ARM_TLS_GD32 = 0x68, R_ARM_TLS_LDM32 = 0x69, R_ARM_TLS_LDO32 = 0x6a, R_ARM_TLS_IE32 = 0x6b, R_ARM_TLS_LE32 = 0x6c, R_ARM_TLS_LDO12 = 0x6d, R_ARM_TLS_LE12 = 0x6e, R_ARM_TLS_IE12GP = 0x6f, R_ARM_PRIVATE_0 = 0x70, R_ARM_PRIVATE_1 = 0x71, R_ARM_PRIVATE_2 = 0x72, R_ARM_PRIVATE_3 = 0x73, R_ARM_PRIVATE_4 = 0x74, R_ARM_PRIVATE_5 = 0x75, R_ARM_PRIVATE_6 = 0x76, R_ARM_PRIVATE_7 = 0x77, R_ARM_PRIVATE_8 = 0x78, R_ARM_PRIVATE_9 = 0x79, R_ARM_PRIVATE_10 = 0x7a, R_ARM_PRIVATE_11 = 0x7b, R_ARM_PRIVATE_12 = 0x7c, R_ARM_PRIVATE_13 = 0x7d, R_ARM_PRIVATE_14 = 0x7e, R_ARM_PRIVATE_15 = 0x7f, R_ARM_ME_TOO = 0x80, R_ARM_THM_TLS_DESCSEQ16 = 0x81, R_ARM_THM_TLS_DESCSEQ32 = 0x82, R_ARM_THM_BF16 = 0x88, R_ARM_THM_BF12 = 0x89, R_ARM_THM_BF18 = 0x8a, R_ARM_IRELATIVE = 0xa0, }; enum : u32 { R_RISCV_NONE = 0, R_RISCV_32 = 1, R_RISCV_64 = 2, R_RISCV_RELATIVE = 3, R_RISCV_COPY = 4, R_RISCV_JUMP_SLOT = 5, R_RISCV_TLS_DTPMOD32 = 6, R_RISCV_TLS_DTPMOD64 = 7, R_RISCV_TLS_DTPREL32 = 8, R_RISCV_TLS_DTPREL64 = 9, R_RISCV_TLS_TPREL32 = 10, R_RISCV_TLS_TPREL64 = 11, R_RISCV_TLSDESC = 12, R_RISCV_BRANCH = 16, R_RISCV_JAL = 17, R_RISCV_CALL = 18, R_RISCV_CALL_PLT = 19, R_RISCV_GOT_HI20 = 20, R_RISCV_TLS_GOT_HI20 = 21, R_RISCV_TLS_GD_HI20 = 22, R_RISCV_PCREL_HI20 = 23, R_RISCV_PCREL_LO12_I = 24, R_RISCV_PCREL_LO12_S = 25, R_RISCV_HI20 = 26, R_RISCV_LO12_I = 27, R_RISCV_LO12_S = 28, R_RISCV_TPREL_HI20 = 29, R_RISCV_TPREL_LO12_I = 30, R_RISCV_TPREL_LO12_S = 31, R_RISCV_TPREL_ADD = 32, R_RISCV_ADD8 = 33, R_RISCV_ADD16 = 34, R_RISCV_ADD32 = 35, R_RISCV_ADD64 = 36, R_RISCV_SUB8 = 37, R_RISCV_SUB16 = 38, R_RISCV_SUB32 = 39, R_RISCV_SUB64 = 40, R_RISCV_ALIGN = 43, R_RISCV_RVC_BRANCH = 44, R_RISCV_RVC_JUMP = 45, R_RISCV_RELAX = 51, R_RISCV_SUB6 = 52, R_RISCV_SET6 = 53, R_RISCV_SET8 = 54, R_RISCV_SET16 = 55, R_RISCV_SET32 = 56, R_RISCV_32_PCREL = 57, R_RISCV_IRELATIVE = 58, R_RISCV_PLT32 = 59, R_RISCV_SET_ULEB128 = 60, R_RISCV_SUB_ULEB128 = 61, R_RISCV_TLSDESC_HI20 = 62, R_RISCV_TLSDESC_LOAD_LO12 = 63, R_RISCV_TLSDESC_ADD_LO12 = 64, R_RISCV_TLSDESC_CALL = 65, }; enum : u32 { R_PPC_NONE = 0, R_PPC_ADDR32 = 1, R_PPC_ADDR24 = 2, R_PPC_ADDR16 = 3, R_PPC_ADDR16_LO = 4, R_PPC_ADDR16_HI = 5, R_PPC_ADDR16_HA = 6, R_PPC_ADDR14 = 7, R_PPC_ADDR14_BRTAKEN = 8, R_PPC_ADDR14_BRNTAKEN = 9, R_PPC_REL24 = 10, R_PPC_REL14 = 11, R_PPC_REL14_BRTAKEN = 12, R_PPC_REL14_BRNTAKEN = 13, R_PPC_GOT16 = 14, R_PPC_GOT16_LO = 15, R_PPC_GOT16_HI = 16, R_PPC_GOT16_HA = 17, R_PPC_PLTREL24 = 18, R_PPC_COPY = 19, R_PPC_GLOB_DAT = 20, R_PPC_JMP_SLOT = 21, R_PPC_RELATIVE = 22, R_PPC_LOCAL24PC = 23, R_PPC_UADDR32 = 24, R_PPC_UADDR16 = 25, R_PPC_REL32 = 26, R_PPC_PLT32 = 27, R_PPC_PLTREL32 = 28, R_PPC_PLT16_LO = 29, R_PPC_PLT16_HI = 30, R_PPC_PLT16_HA = 31, R_PPC_SDAREL16 = 32, R_PPC_SECTOFF = 33, R_PPC_SECTOFF_LO = 34, R_PPC_SECTOFF_HI = 35, R_PPC_SECTOFF_HA = 36, R_PPC_ADDR30 = 37, R_PPC_TLS = 67, R_PPC_DTPMOD32 = 68, R_PPC_TPREL16 = 69, R_PPC_TPREL16_LO = 70, R_PPC_TPREL16_HI = 71, R_PPC_TPREL16_HA = 72, R_PPC_TPREL32 = 73, R_PPC_DTPREL16 = 74, R_PPC_DTPREL16_LO = 75, R_PPC_DTPREL16_HI = 76, R_PPC_DTPREL16_HA = 77, R_PPC_DTPREL32 = 78, R_PPC_GOT_TLSGD16 = 79, R_PPC_GOT_TLSGD16_LO = 80, R_PPC_GOT_TLSGD16_HI = 81, R_PPC_GOT_TLSGD16_HA = 82, R_PPC_GOT_TLSLD16 = 83, R_PPC_GOT_TLSLD16_LO = 84, R_PPC_GOT_TLSLD16_HI = 85, R_PPC_GOT_TLSLD16_HA = 86, R_PPC_GOT_TPREL16 = 87, R_PPC_GOT_TPREL16_LO = 88, R_PPC_GOT_TPREL16_HI = 89, R_PPC_GOT_TPREL16_HA = 90, R_PPC_GOT_DTPREL16 = 91, R_PPC_GOT_DTPREL16_LO = 92, R_PPC_GOT_DTPREL16_HI = 93, R_PPC_GOT_DTPREL16_HA = 94, R_PPC_TLSGD = 95, R_PPC_TLSLD = 96, R_PPC_PLTSEQ = 119, R_PPC_PLTCALL = 120, R_PPC_IRELATIVE = 248, R_PPC_REL16 = 249, R_PPC_REL16_LO = 250, R_PPC_REL16_HI = 251, R_PPC_REL16_HA = 252, }; enum : u32 { R_PPC64_NONE = 0, R_PPC64_ADDR32 = 1, R_PPC64_ADDR24 = 2, R_PPC64_ADDR16 = 3, R_PPC64_ADDR16_LO = 4, R_PPC64_ADDR16_HI = 5, R_PPC64_ADDR16_HA = 6, R_PPC64_ADDR14 = 7, R_PPC64_ADDR14_BRTAKEN = 8, R_PPC64_ADDR14_BRNTAKEN = 9, R_PPC64_REL24 = 10, R_PPC64_REL14 = 11, R_PPC64_REL14_BRTAKEN = 12, R_PPC64_REL14_BRNTAKEN = 13, R_PPC64_GOT16 = 14, R_PPC64_GOT16_LO = 15, R_PPC64_GOT16_HI = 16, R_PPC64_GOT16_HA = 17, R_PPC64_COPY = 19, R_PPC64_GLOB_DAT = 20, R_PPC64_JMP_SLOT = 21, R_PPC64_RELATIVE = 22, R_PPC64_REL32 = 26, R_PPC64_PLT16_LO = 29, R_PPC64_PLT16_HI = 30, R_PPC64_PLT16_HA = 31, R_PPC64_ADDR64 = 38, R_PPC64_ADDR16_HIGHER = 39, R_PPC64_ADDR16_HIGHERA = 40, R_PPC64_ADDR16_HIGHEST = 41, R_PPC64_ADDR16_HIGHESTA = 42, R_PPC64_REL64 = 44, R_PPC64_TOC16 = 47, R_PPC64_TOC16_LO = 48, R_PPC64_TOC16_HI = 49, R_PPC64_TOC16_HA = 50, R_PPC64_TOC = 51, R_PPC64_ADDR16_DS = 56, R_PPC64_ADDR16_LO_DS = 57, R_PPC64_GOT16_DS = 58, R_PPC64_GOT16_LO_DS = 59, R_PPC64_PLT16_LO_DS = 60, R_PPC64_TOC16_DS = 63, R_PPC64_TOC16_LO_DS = 64, R_PPC64_TLS = 67, R_PPC64_DTPMOD64 = 68, R_PPC64_TPREL16 = 69, R_PPC64_TPREL16_LO = 70, R_PPC64_TPREL16_HI = 71, R_PPC64_TPREL16_HA = 72, R_PPC64_TPREL64 = 73, R_PPC64_DTPREL16 = 74, R_PPC64_DTPREL16_LO = 75, R_PPC64_DTPREL16_HI = 76, R_PPC64_DTPREL16_HA = 77, R_PPC64_DTPREL64 = 78, R_PPC64_GOT_TLSGD16 = 79, R_PPC64_GOT_TLSGD16_LO = 80, R_PPC64_GOT_TLSGD16_HI = 81, R_PPC64_GOT_TLSGD16_HA = 82, R_PPC64_GOT_TLSLD16 = 83, R_PPC64_GOT_TLSLD16_LO = 84, R_PPC64_GOT_TLSLD16_HI = 85, R_PPC64_GOT_TLSLD16_HA = 86, R_PPC64_GOT_TPREL16_DS = 87, R_PPC64_GOT_TPREL16_LO_DS = 88, R_PPC64_GOT_TPREL16_HI = 89, R_PPC64_GOT_TPREL16_HA = 90, R_PPC64_GOT_DTPREL16_DS = 91, R_PPC64_GOT_DTPREL16_LO_DS = 92, R_PPC64_GOT_DTPREL16_HI = 93, R_PPC64_GOT_DTPREL16_HA = 94, R_PPC64_TPREL16_DS = 95, R_PPC64_TPREL16_LO_DS = 96, R_PPC64_TPREL16_HIGHER = 97, R_PPC64_TPREL16_HIGHERA = 98, R_PPC64_TPREL16_HIGHEST = 99, R_PPC64_TPREL16_HIGHESTA = 100, R_PPC64_DTPREL16_DS = 101, R_PPC64_DTPREL16_LO_DS = 102, R_PPC64_DTPREL16_HIGHER = 103, R_PPC64_DTPREL16_HIGHERA = 104, R_PPC64_DTPREL16_HIGHEST = 105, R_PPC64_DTPREL16_HIGHESTA = 106, R_PPC64_TLSGD = 107, R_PPC64_TLSLD = 108, R_PPC64_ADDR16_HIGH = 110, R_PPC64_ADDR16_HIGHA = 111, R_PPC64_TPREL16_HIGH = 112, R_PPC64_TPREL16_HIGHA = 113, R_PPC64_DTPREL16_HIGH = 114, R_PPC64_DTPREL16_HIGHA = 115, R_PPC64_REL24_NOTOC = 116, R_PPC64_ENTRY = 118, R_PPC64_PLTSEQ = 119, R_PPC64_PLTCALL = 120, R_PPC64_PLTSEQ_NOTOC = 121, R_PPC64_PLTCALL_NOTOC = 122, R_PPC64_PCREL_OPT = 123, R_PPC64_PCREL34 = 132, R_PPC64_GOT_PCREL34 = 133, R_PPC64_PLT_PCREL34 = 134, R_PPC64_PLT_PCREL34_NOTOC = 135, R_PPC64_TPREL34 = 146, R_PPC64_DTPREL34 = 147, R_PPC64_GOT_TLSGD_PCREL34 = 148, R_PPC64_GOT_TLSLD_PCREL34 = 149, R_PPC64_GOT_TPREL_PCREL34 = 150, R_PPC64_IRELATIVE = 248, R_PPC64_REL16 = 249, R_PPC64_REL16_LO = 250, R_PPC64_REL16_HI = 251, R_PPC64_REL16_HA = 252, }; enum : u32 { R_SPARC_NONE = 0, R_SPARC_8 = 1, R_SPARC_16 = 2, R_SPARC_32 = 3, R_SPARC_DISP8 = 4, R_SPARC_DISP16 = 5, R_SPARC_DISP32 = 6, R_SPARC_WDISP30 = 7, R_SPARC_WDISP22 = 8, R_SPARC_HI22 = 9, R_SPARC_22 = 10, R_SPARC_13 = 11, R_SPARC_LO10 = 12, R_SPARC_GOT10 = 13, R_SPARC_GOT13 = 14, R_SPARC_GOT22 = 15, R_SPARC_PC10 = 16, R_SPARC_PC22 = 17, R_SPARC_WPLT30 = 18, R_SPARC_COPY = 19, R_SPARC_GLOB_DAT = 20, R_SPARC_JMP_SLOT = 21, R_SPARC_RELATIVE = 22, R_SPARC_UA32 = 23, R_SPARC_PLT32 = 24, R_SPARC_HIPLT22 = 25, R_SPARC_LOPLT10 = 26, R_SPARC_PCPLT32 = 27, R_SPARC_PCPLT22 = 28, R_SPARC_PCPLT10 = 29, R_SPARC_10 = 30, R_SPARC_11 = 31, R_SPARC_64 = 32, R_SPARC_OLO10 = 33, R_SPARC_HH22 = 34, R_SPARC_HM10 = 35, R_SPARC_LM22 = 36, R_SPARC_PC_HH22 = 37, R_SPARC_PC_HM10 = 38, R_SPARC_PC_LM22 = 39, R_SPARC_WDISP16 = 40, R_SPARC_WDISP19 = 41, R_SPARC_7 = 43, R_SPARC_5 = 44, R_SPARC_6 = 45, R_SPARC_DISP64 = 46, R_SPARC_PLT64 = 47, R_SPARC_HIX22 = 48, R_SPARC_LOX10 = 49, R_SPARC_H44 = 50, R_SPARC_M44 = 51, R_SPARC_L44 = 52, R_SPARC_REGISTER = 53, R_SPARC_UA64 = 54, R_SPARC_UA16 = 55, R_SPARC_TLS_GD_HI22 = 56, R_SPARC_TLS_GD_LO10 = 57, R_SPARC_TLS_GD_ADD = 58, R_SPARC_TLS_GD_CALL = 59, R_SPARC_TLS_LDM_HI22 = 60, R_SPARC_TLS_LDM_LO10 = 61, R_SPARC_TLS_LDM_ADD = 62, R_SPARC_TLS_LDM_CALL = 63, R_SPARC_TLS_LDO_HIX22 = 64, R_SPARC_TLS_LDO_LOX10 = 65, R_SPARC_TLS_LDO_ADD = 66, R_SPARC_TLS_IE_HI22 = 67, R_SPARC_TLS_IE_LO10 = 68, R_SPARC_TLS_IE_LD = 69, R_SPARC_TLS_IE_LDX = 70, R_SPARC_TLS_IE_ADD = 71, R_SPARC_TLS_LE_HIX22 = 72, R_SPARC_TLS_LE_LOX10 = 73, R_SPARC_TLS_DTPMOD32 = 74, R_SPARC_TLS_DTPMOD64 = 75, R_SPARC_TLS_DTPOFF32 = 76, R_SPARC_TLS_DTPOFF64 = 77, R_SPARC_TLS_TPOFF32 = 78, R_SPARC_TLS_TPOFF64 = 79, R_SPARC_GOTDATA_HIX22 = 80, R_SPARC_GOTDATA_LOX10 = 81, R_SPARC_GOTDATA_OP_HIX22 = 82, R_SPARC_GOTDATA_OP_LOX10 = 83, R_SPARC_GOTDATA_OP = 84, R_SPARC_SIZE32 = 86, R_SPARC_JMP_IREL = 248, R_SPARC_IRELATIVE = 249, }; enum : u32 { R_390_NONE = 0, R_390_8 = 1, R_390_12 = 2, R_390_16 = 3, R_390_32 = 4, R_390_PC32 = 5, R_390_GOT12 = 6, R_390_GOT32 = 7, R_390_PLT32 = 8, R_390_COPY = 9, R_390_GLOB_DAT = 10, R_390_JMP_SLOT = 11, R_390_RELATIVE = 12, R_390_GOTOFF32 = 13, R_390_GOTPC = 14, R_390_GOT16 = 15, R_390_PC16 = 16, R_390_PC16DBL = 17, R_390_PLT16DBL = 18, R_390_PC32DBL = 19, R_390_PLT32DBL = 20, R_390_GOTPCDBL = 21, R_390_64 = 22, R_390_PC64 = 23, R_390_GOT64 = 24, R_390_PLT64 = 25, R_390_GOTENT = 26, R_390_GOTOFF16 = 27, R_390_GOTOFF64 = 28, R_390_GOTPLT12 = 29, R_390_GOTPLT16 = 30, R_390_GOTPLT32 = 31, R_390_GOTPLT64 = 32, R_390_GOTPLTENT = 33, R_390_PLTOFF16 = 34, R_390_PLTOFF32 = 35, R_390_PLTOFF64 = 36, R_390_TLS_LOAD = 37, R_390_TLS_GDCALL = 38, R_390_TLS_LDCALL = 39, R_390_TLS_GD32 = 40, R_390_TLS_GD64 = 41, R_390_TLS_GOTIE12 = 42, R_390_TLS_GOTIE32 = 43, R_390_TLS_GOTIE64 = 44, R_390_TLS_LDM32 = 45, R_390_TLS_LDM64 = 46, R_390_TLS_IE32 = 47, R_390_TLS_IE64 = 48, R_390_TLS_IEENT = 49, R_390_TLS_LE32 = 50, R_390_TLS_LE64 = 51, R_390_TLS_LDO32 = 52, R_390_TLS_LDO64 = 53, R_390_TLS_DTPMOD = 54, R_390_TLS_DTPOFF = 55, R_390_TLS_TPOFF = 56, R_390_20 = 57, R_390_GOT20 = 58, R_390_GOTPLT20 = 59, R_390_TLS_GOTIE20 = 60, R_390_IRELATIVE = 61, R_390_PC12DBL = 62, R_390_PLT12DBL = 63, R_390_PC24DBL = 64, R_390_PLT24DBL = 65, }; enum : u32 { R_68K_NONE = 0, R_68K_32 = 1, R_68K_16 = 2, R_68K_8 = 3, R_68K_PC32 = 4, R_68K_PC16 = 5, R_68K_PC8 = 6, R_68K_GOTPCREL32 = 7, R_68K_GOTPCREL16 = 8, R_68K_GOTPCREL8 = 9, R_68K_GOTOFF32 = 10, R_68K_GOTOFF16 = 11, R_68K_GOTOFF8 = 12, R_68K_PLT32 = 13, R_68K_PLT16 = 14, R_68K_PLT8 = 15, R_68K_PLTOFF32 = 16, R_68K_PLTOFF16 = 17, R_68K_PLTOFF8 = 18, R_68K_COPY = 19, R_68K_GLOB_DAT = 20, R_68K_JMP_SLOT = 21, R_68K_RELATIVE = 22, R_68K_TLS_GD32 = 25, R_68K_TLS_GD16 = 26, R_68K_TLS_GD8 = 27, R_68K_TLS_LDM32 = 28, R_68K_TLS_LDM16 = 29, R_68K_TLS_LDM8 = 30, R_68K_TLS_LDO32 = 31, R_68K_TLS_LDO16 = 32, R_68K_TLS_LDO8 = 33, R_68K_TLS_IE32 = 34, R_68K_TLS_IE16 = 35, R_68K_TLS_IE8 = 36, R_68K_TLS_LE32 = 37, R_68K_TLS_LE16 = 38, R_68K_TLS_LE8 = 39, R_68K_TLS_DTPMOD32 = 40, R_68K_TLS_DTPREL32 = 41, R_68K_TLS_TPREL32 = 42, }; enum : u32 { R_SH_NONE = 0, R_SH_DIR32 = 1, R_SH_REL32 = 2, R_SH_DIR8WPN = 3, R_SH_IND12W = 4, R_SH_DIR8WPL = 5, R_SH_DIR8WPZ = 6, R_SH_DIR8BP = 7, R_SH_DIR8W = 8, R_SH_DIR8L = 9, R_SH_TLS_GD_32 = 144, R_SH_TLS_LD_32 = 145, R_SH_TLS_LDO_32 = 146, R_SH_TLS_IE_32 = 147, R_SH_TLS_LE_32 = 148, R_SH_TLS_DTPMOD32 = 149, R_SH_TLS_DTPOFF32 = 150, R_SH_TLS_TPOFF32 = 151, R_SH_GOT32 = 160, R_SH_PLT32 = 161, R_SH_COPY = 162, R_SH_GLOB_DAT = 163, R_SH_JMP_SLOT = 164, R_SH_RELATIVE = 165, R_SH_GOTOFF = 166, R_SH_GOTPC = 167, R_SH_GOTPLT32 = 168, }; enum : u32 { R_LARCH_NONE = 0, R_LARCH_32 = 1, R_LARCH_64 = 2, R_LARCH_RELATIVE = 3, R_LARCH_COPY = 4, R_LARCH_JUMP_SLOT = 5, R_LARCH_TLS_DTPMOD32 = 6, R_LARCH_TLS_DTPMOD64 = 7, R_LARCH_TLS_DTPREL32 = 8, R_LARCH_TLS_DTPREL64 = 9, R_LARCH_TLS_TPREL32 = 10, R_LARCH_TLS_TPREL64 = 11, R_LARCH_IRELATIVE = 12, R_LARCH_TLS_DESC32 = 13, R_LARCH_TLS_DESC64 = 14, R_LARCH_MARK_LA = 20, R_LARCH_MARK_PCREL = 21, R_LARCH_SOP_PUSH_PCREL = 22, R_LARCH_SOP_PUSH_ABSOLUTE = 23, R_LARCH_SOP_PUSH_DUP = 24, R_LARCH_SOP_PUSH_GPREL = 25, R_LARCH_SOP_PUSH_TLS_TPREL = 26, R_LARCH_SOP_PUSH_TLS_GOT = 27, R_LARCH_SOP_PUSH_TLS_GD = 28, R_LARCH_SOP_PUSH_PLT_PCREL = 29, R_LARCH_SOP_ASSERT = 30, R_LARCH_SOP_NOT = 31, R_LARCH_SOP_SUB = 32, R_LARCH_SOP_SL = 33, R_LARCH_SOP_SR = 34, R_LARCH_SOP_ADD = 35, R_LARCH_SOP_AND = 36, R_LARCH_SOP_IF_ELSE = 37, R_LARCH_SOP_POP_32_S_10_5 = 38, R_LARCH_SOP_POP_32_U_10_12 = 39, R_LARCH_SOP_POP_32_S_10_12 = 40, R_LARCH_SOP_POP_32_S_10_16 = 41, R_LARCH_SOP_POP_32_S_10_16_S2 = 42, R_LARCH_SOP_POP_32_S_5_20 = 43, R_LARCH_SOP_POP_32_S_0_5_10_16_S2 = 44, R_LARCH_SOP_POP_32_S_0_10_10_16_S2 = 45, R_LARCH_SOP_POP_32_U = 46, R_LARCH_ADD8 = 47, R_LARCH_ADD16 = 48, R_LARCH_ADD24 = 49, R_LARCH_ADD32 = 50, R_LARCH_ADD64 = 51, R_LARCH_SUB8 = 52, R_LARCH_SUB16 = 53, R_LARCH_SUB24 = 54, R_LARCH_SUB32 = 55, R_LARCH_SUB64 = 56, R_LARCH_GNU_VTINHERIT = 57, R_LARCH_GNU_VTENTRY = 58, R_LARCH_B16 = 64, R_LARCH_B21 = 65, R_LARCH_B26 = 66, R_LARCH_ABS_HI20 = 67, R_LARCH_ABS_LO12 = 68, R_LARCH_ABS64_LO20 = 69, R_LARCH_ABS64_HI12 = 70, R_LARCH_PCALA_HI20 = 71, R_LARCH_PCALA_LO12 = 72, R_LARCH_PCALA64_LO20 = 73, R_LARCH_PCALA64_HI12 = 74, R_LARCH_GOT_PC_HI20 = 75, R_LARCH_GOT_PC_LO12 = 76, R_LARCH_GOT64_PC_LO20 = 77, R_LARCH_GOT64_PC_HI12 = 78, R_LARCH_GOT_HI20 = 79, R_LARCH_GOT_LO12 = 80, R_LARCH_GOT64_LO20 = 81, R_LARCH_GOT64_HI12 = 82, R_LARCH_TLS_LE_HI20 = 83, R_LARCH_TLS_LE_LO12 = 84, R_LARCH_TLS_LE64_LO20 = 85, R_LARCH_TLS_LE64_HI12 = 86, R_LARCH_TLS_IE_PC_HI20 = 87, R_LARCH_TLS_IE_PC_LO12 = 88, R_LARCH_TLS_IE64_PC_LO20 = 89, R_LARCH_TLS_IE64_PC_HI12 = 90, R_LARCH_TLS_IE_HI20 = 91, R_LARCH_TLS_IE_LO12 = 92, R_LARCH_TLS_IE64_LO20 = 93, R_LARCH_TLS_IE64_HI12 = 94, R_LARCH_TLS_LD_PC_HI20 = 95, R_LARCH_TLS_LD_HI20 = 96, R_LARCH_TLS_GD_PC_HI20 = 97, R_LARCH_TLS_GD_HI20 = 98, R_LARCH_32_PCREL = 99, R_LARCH_RELAX = 100, R_LARCH_DELETE = 101, R_LARCH_ALIGN = 102, R_LARCH_PCREL20_S2 = 103, R_LARCH_CFA = 104, R_LARCH_ADD6 = 105, R_LARCH_SUB6 = 106, R_LARCH_ADD_ULEB128 = 107, R_LARCH_SUB_ULEB128 = 108, R_LARCH_64_PCREL = 109, R_LARCH_CALL36 = 110, R_LARCH_TLS_DESC_PC_HI20 = 111, R_LARCH_TLS_DESC_PC_LO12 = 112, R_LARCH_TLS_DESC64_PC_LO20 = 113, R_LARCH_TLS_DESC64_PC_HI12 = 114, R_LARCH_TLS_DESC_HI20 = 115, R_LARCH_TLS_DESC_LO12 = 116, R_LARCH_TLS_DESC64_LO20 = 117, R_LARCH_TLS_DESC64_HI12 = 118, R_LARCH_TLS_DESC_LD = 119, R_LARCH_TLS_DESC_CALL = 120, R_LARCH_TLS_LE_HI20_R = 121, R_LARCH_TLS_LE_ADD_R = 122, R_LARCH_TLS_LE_LO12_R = 123, R_LARCH_TLS_LD_PCREL20_S2 = 124, R_LARCH_TLS_GD_PCREL20_S2 = 125, R_LARCH_TLS_DESC_PCREL20_S2 = 126, }; // Returns true if a given relocation is of type used for direct // function call. template inline bool is_func_call_rel(const ElfRel &r) { for (u32 r_type : E::R_FUNCALL) if (r.r_type == r_type) return true; return false; } // // DWARF data types // enum : u32 { DW_EH_PE_absptr = 0, DW_EH_PE_omit = 0xff, DW_EH_PE_uleb128 = 0x01, DW_EH_PE_udata2 = 0x02, DW_EH_PE_udata4 = 0x03, DW_EH_PE_udata8 = 0x04, DW_EH_PE_signed = 0x08, DW_EH_PE_sleb128 = 0x09, DW_EH_PE_sdata2 = 0x0a, DW_EH_PE_sdata4 = 0x0b, DW_EH_PE_sdata8 = 0x0c, DW_EH_PE_pcrel = 0x10, DW_EH_PE_textrel = 0x20, DW_EH_PE_datarel = 0x30, DW_EH_PE_funcrel = 0x40, DW_EH_PE_aligned = 0x50, }; enum : u32 { DW_AT_low_pc = 0x11, DW_AT_high_pc = 0x12, DW_AT_producer = 0x25, DW_AT_ranges = 0x55, DW_AT_addr_base = 0x73, DW_AT_rnglists_base = 0x74, }; enum : u32 { DW_TAG_compile_unit = 0x11, DW_TAG_skeleton_unit = 0x4a, }; enum : u32 { DW_UT_compile = 0x01, DW_UT_partial = 0x03, DW_UT_skeleton = 0x04, DW_UT_split_compile = 0x05, }; enum : u32 { DW_FORM_addr = 0x01, DW_FORM_block2 = 0x03, DW_FORM_block4 = 0x04, DW_FORM_data2 = 0x05, DW_FORM_data4 = 0x06, DW_FORM_data8 = 0x07, DW_FORM_string = 0x08, DW_FORM_block = 0x09, DW_FORM_block1 = 0x0a, DW_FORM_data1 = 0x0b, DW_FORM_flag = 0x0c, DW_FORM_sdata = 0x0d, DW_FORM_strp = 0x0e, DW_FORM_udata = 0x0f, DW_FORM_ref_addr = 0x10, DW_FORM_ref1 = 0x11, DW_FORM_ref2 = 0x12, DW_FORM_ref4 = 0x13, DW_FORM_ref8 = 0x14, DW_FORM_ref_udata = 0x15, DW_FORM_indirect = 0x16, DW_FORM_sec_offset = 0x17, DW_FORM_exprloc = 0x18, DW_FORM_flag_present = 0x19, DW_FORM_strx = 0x1a, DW_FORM_addrx = 0x1b, DW_FORM_ref_sup4 = 0x1c, DW_FORM_strp_sup = 0x1d, DW_FORM_data16 = 0x1e, DW_FORM_line_strp = 0x1f, DW_FORM_ref_sig8 = 0x20, DW_FORM_implicit_const = 0x21, DW_FORM_loclistx = 0x22, DW_FORM_rnglistx = 0x23, DW_FORM_ref_sup8 = 0x24, DW_FORM_strx1 = 0x25, DW_FORM_strx2 = 0x26, DW_FORM_strx3 = 0x27, DW_FORM_strx4 = 0x28, DW_FORM_addrx1 = 0x29, DW_FORM_addrx2 = 0x2a, DW_FORM_addrx3 = 0x2b, DW_FORM_addrx4 = 0x2c, }; enum : u32 { DW_RLE_end_of_list = 0x00, DW_RLE_base_addressx = 0x01, DW_RLE_startx_endx = 0x02, DW_RLE_startx_length = 0x03, DW_RLE_offset_pair = 0x04, DW_RLE_base_address = 0x05, DW_RLE_start_end = 0x06, DW_RLE_start_length = 0x07, }; // // ELF types // template using I16 = std::conditional_t; template using I32 = std::conditional_t; template using I64 = std::conditional_t; template using U16 = std::conditional_t; template using U24 = std::conditional_t; template using U32 = std::conditional_t; template using U64 = std::conditional_t; template using Word = std::conditional_t, U32>; template using SWord = std::conditional_t, I32>; template requires E::is_64 struct ElfSym { bool is_undef() const { return st_shndx == SHN_UNDEF; } bool is_abs() const { return st_shndx == SHN_ABS; } bool is_common() const { return st_shndx == SHN_COMMON; } bool is_weak() const { return st_bind == STB_WEAK; } bool is_undef_weak() const { return is_undef() && is_weak(); } bool ppc64_preserves_r2() const { return ppc64_local_entry != 1; } bool ppc64_uses_toc() const { return ppc64_local_entry > 1; } U32 st_name; #ifdef __LITTLE_ENDIAN__ u8 st_type : 4; u8 st_bind : 4; union { u8 st_visibility : 2; struct { u8 : 7; u8 arm64_variant_pcs : 1; }; struct { u8 : 7; u8 riscv_variant_cc : 1; }; struct { u8 : 5; u8 ppc64_local_entry : 3; }; }; #else u8 st_bind : 4; u8 st_type : 4; union { struct { u8 : 6; u8 st_visibility : 2; }; u8 arm64_variant_pcs : 1; u8 riscv_variant_cc : 1; u8 ppc64_local_entry : 3; }; #endif U16 st_shndx; U64 st_value; U64 st_size; }; template requires (!E::is_64) struct ElfSym { bool is_undef() const { return st_shndx == SHN_UNDEF; } bool is_abs() const { return st_shndx == SHN_ABS; } bool is_common() const { return st_shndx == SHN_COMMON; } bool is_weak() const { return st_bind == STB_WEAK; } bool is_undef_weak() const { return is_undef() && is_weak(); } U32 st_name; U32 st_value; U32 st_size; #ifdef __LITTLE_ENDIAN__ u8 st_type : 4; u8 st_bind : 4; union { u8 st_visibility : 2; struct { u8 : 7; u8 riscv_variant_cc : 1; }; }; #else u8 st_bind : 4; u8 st_type : 4; union { struct { u8 : 6; u8 st_visibility : 2; }; u8 riscv_variant_cc : 1; }; #endif U16 st_shndx; }; template struct ElfShdr { U32 sh_name; U32 sh_type; Word sh_flags; Word sh_addr; Word sh_offset; Word sh_size; U32 sh_link; U32 sh_info; Word sh_addralign; Word sh_entsize; }; template struct ElfEhdr { u8 e_ident[16]; U16 e_type; U16 e_machine; U32 e_version; Word e_entry; Word e_phoff; Word e_shoff; U32 e_flags; U16 e_ehsize; U16 e_phentsize; U16 e_phnum; U16 e_shentsize; U16 e_shnum; U16 e_shstrndx; }; template requires E::is_64 struct ElfPhdr { U32 p_type; U32 p_flags; U64 p_offset; U64 p_vaddr; U64 p_paddr; U64 p_filesz; U64 p_memsz; U64 p_align; }; template requires (!E::is_64) struct ElfPhdr { U32 p_type; U32 p_offset; U32 p_vaddr; U32 p_paddr; U32 p_filesz; U32 p_memsz; U32 p_flags; U32 p_align; }; // Depending on the target, ElfRel may or may not contain r_addend member. // The relocation record containing r_addend is called RELA, and that // without r_addend is called REL. // // If REL, relocation addends are stored as parts of section contents. // That means we add a computed value to an existing value when writing a // relocated value if REL. If RELA, we just overwrite an existing value // with a newly computed value. // // We don't want to have too many `if (REL)`s and `if (RELA)`s in our // codebase, so ElfRel always takes r_addend as a constructor argument. // If it's REL, the argument will simply be ignored. template requires (E::is_le && E::is_rela) struct ElfRel { ElfRel() = default; ElfRel(u64 offset, u32 type, u32 sym, i64 addend) : r_offset(offset), r_type(type), r_sym(sym), r_addend(addend) {} Word r_offset; std::conditional_t, u8> r_type; std::conditional_t, U24> r_sym; SWord r_addend; }; template requires (!E::is_le && E::is_rela) struct ElfRel { ElfRel() = default; ElfRel(u64 offset, u32 type, u32 sym, i64 addend) : r_offset(offset), r_sym(sym), r_type(type), r_addend(addend) {} Word r_offset; std::conditional_t, U24> r_sym; std::conditional_t, u8> r_type; SWord r_addend; }; template requires (E::is_le && !E::is_rela) struct ElfRel { ElfRel() = default; ElfRel(u64 offset, u32 type, u32 sym, i64 addend = 0) : r_offset(offset), r_type(type), r_sym(sym) {} Word r_offset; std::conditional_t, u8> r_type; std::conditional_t, U24> r_sym; }; template requires (!E::is_le && !E::is_rela) struct ElfRel { ElfRel() = default; ElfRel(u64 offset, u32 type, u32 sym, i64 addend = 0) : r_offset(offset), r_sym(sym), r_type(type) {} Word r_offset; std::conditional_t, U24> r_sym; std::conditional_t, u8> r_type; }; template struct ElfDyn { Word d_tag; Word d_val; }; template struct ElfVerneed { U16 vn_version; U16 vn_cnt; U32 vn_file; U32 vn_aux; U32 vn_next; }; template struct ElfVernaux { U32 vna_hash; U16 vna_flags; U16 vna_other; U32 vna_name; U32 vna_next; }; template struct ElfVerdef { U16 vd_version; U16 vd_flags; U16 vd_ndx; U16 vd_cnt; U32 vd_hash; U32 vd_aux; U32 vd_next; }; template struct ElfVerdaux { U32 vda_name; U32 vda_next; }; template requires E::is_64 struct ElfChdr { U32 ch_type; U32 ch_reserved; U64 ch_size; U64 ch_addralign; }; template requires (!E::is_64) struct ElfChdr { U32 ch_type; U32 ch_size; U32 ch_addralign; }; template struct ElfNhdr { U32 n_namesz; U32 n_descsz; U32 n_type; }; // // Target-specific ELF data types // template <> struct ElfRel { ElfRel() = default; ElfRel(u64 offset, u32 type, u32 sym, i64 addend) : r_offset(offset), r_sym(sym), r_type_data(0), r_type(type), r_addend(addend) {} ub64 r_offset; ub32 r_sym; ub24 r_type_data; // SPARC-specific: used for R_SPARC_OLO10 u8 r_type; ib64 r_addend; }; template <> struct ElfRel { ElfRel() = default; // Addend is ignored except for base relocations because even though // SH4 is RELA, r_addend is ignored in most cases and works as if it // were REL. ElfRel(u64 offset, u32 type, u32 sym, i64 addend) : r_offset(offset), r_type(type), r_sym(sym), r_addend(sym ? 0 : addend) {} ul32 r_offset; u8 r_type; ul24 r_sym; il32 r_addend; }; template <> struct ElfRel { ElfRel() = default; ElfRel(u64 offset, u32 type, u32 sym, i64 addend) : r_offset(offset), r_sym(sym), r_type(type), r_addend(sym ? 0 : addend) {} ub32 r_offset; ub24 r_sym; u8 r_type; ib32 r_addend; }; // // Machine descriptions // template concept supports_ifunc = requires { E::R_IRELATIVE; }; template concept supports_tlsdesc = requires { E::R_TLSDESC; }; template concept needs_thunk = requires { E::thunk_size; }; template concept is_x86_64 = std::same_as; template concept is_i386 = std::same_as; template concept is_arm64le = std::same_as; template concept is_arm64be = std::same_as; template concept is_arm32le = std::same_as; template concept is_arm32be = std::same_as; template concept is_rv64le = std::same_as; template concept is_rv64be = std::same_as; template concept is_rv32le = std::same_as; template concept is_rv32be = std::same_as; template concept is_ppc32 = std::same_as; template concept is_ppc64v1 = std::same_as; template concept is_ppc64v2 = std::same_as; template concept is_s390x = std::same_as; template concept is_sparc64 = std::same_as; template concept is_m68k = std::same_as; template concept is_sh4le = std::same_as; template concept is_sh4be = std::same_as; template concept is_loongarch64 = std::same_as; template concept is_loongarch32 = std::same_as; template concept is_x86 = is_x86_64 || is_i386; template concept is_arm32 = is_arm32le || is_arm32be; template concept is_arm64 = is_arm64le || is_arm64be; template concept is_arm = is_arm64 || is_arm32; template concept is_rv64 = is_rv64le || is_rv64be; template concept is_rv32 = is_rv32le || is_rv32be; template concept is_riscv = is_rv64 || is_rv32; template concept is_ppc64 = is_ppc64v1 || is_ppc64v2; template concept is_ppc = is_ppc64 || is_ppc32; template concept is_sparc = is_sparc64; template concept is_sh4 = is_sh4le || is_sh4be; template concept is_loongarch = is_loongarch64 || is_loongarch32; struct X86_64 { static constexpr std::string_view name = "x86_64"; static constexpr bool is_64 = true; static constexpr bool is_le = true; static constexpr bool is_rela = true; static constexpr u32 page_size = 4096; static constexpr u32 e_machine = EM_X86_64; static constexpr u32 plt_hdr_size = 32; static constexpr u32 plt_size = 16; static constexpr u32 pltgot_size = 8; static constexpr u8 trap[] = { 0xcc }; // int3 static constexpr u32 R_COPY = R_X86_64_COPY; static constexpr u32 R_GLOB_DAT = R_X86_64_GLOB_DAT; static constexpr u32 R_JUMP_SLOT = R_X86_64_JUMP_SLOT; static constexpr u32 R_ABS = R_X86_64_64; static constexpr u32 R_RELATIVE = R_X86_64_RELATIVE; static constexpr u32 R_IRELATIVE = R_X86_64_IRELATIVE; static constexpr u32 R_DTPOFF = R_X86_64_DTPOFF64; static constexpr u32 R_TPOFF = R_X86_64_TPOFF64; static constexpr u32 R_DTPMOD = R_X86_64_DTPMOD64; static constexpr u32 R_TLSDESC = R_X86_64_TLSDESC; static constexpr u32 R_FUNCALL[] = { R_X86_64_PLT32, R_X86_64_PLTOFF64 }; }; struct I386 { static constexpr std::string_view name = "i386"; static constexpr bool is_64 = false; static constexpr bool is_le = true; static constexpr bool is_rela = false; static constexpr u32 page_size = 4096; static constexpr u32 e_machine = EM_386; static constexpr u32 plt_hdr_size = 16; static constexpr u32 plt_size = 16; static constexpr u32 pltgot_size = 8; static constexpr u8 trap[] = { 0xcc }; // int3 static constexpr u32 R_COPY = R_386_COPY; static constexpr u32 R_GLOB_DAT = R_386_GLOB_DAT; static constexpr u32 R_JUMP_SLOT = R_386_JUMP_SLOT; static constexpr u32 R_ABS = R_386_32; static constexpr u32 R_RELATIVE = R_386_RELATIVE; static constexpr u32 R_IRELATIVE = R_386_IRELATIVE; static constexpr u32 R_DTPOFF = R_386_TLS_DTPOFF32; static constexpr u32 R_TPOFF = R_386_TLS_TPOFF; static constexpr u32 R_DTPMOD = R_386_TLS_DTPMOD32; static constexpr u32 R_TLSDESC = R_386_TLS_DESC; static constexpr u32 R_FUNCALL[] = { R_386_PLT32 }; }; struct ARM64LE { static constexpr std::string_view name = "arm64"; static constexpr bool is_64 = true; static constexpr bool is_le = true; static constexpr bool is_rela = true; static constexpr u32 page_size = 65536; static constexpr u32 e_machine = EM_AARCH64; static constexpr u32 plt_hdr_size = 32; static constexpr u32 plt_size = 16; static constexpr u32 pltgot_size = 16; static constexpr u32 thunk_hdr_size = 0; static constexpr u32 thunk_size = 32; static constexpr u8 trap[] = { 0x00, 0x7d, 0x20, 0xd4 }; // brk static constexpr u32 R_COPY = R_AARCH64_COPY; static constexpr u32 R_GLOB_DAT = R_AARCH64_GLOB_DAT; static constexpr u32 R_JUMP_SLOT = R_AARCH64_JUMP_SLOT; static constexpr u32 R_ABS = R_AARCH64_ABS64; static constexpr u32 R_RELATIVE = R_AARCH64_RELATIVE; static constexpr u32 R_IRELATIVE = R_AARCH64_IRELATIVE; static constexpr u32 R_DTPOFF = R_AARCH64_TLS_DTPREL64; static constexpr u32 R_TPOFF = R_AARCH64_TLS_TPREL64; static constexpr u32 R_DTPMOD = R_AARCH64_TLS_DTPMOD64; static constexpr u32 R_TLSDESC = R_AARCH64_TLSDESC; static constexpr u32 R_FUNCALL[] = { R_AARCH64_JUMP26, R_AARCH64_CALL26 }; }; struct ARM64BE : ARM64LE { static constexpr std::string_view name = "arm64be"; static constexpr bool is_le = false; }; struct ARM32LE { static constexpr std::string_view name = "arm32"; static constexpr bool is_64 = false; static constexpr bool is_le = true; static constexpr bool is_rela = false; static constexpr u32 page_size = 65536; static constexpr u32 e_machine = EM_ARM; static constexpr u32 plt_hdr_size = 32; static constexpr u32 plt_size = 16; static constexpr u32 pltgot_size = 16; static constexpr u32 thunk_hdr_size = 16; static constexpr u32 thunk_size = 16; static constexpr u8 trap[] = { 0xff, 0xde }; // udf static constexpr u32 R_COPY = R_ARM_COPY; static constexpr u32 R_GLOB_DAT = R_ARM_GLOB_DAT; static constexpr u32 R_JUMP_SLOT = R_ARM_JUMP_SLOT; static constexpr u32 R_ABS = R_ARM_ABS32; static constexpr u32 R_RELATIVE = R_ARM_RELATIVE; static constexpr u32 R_IRELATIVE = R_ARM_IRELATIVE; static constexpr u32 R_DTPOFF = R_ARM_TLS_DTPOFF32; static constexpr u32 R_TPOFF = R_ARM_TLS_TPOFF32; static constexpr u32 R_DTPMOD = R_ARM_TLS_DTPMOD32; static constexpr u32 R_TLSDESC = R_ARM_TLS_DESC; static constexpr u32 R_FUNCALL[] = { R_ARM_JUMP24, R_ARM_THM_JUMP24, R_ARM_CALL, R_ARM_THM_CALL, R_ARM_PLT32, }; }; struct ARM32BE : ARM32LE { static constexpr std::string_view name = "arm32be"; static constexpr bool is_le = false; }; struct RV64LE { static constexpr std::string_view name = "riscv64"; static constexpr bool is_64 = true; static constexpr bool is_le = true; static constexpr bool is_rela = true; static constexpr u32 page_size = 4096; static constexpr u32 e_machine = EM_RISCV; static constexpr u32 plt_hdr_size = 32; static constexpr u32 plt_size = 16; static constexpr u32 pltgot_size = 16; static constexpr u8 trap[] = { 0x02, 0x90 }; // c.ebreak static constexpr u32 R_COPY = R_RISCV_COPY; static constexpr u32 R_GLOB_DAT = R_RISCV_64; static constexpr u32 R_JUMP_SLOT = R_RISCV_JUMP_SLOT; static constexpr u32 R_ABS = R_RISCV_64; static constexpr u32 R_RELATIVE = R_RISCV_RELATIVE; static constexpr u32 R_IRELATIVE = R_RISCV_IRELATIVE; static constexpr u32 R_DTPOFF = R_RISCV_TLS_DTPREL64; static constexpr u32 R_TPOFF = R_RISCV_TLS_TPREL64; static constexpr u32 R_DTPMOD = R_RISCV_TLS_DTPMOD64; static constexpr u32 R_TLSDESC = R_RISCV_TLSDESC; static constexpr u32 R_FUNCALL[] = { R_RISCV_CALL, R_RISCV_CALL_PLT }; }; struct RV64BE : RV64LE { static constexpr std::string_view name = "riscv64be"; static constexpr bool is_le = false; }; struct RV32LE { static constexpr std::string_view name = "riscv32"; static constexpr bool is_64 = false; static constexpr bool is_le = true; static constexpr bool is_rela = true; static constexpr u32 page_size = 4096; static constexpr u32 e_machine = EM_RISCV; static constexpr u32 plt_hdr_size = 32; static constexpr u32 plt_size = 16; static constexpr u32 pltgot_size = 16; static constexpr u8 trap[] = { 0x02, 0x90 }; // c.ebreak static constexpr u32 R_COPY = R_RISCV_COPY; static constexpr u32 R_GLOB_DAT = R_RISCV_32; static constexpr u32 R_JUMP_SLOT = R_RISCV_JUMP_SLOT; static constexpr u32 R_ABS = R_RISCV_32; static constexpr u32 R_RELATIVE = R_RISCV_RELATIVE; static constexpr u32 R_IRELATIVE = R_RISCV_IRELATIVE; static constexpr u32 R_DTPOFF = R_RISCV_TLS_DTPREL32; static constexpr u32 R_TPOFF = R_RISCV_TLS_TPREL32; static constexpr u32 R_DTPMOD = R_RISCV_TLS_DTPMOD32; static constexpr u32 R_TLSDESC = R_RISCV_TLSDESC; static constexpr u32 R_FUNCALL[] = { R_RISCV_CALL, R_RISCV_CALL_PLT }; }; struct RV32BE : RV32LE { static constexpr std::string_view name = "riscv32be"; static constexpr bool is_le = false; }; struct PPC32 { static constexpr std::string_view name = "ppc32"; static constexpr bool is_64 = false; static constexpr bool is_le = false; static constexpr bool is_rela = true; static constexpr u32 page_size = 65536; static constexpr u32 e_machine = EM_PPC; static constexpr u32 plt_hdr_size = 64; static constexpr u32 plt_size = 36; static constexpr u32 pltgot_size = 36; static constexpr u32 thunk_hdr_size = 0; static constexpr u32 thunk_size = 36; static constexpr u8 trap[] = { 0x7f, 0xe0, 0x00, 0x08 }; // trap static constexpr u32 R_COPY = R_PPC_COPY; static constexpr u32 R_GLOB_DAT = R_PPC_GLOB_DAT; static constexpr u32 R_JUMP_SLOT = R_PPC_JMP_SLOT; static constexpr u32 R_ABS = R_PPC_ADDR32; static constexpr u32 R_RELATIVE = R_PPC_RELATIVE; static constexpr u32 R_IRELATIVE = R_PPC_IRELATIVE; static constexpr u32 R_DTPOFF = R_PPC_DTPREL32; static constexpr u32 R_TPOFF = R_PPC_TPREL32; static constexpr u32 R_DTPMOD = R_PPC_DTPMOD32; static constexpr u32 R_FUNCALL[] = { R_PPC_REL24, R_PPC_PLTREL24, R_PPC_LOCAL24PC, }; }; struct PPC64 { static constexpr bool is_64 = true; static constexpr bool is_rela = true; static constexpr u32 page_size = 65536; static constexpr u32 e_machine = EM_PPC64; static constexpr u32 R_COPY = R_PPC64_COPY; static constexpr u32 R_GLOB_DAT = R_PPC64_GLOB_DAT; static constexpr u32 R_JUMP_SLOT = R_PPC64_JMP_SLOT; static constexpr u32 R_ABS = R_PPC64_ADDR64; static constexpr u32 R_RELATIVE = R_PPC64_RELATIVE; static constexpr u32 R_IRELATIVE = R_PPC64_IRELATIVE; static constexpr u32 R_DTPOFF = R_PPC64_DTPREL64; static constexpr u32 R_TPOFF = R_PPC64_TPREL64; static constexpr u32 R_DTPMOD = R_PPC64_DTPMOD64; static constexpr u32 R_FUNCALL[] = { R_PPC64_REL24, R_PPC64_REL24_NOTOC }; }; struct PPC64V1 : PPC64 { static constexpr std::string_view name = "ppc64v1"; static constexpr bool is_le = false; static constexpr u32 plt_hdr_size = 44; static constexpr u32 pltgot_size = 0; static constexpr u32 thunk_hdr_size = 0; static constexpr u32 thunk_size = 28; static constexpr u8 trap[] = { 0x7f, 0xe0, 0x00, 0x08 }; // trap }; struct PPC64V2 : PPC64 { static constexpr std::string_view name = "ppc64v2"; static constexpr bool is_le = true; static constexpr u32 plt_hdr_size = 52; static constexpr u32 plt_size = 4; static constexpr u32 pltgot_size = 0; static constexpr u32 thunk_hdr_size = 0; static constexpr u32 thunk_size = 24; static constexpr u8 trap[] = { 0x08, 0x00, 0xe0, 0x7f }; // trap }; struct S390X { static constexpr std::string_view name = "s390x"; static constexpr bool is_64 = true; static constexpr bool is_le = false; static constexpr bool is_rela = true; static constexpr u32 page_size = 4096; static constexpr u32 e_machine = EM_S390X; static constexpr u32 plt_hdr_size = 48; static constexpr u32 plt_size = 16; static constexpr u32 pltgot_size = 16; static constexpr u8 trap[] = { 0x00, 0x00 }; // invalid static constexpr u32 R_COPY = R_390_COPY; static constexpr u32 R_GLOB_DAT = R_390_GLOB_DAT; static constexpr u32 R_JUMP_SLOT = R_390_JMP_SLOT; static constexpr u32 R_ABS = R_390_64; static constexpr u32 R_RELATIVE = R_390_RELATIVE; static constexpr u32 R_IRELATIVE = R_390_IRELATIVE; static constexpr u32 R_DTPOFF = R_390_TLS_DTPOFF; static constexpr u32 R_TPOFF = R_390_TLS_TPOFF; static constexpr u32 R_DTPMOD = R_390_TLS_DTPMOD; static constexpr u32 R_FUNCALL[] = { R_390_PLT32DBL }; }; struct SPARC64 { static constexpr std::string_view name = "sparc64"; static constexpr bool is_64 = true; static constexpr bool is_le = false; static constexpr bool is_rela = true; static constexpr u32 page_size = 8192; static constexpr u32 e_machine = EM_SPARC64; static constexpr u32 plt_hdr_size = 128; static constexpr u32 plt_size = 32; static constexpr u32 pltgot_size = 32; static constexpr u8 trap[] = { 0x91, 0xd0, 0x20, 0x05 }; // ta 5 static constexpr u32 R_COPY = R_SPARC_COPY; static constexpr u32 R_GLOB_DAT = R_SPARC_GLOB_DAT; static constexpr u32 R_JUMP_SLOT = R_SPARC_JMP_SLOT; static constexpr u32 R_ABS = R_SPARC_64; static constexpr u32 R_RELATIVE = R_SPARC_RELATIVE; static constexpr u32 R_IRELATIVE = R_SPARC_IRELATIVE; static constexpr u32 R_DTPOFF = R_SPARC_TLS_DTPOFF64; static constexpr u32 R_TPOFF = R_SPARC_TLS_TPOFF64; static constexpr u32 R_DTPMOD = R_SPARC_TLS_DTPMOD64; static constexpr u32 R_FUNCALL[] = { R_SPARC_WPLT30, R_SPARC_WDISP30 }; }; struct M68K { static constexpr std::string_view name = "m68k"; static constexpr bool is_64 = false; static constexpr bool is_le = false; static constexpr bool is_rela = true; static constexpr u32 page_size = 8192; static constexpr u32 e_machine = EM_68K; static constexpr u32 plt_hdr_size = 18; static constexpr u32 plt_size = 14; static constexpr u32 pltgot_size = 8; static constexpr u8 trap[] = { 0x4a, 0xfc }; // illegal static constexpr u32 R_COPY = R_68K_COPY; static constexpr u32 R_GLOB_DAT = R_68K_GLOB_DAT; static constexpr u32 R_JUMP_SLOT = R_68K_JMP_SLOT; static constexpr u32 R_ABS = R_68K_32; static constexpr u32 R_RELATIVE = R_68K_RELATIVE; static constexpr u32 R_DTPOFF = R_68K_TLS_DTPREL32; static constexpr u32 R_TPOFF = R_68K_TLS_TPREL32; static constexpr u32 R_DTPMOD = R_68K_TLS_DTPMOD32; static constexpr u32 R_FUNCALL[] = { R_68K_PLT32 }; }; struct SH4LE { static constexpr std::string_view name = "sh4"; static constexpr bool is_64 = false; static constexpr bool is_le = true; static constexpr bool is_rela = true; static constexpr u32 page_size = 4096; static constexpr u32 e_machine = EM_SH; static constexpr u32 plt_hdr_size = 16; static constexpr u32 plt_size = 16; static constexpr u32 pltgot_size = 12; static constexpr u8 trap[] = { 0xfd, 0xff }; // illegal static constexpr u32 R_COPY = R_SH_COPY; static constexpr u32 R_GLOB_DAT = R_SH_GLOB_DAT; static constexpr u32 R_JUMP_SLOT = R_SH_JMP_SLOT; static constexpr u32 R_ABS = R_SH_DIR32; static constexpr u32 R_RELATIVE = R_SH_RELATIVE; static constexpr u32 R_DTPOFF = R_SH_TLS_DTPOFF32; static constexpr u32 R_TPOFF = R_SH_TLS_TPOFF32; static constexpr u32 R_DTPMOD = R_SH_TLS_DTPMOD32; static constexpr u32 R_FUNCALL[] = { R_SH_PLT32 }; }; struct SH4BE : SH4LE { static constexpr std::string_view name = "sh4be"; static constexpr bool is_le = false; static constexpr u8 trap[] = { 0xff, 0xfd }; // illegal }; struct LOONGARCH64 { static constexpr std::string_view name = "loongarch64"; static constexpr bool is_64 = true; static constexpr bool is_le = true; static constexpr bool is_rela = true; static constexpr u32 page_size = 65536; static constexpr u32 e_machine = EM_LOONGARCH; static constexpr u32 plt_hdr_size = 32; static constexpr u32 plt_size = 16; static constexpr u32 pltgot_size = 16; static constexpr u8 trap[] = { 0x00, 0x00, 0x2a, 0x00 }; // break 0 static constexpr u32 R_COPY = R_LARCH_COPY; static constexpr u32 R_GLOB_DAT = R_LARCH_64; static constexpr u32 R_JUMP_SLOT = R_LARCH_JUMP_SLOT; static constexpr u32 R_ABS = R_LARCH_64; static constexpr u32 R_RELATIVE = R_LARCH_RELATIVE; static constexpr u32 R_IRELATIVE = R_LARCH_IRELATIVE; static constexpr u32 R_DTPOFF = R_LARCH_TLS_DTPREL64; static constexpr u32 R_TPOFF = R_LARCH_TLS_TPREL64; static constexpr u32 R_DTPMOD = R_LARCH_TLS_DTPMOD64; static constexpr u32 R_TLSDESC = R_LARCH_TLS_DESC64; static constexpr u32 R_FUNCALL[] = { R_LARCH_B26, R_LARCH_CALL36 }; }; struct LOONGARCH32 { static constexpr std::string_view name = "loongarch32"; static constexpr bool is_64 = false; static constexpr bool is_le = true; static constexpr bool is_rela = true; static constexpr u32 page_size = 65536; static constexpr u32 e_machine = EM_LOONGARCH; static constexpr u32 plt_hdr_size = 32; static constexpr u32 plt_size = 16; static constexpr u32 pltgot_size = 16; static constexpr u8 trap[] = { 0x00, 0x00, 0x2a, 0x00 }; // break 0 static constexpr u32 R_COPY = R_LARCH_COPY; static constexpr u32 R_GLOB_DAT = R_LARCH_32; static constexpr u32 R_JUMP_SLOT = R_LARCH_JUMP_SLOT; static constexpr u32 R_ABS = R_LARCH_32; static constexpr u32 R_RELATIVE = R_LARCH_RELATIVE; static constexpr u32 R_IRELATIVE = R_LARCH_IRELATIVE; static constexpr u32 R_DTPOFF = R_LARCH_TLS_DTPREL32; static constexpr u32 R_TPOFF = R_LARCH_TLS_TPREL32; static constexpr u32 R_DTPMOD = R_LARCH_TLS_DTPMOD32; static constexpr u32 R_TLSDESC = R_LARCH_TLS_DESC32; static constexpr u32 R_FUNCALL[] = { R_LARCH_B26, R_LARCH_CALL36 }; }; } // namespace mold ================================================ FILE: src/entry.cc ================================================ #include "config.h" #include "mold.h" #include "mold-git-hash.h" #if MOLD_USE_MIMALLOC # include #endif #if MOLD_USE_SYSTEM_MIMALLOC // Including mimalloc-new-delete.h overrides the new/delete operators. // We need it only when using mimalloc as a dynamic library. // This header should be included in only one source file, so we do // it in this file. # include #endif std::string mold::mold_version = #ifdef MOLD_GIT_HASH "mold " MOLD_VERSION " (" MOLD_GIT_HASH "; compatible with GNU ld)"; #else "mold " MOLD_VERSION " (compatible with GNU ld)"; #endif int main(int argc, char **argv) { #if MOLD_USE_MIMALLOC // Silence mimalloc warnings that users can ignore mi_option_disable(mi_option_verbose); mi_option_disable(mi_option_show_errors); #endif return mold::mold_main(argc, argv); } ================================================ FILE: src/error.cc ================================================ #include "mold.h" namespace mold { static std::string_view fatal_mono = "mold: fatal: "; static std::string_view fatal_color = "mold: \033[0;1;31mfatal:\033[0m "; static std::string_view error_mono = "mold: error: "; static std::string_view error_color = "mold: \033[0;1;31merror:\033[0m "; static std::string_view warning_mono = "mold: warning: "; static std::string_view warning_color = "mold: \033[0;1;35mwarning:\033[0m "; template Fatal::Fatal(Context &ctx) { out << (ctx.arg.color_diagnostics ? fatal_color : fatal_mono); } template [[noreturn]] Fatal::~Fatal() { out.emit(); cleanup(); _exit(1); } template Error::Error(Context &ctx) { if (ctx.arg.noinhibit_exec) { out << (ctx.arg.color_diagnostics ? warning_color : warning_mono); } else { out << (ctx.arg.color_diagnostics ? error_color : error_mono); ctx.has_error = true; } } template Warn::Warn(Context &ctx) { if (ctx.arg.suppress_warnings) return; out.emplace(std::cerr); if (ctx.arg.fatal_warnings) { *out << (ctx.arg.color_diagnostics ? error_color : error_mono); ctx.has_error = true; } else { *out << (ctx.arg.color_diagnostics ? warning_color : warning_mono); } } using E = MOLD_TARGET; template class Fatal; template class Error; template class Warn; } // namespace mold ================================================ FILE: src/filetype.cc ================================================ #include "mold.h" namespace mold { static bool is_text_file(MappedFile *mf) { auto istext = [](char c) { return isprint(c) || c == '\n' || c == '\t'; }; u8 *data = mf->data; return mf->size >= 4 && istext(data[0]) && istext(data[1]) && istext(data[2]) && istext(data[3]); } template static bool is_gcc_lto_obj(MappedFile *mf, bool has_plugin) { const char *data = mf->get_contents().data(); ElfEhdr &ehdr = *(ElfEhdr *)data; ElfShdr *sh_begin = (ElfShdr *)(data + ehdr.e_shoff); std::span> shdrs{(ElfShdr *)(data + ehdr.e_shoff), ehdr.e_shnum}; // e_shstrndx is a 16-bit field. If .shstrtab's section index is // too large, the actual number is stored to sh_link field. i64 shstrtab_idx = (ehdr.e_shstrndx == SHN_XINDEX) ? sh_begin->sh_link : ehdr.e_shstrndx; for (ElfShdr &sec : shdrs) { // GCC FAT LTO objects contain both regular ELF sections and GCC- // specific LTO sections, so that they can be linked as LTO objects if // the LTO linker plugin is available and falls back as regular // objects otherwise. GCC FAT LTO object can be identified by the // presence of `.gcc.lto_.symtab` section. if (has_plugin) { std::string_view name = data + shdrs[shstrtab_idx].sh_offset + sec.sh_name; if (name.starts_with(".gnu.lto_.symtab.")) return true; } if (sec.sh_type != SHT_SYMTAB) continue; // GCC non-FAT LTO object contains only sections symbols followed by // a common symbol whose name is `__gnu_lto_slim` (or `__gnu_lto_v1` // for older GCC releases). std::span> elf_syms{(ElfSym *)(data + sec.sh_offset), (size_t)sec.sh_size / sizeof(ElfSym)}; auto skip = [](u8 type) { return type == STT_NOTYPE || type == STT_FILE || type == STT_SECTION; }; i64 i = 1; while (i < elf_syms.size() && skip(elf_syms[i].st_type)) i++; if (i < elf_syms.size() && elf_syms[i].st_shndx == SHN_COMMON) { std::string_view name = data + shdrs[sec.sh_link].sh_offset + elf_syms[i].st_name; if (name.starts_with("__gnu_lto_")) return true; } break; } return false; } template FileType get_file_type(Context &ctx, MappedFile *mf) { std::string_view data = mf->get_contents(); bool has_plugin = !ctx.arg.plugin.empty(); if (data.empty()) return FileType::EMPTY; if (data.starts_with("\177ELF")) { u8 byte_order = ((ElfEhdr *)data.data())->e_ident[EI_DATA]; if (byte_order == ELFDATA2LSB) { auto &ehdr = *(ElfEhdr *)data.data(); if (ehdr.e_type == ET_REL) { if (ehdr.e_ident[EI_CLASS] == ELFCLASS32) { if (is_gcc_lto_obj(mf, has_plugin)) return FileType::GCC_LTO_OBJ; } else { if (is_gcc_lto_obj(mf, has_plugin)) return FileType::GCC_LTO_OBJ; } return FileType::ELF_OBJ; } if (ehdr.e_type == ET_DYN) return FileType::ELF_DSO; } else { auto &ehdr = *(ElfEhdr *)data.data(); if (ehdr.e_type == ET_REL) { if (ehdr.e_ident[EI_CLASS] == ELFCLASS32) { if (is_gcc_lto_obj(mf, has_plugin)) return FileType::GCC_LTO_OBJ; } else { if (is_gcc_lto_obj(mf, has_plugin)) return FileType::GCC_LTO_OBJ; } return FileType::ELF_OBJ; } if (ehdr.e_type == ET_DYN) return FileType::ELF_DSO; } return FileType::UNKNOWN; } if (data.starts_with("!\n")) return FileType::AR; if (data.starts_with("!\n")) return FileType::THIN_AR; if (is_text_file(mf)) return FileType::TEXT; if (data.starts_with("\xde\xc0\x17\x0b")) return FileType::LLVM_BITCODE; if (data.starts_with("BC\xc0\xde")) return FileType::LLVM_BITCODE; return FileType::UNKNOWN; } static std::string_view get_elf_type(u8 *buf) { bool is_le = (buf[EI_DATA] == ELFDATA2LSB); bool is_64 = (buf[EI_CLASS] == ELFCLASS64); auto *ehdr_le = (ElfEhdr *)buf; auto *ehdr_be = (ElfEhdr *)buf; switch (is_le ? ehdr_le->e_machine : ehdr_be->e_machine) { case EM_386: return I386::name; case EM_X86_64: return X86_64::name; case EM_ARM: return is_le ? ARM32LE::name : ARM32BE::name; case EM_AARCH64: return is_le ? ARM64LE::name : ARM64BE::name; case EM_RISCV: if (is_le) return is_64 ? RV64LE::name : RV32LE::name; return is_64 ? RV64BE::name : RV32BE::name; case EM_PPC: return PPC32::name; case EM_PPC64: return is_le ? PPC64V2::name : PPC64V1::name; case EM_S390X: return S390X::name; case EM_SPARC64: return SPARC64::name; case EM_68K: return M68K::name; case EM_SH: return is_le ? SH4LE::name : SH4BE::name; case EM_LOONGARCH: return is_64 ? LOONGARCH64::name : LOONGARCH32::name; default: return ""; } } // Read the beginning of a given file and returns its machine type // (e.g. EM_X86_64 or EM_386). template std::string_view get_machine_type(Context &ctx, ReaderContext &rctx, MappedFile *mf) { switch (get_file_type(ctx, mf)) { case FileType::ELF_OBJ: case FileType::ELF_DSO: case FileType::GCC_LTO_OBJ: return get_elf_type(mf->data); case FileType::AR: for (MappedFile *child : read_fat_archive_members(ctx, mf)) if (FileType ty = get_file_type(ctx, child); ty == FileType::ELF_OBJ || ty == FileType::GCC_LTO_OBJ) return get_elf_type(child->data); return ""; case FileType::THIN_AR: for (MappedFile *child : read_thin_archive_members(ctx, mf)) if (FileType ty = get_file_type(ctx, child); ty == FileType::ELF_OBJ || ty == FileType::GCC_LTO_OBJ) return get_elf_type(child->data); return ""; case FileType::TEXT: return Script(ctx, rctx, mf).get_script_output_type(); default: return ""; } } using E = MOLD_TARGET; template FileType get_file_type(Context &, MappedFile *); template std::string_view get_machine_type(Context &, ReaderContext &, MappedFile *); } // namespace mold ================================================ FILE: src/gc-sections.cc ================================================ // This file implements a mark-sweep garbage collector for -gc-sections. // In this algorithm, vertices are sections and edges are relocations. // Any section that is reachable from a root section is considered alive. #include "mold.h" #include #include #include namespace mold { template static bool should_keep(const InputSection &isec) { u32 type = isec.shdr().sh_type; u32 flags = isec.shdr().sh_flags; std::string_view name = isec.name(); if constexpr (is_ppc32) if (name == ".got2") return true; return (flags & SHF_GNU_RETAIN) || type == SHT_NOTE || type == SHT_INIT_ARRAY || type == SHT_FINI_ARRAY || type == SHT_PREINIT_ARRAY || name.starts_with(".ctors") || name.starts_with(".dtors") || name.starts_with(".init") || name.starts_with(".fini") || is_c_identifier(name); } template static bool mark_section(InputSection *isec) { return isec && isec->is_alive && !isec->is_visited.test_and_set(); } template static tbb::concurrent_vector *> collect_root_set(Context &ctx) { Timer t(ctx, "collect_root_set"); tbb::concurrent_vector *> rootset; auto enqueue_section = [&](InputSection *isec) { if (mark_section(isec)) rootset.push_back(isec); }; auto enqueue_symbol = [&](Symbol *sym) { if (sym) { if (SectionFragment *frag = sym->get_frag()) frag->is_alive = true; else enqueue_section(sym->get_input_section()); } }; // Add sections that are not subject to garbage collection. tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { for (std::unique_ptr> &isec : file->sections) { if (!isec || !isec->is_alive) continue; // --gc-sections discards only SHF_ALLOC sections. If you want to // reduce the amount of non-memory-mapped segments, you should // use `strip` command, compile without debug info or use // --strip-all linker option. if (!(isec->shdr().sh_flags & SHF_ALLOC)) { isec->is_visited = true; continue; } if (should_keep(*isec)) enqueue_section(isec.get()); } }); // Add sections containing gc root or exported symbols tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { for (Symbol *sym : file->symbols) if (sym->file == file && (sym->gc_root || sym->is_exported)) enqueue_symbol(sym); }); // .eh_frame consists of variable-length records called CIE and FDE // records, and they are a unit of inclusion or exclusion. // We just keep all CIEs and everything that are referenced by them. tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { for (CieRecord &cie : file->cies) for (const ElfRel &rel : cie.get_rels()) enqueue_symbol(file->symbols[rel.r_sym]); }); return rootset; } template static void visit(Context &ctx, InputSection *isec, tbb::feeder *> &feeder, i64 depth) { assert(isec->is_visited); // Mark a section alive. For better performacne, we don't call // `feeder.add` too often. auto mark = [&](InputSection *sec) { if (mark_section(sec)) { if (depth < 3) visit(ctx, sec, feeder, depth + 1); else feeder.add(sec); } }; // If this is a text section, .eh_frame may contain records // describing how to handle exceptions for that function. // We want to keep associated .eh_frame records. for (FdeRecord &fde : isec->get_fdes()) for (const ElfRel &rel : fde.get_rels(isec->file).subspan(1)) if (Symbol *sym = isec->file.symbols[rel.r_sym]) mark(sym->get_input_section()); for (const ElfRel &rel : isec->get_rels(ctx)) { // Symbol can refer to either a section fragment or an input section. Symbol &sym = *isec->file.symbols[rel.r_sym]; if (SectionFragment *frag = sym.get_frag()) frag->is_alive = true; else mark(sym.get_input_section()); } if constexpr (is_arm32) mark(isec->extra.exidx); } // Mark all reachable sections template static void mark(Context &ctx, tbb::concurrent_vector *> &rootset) { Timer t(ctx, "mark"); tbb::parallel_for_each(rootset, [&](InputSection *isec, tbb::feeder *> &feeder) { visit(ctx, isec, feeder, 0); }); } // Remove unreachable sections template static void sweep(Context &ctx) { Timer t(ctx, "sweep"); std::vector *>> sections(ctx.objs.size()); tbb::parallel_for((i64)0, (i64)ctx.objs.size(), [&](i64 i) { ObjectFile &file = *ctx.objs[i]; for (std::unique_ptr> &isec : file.sections) { if (isec && isec->is_alive && !isec->is_visited) { isec->kill(); sections[i].push_back(isec.get()); } } }); std::string &path = ctx.arg.print_gc_sections; if (!path.empty()) { std::ostream *out = &std::cout; std::ofstream file; if (path != "-") { file.open(path); if (file.fail()) Fatal(ctx) << "--print-gc-sections: cannot open " << path << ": " << errno_string(); out = &file; } i64 saved_bytes = 0; for (std::span *> vec : sections) for (InputSection *isec : vec) *out << "removing unused section " << *isec << '\n'; *out << "GC saved " << saved_bytes << " bytes\n"; } } template void gc_sections(Context &ctx) { Timer t(ctx, "gc"); tbb::concurrent_vector *> rootset = collect_root_set(ctx); mark(ctx, rootset); sweep(ctx); } using E = MOLD_TARGET; template void gc_sections(Context &ctx); } // namespace mold ================================================ FILE: src/gdb-index.cc ================================================ // This file contains code to read DWARF debug info to create .gdb_index. // // .gdb_index is an optional section to speed up GNU debugger. It contains // two maps: 1) a map from function/variable/type names to compunits, and // 2) a map from function address ranges to compunits. gdb uses these // maps to quickly find a compunit given a name or an instruction pointer. // // (Terminology: a compilation unit, often abbreviated as compunit or // CU, is a unit of debug info. An input .debug_info section usually // contains one compunit, and thus an output .debug_info contains as // many compunits as the number of input files.) // // .gdb_index is not mandatory. All the information in .gdb_index is // also in other debug info sections. You can actually create an // executable without .gdb_index and later add it using the // `gdb-add-index` post-processing tool that comes with gdb. // // Post-relocated debug section contents are needed to create a // .gdb_index. Therefore, we create it after relocating all the other // sections. The size of the section is also hard to estimate before // applying relocations to debug info sections, so a .gdb_index is // placed at the very end of the output file, even after the section // header. // // The mapping from names to compunits is 1:n while the mapping from // address ranges to compunits is 1:1. That is, two object files may // define the same type name, while there should be no two functions // that overlap with each other in memory. // // .gdb_index contains an on-disk hash table for names, so gdb can // lookup names without loading all strings into memory and construct an // in-memory hash table. // // Names are in .debug_gnu_pubnames and .debug_gnu_pubtypes input // sections. These sections are created if `-ggnu-pubnames` is given. // Besides names, these sections contain attributes for each name so // that gdb can distinguish type names from function names, for example. // // A compunit contains one or more function address ranges. If an // object file is compiled without -ffunction-sections, it contains // only one .text section and therefore contains a single address range. // Such range is typically stored directly to the compunit. // // If an object file is compiled with -ffunction-sections, it contains // more than one .text section, and it has as many address ranges as // the number of .text sections. Such discontiguous address ranges are // stored to .debug_ranges in DWARF 2/3/4 and .debug_rnglists/.debug_addr // in DWARF 5. // // .debug_info section contains DWARF debug info. Although we don't need // to parse the whole .debug_info section to read address ranges, we // have to do a little bit. DWARF is complicated and often handled using // a library such as libdwarf. But we don't use any library because we // don't want to add an extra run-time dependency just for --gdb-index. // // This page explains the format of .gdb_index: // https://sourceware.org/gdb/onlinedocs/gdb/Index-Section-Format.html #include "mold.h" #include namespace mold { enum DwarfKind { DWARF2_32, DWARF5_32, DWARF2_64, DWARF5_64 }; template struct CuHdrDwarf2_32 { U32 size; U16 version; U32 abbrev_offset; u8 address_size; }; template struct CuHdrDwarf5_32 { U32 size; U16 version; u8 unit_type; u8 address_size; U32 abbrev_offset; }; template struct CuHdrDwarf2_64 { U32 magic; U64 size; U16 version; U64 abbrev_offset; u8 address_size; }; template struct CuHdrDwarf5_64 { U32 magic; U64 size; U16 version; u8 unit_type; u8 address_size; U64 abbrev_offset; }; template struct PubnamesHdr32 { U32 size; U16 version; U32 debug_info_offset; U32 debug_info_size; }; template struct PubnamesHdr64 { U32 magic; U64 size; U16 version; U64 debug_info_offset; U64 debug_info_size; }; struct SectionHeader { ul32 version = 7; ul32 cu_list_offset = 0; ul32 cu_types_offset = 0; ul32 ranges_offset = 0; ul32 symtab_offset = 0; ul32 const_pool_offset = 0; }; struct NameType { auto operator<=>(const NameType &) const = default; u64 hash; u8 type; std::string_view name; }; struct MapValue { u32 gdb_hash = 0; Atomic count; u32 name_offset = 0; u32 type_offset = 0; }; struct Compunit { DwarfKind kind; i64 offset; i64 size; std::vector> ranges; std::vector nametypes; std::vector entries; }; // The hash function for .gdb_index. static u32 gdb_hash(std::string_view name) { u32 h = 0; for (u8 c : name) { if ('A' <= c && c <= 'Z') c = 'a' + c - 'A'; h = h * 67 + c - 113; } return h; } template static DwarfKind get_dwarf_kind(Context &ctx, u8 *p) { if (*(U32 *)p == 0xffff'ffff) { CuHdrDwarf2_64 &hdr = *(CuHdrDwarf2_64 *)p; if (hdr.version > 5) Fatal(ctx) << "--gdb-index: DWARF version " << hdr.version << " is not supported"; return (hdr.version == 5) ? DWARF5_64 : DWARF2_64; } CuHdrDwarf2_32 &hdr = *(CuHdrDwarf2_32 *)p; if (hdr.version > 5) Fatal(ctx) << "--gdb-index: DWARF version " << hdr.version << " is not supported"; return (hdr.version == 5) ? DWARF5_32 : DWARF2_32; } template u8 *find_cu_abbrev(Context &ctx, u8 **p, const CuHdr &hdr) { if (hdr.address_size != sizeof(Word)) Fatal(ctx) << "--gdb-index: unsupported address size " << hdr.address_size; if constexpr (requires { hdr.unit_type; }) { switch (hdr.unit_type) { case DW_UT_compile: case DW_UT_partial: break; case DW_UT_skeleton: case DW_UT_split_compile: *p += 8; break; default: Fatal(ctx) << "--gdb-index: unknown unit type: 0x" << std::hex << hdr.unit_type; } } i64 abbrev_code = read_uleb(p); // Find a .debug_abbrev record corresponding to the .debug_info record. // We assume the .debug_info record at a given offset is of // DW_TAG_compile_unit which describes a compunit. u8 *abbrev = &ctx.debug_abbrev[0] + hdr.abbrev_offset; for (;;) { u32 code = read_uleb(&abbrev); if (code == 0) Fatal(ctx) << "--gdb-index: .debug_abbrev does not contain" << " a record for the first .debug_info record"; if (code == abbrev_code) { // Found a record u64 abbrev_tag = read_uleb(&abbrev); if (abbrev_tag != DW_TAG_compile_unit && abbrev_tag != DW_TAG_skeleton_unit) Fatal(ctx) << "--gdb-index: the first entry's tag is not" << " DW_TAG_compile_unit/DW_TAG_skeleton_unit but 0x" << std::hex << abbrev_tag; break; } // Skip an uninteresting record read_uleb(&abbrev); // tag abbrev++; // has_children byte for (;;) { u64 name = read_uleb(&abbrev); u64 form = read_uleb(&abbrev); if (name == 0 && form == 0) break; if (form == DW_FORM_implicit_const) read_uleb(&abbrev); } } abbrev++; // skip has_children byte return abbrev; } // .debug_info contains variable-length fields. // This function reads one scalar value from a given location. template u64 read_scalar(Context &ctx, u8 **p, u64 form) { switch (form) { case DW_FORM_flag_present: return 0; case DW_FORM_data1: case DW_FORM_flag: case DW_FORM_strx1: case DW_FORM_addrx1: case DW_FORM_ref1: return *(*p)++; case DW_FORM_data2: case DW_FORM_strx2: case DW_FORM_addrx2: case DW_FORM_ref2: { u64 val = *(U16 *)*p; *p += 2; return val; } case DW_FORM_strx3: case DW_FORM_addrx3: { u64 val = *(U24 *)*p; *p += 3; return val; } case DW_FORM_data4: case DW_FORM_strx4: case DW_FORM_addrx4: case DW_FORM_ref4: { u64 val = *(U32 *)*p; *p += 4; return val; } case DW_FORM_data8: case DW_FORM_ref8: { u64 val = *(U64 *)*p; *p += 8; return val; } case DW_FORM_strp: case DW_FORM_sec_offset: case DW_FORM_line_strp: { u64 val = *(Offset *)*p; *p += sizeof(Offset); return val; } case DW_FORM_addr: case DW_FORM_ref_addr: { u64 val = *(Word *)*p; *p += sizeof(Word); return val; } case DW_FORM_strx: case DW_FORM_addrx: case DW_FORM_udata: case DW_FORM_ref_udata: case DW_FORM_loclistx: case DW_FORM_rnglistx: return read_uleb(p); case DW_FORM_string: *p += strlen((char *)*p) + 1; return 0; default: Fatal(ctx) << "--gdb-index: unhandled debug info form: 0x" << std::hex << form; } } // Read a range list from .debug_ranges starting at the given offset. template static std::vector> read_debug_range(Word *range, u64 base) { std::vector> vec; for (i64 i = 0; range[i] || range[i + 1]; i += 2) { if (range[i] + 1 == 0) base = range[i + 1]; else vec.emplace_back(range[i] + base, range[i + 1] + base); } return vec; } // Read a range list from .debug_rnglists starting at the given offset. template static void read_rnglist_range(std::vector> &vec, u8 *p, Word *addrx, u64 base) { for (;;) { switch (*p++) { case DW_RLE_end_of_list: return; case DW_RLE_base_addressx: base = addrx[read_uleb(&p)]; break; case DW_RLE_startx_endx: { u64 val1 = read_uleb(&p); u64 val2 = read_uleb(&p); vec.emplace_back(addrx[val1], addrx[val2]); break; } case DW_RLE_startx_length: { u64 val1 = read_uleb(&p); u64 val2 = read_uleb(&p); vec.emplace_back(addrx[val1], addrx[val1] + val2); break; } case DW_RLE_offset_pair: { u64 val1 = read_uleb(&p); u64 val2 = read_uleb(&p); // If the base is 0, this address range is for an eliminated // section. We only emit it if it's alive. if (base) vec.emplace_back(base + val1, base + val2); break; } case DW_RLE_base_address: base = *(Word *)p; p += sizeof(Word); break; case DW_RLE_start_end: { u64 val1 = ((Word *)p)[0]; u64 val2 = ((Word *)p)[1]; p += sizeof(Word) * 2; vec.emplace_back(val1, val2); break; } case DW_RLE_start_length: { u64 val1 = *(Word *)p; p += sizeof(Word); u64 val2 = read_uleb(&p); vec.emplace_back(val1, val1 + val2); break; } } } } // Returns a list of address ranges explained by a compunit at the // `offset` in an output .debug_info section. // // .debug_info contains DWARF debug info records, so this function // parses DWARF. If a designated compunit contains multiple ranges, the // ranges are read from .debug_ranges (or .debug_rnglists for DWARF5). // Otherwise, a range is read directly from .debug_info (or possibly // from .debug_addr for DWARF5). template static std::vector> read_address_ranges(Context &ctx, const Compunit &cu) { // Read .debug_info to find the record at a given offset. u8 *p = &ctx.debug_info[0] + cu.offset; CuHdr &hdr = *(CuHdr *)p; p += sizeof(hdr); u8 *abbrev = find_cu_abbrev(ctx, &p, hdr); // Now, read debug info records. struct Record { u64 form = 0; u64 value = 0; }; using Offset = decltype(hdr.size); Record low_pc; Record high_pc; Record ranges; u64 rnglists_base = -1; Word *addrx = nullptr; // Read all interesting debug records. for (;;) { u64 name = read_uleb(&abbrev); u64 form = read_uleb(&abbrev); if (name == 0 && form == 0) break; u64 val = read_scalar(ctx, &p, form); switch (name) { case DW_AT_low_pc: low_pc = {form, val}; break; case DW_AT_high_pc: high_pc = {form, val}; break; case DW_AT_rnglists_base: rnglists_base = val; break; case DW_AT_addr_base: addrx = (Word *)(&ctx.debug_addr[0] + val); break; case DW_AT_ranges: ranges = {form, val}; break; } } // Handle non-contiguous address ranges. if (ranges.form) { if (hdr.version <= 4) { Word *p = (Word *)(&ctx.debug_ranges[0] + ranges.value); return read_debug_range(p, low_pc.value); } assert(hdr.version == 5); std::vector> vec; u8 *buf = &ctx.debug_rnglists[0]; if (ranges.form == DW_FORM_sec_offset) { read_rnglist_range(vec, buf + ranges.value, addrx, low_pc.value); } else { if (rnglists_base == -1) Fatal(ctx) << "--gdb-index: missing DW_AT_rnglists_base"; u8 *base = buf + rnglists_base; i64 num_offsets = *(U32 *)(base - 4); Offset *offsets = (Offset *)base; for (i64 i = 0; i < num_offsets; i++) read_rnglist_range(vec, base + offsets[i], addrx, low_pc.value); } return vec; } // Handle a contiguous address range. if (low_pc.form && high_pc.form) { u64 lo; switch (low_pc.form) { case DW_FORM_addr: lo = low_pc.value; break; case DW_FORM_addrx: case DW_FORM_addrx1: case DW_FORM_addrx2: case DW_FORM_addrx4: lo = addrx[low_pc.value]; break; default: Fatal(ctx) << "--gdb-index: unhandled form for DW_AT_low_pc: 0x" << std::hex << high_pc.form; } switch (high_pc.form) { case DW_FORM_addr: return {{lo, high_pc.value}}; case DW_FORM_addrx: case DW_FORM_addrx1: case DW_FORM_addrx2: case DW_FORM_addrx4: return {{lo, addrx[high_pc.value]}}; case DW_FORM_udata: case DW_FORM_data1: case DW_FORM_data2: case DW_FORM_data4: case DW_FORM_data8: return {{lo, lo + high_pc.value}}; default: Fatal(ctx) << "--gdb-index: unhandled form for DW_AT_high_pc: 0x" << std::hex << high_pc.form; } } return {}; } template static i64 read_pubnames_cu(Context &ctx, const PubnamesHdr &hdr, std::vector &cus, ObjectFile &file) { using Offset = decltype(hdr.size); auto get_cu = [&](i64 offset) { for (i64 i = 0; i < cus.size(); i++) if (cus[i].offset == offset) return &cus[i]; Fatal(ctx) << file << ": corrupted debug_info_offset"; }; Compunit *cu = get_cu(file.debug_info->offset + hdr.debug_info_offset); i64 size = hdr.size + offsetof(PubnamesHdr, size) + sizeof(hdr.size); u8 *p = (u8 *)&hdr + sizeof(hdr); u8 *end = (u8 *)&hdr + size; while (p < end) { if (*(Offset *)p == 0) break; p += sizeof(Offset); u8 type = *p++; std::string_view name = (char *)p; p += name.size() + 1; cu->nametypes.push_back(NameType{hash_string(name), type, name}); } return size; } // Parses .debug_gnu_pubnames and .debug_gnu_pubtypes. These sections // start with a 14 bytes header followed by (4-byte offset, 1-byte type, // null-terminated string) tuples. // // The 4-byte offset is an offset into .debug_info that contains details // about the name. The 1-byte type is a type of the corresponding name // (e.g. function, variable or datatype). The string is a name of a // function, a variable or a type. template static void read_pubnames(Context &ctx, std::vector &cus, ObjectFile &file) { for (InputSection *isec : { file.debug_pubnames, file.debug_pubtypes }) { if (!isec) continue; isec->uncompress(ctx); if (isec->contents.empty()) continue; u8 *p = (u8*)&isec->contents[0]; u8 *end = p + isec->contents.size(); while (p < end) { if (*(U32 *)p == 0xffff'ffff) p += read_pubnames_cu(ctx, *(PubnamesHdr64 *)p, cus, file); else p += read_pubnames_cu(ctx, *(PubnamesHdr32 *)p, cus, file); } }; } template static std::vector read_compunits(Context &ctx) { std::vector cus; // Read compunits from the output .debug_info section. u8 *begin = &ctx.debug_info[0]; u8 *end = begin + ctx.debug_info.size(); for (u8 *p = begin; p < end;) { DwarfKind kind = get_dwarf_kind(ctx, p); i64 size; if (kind == DWARF2_32 || kind == DWARF5_32) size = ((CuHdrDwarf2_32 *)p)->size + 4; else size = ((CuHdrDwarf2_64 *)p)->size + 12; cus.push_back(Compunit{kind, p - begin, size}); p += size; } // Read address ranges for each compunit. tbb::parallel_for_each(cus, [&](Compunit &cu) { switch (cu.kind) { case DWARF2_32: cu.ranges = read_address_ranges>(ctx, cu); break; case DWARF5_32: cu.ranges = read_address_ranges>(ctx, cu); break; case DWARF2_64: cu.ranges = read_address_ranges>(ctx, cu); break; case DWARF5_64: cu.ranges = read_address_ranges>(ctx, cu); break; } // Remove empty ranges std::erase_if(cu.ranges, [](std::pair p) { return p.first == 0 || p.first == p.second; }); }); // Read symbols from .debug_gnu_pubnames and .debug_gnu_pubtypes. tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { read_pubnames(ctx, cus, *file); }); // Uniquify elements because GCC 11 seems to emit one record for each // comdat group which results in having a lot of duplicate records. tbb::parallel_for_each(cus, [](Compunit &cu) { ranges::stable_sort(cu.nametypes); remove_duplicates(cu.nametypes); }); return cus; } template std::span get_buffer(Context &ctx, Chunk *chunk) { if (chunk->is_compressed) { CompressedSection &sec = *(CompressedSection *)chunk; return {sec.uncompressed_data.get(), (size_t)sec.chdr.ch_size}; } return {ctx.buf + chunk->shdr.sh_offset, (size_t)chunk->shdr.sh_size}; } template void write_gdb_index(Context &ctx) { Timer t(ctx, "write_gdb_index"); // Find debug info sections for (Chunk *chunk : ctx.chunks) { std::string_view name = chunk->name; if (name == ".debug_info") ctx.debug_info = get_buffer(ctx, chunk); if (name == ".debug_abbrev") ctx.debug_abbrev = get_buffer(ctx, chunk); if (name == ".debug_ranges") ctx.debug_ranges = get_buffer(ctx, chunk); if (name == ".debug_addr") ctx.debug_addr = get_buffer(ctx, chunk); if (name == ".debug_rnglists") ctx.debug_rnglists = get_buffer(ctx, chunk); } if (ctx.debug_info.empty()) return; // Read debug info std::vector cus = read_compunits(ctx); // Uniquify symbols HyperLogLog estimator; tbb::parallel_for_each(cus, [&](Compunit &cu) { HyperLogLog e; for (NameType &nt : cu.nametypes) e.insert(nt.hash); estimator.merge(e); }); ConcurrentMap map(estimator.get_cardinality() * 3 / 2); tbb::parallel_for_each(cus, [&](Compunit &cu) { cu.entries.reserve(cu.nametypes.size()); for (NameType &nt : cu.nametypes) { MapValue *ent; bool inserted; std::tie(ent, inserted) = map.insert(nt.name, nt.hash, MapValue{gdb_hash(nt.name)}); ent->count++; cu.entries.push_back(ent); } }); // Sort symbols for build reproducibility using Entry = typename decltype(map)::Entry; std::vector entries = map.get_sorted_entries_all(); // Compute sizes of each components SectionHeader hdr; hdr.cu_list_offset = sizeof(hdr); hdr.cu_types_offset = hdr.cu_list_offset + cus.size() * 16; hdr.ranges_offset = hdr.cu_types_offset; hdr.symtab_offset = hdr.ranges_offset; for (Compunit &cu : cus) hdr.symtab_offset += cu.ranges.size() * 20; i64 ht_size = bit_ceil(entries.size() * 5 / 4 + 1); hdr.const_pool_offset = hdr.symtab_offset + ht_size * 8; i64 offset = 0; for (Entry *ent : entries) { ent->value.type_offset = offset; offset += ent->value.count * 4 + 4; } for (Entry *ent : entries) { ent->value.name_offset = offset; offset += ent->keylen + 1; } i64 bufsize = hdr.const_pool_offset + offset; // Allocate an output buffer. We use malloc instead of vector to // avoid zero-initializing the entire buffer. ctx.output_file->buf2 = (u8 *)malloc(bufsize); ctx.output_file->buf2_size = bufsize; u8 *buf = ctx.output_file->buf2; // Write a section header memcpy(buf, &hdr, sizeof(hdr)); // Write a CU list u8 *p = buf + sizeof(hdr); for (Compunit &cu : cus) { *(ul64 *)p = cu.offset; *(ul64 *)(p + 8) = cu.size; p += 16; } // Write address areas std::vector range_offsets(cus.size()); for (i64 i = 1; i < cus.size(); i++) range_offsets[i] = range_offsets[i - 1] + cus[i - 1].ranges.size() * 20; tbb::parallel_for_each(cus, [&](Compunit &cu) { i64 i = &cu - cus.data(); u8 *p = buf + hdr.ranges_offset + range_offsets[i]; for (std::pair range : cu.ranges) { *(ul64 *)p = range.first; *(ul64 *)(p + 8) = range.second; *(ul32 *)(p + 16) = i; p += 20; } }); // Write a symbol table u32 mask = ht_size - 1; ul32 *ht = (ul32 *)(buf + hdr.symtab_offset); memset(ht, 0, ht_size * 8); for (Entry *ent : entries) { u32 hash = ent->value.gdb_hash; u32 step = ((hash * 17) & mask) | 1; u32 j = hash & mask; while (ht[j * 2] || ht[j * 2 + 1]) j = (j + step) & mask; ht[j * 2] = ent->value.name_offset; ht[j * 2 + 1] = ent->value.type_offset; } // Write types. Use MapValue::count as an atomic slot counter. u8 *base = buf + hdr.const_pool_offset; for (Entry *ent : entries) ent->value.count = 0; tbb::parallel_for_each(cus, [&](Compunit &cu) { i64 i = &cu - cus.data(); for (i64 j = 0; j < cu.nametypes.size(); j++) { MapValue *ent = cu.entries[j]; ul32 *p = (ul32 *)(base + ent->type_offset); i64 idx = ++ent->count; p[idx] = (cu.nametypes[j].type << 24) | i; } }); // Write the final counts into the buffer. for (Entry *ent : entries) *(ul32 *)(base + ent->value.type_offset) = ent->value.count; // Write names tbb::parallel_for_each(entries, [&](Entry *ent) { u8 *dst = buf + hdr.const_pool_offset + ent->value.name_offset; memcpy(dst, ent->key, ent->keylen); dst[ent->keylen] = '\0'; }); // Update the section size and rewrite the section header if (ctx.shdr) { ctx.gdb_index->shdr.sh_size = bufsize; ctx.shdr->copy_buf(ctx); } } using E = MOLD_TARGET; template void write_gdb_index(Context &); } // namespace mold ================================================ FILE: src/icf.cc ================================================ // This file implements the Identical Code Folding feature which can // reduce the output file size of a typical program by a few percent. // ICF identifies read-only input sections that happen to be identical // and thus can be used interchangeably. ICF leaves one of them and discards // the others. // // ICF is usually used in combination with -ffunction-sections and // -fdata-sections compiler options, so that object files have one section // for each function or variable instead of having one large .text or .data. // The unit of ICF merging is section. // // Two sections are considered identical by ICF if they have the exact // same contents, metadata such as section flags, exception handling // records, and relocations. The last one is interesting because two // relocations are considered identical if they point to the _same_ // section in terms of ICF. // // To see what that means, consider two sections, A and B, which are // identical except for one pair of relocations. Say, A has a relocation to // section C, and B has a relocation to D. In this case, A and B are // considered identical if C and D are considered identical. C and D can be // either really the same section or two different sections that are // considered identical by ICF. Below is an example of such inputs, A, B, C // and D: // // void A() { C(); } // void B() { D(); } // void C() { A(); } // void D() { B(); } // // If we assume A and B are mergeable, we can merge C and D, which makes A // and B mergeable. There's no contradiction in our assumption, so we can // conclude that A and B as well as C and D are mergeable. // // This problem boils down to one in graph theory. Input to ICF can be // considered as a directed graph in which vertices are sections and edges // are relocations. Vertices have labels (section contents, etc.), and so // are edges (relocation offsets, etc.). Two vertices are considered // identical if and only if the (possibly infinite) their unfoldings into // regular trees are equal. Given this formulation, we want to find as // many identical vertices as possible. // // Just like a lot of problems with graph, this problem doesn't have a // straightforward "optimal" solution, and we need to resort to heuristics. // // mold approaches this problem by hashing program trees with increasing depth // on each iteration. // For example, when we start, we only hash individual functions with // their call into other functions omitted. From the second iteration, we // put the function they call into the hash by appending the hash of those // functions from the previous iteration. This means that the nth iteration // hashes call chain up to (n-1) levels deep. // We use a cryptographic hash function, so the unique number of hashes will // only monotonically increase as we take into account of deeper trees with // iterations (otherwise, that means we have found a hash collision). We stop // when the unique number of hashes stop increasing; this is based on the fact // that once we observe an iteration with the same amount of unique hashes as // the previous iteration, it will remain unchanged for further iterations. // This is provable, but here we omit the proof for brevity. // // When compared to other approaches, mold's approach has a relatively cheaper // cost per iteration, and as a bonus, is highly parallelizable. // For Chromium, mold's ICF finishes in less than 1 second with 20 threads, // whereas lld takes 5 seconds and gold takes 50 seconds under the same // conditions. #include "mold.h" #include "../lib/siphash.h" #include #include #include #include #include #include #include #include #include static constexpr int64_t HASH_SIZE = 16; using Digest = std::array; namespace std { template <> struct hash { size_t operator()(const Digest &k) const { static_assert(sizeof(size_t) <= HASH_SIZE); size_t val; memcpy(&val, k.data(), sizeof(size_t)); return val; } }; } namespace mold { static u8 hmac_key[16]; template static void uniquify_cies(Context &ctx) { Timer t(ctx, "uniquify_cies"); std::vector *> cies; auto find = [&](CieRecord &cie) -> i64 { for (i64 i = 0; i < cies.size(); i++) if (cie_equals(cie, *cies[i])) return i; return -1; }; for (ObjectFile *file : ctx.objs) { for (CieRecord &cie : file->cies) { if (i64 idx = find(cie); idx != -1) { cie.icf_idx = idx; } else { cie.icf_idx = cies.size(); cies.push_back(&cie); } } } } template static bool is_eligible(Context &ctx, InputSection &isec) { const ElfShdr &shdr = isec.shdr(); std::string_view name = isec.name(); if (shdr.sh_size == 0 || !(shdr.sh_flags & SHF_ALLOC) || shdr.sh_type == SHT_NOBITS || is_c_identifier(name)) return false; if (shdr.sh_flags & SHF_EXECINSTR) return (ctx.arg.icf_all || !isec.address_taken) && name != ".init" && name != ".fini"; // .gcc_except_table contains a compiler-generated table. Pointer // equality for the section is not significant because only the C++ // exception handling code will use the table at runtime. if (name == ".gcc_except_table" || name.starts_with(".gcc_except_table.")) return true; bool is_readonly = !(shdr.sh_flags & SHF_WRITE); bool is_relro = isec.name().starts_with(".data.rel.ro"); return (ctx.arg.ignore_data_address_equality || !isec.address_taken) && (is_readonly || is_relro); } template static bool is_leaf(Context &ctx, InputSection &isec) { if (!isec.get_rels(ctx).empty()) return false; for (FdeRecord &fde : isec.get_fdes()) if (fde.get_rels(isec.file).size() > 1) return false; return true; } template struct LeafHasher { size_t operator()(InputSection *isec) const { u64 h = hash_string(isec->contents); for (FdeRecord &fde : isec->get_fdes()) { u64 h2 = hash_string(fde.get_contents(isec->file).substr(8)); h = combine_hash(h, h2); } return h; } }; template struct LeafEq { bool operator()(InputSection *a, InputSection *b) const { if (a->contents != b->contents) return false; std::span> x = a->get_fdes(); std::span> y = b->get_fdes(); if (x.size() != y.size()) return false; for (i64 i = 0; i < x.size(); i++) if (x[i].get_contents(a->file).substr(8) != y[i].get_contents(b->file).substr(8)) return false; return true; } }; // Early merge of leaf nodes, which can be processed without constructing the // entire graph. This reduces the vertex count and improves memory efficiency. template static void merge_leaf_nodes(Context &ctx) { Timer t(ctx, "merge_leaf_nodes"); static Counter eligible("icf_eligibles"); static Counter non_eligible("icf_non_eligibles"); static Counter leaf("icf_leaf_nodes"); tbb::concurrent_unordered_map *, Atomic *>, LeafHasher, LeafEq> map; tbb::parallel_for((i64)0, (i64)ctx.objs.size(), [&](i64 i) { for (std::unique_ptr> &isec : ctx.objs[i]->sections) { if (!isec || !isec->is_alive) continue; if (!is_eligible(ctx, *isec)) { non_eligible++; continue; } if (is_leaf(ctx, *isec)) { leaf++; isec->icf_leaf = true; auto [it, inserted] = map.insert({isec.get(), isec.get()}); if (!inserted) { InputSection *isec2 = it->second.load(); while (isec->get_priority() < isec2->get_priority() && !it->second.compare_exchange_strong(isec2, isec.get())); } } else { eligible++; isec->icf_eligible = true; } } }); tbb::parallel_for((i64)0, (i64)ctx.objs.size(), [&](i64 i) { for (std::unique_ptr> &isec : ctx.objs[i]->sections) { if (isec && isec->is_alive && isec->icf_leaf) { auto it = map.find(isec.get()); assert(it != map.end()); isec->leader = it->second; } } }); } template static Digest compute_digest(Context &ctx, InputSection &isec) { SipHash13_128 hasher(hmac_key); auto hash = [&](auto val) { hasher.update((u8 *)&val, sizeof(val)); }; auto hash_string = [&](std::string_view str) { hash(str.size()); hasher.update((u8 *)str.data(), str.size()); }; auto hash_symbol = [&](Symbol &sym) { InputSection *isec = sym.get_input_section(); if (!sym.file) { hash('1'); hash((u64)&sym); } else if (SectionFragment *frag = sym.get_frag()) { hash('2'); hash((u64)frag); } else if (!isec) { hash('3'); } else if (isec->leader) { hash('4'); hash((u64)isec->leader); } else if (isec->icf_eligible) { hash('5'); } else { hash('6'); hash((u64)isec); } hash(sym.value); }; hash_string(isec.contents); hash(isec.shdr().sh_flags); hash(isec.get_fdes().size()); hash(isec.get_rels(ctx).size()); for (FdeRecord &fde : isec.get_fdes()) { hash(isec.file.cies[fde.cie_idx].icf_idx); // Bytes 0 to 4 contain the length of this record, and // bytes 4 to 8 contain an offset to CIE. hash_string(fde.get_contents(isec.file).substr(8)); hash(fde.get_rels(isec.file).size()); for (const ElfRel &rel : fde.get_rels(isec.file).subspan(1)) { hash_symbol(*isec.file.symbols[rel.r_sym]); hash(rel.r_type); hash(rel.r_offset - fde.input_offset); hash(get_addend(isec.file.cies[fde.cie_idx].input_section, rel)); } } for (i64 i = 0; i < isec.get_rels(ctx).size(); i++) { const ElfRel &rel = isec.get_rels(ctx)[i]; hash(rel.r_offset); hash(rel.r_type); hash(get_addend(isec, rel)); hash_symbol(*isec.file.symbols[rel.r_sym]); } Digest digest; hasher.finish(digest.data()); return digest; } template static std::vector *> gather_sections(Context &ctx) { Timer t(ctx, "gather_sections"); // Count the number of input sections for each input file. std::vector num_sections(ctx.objs.size()); tbb::parallel_for((i64)0, (i64)ctx.objs.size(), [&](i64 i) { for (std::unique_ptr> &isec : ctx.objs[i]->sections) if (isec && isec->is_alive && isec->icf_eligible) num_sections[i]++; }); std::vector section_indices(ctx.objs.size()); for (i64 i = 0; i < ctx.objs.size() - 1; i++) section_indices[i + 1] = section_indices[i] + num_sections[i]; std::vector *> sections( section_indices.back() + num_sections.back()); // Fill `sections` contents. tbb::parallel_for((i64)0, (i64)ctx.objs.size(), [&](i64 i) { i64 idx = section_indices[i]; for (std::unique_ptr> &isec : ctx.objs[i]->sections) if (isec && isec->is_alive && isec->icf_eligible) sections[idx++] = isec.get(); }); tbb::parallel_for((i64)0, (i64)sections.size(), [&](i64 i) { sections[i]->icf_idx = i; }); return sections; } template static std::vector compute_digests(Context &ctx, std::span *> sections) { Timer t(ctx, "compute_digests"); std::vector digests(sections.size()); tbb::parallel_for((i64)0, (i64)sections.size(), [&](i64 i) { digests[i] = compute_digest(ctx, *sections[i]); }); return digests; } // Build a graph, treating every function as a vertex and every function call // as an edge. See the description at the top for a more detailed formulation. // We use u32 indices here to improve cache locality. template static void gather_edges(Context &ctx, std::span *> sections, std::vector &edges, std::vector &edge_indices) { Timer t(ctx, "gather_edges"); if (sections.empty()) return; std::vector num_edges(sections.size()); edge_indices.resize(sections.size()); tbb::parallel_for((i64)0, (i64)sections.size(), [&](i64 i) { InputSection &isec = *sections[i]; assert(isec.icf_eligible); for (i64 j = 0; j < isec.get_rels(ctx).size(); j++) { const ElfRel &rel = isec.get_rels(ctx)[j]; Symbol &sym = *isec.file.symbols[rel.r_sym]; if (!sym.get_frag()) if (InputSection *isec = sym.get_input_section()) if (isec->icf_eligible) num_edges[i]++; } }); for (i64 i = 0; i < num_edges.size() - 1; i++) edge_indices[i + 1] = edge_indices[i] + num_edges[i]; edges.resize(edge_indices.back() + num_edges.back()); tbb::parallel_for((i64)0, (i64)num_edges.size(), [&](i64 i) { InputSection &isec = *sections[i]; i64 idx = edge_indices[i]; for (ElfRel &rel : isec.get_rels(ctx)) { Symbol &sym = *isec.file.symbols[rel.r_sym]; if (InputSection *isec = sym.get_input_section()) if (isec->icf_eligible) edges[idx++] = isec->icf_idx; } }); } template static i64 propagate(std::span> digests, std::span edges, std::span edge_indices, bool &slot, std::span converged, tbb::affinity_partitioner &ap) { static Counter round("icf_round"); round++; i64 num_digests = digests[0].size(); tbb::enumerable_thread_specific changed; tbb::parallel_for((i64)0, num_digests, [&](i64 i) { if (converged[i]) return; SipHash13_128 hasher(hmac_key); hasher.update(digests[2][i].data(), HASH_SIZE); i64 begin = edge_indices[i]; i64 end = (i + 1 == num_digests) ? edges.size() : edge_indices[i + 1]; for (i64 j : edges.subspan(begin, end - begin)) hasher.update(digests[slot][j].data(), HASH_SIZE); hasher.finish(digests[!slot][i].data()); if (digests[slot][i] == digests[!slot][i]) { // This node has converged. Skip further iterations as it will // yield the same hash. converged[i] = true; } else { changed.local()++; } }, ap); slot = !slot; return changed.combine(std::plus()); } template static i64 count_num_classes(std::span digests, tbb::affinity_partitioner &ap) { std::vector vec(digests.begin(), digests.end()); tbb::parallel_sort(vec); tbb::enumerable_thread_specific num_classes; tbb::parallel_for((i64)0, (i64)vec.size() - 1, [&](i64 i) { if (vec[i] != vec[i + 1]) num_classes.local()++; }, ap); return num_classes.combine(std::plus()); } template static void print_icf_sections(Context &ctx) { tbb::concurrent_vector *> leaders; tbb::concurrent_unordered_multimap *, InputSection *> map; tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { for (std::unique_ptr> &isec : file->sections) { if (isec && isec->is_alive && isec->leader) { if (isec.get() == isec->leader) leaders.push_back(isec.get()); else map.insert({isec->leader, isec.get()}); } } }); tbb::parallel_sort(leaders.begin(), leaders.end(), [](InputSection *a, InputSection *b) { return a->get_priority() < b->get_priority(); }); std::ostream *out = &std::cout; std::ofstream file; std::string &path = ctx.arg.print_icf_sections; if (path != "-") { file.open(path); if (file.fail()) Fatal(ctx) << "--print-icf-sections: cannot open " << path << ": " << errno_string(); out = &file; } i64 saved_bytes = 0; for (InputSection *leader : leaders) { auto [begin, end] = map.equal_range(leader); if (begin != end) { *out << "selected section " << *leader << '\n'; for (auto it = begin; it != end; it++) { *out << " removing identical section " << *it->second << '\n'; saved_bytes += leader->contents.size(); } } } *out << "ICF saved " << saved_bytes << " bytes\n"; } template void icf_sections(Context &ctx) { Timer t(ctx, "icf"); if (ctx.objs.empty()) return; get_random_bytes(hmac_key, sizeof(hmac_key)); uniquify_cies(ctx); merge_leaf_nodes(ctx); // Prepare for the propagation rounds. std::vector *> sections = gather_sections(ctx); // We allocate 3 arrays to store hashes for each vertex. // // Index 0 and 1 are used for tree hashes from the previous // iteration and the current iteration. They switch roles every // iteration. See `slot` below. // // Index 2 stores the initial, single-vertex hash. This is combined // with hashes from the connected vertices to form the tree hash // described above. std::vector> digests(3); digests[0] = compute_digests(ctx, sections); digests[1].resize(digests[0].size()); digests[2] = digests[0]; std::vector edges; std::vector edge_indices; gather_edges(ctx, sections, edges, edge_indices); std::vector converged(digests[0].size()); bool slot = 0; // Execute the propagation rounds until convergence is obtained. { Timer t(ctx, "propagate"); tbb::affinity_partitioner ap; // A cheap test that the graph hasn't converged yet. // The loop after this one uses a strict condition, but it's expensive // as it requires sorting the entire hash collection. // // For nodes that have a cycle in downstream (i.e. recursive // functions and functions that calls recursive functions) will always // change with the iterations. Nodes that doesn't (i.e. non-recursive // functions) will stop changing as soon as the propagation depth reaches // the call tree depth. // Here, we test whether we have reached sufficient depth for the latter, // which is a necessary (but not sufficient) condition for convergence. i64 num_changed = -1; for (;;) { i64 n = propagate(digests, edges, edge_indices, slot, converged, ap); if (n == num_changed) break; num_changed = n; } // Run the pass until the unique number of hashes stop increasing, at which // point we have achieved convergence (proof omitted for brevity). i64 num_classes = -1; for (;;) { // count_num_classes requires sorting which is O(n log n), so do a little // more work beforehand to amortize that log factor. for (i64 i = 0; i < 10; i++) propagate(digests, edges, edge_indices, slot, converged, ap); i64 n = count_num_classes(digests[slot], ap); if (n == num_classes) break; num_classes = n; } } // Group sections by hash values. { Timer t(ctx, "group"); auto *map = new tbb::concurrent_unordered_map *>>; std::span digest = digests[slot]; tbb::parallel_for((i64)0, (i64)sections.size(), [&](i64 i) { InputSection *isec = sections[i]; auto [it, inserted] = map->insert({digest[i], isec}); if (!inserted) { InputSection *isec2 = it->second.load(); while (isec->get_priority() < isec2->get_priority() && !it->second.compare_exchange_strong(isec2, isec)); } }); tbb::parallel_for((i64)0, (i64)sections.size(), [&](i64 i) { auto it = map->find(digest[i]); assert(it != map->end()); sections[i]->leader = it->second; }); // Since free'ing the map is slow, postpone it. ctx.on_exit.push_back([=] { delete map; }); } if (!ctx.arg.print_icf_sections.empty()) print_icf_sections(ctx); // Eliminate duplicate sections. // Symbols pointing to eliminated sections will be redirected on the fly when // exporting to the symtab. { Timer t(ctx, "sweep"); static Counter eliminated("icf_eliminated"); tbb::parallel_for_each(ctx.objs, [](ObjectFile *file) { for (std::unique_ptr> &isec : file->sections) { if (isec && isec->is_alive && isec->icf_removed()) { isec->kill(); eliminated++; } } }); } } using E = MOLD_TARGET; template void icf_sections(Context &ctx); } // namespace mold ================================================ FILE: src/input-files.cc ================================================ #include "mold.h" #include #include #include #ifndef _WIN32 # include #endif namespace mold { // If we haven't seen the same `key` before, create a new instance // of Symbol and returns it. Otherwise, returns the previously- // instantiated object. `key` is usually the same as `name`. template Symbol *get_symbol(Context &ctx, std::string_view key, std::string_view name) { typename decltype(ctx.symbol_map)::const_accessor acc; ctx.symbol_map.insert(acc, {key, Symbol(name, ctx.arg.demangle)}); return const_cast *>(&acc->second); } template Symbol *get_symbol(Context &ctx, std::string_view key) { std::string_view name = key.substr(0, key.find('@')); return get_symbol(ctx, key, name); } template static bool is_rust_symbol(const Symbol &sym) { // The legacy Rust mangling scheme is indistinguishtable from C++. // We don't want to accidentally demangle C++ symbols as Rust ones. // So, the legacy mangling scheme will be demangled only when we // know the object file was created by rustc. if (sym.file && !sym.file->is_dso && ((ObjectFile *)sym.file)->is_rust_obj) return true; // "_R" is the prefix of the new Rust mangling scheme. return sym.name().starts_with("_R"); } template std::string_view demangle(const Symbol &sym) { if (is_rust_symbol(sym)) { if (std::optional s = demangle_rust(sym.name())) return *s; } else { if (std::optional s = demangle_cpp(sym.name())) return *s; } return sym.name(); } template std::ostream &operator<<(std::ostream &out, const Symbol &sym) { if (sym.demangle) out << demangle(sym); else out << sym.name(); return out; } template InputFile::InputFile(Context &ctx, MappedFile *mf) : mf(mf), filename(mf->name) { if (mf->size < sizeof(ElfEhdr)) Fatal(ctx) << *this << ": file too small"; if (memcmp(mf->data, "\177ELF", 4)) Fatal(ctx) << *this << ": not an ELF file"; ElfEhdr &ehdr = *(ElfEhdr *)mf->data; is_dso = (ehdr.e_type == ET_DYN); ElfShdr *sh_begin = (ElfShdr *)(mf->data + ehdr.e_shoff); // e_shnum contains the total number of sections in an object file. // Since it is a 16-bit integer field, it's not large enough to // represent >65535 sections. If an object file contains more than 65535 // sections, the actual number is stored to sh_size field. i64 num_sections = (ehdr.e_shnum == 0) ? sh_begin->sh_size : ehdr.e_shnum; if (mf->data + mf->size < (u8 *)(sh_begin + num_sections)) Fatal(ctx) << mf->name << ": e_shoff or e_shnum corrupted: " << mf->size << " " << num_sections; elf_sections = {sh_begin, sh_begin + num_sections}; // e_shstrndx is a 16-bit field. If .shstrtab's section index is // too large, the actual number is stored to sh_link field. i64 shstrtab_idx = (ehdr.e_shstrndx == SHN_XINDEX) ? sh_begin->sh_link : ehdr.e_shstrndx; shstrtab = this->get_string(ctx, shstrtab_idx); } template ElfShdr *InputFile::find_section(i64 type) { for (ElfShdr &sec : elf_sections) if (sec.sh_type == type) return &sec; return nullptr; } // Find the source filename. It should be listed in symtab as STT_FILE. template std::string_view InputFile::get_source_name() const { for (i64 i = 0; i < first_global; i++) if (Symbol *sym = symbols[i]; sym->get_type() == STT_FILE) return sym->name(); return ""; } template static bool is_debug_section(const ElfShdr &shdr, std::string_view name) { return !(shdr.sh_flags & SHF_ALLOC) && name.starts_with(".debug_"); } template void ObjectFile::parse_note_gnu_property(Context &ctx, const ElfShdr &shdr) { std::string_view data = this->get_string(ctx, shdr); while (!data.empty()) { ElfNhdr &hdr = *(ElfNhdr *)data.data(); data = data.substr(sizeof(hdr)); std::string_view name = data.substr(0, hdr.n_namesz - 1); data = data.substr(align_to(hdr.n_namesz, 4)); std::string_view desc = data.substr(0, hdr.n_descsz); data = data.substr(align_to(hdr.n_descsz, sizeof(Word))); if (hdr.n_type != NT_GNU_PROPERTY_TYPE_0 || name != "GNU") continue; while (!desc.empty()) { u32 type = *(U32 *)desc.data(); u32 size = *(U32 *)(desc.data() + 4); desc = desc.substr(8); // The majority of currently defined .note.gnu.property // use 32-bit values. // We don't know how to handle anything else, so if we encounter // one, skip it. // // The following properties have a different size: // - GNU_PROPERTY_STACK_SIZE // - GNU_PROPERTY_NO_COPY_ON_PROTECTED if (size == 4) gnu_properties[type] |= *(U32 *)desc.data(); desc = desc.substr(align_to(size, sizeof(Word))); } } } // // [ "vendor-name" *]+ ]* template static void read_riscv_attributes(Context &ctx, ObjectFile &file, std::string_view data) { if (data.empty()) Fatal(ctx) << file << ": corrupted .riscv.attributes section"; if (u8 format_version = data[0]; format_version != 'A') return; data = data.substr(1); while (!data.empty()) { i64 sz = *(U32 *)data.data(); if (data.size() < sz) Fatal(ctx) << file << ": corrupted .riscv.attributes section"; std::string_view p(data.data() + 4, sz - 4); data = data.substr(sz); if (!p.starts_with("riscv\0"sv)) continue; p = p.substr(6); if (!p.starts_with(ELF_TAG_FILE)) Fatal(ctx) << file << ": corrupted .riscv.attributes section"; p = p.substr(5); // skip the tag and the sub-sub-section size while (!p.empty()) { i64 tag = read_uleb(&p); switch (tag) { case ELF_TAG_RISCV_STACK_ALIGN: file.extra.stack_align = read_uleb(&p); break; case ELF_TAG_RISCV_ARCH: { i64 pos = p.find_first_of('\0'); file.extra.arch = p.substr(0, pos); p = p.substr(pos + 1); break; } case ELF_TAG_RISCV_UNALIGNED_ACCESS: file.extra.unaligned_access = read_uleb(&p); break; default: break; } } } } template static bool is_known_section_type(const ElfShdr &shdr) { u32 ty = shdr.sh_type; u32 flags = shdr.sh_flags; if (ty == SHT_PROGBITS || ty == SHT_NOTE || ty == SHT_NOBITS || ty == SHT_INIT_ARRAY || ty == SHT_FINI_ARRAY || ty == SHT_PREINIT_ARRAY) return true; if (SHT_LOUSER <= ty && ty <= SHT_HIUSER && !(flags & SHF_ALLOC)) return true; if (SHT_LOOS <= ty && ty <= SHT_HIOS && !(flags & SHF_OS_NONCONFORMING)) return true; if (is_x86_64 && ty == SHT_X86_64_UNWIND) return true; if (is_arm32 && (ty == SHT_ARM_EXIDX || ty == SHT_ARM_ATTRIBUTES)) return true; if (is_riscv && ty == SHT_RISCV_ATTRIBUTES) return true; return false; } // SHT_CREL is an experimental alternative relocation table format // designed to reduce the size of the table. Only LLVM supports it // at the moment. // // This function converts a CREL relocation table to a regular one. template std::vector> decode_crel(Context &ctx, ObjectFile &file, const ElfShdr &shdr) { u8 *p = (u8 *)file.get_string(ctx, shdr).data(); u64 hdr = read_uleb(&p); i64 nrels = hdr >> 3; bool is_rela = hdr & 0b100; i64 scale = hdr & 0b11; if (is_rela && !E::is_rela) Fatal(ctx) << file << ": CREL with addends is not supported for " << E::name; u64 offset = 0; i64 type = 0; i64 symidx = 0; i64 addend = 0; std::vector> vec; vec.reserve(nrels); while (vec.size() < nrels) { u8 flags = *p++; i64 nflags = is_rela ? 3 : 2; // The first ULEB-128 encoded value is a concatenation of bit flags and // an offset delta. The delta may be very large to decrease the // current offset value by wrapping around. Combined, the encoded value // can be up to 67 bit long. Thus we can't simply use read_uleb() which // returns a u64. u64 delta; if (flags & 0x80) delta = (read_uleb(&p) << (7 - nflags)) | ((flags & 0x7f) >> nflags); else delta = flags >> nflags; offset += delta << scale; if (flags & 1) symidx += read_sleb(&p); if (flags & 2) type += read_sleb(&p); if (is_rela && (flags & 4)) addend += read_sleb(&p); vec.emplace_back(offset, type, symidx, addend); } return vec; } template ComdatGroup *insert_comdat_group(Context &ctx, std::string_view name) { typename decltype(ctx.comdat_groups)::const_accessor acc; ctx.comdat_groups.insert(acc, {name, ComdatGroup()}); return const_cast(&acc->second); } template void ObjectFile::initialize_sections(Context &ctx) { // Read sections for (i64 i = 0; i < this->elf_sections.size(); i++) { const ElfShdr &shdr = this->elf_sections[i]; std::string_view name = this->shstrtab.data() + shdr.sh_name; if ((shdr.sh_flags & SHF_EXCLUDE) && name.starts_with(".gnu.offload_lto_.symtab.")) { this->is_gcc_offload_obj = true; continue; } if ((shdr.sh_flags & SHF_EXCLUDE) && !(shdr.sh_flags & SHF_ALLOC) && shdr.sh_type != SHT_LLVM_ADDRSIG && !ctx.arg.relocatable) continue; if constexpr (is_arm) if (shdr.sh_type == SHT_ARM_ATTRIBUTES) continue; if constexpr (is_riscv) { if (shdr.sh_type == SHT_RISCV_ATTRIBUTES) { read_riscv_attributes(ctx, *this, this->get_string(ctx, shdr)); continue; } } switch (shdr.sh_type) { case SHT_GROUP: { // Get the signature of this section group. if (shdr.sh_info >= this->elf_syms.size()) Fatal(ctx) << *this << ": invalid symbol index"; const ElfSym &esym = this->elf_syms[shdr.sh_info]; std::string_view signature; if (esym.st_type == STT_SECTION) { signature = this->shstrtab.data() + this->elf_sections[get_shndx(esym)].sh_name; } else { signature = this->symbol_strtab.data() + esym.st_name; } // Ignore a broken comdat group GCC emits for .debug_macros. // https://github.com/rui314/mold/issues/438 if (signature.starts_with("wm4.")) continue; // Get comdat group members. std::span> entries = this->template get_data>(ctx, shdr); if (entries.empty()) Fatal(ctx) << *this << ": empty SHT_GROUP"; if (entries[0] == 0) continue; if (entries[0] != GRP_COMDAT) Fatal(ctx) << *this << ": unsupported SHT_GROUP format"; ComdatGroup *group = insert_comdat_group(ctx, signature); comdat_groups.push_back({group, (i32)i, entries.subspan(1)}); break; } case SHT_CREL: decoded_crel.resize(i + 1); decoded_crel[i] = decode_crel(ctx, *this, shdr); break; case SHT_REL: case SHT_RELA: case SHT_SYMTAB: case SHT_SYMTAB_SHNDX: case SHT_STRTAB: case SHT_NULL: break; default: if (!is_known_section_type(shdr)) Fatal(ctx) << *this << ": " << name << ": unsupported section type: 0x" << std::hex << (u32)shdr.sh_type; // .note.GNU-stack section controls executable-ness of the stack // area in GNU linkers. We ignore that section because silently // making the stack area executable is too dangerous. Tell our // users about the difference if that matters. if (name == ".note.GNU-stack" && !ctx.arg.relocatable) { if (shdr.sh_flags & SHF_EXECINSTR) { if (!ctx.arg.z_execstack && !ctx.arg.z_execstack_if_needed) Warn(ctx) << *this << ": this file may cause a segmentation" " fault because it requires an executable stack. See" " https://github.com/rui314/mold/tree/main/docs/execstack.md" " for more info."; needs_executable_stack = true; } continue; } if (name == ".note.gnu.property") { parse_note_gnu_property(ctx, shdr); continue; } // Ignore a build-id section in an input file. This doesn't normally // happen, but you can create such object file with // `ld.bfd -r --build-id`. if (name == ".note.gnu.build-id") continue; // Ignore these sections for compatibility with old glibc i386 CRT files. if (name == ".gnu.linkonce.t.__x86.get_pc_thunk.bx" || name == ".gnu.linkonce.t.__i686.get_pc_thunk.bx") continue; // Also ignore this for compatibility with ICC if (name == ".gnu.linkonce.d.DW.ref.__gxx_personality_v0") continue; // Ignore debug sections if --strip-all or --strip-debug is given. if ((ctx.arg.strip_all || ctx.arg.strip_debug) && is_debug_section(shdr, name)) continue; // Ignore section is specified by --discard-section. if (!ctx.arg.discard_section.empty() && ctx.arg.discard_section.contains(name)) continue; if (name == ".comment" && this->get_string(ctx, shdr).starts_with("rustc ")) this->is_rust_obj = true; // If an output file doesn't have a section header (i.e. // --oformat=binary is given), we discard all non-memory-allocated // sections. This is because without a section header, we can't find // their places in an output file in the first place. if (ctx.arg.oformat_binary && !(shdr.sh_flags & SHF_ALLOC)) continue; this->sections[i] = std::make_unique>(ctx, *this, i); InputSection *isec = this->sections[i].get(); // Save .llvm_addrsig for --icf=safe. if (shdr.sh_type == SHT_LLVM_ADDRSIG && !ctx.arg.relocatable) { // sh_link should be the index of the symbol table section. // Tools that mutates the symbol table, such as objcopy or `ld -r` // tend to not preserve sh_link, so we ignore such section. if (shdr.sh_link != 0) llvm_addrsig = std::move(this->sections[i]); continue; } if (shdr.sh_type == SHT_INIT_ARRAY || shdr.sh_type == SHT_FINI_ARRAY || shdr.sh_type == SHT_PREINIT_ARRAY) this->has_init_array = true; if (name == ".ctors" || name.starts_with(".ctors.") || name == ".dtors" || name.starts_with(".dtors.")) this->has_ctors = true; if (name == ".eh_frame") eh_frame_sections.push_back(isec); if (name == ".debug_info" && !(shdr.sh_flags & SHF_ALLOC)) debug_info = isec; if constexpr (is_ppc32) if (name == ".got2") extra.got2 = isec; // Save debug sections for --gdb-index. if (ctx.arg.gdb_index) { // If --gdb-index is given, contents of .debug_gnu_pubnames and // .debug_gnu_pubtypes are copied to .gdb_index, so keeping them // in an output file is just a waste of space. if (name == ".debug_gnu_pubnames") { debug_pubnames = isec; isec->is_alive = false; } if (name == ".debug_gnu_pubtypes") { debug_pubtypes = isec; isec->is_alive = false; } // .debug_types is similar to .debug_info but contains type info // only. It exists only in DWARF 4, has been removed in DWARF 5 and // neither GCC nor Clang generate it by default // (-fdebug-types-section is needed). As such there is probably // little need to support it. if (name == ".debug_types") Fatal(ctx) << *this << ": mold's --gdb-index is not compatible" " with .debug_types; to fix this error, remove" " -fdebug-types-section and recompile"; } static Counter counter("regular_sections"); counter++; break; } } // Attach relocation sections to their target sections. for (i64 i = 0; i < this->elf_sections.size(); i++) { const ElfShdr &shdr = this->elf_sections[i]; if (shdr.sh_type == (E::is_rela ? SHT_RELA : SHT_REL) || shdr.sh_type == SHT_CREL) { if (std::unique_ptr> &target = sections[shdr.sh_info]) { assert(target->relsec_idx == -1); target->relsec_idx = i; } } } // Attach .arm.exidx sections to their corresponding sections if constexpr (is_arm32) for (std::unique_ptr> &isec : this->sections) if (isec && isec->shdr().sh_type == SHT_ARM_EXIDX) if (InputSection *target = sections[isec->shdr().sh_link].get()) target->extra.exidx = isec.get(); } // .eh_frame contains data records explaining how to handle exceptions. // When an exception is thrown, the runtime searches a record from // .eh_frame with the current program counter as a key. A record that // covers the current PC explains how to find a handler and how to // transfer the control ot it. // // Unlike the most other sections, linker has to parse .eh_frame contents // because of the following reasons: // // - There's usually only one .eh_frame section for each object file, // which explains how to handle exceptions for all functions in the same // object. If we just copy them, the resulting .eh_frame section will // contain lots of records for dead sections (i.e. de-duplicated inline // functions). We want to copy only records for live functions. // // - .eh_frame contains two types of records: CIE and FDE. There's usually // only one CIE at beginning of .eh_frame section followed by FDEs. // Compiler usually emits the identical CIE record for all object files. // We want to merge identical CIEs in an output .eh_frame section to // reduce the section size. // // - Scanning a .eh_frame section to find a record is an O(n) operation // where n is the number of records in the section. To reduce it to // O(log n), linker creates a .eh_frame_hdr section. The section // contains a sorted list of [an address in .text, an FDE address whose // coverage starts at the .text address] to make binary search doable. // In order to create .eh_frame_hdr, linker has to read .eh_frame. // // This function parses an input .eh_frame section. template void ObjectFile::parse_ehframe(Context &ctx) { for (InputSection *isec : eh_frame_sections) { std::span> rels = isec->get_rels(ctx); i64 cies_begin = cies.size(); i64 fdes_begin = fdes.size(); // Read CIEs and FDEs until empty. std::string_view contents = this->get_string(ctx, isec->shdr()); i64 rel_idx = 0; for (std::string_view data = contents; !data.empty();) { i64 size = *(U32 *)data.data(); if (size == 0) break; i64 begin_offset = data.data() - contents.data(); i64 end_offset = begin_offset + size + 4; i64 id = *(U32 *)(data.data() + 4); data = data.substr(size + 4); i64 rel_begin = rel_idx; while (rel_idx < rels.size() && rels[rel_idx].r_offset < end_offset) rel_idx++; assert(rel_idx == rels.size() || begin_offset <= rels[rel_begin].r_offset); if (id == 0) { // This is CIE. cies.emplace_back(ctx, *this, *isec, begin_offset, rels, rel_begin); } else { // This is FDE. if (rel_begin == rel_idx || rels[rel_begin].r_sym == 0) { // FDE has no valid relocation, which means FDE is dead from // the beginning. Compilers usually don't create such FDE, but // `ld -r` tend to generate such dead FDEs. continue; } if (rels[rel_begin].r_offset - begin_offset != 8) Fatal(ctx) << *isec << ": FDE's first relocation should have offset 8"; fdes.emplace_back(begin_offset, rel_begin); } } // Associate CIEs to FDEs. auto find_cie = [&](i64 offset) { for (i64 i = cies_begin; i < cies.size(); i++) if (cies[i].input_offset == offset) return i; Fatal(ctx) << *isec << ": bad FDE pointer"; }; for (i64 i = fdes_begin; i < fdes.size(); i++) { i64 cie_offset = *(I32 *)(contents.data() + fdes[i].input_offset + 4); fdes[i].cie_idx = find_cie(fdes[i].input_offset + 4 - cie_offset); } isec->is_alive = false; } auto get_isec = [&](const FdeRecord &fde) { return get_section(this->elf_syms[fde.get_rels(*this)[0].r_sym]); }; // We assume that FDEs for the same input sections are contiguous // in `fdes` vector. ranges::stable_sort(fdes, {}, [&](const FdeRecord &x) { return get_isec(x)->get_priority(); }); // Associate FDEs to input sections. for (i64 i = 0; i < fdes.size();) { InputSection *isec = get_isec(fdes[i]); assert(isec->fde_begin == -1); if (isec->is_alive) { isec->fde_begin = i++; while (i < fdes.size() && isec == get_isec(fdes[i])) i++; isec->fde_end = i; } else { fdes[i++].is_alive = false; } } } template void ObjectFile::initialize_symbols(Context &ctx) { if (this->elf_syms.empty()) return; static Counter counter("all_syms"); counter += this->elf_syms.size(); // Initialize local symbols this->local_syms.resize(this->first_global); this->local_syms[0].file = this; this->local_syms[0].sym_idx = 0; for (i64 i = 1; i < this->first_global; i++) { const ElfSym &esym = this->elf_syms[i]; if (esym.is_common()) Fatal(ctx) << *this << ": common local symbol?"; std::string_view name; if (esym.st_type == STT_SECTION) name = this->shstrtab.data() + this->elf_sections[get_shndx(esym)].sh_name; else name = this->symbol_strtab.data() + esym.st_name; Symbol &sym = this->local_syms[i]; sym.set_name(name); sym.file = this; sym.value = esym.st_value; sym.sym_idx = i; if (!esym.is_abs()) sym.set_input_section(sections[get_shndx(esym)].get()); } this->symbols.resize(this->elf_syms.size()); i64 num_globals = this->elf_syms.size() - this->first_global; has_symver.resize(num_globals); for (i64 i = 0; i < this->first_global; i++) this->symbols[i] = &this->local_syms[i]; // Initialize global symbols for (i64 i = this->first_global; i < this->elf_syms.size(); i++) { const ElfSym &esym = this->elf_syms[i]; if (esym.is_common()) has_common_symbol = true; // Get a symbol name std::string_view key = this->symbol_strtab.data() + esym.st_name; std::string_view name = key; // Parse symbol version after atsign if (i64 pos = name.find('@'); pos != name.npos) { std::string_view ver = name.substr(pos); name = name.substr(0, pos); if (ver != "@" && ver != "@@") { if (ver.starts_with("@@")) key = name; has_symver[i - this->first_global] = true; } } // Handle --wrap option Symbol *sym; if (esym.is_undef() && name.starts_with("__real_") && ctx.arg.wrap.contains(name.substr(7))) { sym = get_symbol(ctx, key.substr(7), name.substr(7)); } else { sym = get_symbol(ctx, key, name); if (esym.is_undef() && sym->is_wrapped) { key = save_string(ctx, "__wrap_" + std::string(key)); name = save_string(ctx, "__wrap_" + std::string(name)); sym = get_symbol(ctx, key, name); } } this->symbols[i] = sym; } } // Relocations are usually sorted by r_offset in relocation tables, // but for some reason only RISC-V does not follow that convention. // We expect them to be sorted, so sort them if necessary. template void ObjectFile::sort_relocations(Context &ctx) { if constexpr (is_riscv || is_loongarch) { for (i64 i = 1; i < sections.size(); i++) { std::unique_ptr> &isec = sections[i]; if (!isec || !isec->is_alive || !(isec->shdr().sh_flags & SHF_ALLOC)) continue; std::span> rels = isec->get_rels(ctx); if (!ranges::is_sorted(rels, {}, &ElfRel::r_offset)) ranges::stable_sort(rels, {}, &ElfRel::r_offset); } } } template void ObjectFile::convert_mergeable_sections(Context &ctx) { // Convert InputSections to MergeableSections for (i64 i = 0; i < this->sections.size(); i++) { InputSection *isec = this->sections[i].get(); if (!isec || isec->sh_size == 0 || isec->relsec_idx != -1) continue; const ElfShdr &shdr = isec->shdr(); if (!(shdr.sh_flags & SHF_MERGE)) continue; MergedSection *parent = MergedSection::get_instance(ctx, isec->name(), shdr); if (parent) { this->mergeable_sections[i] = std::make_unique>(ctx, *parent, this->sections[i]); this->sections[i] = nullptr; } } } // Usually a section is an atomic unit of inclusion or exclusion. // Linker doesn't care about its contents. However, if a section is a // mergeable section (a section with SHF_MERGE bit set), the linker is // expected to split it into smaller pieces and merge each piece with // other pieces from different object files. In mold, we call the // atomic unit of mergeable section "section pieces". // // This feature is typically used for string literals. String literals // are usually put into a mergeable section by the compiler. If the same // string literal happens to occur in two different translation units, // the linker merges them into a single instance of a string, so that // the linker's output doesn't contain duplicate string literals. // // Handling symbols in the mergeable sections is a bit tricky. Assume // that we have a mergeable section with the following contents and // symbols: // // Hello world\0foo bar\0 // ^ ^ // .rodata .L.str1 // .L.str0 // // '\0' represents a NUL byte. This mergeable section contains two // section pieces, "Hello world" and "foo bar". The first string is // referred to by two symbols, .rodata and .L.str0, and the second by // .L.str1. .rodata is a section symbol and therefore a local symbol // and refers to the beginning of the section. // // In this example, there are actually two different ways to point to // string "foo bar", because .rodata+12 and .L.str1+0 refer to the same // place in the section. This kind of "out-of-bound" reference occurs // only when a symbol is a section symbol. In other words, the compiler // may use an offset from the beginning of a section to refer to any // section piece in a section, but it doesn't do for any other types // of symbols. // // Section garbage collection and Identical Code Folding work on graphs // where sections or section pieces are vertices and relocations are // edges. To make it easy to handle them, we rewrite symbols and // relocations so that each non-absolute symbol always refers to either // a non-mergeable section or a section piece. // // We do that only for SHF_ALLOC sections because GC and ICF work only // on memory-allocated sections. Non-memory-allocated mergeable sections // are not handled here for performance reasons. template void ObjectFile::reattach_section_pieces(Context &ctx) { // Attach section pieces to symbols. for (i64 i = 1; i < this->elf_syms.size(); i++) { Symbol &sym = *this->symbols[i]; const ElfSym &esym = this->elf_syms[i]; if (esym.is_abs() || esym.is_common() || esym.is_undef()) continue; i64 shndx = get_shndx(esym); std::unique_ptr> &m = mergeable_sections[shndx]; if (!m || !m->parent.resolved) continue; SectionFragment *frag; i64 frag_offset; std::tie(frag, frag_offset) = m->get_fragment(esym.st_value); if (!frag) Fatal(ctx) << *this << ": bad symbol value: " << esym.st_value; sym.set_frag(frag); sym.value = frag_offset; } // Compute the size of frag_syms. i64 nfrag_syms = 0; for (std::unique_ptr> &isec : sections) if (isec && (isec->shdr().sh_flags & SHF_ALLOC)) for (ElfRel &r : isec->get_rels(ctx)) if (const ElfSym &esym = this->elf_syms[r.r_sym]; esym.st_type == STT_SECTION) if (mergeable_sections[get_shndx(esym)]) nfrag_syms++; this->frag_syms.resize(nfrag_syms); // For each relocation referring to a mergeable section symbol, we // create a new dummy non-section symbol and redirect the relocation // to the newly created symbol. i64 idx = 0; for (std::unique_ptr> &isec : sections) { if (isec && (isec->shdr().sh_flags & SHF_ALLOC)) { for (ElfRel &r : isec->get_rels(ctx)) { const ElfSym &esym = this->elf_syms[r.r_sym]; if (esym.st_type != STT_SECTION) continue; i64 shndx = get_shndx(esym); std::unique_ptr> &m = mergeable_sections[shndx]; if (!m) continue; assert(m->parent.resolved); i64 r_addend = get_addend(*isec, r); SectionFragment *frag; i64 in_frag_offset; std::tie(frag, in_frag_offset) = m->get_fragment(esym.st_value + r_addend); if (!frag) Fatal(ctx) << *this << ": bad relocation at " << r.r_sym; Symbol &sym = this->frag_syms[idx]; sym.file = this; sym.set_name(""); sym.sym_idx = r.r_sym; sym.visibility = STV_HIDDEN; sym.set_frag(frag); sym.value = in_frag_offset - r_addend; r.r_sym = this->elf_syms.size() + idx; idx++; } } } assert(idx == this->frag_syms.size()); for (Symbol &sym : this->frag_syms) this->symbols.push_back(&sym); } template void ObjectFile::parse(Context &ctx) { sections.resize(this->elf_sections.size()); mergeable_sections.resize(sections.size()); symtab_sec = this->find_section(SHT_SYMTAB); if (symtab_sec) { // In ELF, all local symbols precede global symbols in the symbol table. // sh_info has an index of the first global symbol. this->first_global = symtab_sec->sh_info; this->elf_syms = this->template get_data>(ctx, *symtab_sec); this->symbol_strtab = this->get_string(ctx, symtab_sec->sh_link); if (ElfShdr *shdr = this->find_section(SHT_SYMTAB_SHNDX)) symtab_shndx_sec = this->template get_data>(ctx, *shdr); } initialize_sections(ctx); initialize_symbols(ctx); sort_relocations(ctx); } // Symbols with higher priorities overwrites symbols with lower priorities. // Here is the list of priorities, from the highest to the lowest. // // 1. Strong defined symbol // 2. Weak defined symbol // 3. Strong defined symbol in a DSO/archive // 4. Weak Defined symbol in a DSO/archive // 5. Common symbol // 6. Common symbol in an archive // 7. Unclaimed (nonexistent) symbol // // Ties are broken by file priority. // // Note that the above priorities are based on heuristics and not on exact // science. We tried several different orders and settled on the current // one just because it avoids link errors in all programs we've tested. template static u64 get_rank(InputFile *file, const ElfSym &esym, bool is_in_archive) { auto get_sym_rank = [&] { if (esym.is_common()) { assert(!file->is_dso); return is_in_archive ? 6 : 5; } if (file->is_dso || is_in_archive) return (esym.st_bind == STB_WEAK) ? 4 : 3; if (esym.st_bind == STB_WEAK) return 2; return 1; }; return (get_sym_rank() << 24) + file->priority; } template static u64 get_rank(const Symbol &sym) { if (!sym.file) return 7 << 24; return get_rank(sym.file, sym.esym(), !sym.file->is_reachable); } // Symbol's visibility is set to the most restrictive one. For example, // if one input file has a defined symbol `foo` with the default // visibility and the other input file has an undefined symbol `foo` // with the hidden visibility, the resulting symbol is a hidden defined // symbol. template void ObjectFile::merge_visibility(Context &ctx, Symbol &sym, u8 visibility) { // Canonicalize visibility if (visibility == STV_INTERNAL) visibility = STV_HIDDEN; auto priority = [&](u8 visibility) { switch (visibility) { case STV_HIDDEN: return 1; case STV_PROTECTED: return 2; case STV_DEFAULT: return 3; } Fatal(ctx) << *this << ": unknown symbol visibility: " << sym; }; update_minimum(sym.visibility, visibility, [&](u8 a, u8 b) { return priority(a) < priority(b); }); } template static void print_trace_symbol(Context &ctx, InputFile &file, const ElfSym &esym, Symbol &sym) { if (!esym.is_undef()) Out(ctx) << "trace-symbol: " << file << ": definition of " << sym; else if (esym.is_weak()) Out(ctx) << "trace-symbol: " << file << ": weak reference to " << sym; else Out(ctx) << "trace-symbol: " << file << ": reference to " << sym; } template void ObjectFile::resolve_symbols(Context &ctx) { for (i64 i = this->first_global; i < this->elf_syms.size(); i++) { Symbol &sym = *this->symbols[i]; const ElfSym &esym = this->elf_syms[i]; if (esym.is_undef()) continue; InputSection *isec = nullptr; if (!esym.is_abs() && !esym.is_common()) { isec = get_section(esym); if (!isec || !isec->is_alive) continue; } std::scoped_lock lock(sym.mu); if (get_rank(this, esym, !this->is_reachable) < get_rank(sym)) { sym.file = this; sym.set_input_section(isec); sym.value = esym.st_value; sym.sym_idx = i; sym.ver_idx = ctx.default_version; sym.is_weak = esym.is_weak(); sym.is_versioned_default = false; } } } template void ObjectFile::mark_live_objects(Context &ctx, std::function *)> feeder) { assert(this->is_reachable); for (i64 i = this->first_global; i < this->elf_syms.size(); i++) { const ElfSym &esym = this->elf_syms[i]; Symbol &sym = *this->symbols[i]; if (!esym.is_undef() && exclude_libs) merge_visibility(ctx, sym, STV_HIDDEN); else merge_visibility(ctx, sym, esym.st_visibility); if (sym.is_traced) print_trace_symbol(ctx, *this, esym, sym); if (sym.file) { bool undef_ref = esym.is_undef() && (!esym.is_weak() || sym.file->is_dso); bool common_ref = esym.is_common() && !sym.esym().is_common(); if ((undef_ref || common_ref) && !sym.file->is_reachable.test_and_set()) { feeder(sym.file); if (sym.is_traced) Out(ctx) << "trace-symbol: " << *this << " keeps " << *sym.file << " for " << sym; } } } } template void ObjectFile::scan_relocations(Context &ctx) { // Scan relocations against seciton contents for (std::unique_ptr> &isec : sections) if (isec && isec->is_alive && (isec->shdr().sh_flags & SHF_ALLOC)) isec->scan_relocations(ctx); // Scan relocations against exception frames for (CieRecord &cie : cies) { for (ElfRel &rel : cie.get_rels()) { Symbol &sym = *this->symbols[rel.r_sym]; if (ctx.arg.pic && rel.r_type == E::R_ABS) Error(ctx) << *this << ": relocation " << rel << " in .eh_frame can" << " not be used when making a position-independent output;" << " recompile with -fPIE or -fPIC"; if (sym.is_imported) { if (sym.get_type() != STT_FUNC) Fatal(ctx) << *this << ": " << sym << ": .eh_frame CIE record with an external data reference" << " is not supported"; sym.flags |= NEEDS_PLT; } } } } // Common symbols are used by C's tantative definitions. Tentative // definition is an obscure C feature which allows users to omit `extern` // from global variable declarations in a header file. For example, if you // have a tentative definition `int foo;` in a header which is included // into multiple translation units, `foo` will be included into multiple // object files, but it won't cause the duplicate symbol error. Instead, // the linker will merge them into a single instance of `foo`. // // If a header file contains a tentative definition `int foo;` and one of // a C file contains a definition with initial value such as `int foo = 5;`, // then the "real" definition wins. The symbol for the tentative definition // will be resolved to the real definition. If there is no "real" // definition, the tentative definition gets the default initial value 0. // // Tentative definitions are represented as "common symbols" in an object // file. In this function, we allocate spaces in .common or .tls_common // for remaining common symbols that were not resolved to usual defined // symbols in previous passes. template void ObjectFile::convert_common_symbols(Context &ctx) { if (!has_common_symbol) return; for (i64 i = this->first_global; i < this->elf_syms.size(); i++) { if (!this->elf_syms[i].is_common()) continue; Symbol &sym = *this->symbols[i]; if (sym.file != this) { if (ctx.arg.warn_common) Warn(ctx) << *this << ": multiple common symbols: " << sym; continue; } ElfShdr shdr = {}; if (sym.get_type() == STT_TLS) shdr.sh_flags = SHF_ALLOC | SHF_WRITE | SHF_TLS; else shdr.sh_flags = SHF_ALLOC | SHF_WRITE; shdr.sh_type = SHT_NOBITS; shdr.sh_size = this->elf_syms[i].st_size; shdr.sh_addralign = this->elf_syms[i].st_value; elf_sections2.push_back(shdr); i64 idx = this->elf_sections.size() + elf_sections2.size() - 1; auto isec = std::make_unique>(ctx, *this, idx); sym.set_input_section(isec.get()); sym.value = 0; sym.sym_idx = i; sym.ver_idx = ctx.default_version; sym.is_weak = false; sections.push_back(std::move(isec)); } } template static bool should_write_to_local_symtab(Context &ctx, Symbol &sym) { if (sym.get_type() == STT_SECTION) return false; // Local symbols are discarded if --discard-local is given or they // are in a mergeable section. I *believe* we exclude symbols in // mergeable sections because (1) there are too many and (2) they are // merged, so their origins shouldn't matter, but I don't really // know the rationale. Anyway, this is the behavior of the // traditional linkers. if (sym.name().starts_with(".L") || sym.name() == "L0\001") { if (ctx.arg.discard_locals) return false; if (InputSection *isec = sym.get_input_section()) if (isec->shdr().sh_flags & SHF_MERGE) return false; } return true; } template void ObjectFile::compute_symtab_size(Context &ctx) { this->output_sym_indices.resize(this->elf_syms.size(), -1); auto is_alive = [](Symbol &sym) -> bool { if (SectionFragment *frag = sym.get_frag()) return frag->is_alive; if (InputSection *isec = sym.get_input_section()) return isec->is_alive; return true; }; // Compute the size of local symbols if (!ctx.arg.discard_all && !ctx.arg.strip_all && !ctx.arg.retain_symbols_file) { for (i64 i = 1; i < this->first_global; i++) { Symbol &sym = *this->symbols[i]; if (is_alive(sym) && should_write_to_local_symtab(ctx, sym)) { this->strtab_size += sym.name().size() + 1; this->output_sym_indices[i] = this->num_local_symtab++; sym.write_to_symtab = true; } } } // Compute the size of global symbols. for (i64 i = this->first_global; i < this->elf_syms.size(); i++) { Symbol &sym = *this->symbols[i]; if (sym.file == this && is_alive(sym) && (!ctx.arg.retain_symbols_file || sym.write_to_symtab)) { this->strtab_size += sym.name().size() + 1; // Global symbols can be demoted to local symbols based on visibility, // version scripts etc. if (sym.is_local(ctx)) this->output_sym_indices[i] = this->num_local_symtab++; else this->output_sym_indices[i] = this->num_global_symtab++; sym.write_to_symtab = true; } } } template void ObjectFile::populate_symtab(Context &ctx) { ElfSym *symtab_base = (ElfSym *)(ctx.buf + ctx.symtab->shdr.sh_offset); u8 *strtab_base = ctx.buf + ctx.strtab->shdr.sh_offset; i64 strtab_off = this->strtab_offset; auto write_sym = [&](Symbol &sym, i64 idx) { U32 *xindex = nullptr; if (ctx.symtab_shndx) xindex = (U32 *)(ctx.buf + ctx.symtab_shndx->shdr.sh_offset) + idx; symtab_base[idx] = *to_output_esym(ctx, sym, strtab_off, xindex); strtab_off += write_string(strtab_base + strtab_off, sym.name()); }; i64 local_idx = this->local_symtab_idx; i64 global_idx = this->global_symtab_idx; for (i64 i = 1; i < this->first_global; i++) if (Symbol &sym = *this->symbols[i]; sym.write_to_symtab) write_sym(sym, local_idx++); for (i64 i = this->first_global; i < this->elf_syms.size(); i++) { Symbol &sym = *this->symbols[i]; if (sym.file == this && sym.write_to_symtab) write_sym(sym, sym.is_local(ctx) ? local_idx++ : global_idx++); } } template std::ostream &operator<<(std::ostream &out, const InputFile &file) { if (file.is_dso) { out << path_clean(file.filename); return out; } ObjectFile *obj = (ObjectFile *)&file; if (obj->archive_name == "") out << path_clean(obj->filename); else out << path_clean(obj->archive_name) << "(" << obj->filename + ")"; return out; } template std::string SharedFile::get_soname(Context &ctx) { if (ElfShdr *sec = this->find_section(SHT_DYNAMIC)) for (ElfDyn &dyn : this->template get_data>(ctx, *sec)) if (dyn.d_tag == DT_SONAME) return this->get_string(ctx, sec->sh_link).data() + dyn.d_val; if (this->mf->given_fullpath) return this->filename; return path_filename(this->filename); } template void SharedFile::parse(Context &ctx) { symtab_sec = this->find_section(SHT_DYNSYM); if (!symtab_sec) return; this->symbol_strtab = this->get_string(ctx, symtab_sec->sh_link); soname = get_soname(ctx); version_strings = read_verdef(ctx); // Read a symbol table. std::span> esyms = this->template get_data>(ctx, *symtab_sec); std::span> vers; if (ElfShdr *sec = this->find_section(SHT_GNU_VERSYM)) vers = this->template get_data>(ctx, *sec); for (i64 i = symtab_sec->sh_info; i < esyms.size(); i++) { u16 ver; if (vers.empty() || esyms[i].is_undef()) ver = VER_NDX_GLOBAL; else ver = vers[i] & ~VERSYM_HIDDEN; if (ver == VER_NDX_LOCAL) continue; this->elf_syms2.push_back(esyms[i]); this->versyms.push_back(ver); std::string_view name = this->symbol_strtab.data() + esyms[i].st_name; auto get_versioned_sym = [&] { std::string_view key = save_string( ctx, std::string(name) + "@" + std::string(version_strings[ver])); return get_symbol(ctx, key, name); }; // Symbol resolution involving symbol versioning is tricky because one // symbol can be resolved with two different identifiers. Among // symbols with the same name but different versions, one of them is // always marked as the "default" one. This symbol is often denoted // with two atsigns as `foo@@VERSION` and can be referred to either // as `foo` or `foo@VERSION`. No other symbols have two names like that. // // On contrary, a versioned non-default symbol can be referred only // with an explicit version suffix, e.g., `foo@VERSION`. // // Here is how we resolve versioned default symbols. We resolve `foo` // and `foo@VERSION` as usual, but with information to forward // references to `foo@VERSION` to `foo`. After name resolution, we // visit all symbol references to redirect `foo@VERSION` to `foo`. if (vers.empty() || ver == VER_NDX_GLOBAL) { // Unversioned symbol this->symbols.push_back(get_symbol(ctx, name)); this->symbols2.push_back(nullptr); } else if (vers[i] & VERSYM_HIDDEN) { // Versioned non-default symbol this->symbols.push_back(get_versioned_sym()); this->symbols2.push_back(nullptr); } else { // Versioned default symbol this->symbols.push_back(get_symbol(ctx, name)); this->symbols2.push_back(get_versioned_sym()); } } this->elf_syms = elf_syms2; this->first_global = 0; static Counter counter("dso_syms"); counter += this->elf_syms.size(); } template std::vector SharedFile::get_dt_needed(Context &ctx) { std::vector vec; if (ElfShdr *sec = this->find_section(SHT_DYNAMIC)) for (ElfDyn &dyn : this->template get_data>(ctx, *sec)) if (dyn.d_tag == DT_NEEDED) vec.push_back(this->get_string(ctx, sec->sh_link).data() + dyn.d_val); return vec; } template std::string_view SharedFile::get_dt_audit(Context &ctx) { if (ElfShdr *sec = this->find_section(SHT_DYNAMIC)) for (ElfDyn &dyn : this->template get_data>(ctx, *sec)) if (dyn.d_tag == DT_AUDIT) return this->get_string(ctx, sec->sh_link).data() + dyn.d_val; return ""; } // Symbol versioning is a GNU extension to the ELF file format. I don't // particularly like the feature as it complicates the semantics of // dynamic linking, but we need to support it anyway because it is // mandatory on glibc-based systems such as most Linux distros. // // Let me explain what symbol versioning is. Symbol versioning is a // mechanism to allow multiple symbols of the same name but of different // versions live together in a shared object file. It's convenient if you // want to make an API-breaking change to some function but want to keep // old programs working with the newer libraries. // // With symbol versioning, dynamic symbols are resolved by (name, version) // tuple instead of just by name. For example, glibc 2.35 defines two // different versions of `posix_spawn`, `posix_spawn` of version // "GLIBC_2.15" and that of version "GLIBC_2.2.5". Any executable that // uses `posix_spawn` is linked either to that of "GLIBC_2.15" or that of // "GLIBC_2.2.5" // // Versions are just strings, and no ordering is defined between them. // For example, "GLIBC_2.15" is not considered a newer version of // "GLIBC_2.2.5" or vice versa. They are considered just different. // // If a shared object file has versioned symbols, it contains a parallel // array for the symbol table. Version strings can be found in that // parallel table. // // One version is considered the "default" version for each shared object. // If an undefiend symbol `foo` is resolved to a symbol defined by the // shared object, it's marked so that it'll be resolved to (`foo`, the // default version of the library) at load-time. template std::vector SharedFile::read_verdef(Context &ctx) { ElfShdr *verdef_sec = this->find_section(SHT_GNU_VERDEF); if (!verdef_sec) return {}; std::string_view verdef = this->get_string(ctx, *verdef_sec); std::string_view strtab = this->get_string(ctx, verdef_sec->sh_link); std::vector vec; u8 *ptr = (u8 *)verdef.data(); for (;;) { ElfVerdef *ver = (ElfVerdef *)ptr; if (ver->vd_ndx == VER_NDX_UNSPECIFIED) Fatal(ctx) << *this << ": symbol version too large"; if (vec.size() <= ver->vd_ndx) vec.resize(ver->vd_ndx + 1); ElfVerdaux *aux = (ElfVerdaux *)(ptr + ver->vd_aux); vec[ver->vd_ndx] = strtab.data() + aux->vda_name; if (!ver->vd_next) break; ptr += ver->vd_next; } return vec; } template void SharedFile::resolve_symbols(Context &ctx) { for (i64 i = 0; i < this->symbols.size(); i++) { Symbol &sym = *this->symbols[i]; const ElfSym &esym = this->elf_syms[i]; if (esym.is_undef() || sym.skip_dso) continue; std::scoped_lock lock(sym.mu); if (get_rank(this, esym, false) < get_rank(sym)) { sym.file = this; sym.origin = 0; sym.value = esym.st_value; sym.sym_idx = i; sym.ver_idx = versyms[i]; sym.is_weak = true; sym.is_versioned_default = false; } // A symbol with the default version is a special case because, unlike // other symbols, the symbol can be referred by two names, `foo` and // `foo@VERSION`. Here, we resolve `foo@VERSOIN` as a proxy of `foo`. Symbol *sym2 = this->symbols2[i]; if (sym2 && sym2 != &sym) { std::scoped_lock lock2(sym2->mu); if (get_rank(this, esym, false) < get_rank(*sym2)) { sym2->file = this; sym2->origin = (uintptr_t)&sym; sym2->sym_idx = i; sym2->is_versioned_default = true; } } } } template void SharedFile::mark_live_objects(Context &ctx, std::function *)> feeder) { for (i64 i = 0; i < this->elf_syms.size(); i++) { const ElfSym &esym = this->elf_syms[i]; Symbol &sym = *this->symbols[i]; if (sym.is_traced) print_trace_symbol(ctx, *this, esym, sym); // We follow undefined symbols in a DSO only to handle // --no-allow-shlib-undefined. if (esym.is_undef() && !esym.is_weak() && sym.file && (!sym.file->is_dso || !ctx.arg.allow_shlib_undefined) && !sym.file->is_reachable.test_and_set()) { feeder(sym.file); if (sym.is_traced) Out(ctx) << "trace-symbol: " << *this << " keeps " << *sym.file << " for " << sym; } } } template std::span *> SharedFile::get_symbols_at(Symbol *sym) { assert(sym->file == this); std::call_once(init_sorted_syms, [&] { for (Symbol *sym : this->symbols) if (sym->file == this) sorted_syms.push_back(sym); tbb::parallel_sort(sorted_syms.begin(), sorted_syms.end(), [](Symbol *a, Symbol *b) { const ElfSym &x = a->esym(); const ElfSym &y = b->esym(); return std::tuple{x.st_value, &x} < std::tuple{y.st_value, &y}; }); }); auto [begin, end] = ranges::equal_range(sorted_syms, sym->esym().st_value, {}, [](Symbol *x) { return x->esym().st_value; }); return {&*begin, (size_t)(end - begin)}; } // Infer an alignment of a DSO symbol. An alignment of a symbol in other // .so is not something we usually care about, but when we create a copy // relocation for a symbol, we need to preserve its alignment requirement. // // Symbol alignment is not explicitly represented in an ELF file. In this // function, we conservatively infer it from a symbol address and a // section alignment requirement. template i64 SharedFile::get_alignment(Symbol *sym) { ElfShdr &shdr = this->elf_sections[sym->esym().st_shndx]; i64 align = std::max(1, shdr.sh_addralign); if (sym->value) align = std::min(align, 1LL << std::countr_zero(sym->value)); return align; } template bool SharedFile::is_readonly(Symbol *sym) { ElfEhdr &ehdr = *(ElfEhdr *)this->mf->data; std::span> phdrs((ElfPhdr *)(this->mf->data + ehdr.e_phoff), ehdr.e_phnum); u64 val = sym->esym().st_value; for (ElfPhdr &phdr : phdrs) if ((phdr.p_type == PT_LOAD || phdr.p_type == PT_GNU_RELRO) && !(phdr.p_flags & PF_W) && phdr.p_vaddr <= val && val < phdr.p_vaddr + phdr.p_memsz) return true; return false; } template void SharedFile::compute_symtab_size(Context &ctx) { this->output_sym_indices.resize(this->elf_syms.size(), -1); // Compute the size of global symbols. for (i64 i = this->first_global; i < this->symbols.size(); i++) { Symbol &sym = *this->symbols[i]; if (sym.file == this && (sym.is_imported || sym.is_exported) && (!ctx.arg.retain_symbols_file || sym.write_to_symtab)) { this->strtab_size += sym.name().size() + 1; this->output_sym_indices[i] = this->num_global_symtab++; sym.write_to_symtab = true; } } } template void SharedFile::populate_symtab(Context &ctx) { ElfSym *symtab = (ElfSym *)(ctx.buf + ctx.symtab->shdr.sh_offset) + this->global_symtab_idx; u8 *strtab = ctx.buf + ctx.strtab->shdr.sh_offset; i64 strtab_off = this->strtab_offset; for (i64 i = 0; Symbol *sym : this->get_global_syms()) { if (sym->file != this || !sym->write_to_symtab) continue; U32 *xindex = nullptr; if (ctx.symtab_shndx) xindex = (U32 *)(ctx.buf + ctx.symtab_shndx->shdr.sh_offset) + this->global_symtab_idx + i; *symtab++ = *to_output_esym(ctx, *sym, strtab_off, xindex); strtab_off += write_string(strtab + strtab_off, sym->name()); i++; } } using E = MOLD_TARGET; template class InputFile; template class ObjectFile; template class SharedFile; template Symbol *get_symbol(Context &, std::string_view, std::string_view); template Symbol *get_symbol(Context &, std::string_view); template std::string_view demangle(const Symbol &); template ComdatGroup *insert_comdat_group(Context &, std::string_view); template std::ostream &operator<<(std::ostream &, const Symbol &); template std::ostream &operator<<(std::ostream &, const InputFile &); } // namespace mold ================================================ FILE: src/input-sections.cc ================================================ #include "mold.h" #include #include namespace mold { static i64 to_p2align(u64 alignment) { if (alignment == 0) return 0; return std::countr_zero(alignment); } template bool cie_equals(const CieRecord &a, const CieRecord &b) { if (a.get_contents() != b.get_contents()) return false; std::span> x = a.get_rels(); std::span> y = b.get_rels(); if (x.size() != y.size()) return false; for (i64 i = 0; i < x.size(); i++) if (x[i].r_offset - a.input_offset != y[i].r_offset - b.input_offset || x[i].r_type != y[i].r_type || a.file.symbols[x[i].r_sym] != b.file.symbols[y[i].r_sym] || get_addend(a.input_section, x[i]) != get_addend(b.input_section, y[i])) return false; return true; } template InputSection::InputSection(Context &ctx, ObjectFile &file, i64 shndx) : file(file), shndx(shndx) { if (shndx < file.elf_sections.size()) contents = {(char *)file.mf->data + shdr().sh_offset, (size_t)shdr().sh_size}; if (shdr().sh_flags & SHF_COMPRESSED) { ElfChdr &chdr = *(ElfChdr *)&contents[0]; sh_size = chdr.ch_size; p2align = to_p2align(chdr.ch_addralign); } else { sh_size = shdr().sh_size; p2align = to_p2align(shdr().sh_addralign); } // Sections may have been compressed. We usually uncompress them // directly into the mmap'ed output file, but we want to uncompress // early for REL-type ELF types to read relocation addends from // section contents. For RELA-type, we don't need to do this because // addends are in relocations. // // SH-4 stores addends to sections despite being RELA, which is a // special (and buggy) case. if constexpr (!E::is_rela || is_sh4) uncompress(ctx); } template void InputSection::uncompress(Context &ctx) { if (!(shdr().sh_flags & SHF_COMPRESSED) || uncompressed) return; u8 *buf = new u8[sh_size]; copy_contents_to(ctx, buf, sh_size); contents = std::string_view((char *)buf, sh_size); ctx.string_pool.emplace_back(buf); uncompressed = true; } template void InputSection::copy_contents_to(Context &ctx, u8 *buf, i64 sz) { if (!(shdr().sh_flags & SHF_COMPRESSED) || uncompressed) { memcpy(buf, contents.data(), sz); return; } if (contents.size() < sizeof(ElfChdr)) Fatal(ctx) << *this << ": corrupted compressed section"; ElfChdr &hdr = *(ElfChdr *)&contents[0]; std::string_view data = contents.substr(sizeof(ElfChdr)); switch (hdr.ch_type) { case ELFCOMPRESS_ZLIB: { z_stream s = {}; inflateInit(&s); s.next_in = (u8 *)data.data(); s.avail_in = data.size(); s.next_out = buf; s.avail_out = sz; int r; while (s.total_out < sz && (r = inflate(&s, Z_NO_FLUSH)) == Z_OK); if (s.total_out < sz && r != Z_STREAM_END) Fatal(ctx) << *this << ": uncompress failed: " << s.msg; inflateEnd(&s); msan_unpoison(buf, sz); break; } case ELFCOMPRESS_ZSTD: { ZSTD_DCtx *dctx = ZSTD_createDCtx(); ZSTD_inBuffer in = { data.data(), data.size() }; ZSTD_outBuffer out = { buf, (size_t)sz }; while (out.pos < out.size) { size_t r = ZSTD_decompressStream(dctx, &out, &in); if (ZSTD_isError(r)) Fatal(ctx) << *this << ": uncompress failed: " << ZSTD_getErrorName(r); if (r == 0 && out.pos < out.size) Fatal(ctx) << *this << ": uncompress failed: premature end of input"; } ZSTD_freeDCtx(dctx); msan_unpoison(buf, sz); break; } default: Fatal(ctx) << *this << ": unsupported compression type: 0x" << std::hex << hdr.ch_type; } } typedef enum : u8 { NONE, ERROR, COPYREL, PLT, CPLT } Action; template static void do_action(Context &ctx, Action action, InputSection &isec, Symbol &sym, const ElfRel &rel) { switch (action) { case NONE: break; case ERROR: Error(ctx) << isec << ": " << rel << " relocation at offset 0x" << std::hex << rel.r_offset << " against symbol `" << sym << "' can not be used; recompile with -fPIC"; break; case COPYREL: sym.flags |= NEEDS_COPYREL; break; case PLT: // Create a PLT entry sym.flags |= NEEDS_PLT; break; case CPLT: // Create a canonical PLT entry sym.flags |= NEEDS_CPLT; break; } } template static inline i64 get_output_type(Context &ctx) { if (ctx.arg.shared) return 0; if (ctx.arg.pie) return 1; return 2; } template static inline i64 get_sym_type(Symbol &sym) { if (sym.is_absolute()) return 0; if (!sym.is_imported) return 1; if (sym.get_type() != STT_FUNC) return 2; return 3; } template void InputSection::scan_pcrel(Context &ctx, Symbol &sym, const ElfRel &rel) { // This is for PC-relative relocations (e.g. R_X86_64_PC32). // We cannot promote them to dynamic relocations because the dynamic // linker generally does not support PC-relative relocations. static Action table[][4] = { // Absolute Local Imported data Imported code { ERROR, NONE, ERROR, PLT }, // Shared object { ERROR, NONE, COPYREL, CPLT }, // Position-independent exec { NONE, NONE, COPYREL, CPLT }, // Position-dependent exec }; Action action = table[get_output_type(ctx)][get_sym_type(sym)]; do_action(ctx, action, *this, sym, rel); } template void InputSection::scan_absrel(Context &ctx, Symbol &sym, const ElfRel &rel) { // This is a decision table for absolute relocations that is smaller // than the pointer size (e.g. R_X86_64_32). Since the dynamic linker // generally does not support dynamic relocations smaller than the // pointer size, we need to report an error if a relocation cannot be // resolved at link-time. static Action table[][4] = { // Absolute Local Imported data Imported code { NONE, ERROR, ERROR, ERROR }, // Shared object { NONE, ERROR, ERROR, ERROR }, // Position-independent exec { NONE, NONE, COPYREL, CPLT }, // Position-dependent exec }; Action action = table[get_output_type(ctx)][get_sym_type(sym)]; do_action(ctx, action, *this, sym, rel); } template void InputSection::scan_tlsdesc(Context &ctx, Symbol &sym) { if (ctx.arg.static_ || (ctx.arg.relax && sym.is_tprel_linktime_const(ctx))) { // Relax TLSDESC to Local Exec. In this case, we directly materialize // a TP-relative offset, so no dynamic relocation is needed. // // TLSDESC relocs must always be relaxed for statically-linked // executables even if -no-relax is given. It is because a // statically-linked executable doesn't contain a trampoline // function needed for TLSDESC. } else if (ctx.arg.relax && sym.is_tprel_runtime_const(ctx)) { // In this condition, TP-relative offset of a thread-local variable // is known at process startup time, so we can relax TLSDESC to the // code that reads the TP-relative offset from GOT and add TP to it. sym.flags |= NEEDS_GOTTP; } else { // If no relaxation is doable, we simply create a TLSDESC dynamic // relocation. sym.flags |= NEEDS_TLSDESC; } } template void InputSection::check_tlsle(Context &ctx, Symbol &sym, const ElfRel &rel) { if (ctx.arg.shared) Error(ctx) << *this << ": relocation " << rel << " against `" << sym << "` can not be used when making a shared object;" << " recompile with -fPIC"; } template void InputSection::write_to(Context &ctx, u8 *buf) { if (shdr().sh_type == SHT_NOBITS || sh_size == 0) return; // Copy data. In RISC-V and LoongArch object files, sections are not // atomic unit of copying because of relaxation. That is, some // relocations are allowed to remove bytes from the middle of a // section and shrink the overall size of it. if constexpr (is_riscv || is_loongarch) { std::span deltas = extra.r_deltas; if (deltas.empty()) { // If a section is not relaxed, we can copy it as a one big chunk. copy_contents_to(ctx, buf, sh_size); } else { // A relaxed section is copied piece-wise. memcpy(buf, contents.data(), deltas[0].offset); for (i64 i = 0; i < deltas.size(); i++) { i64 offset = deltas[i].offset; i64 delta = deltas[i].delta; i64 end = (i + 1 == deltas.size()) ? contents.size() : deltas[i + 1].offset; i64 removed_bytes = get_removed_bytes(deltas, i); memcpy(buf + offset + removed_bytes - delta, contents.data() + offset + removed_bytes, end - offset - removed_bytes); } } } else { copy_contents_to(ctx, buf, sh_size); } // Apply relocations if (!ctx.arg.relocatable) { if (shdr().sh_flags & SHF_ALLOC) apply_reloc_alloc(ctx, buf); else apply_reloc_nonalloc(ctx, buf); } } // Get the name of a function containin a given offset. template std::string_view InputSection::get_func_name(Context &ctx, i64 offset) const { for (Symbol *sym : file.symbols) if (sym->file == &file) if (const ElfSym &esym = sym->esym(); esym.st_shndx == shndx && esym.st_type == STT_FUNC && esym.st_value <= offset && offset < esym.st_value + esym.st_size) return ctx.arg.demangle ? demangle(*sym) : sym->name(); return ""; } // Test if the symbol a given relocation refers to has already been resolved. // If not, record that error and returns true. template bool InputSection::record_undef_error(Context &ctx, const ElfRel &rel) { // If a relocation refers to a linker-synthesized symbol for a // section fragment, it's always been resolved. if (file.elf_syms.size() <= rel.r_sym) return false; Symbol &sym = *file.symbols[rel.r_sym]; const ElfSym &esym = file.elf_syms[rel.r_sym]; // If a symbol is defined in a comdat group, and the comdat group is // discarded, the symbol may not have an owner. It is technically an // violation of the One Definition Rule, so it is a programmer's fault. if (!sym.file) { Error(ctx) << *this << ": " << sym << " refers to a discarded COMDAT section" << " probably due to an ODR violation"; return true; } auto record = [&] { std::stringstream ss; if (std::string_view source = file.get_source_name(); !source.empty()) ss << ">>> referenced by " << source << "\n"; else ss << ">>> referenced by " << *this << "\n"; ss << ">>> " << file; if (std::string_view func = get_func_name(ctx, rel.r_offset); !func.empty()) ss << ":(" << func << ")"; ss << '\n'; typename decltype(ctx.undef_errors)::accessor acc; ctx.undef_errors.insert(acc, {&sym, {}}); acc->second.push_back(ss.str()); }; // A non-weak undefined symbol must be promoted to an imported symbol // or resolved to an defined symbol. Otherwise, we need to report an // error or warn on it. // // Every ELF file has an absolute local symbol as its first symbol. // Referring to that symbol is always valid. bool is_undef = esym.is_undef() && !esym.is_weak() && sym.sym_idx; if (is_undef && sym.esym().is_undef()) { if (ctx.arg.unresolved_symbols == UNRESOLVED_ERROR && !sym.is_imported) { record(); return true; } if (ctx.arg.unresolved_symbols == UNRESOLVED_WARN) { record(); return false; } } return false; } template MergeableSection::MergeableSection(Context &ctx, MergedSection &parent, std::unique_ptr> &isec) : parent(parent), p2align(isec->p2align), input_section(std::move(isec)) { input_section->uncompress(ctx); std::scoped_lock lock(parent.mu); parent.members.push_back(this); } static size_t find_null(std::string_view data, i64 pos, i64 entsize) { if (entsize == 1) return data.find('\0', pos); for (; pos <= data.size() - entsize; pos += entsize) if (data.substr(pos, entsize).find_first_not_of('\0') == data.npos) return pos; return data.npos; } // Mergeable sections (sections with SHF_MERGE bit) typically contain // string literals. Linker is expected to split the section contents // into null-terminated strings, merge them with mergeable strings // from other object files, and emit uniquified strings to an output // file. // // This mechanism reduces the size of an output file. If two source // files happen to contain the same string literal, the output will // contain only a single copy of it. // // It is less common than string literals, but mergeable sections can // contain fixed-sized read-only records too. // // This function splits the section contents into small pieces that we // call "section fragments". Section fragment is a unit of merging. // // We do not support mergeable sections that have relocations. template void MergeableSection::split_contents(Context &ctx) { std::string_view data = input_section->contents; if (data.size() > UINT32_MAX) Fatal(ctx) << *input_section << ": mergeable section too large"; i64 entsize = parent.shdr.sh_entsize; // Split sections if (parent.shdr.sh_flags & SHF_STRINGS) { for (i64 pos = 0; pos < data.size();) { frag_offsets.push_back(pos); size_t end = find_null(data, pos, entsize); if (end == data.npos) Fatal(ctx) << *input_section << ": string is not null terminated"; pos = end + entsize; } } else { if (data.size() % entsize) Fatal(ctx) << *input_section << ": section size is not multiple of sh_entsize"; frag_offsets.reserve(data.size() / entsize); for (i64 pos = 0; pos < data.size(); pos += entsize) frag_offsets.push_back(pos); } // Compute hashes for section pieces HyperLogLog estimator; hashes.reserve(frag_offsets.size()); for (i64 i = 0; i < frag_offsets.size(); i++) { u64 hash = hash_string(get_contents(i)); hashes.push_back(hash); estimator.insert(hash); } parent.estimator.merge(estimator); static Counter counter("string_fragments"); counter += frag_offsets.size(); } template void MergeableSection::resolve_contents(Context &ctx) { fragments.reserve(frag_offsets.size()); for (i64 i = 0; i < frag_offsets.size(); i++) fragments.push_back(parent.insert(ctx, get_contents(i), hashes[i], p2align)); // Reclaim memory as we'll never use this vector again hashes.clear(); hashes.shrink_to_fit(); } using E = MOLD_TARGET; template bool cie_equals(const CieRecord &, const CieRecord &); template class InputSection; template class MergeableSection; } // namespace mold ================================================ FILE: src/jobs-unix.cc ================================================ // Many build systems attempt to invoke as many linker processes as there // are cores, based on the assumption that the linker is single-threaded. // However, since mold is multi-threaded, such build systems' behavior is // not beneficial and just increases the overall peak memory usage. // On machines with limited memory, this could lead to an out-of-memory // error. // // This file implements a feature that limits the number of concurrent // mold processes to just 1 for each user. It is intended to be used as // `MOLD_JOBS=1 ninja` or `MOLD_JOBS=1 make -j$(nproc)`. #include "mold.h" #include #include #include #include #include #include namespace mold { static int lock_fd = -1; void acquire_global_lock() { char *jobs = getenv("MOLD_JOBS"); if (!jobs || jobs != "1"s) return; std::string path; if (char *dir = getenv("XDG_RUNTIME_DIR")) path = dir + "/mold-lock"s; else path = "/tmp/mold-lock-"s + getpwuid(getuid())->pw_name; int fd = open(path.c_str(), O_WRONLY | O_CREAT | O_CLOEXEC, 0600); if (fd == -1) return; if (lockf(fd, F_LOCK, 0) == -1) return; lock_fd = fd; } void release_global_lock() { if (lock_fd != -1) close(lock_fd); } } // namespace mold ================================================ FILE: src/jobs-win32.cc ================================================ namespace mold { void acquire_global_lock() {} void release_global_lock() {} } // namespace mold ================================================ FILE: src/linker-script.cc ================================================ // On Linux, /usr/lib/x86_64-linux-gnu/libc.so is not actually // a shared object file but an ASCII text file containing a linker // script to include a "real" libc.so file. Therefore, we need to // support a (very limited) subset of the linker script language. #include "mold.h" #include namespace mold { static std::string_view get_line(std::string_view input, const char *pos) { assert(input.data() <= pos); assert(pos < input.data() + input.size()); i64 start = input.rfind('\n', pos - input.data()); if (start == input.npos) start = 0; else start++; i64 end = input.find('\n', pos - input.data()); if (end == input.npos) end = input.size(); return input.substr(start, end - start); } template void Script::error(std::string_view pos, std::string msg) { std::string_view input = mf->get_contents(); std::string_view line = get_line(input, pos.data()); i64 lineno = 1; for (i64 i = 0; input.data() + i < line.data(); i++) if (input[i] == '\n') lineno++; std::string label = mf->name + ":" + std::to_string(lineno) + ": "; i64 indent = strlen("mold: fatal: ") + label.size(); i64 column = pos.data() - line.data(); Fatal(ctx) << label << line << "\n" << std::string(indent + column, ' ') << "^ " << msg; } template void Script::tokenize() { std::string_view input = mf->get_contents(); while (!input.empty()) { if (isspace(input[0])) { input = input.substr(1); continue; } if (input.starts_with("/*")) { i64 pos = input.find("*/", 2); if (pos == std::string_view::npos) error(input, "unclosed comment"); input = input.substr(pos + 2); continue; } if (input[0] == '#') { i64 pos = input.find("\n", 1); if (pos == std::string_view::npos) break; input = input.substr(pos + 1); continue; } if (input[0] == '"') { i64 pos = input.find('"', 1); if (pos == std::string_view::npos) error(input, "unclosed string literal"); tokens.push_back(input.substr(0, pos + 1)); input = input.substr(pos + 1); continue; } i64 pos = input.find_first_not_of( "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" "0123456789_.$/\\~=+[]*?-!^:"); if (pos == 0) pos = 1; else if (pos == input.npos) pos = input.size(); tokens.push_back(input.substr(0, pos)); input = input.substr(pos); } } template std::span Script::skip(std::span tok, std::string_view str) { if (tok.empty()) Fatal(ctx) << mf->name << ": expected '" << str << "', but got EOF"; if (tok[0] != str) error(tok[0], "expected '" + std::string(str) + "'"); return tok.subspan(1); } static std::string_view unquote(std::string_view s) { if (s.size() > 0 && s[0] == '"') { assert(s[s.size() - 1] == '"'); return s.substr(1, s.size() - 2); } return s; } template std::span Script::read_output_format(std::span tok) { tok = skip(tok, "("); while (!tok.empty() && tok[0] != ")") tok = tok.subspan(1); if (tok.empty()) Fatal(ctx) << mf->name << ": expected ')', but got EOF"; return tok.subspan(1); } template static bool is_in_sysroot(Context &ctx, std::string path) { std::string sysroot = ctx.arg.sysroot; if (sysroot.starts_with('/') && !ctx.arg.chroot.empty()) sysroot = ctx.arg.chroot + "/" + path_clean(sysroot); std::string rel = std::filesystem::relative(path, sysroot).string(); return rel != "." && !rel.starts_with("../"); } template MappedFile *Script::resolve_path(std::string_view tok, bool check_target) { std::string str(unquote(tok)); auto open = [&](const std::string &path) -> MappedFile * { MappedFile *mf = open_file(ctx, path); if (!mf) return nullptr; if (check_target) { std::string_view target = get_machine_type(ctx, rctx, mf); if (!target.empty() && target != E::name) { Warn(ctx) << path << ": skipping incompatible file: " << target << " (e_machine " << (int)E::e_machine << ")"; return nullptr; } } return mf; }; // GNU ld prepends the sysroot if a pathname starts with '/' and the // script being processed is in the sysroot. We do the same. if (str.starts_with('/') && is_in_sysroot(ctx, mf->name)) return must_open_file(ctx, ctx.arg.sysroot + str); if (str.starts_with('=')) { std::string path; if (ctx.arg.sysroot.empty()) path = str.substr(1); else path = ctx.arg.sysroot + str.substr(1); return must_open_file(ctx, path); } if (str.starts_with("-l")) return find_library(ctx, rctx, str.substr(2)); if (!str.starts_with('/')) if (MappedFile *mf2 = open(path_clean(mf->name + "/../" + str))) return mf2; if (MappedFile *mf = open(str)) return mf; for (std::string_view dir : ctx.arg.library_paths) { std::string path = std::string(dir) + "/" + str; if (MappedFile *mf = open(path)) return mf; } error(tok, "library not found: " + str); } template std::span Script::read_group(std::span tok) { tok = skip(tok, "("); while (!tok.empty() && tok[0] != ")") { if (tok[0] == "AS_NEEDED") { bool orig = rctx.as_needed; rctx.as_needed = true; tok = read_group(tok.subspan(1)); rctx.as_needed = orig; continue; } MappedFile *mf = resolve_path(tok[0], true); read_file(ctx, rctx, mf); tok = tok.subspan(1); } if (tok.empty()) Fatal(ctx) << mf->name << ": expected ')', but got EOF"; return tok.subspan(1); } template void Script::parse_linker_script() { std::call_once(once, [&] { tokenize(); }); std::span tok = tokens; while (!tok.empty()) { if (tok[0] == "OUTPUT_FORMAT") { tok = read_output_format(tok.subspan(1)); } else if (tok[0] == "INPUT" || tok[0] == "GROUP") { tok = read_group(tok.subspan(1)); } else if (tok[0] == "VERSION") { tok = tok.subspan(1); tok = skip(tok, "{"); tok = read_version_script(tok); tok = skip(tok, "}"); } else if (tok.size() > 3 && tok[1] == "=" && tok[3] == ";") { ctx.arg.defsyms.emplace_back(get_symbol(ctx, unquote(tok[0])), get_symbol(ctx, unquote(tok[2]))); tok = tok.subspan(4); } else if (tok[0] == ";") { tok = tok.subspan(1); } else { error(tok[0], "unknown linker script token"); } } } template std::string_view Script::get_script_output_type() { std::call_once(once, [&] { tokenize(); }); std::span tok = tokens; if (tok.size() >= 3 && tok[0] == "OUTPUT_FORMAT" && tok[1] == "(") { if (tok[2] == "elf64-x86-64") return X86_64::name; if (tok[2] == "elf32-i386") return I386::name; } if (tok.size() >= 3 && (tok[0] == "INPUT" || tok[0] == "GROUP") && tok[1] == "(") if (MappedFile *mf = resolve_path(tok[2], false)) return get_machine_type(ctx, rctx, mf); return ""; } static bool read_label(std::span &tok, std::string label) { if (tok.size() >= 1 && tok[0] == label + ":") { tok = tok.subspan(1); return true; } if (tok.size() >= 2 && tok[0] == label && tok[1] == ":") { tok = tok.subspan(2); return true; } return false; } template std::span Script::read_version_script_commands(std::span tok, std::string_view ver_str, u16 ver_idx, bool is_global, bool is_cpp) { while (!tok.empty() && tok[0] != "}") { if (read_label(tok, "global")) { is_global = true; continue; } if (read_label(tok, "local")) { is_global = false; continue; } if (tok[0] == "extern") { tok = tok.subspan(1); if (!tok.empty() && tok[0] == "\"C\"") { tok = tok.subspan(1); tok = skip(tok, "{"); tok = read_version_script_commands(tok, ver_str, ver_idx, is_global, false); } else { tok = skip(tok, "\"C++\""); tok = skip(tok, "{"); tok = read_version_script_commands(tok, ver_str, ver_idx, is_global, true); } tok = skip(tok, "}"); tok = skip(tok, ";"); continue; } if (tok[0] == "*") { ctx.default_version = (is_global ? ver_idx : (u32)VER_NDX_LOCAL); } else if (is_global) { ctx.version_patterns.push_back({unquote(tok[0]), mf->name, ver_str, ver_idx, is_cpp}); } else { ctx.version_patterns.push_back({unquote(tok[0]), mf->name, ver_str, VER_NDX_LOCAL, is_cpp}); } tok = tok.subspan(1); if (!tok.empty() && tok[0] == "}") break; tok = skip(tok, ";"); } return tok; } template std::span Script::read_version_script(std::span tok) { u16 next_ver = VER_NDX_LAST_RESERVED + ctx.arg.version_definitions.size() + 1; while (!tok.empty() && tok[0] != "}") { std::string_view ver_str; u16 ver_idx; if (tok[0] == "{") { ver_str = "global"; ver_idx = VER_NDX_GLOBAL; } else { ver_str = tok[0]; ver_idx = next_ver++; ctx.arg.version_definitions.emplace_back(tok[0]); tok = tok.subspan(1); } tok = skip(tok, "{"); tok = read_version_script_commands(tok, ver_str, ver_idx, true, false); tok = skip(tok, "}"); if (!tok.empty() && tok[0] != ";") tok = tok.subspan(1); tok = skip(tok, ";"); } return tok; } template void Script::parse_version_script() { std::call_once(once, [&] { tokenize(); }); std::span tok = tokens; tok = read_version_script(tok); if (!tok.empty()) error(tok[0], "trailing garbage token"); } template std::span Script::read_dynamic_list_commands(std::span tok, std::vector &result, bool is_cpp) { while (!tok.empty() && tok[0] != "}") { if (tok[0] == "extern") { tok = tok.subspan(1); if (!tok.empty() && tok[0] == "\"C\"") { tok = tok.subspan(1); tok = skip(tok, "{"); tok = read_dynamic_list_commands(tok, result, false); } else { tok = skip(tok, "\"C++\""); tok = skip(tok, "{"); tok = read_dynamic_list_commands(tok, result, true); } tok = skip(tok, "}"); tok = skip(tok, ";"); continue; } result.push_back({unquote(tok[0]), "", is_cpp}); tok = skip(tok.subspan(1), ";"); } return tok; } template std::vector Script::parse_dynamic_list() { std::call_once(once, [&] { tokenize(); }); std::span tok = tokens; std::vector result; tok = skip(tok, "{"); tok = read_dynamic_list_commands(tok, result, false); tok = skip(tok, "}"); tok = skip(tok, ";"); if (!tok.empty()) error(tok[0], "trailing garbage token"); for (DynamicPattern &p : result) p.source = mf->name; return result; } template std::vector parse_dynamic_list(Context &ctx, std::string_view path) { ReaderContext rctx; MappedFile *mf = must_open_file(ctx, std::string(path)); return Script(ctx, rctx, mf).parse_dynamic_list(); } using E = MOLD_TARGET; template class Script; template std::vector parse_dynamic_list(Context &, std::string_view); } // namespace mold ================================================ FILE: src/lto-unix.cc ================================================ // This file handles the linker plugin to support LTO (Link-Time // Optimization). // // LTO is a technique to do whole-program optimization to a program. Since // a linker sees the whole program as opposed to a single compilation // unit, it in theory can do some optimizations that cannot be done in the // usual separate compilation model. For example, LTO should be able to // inline functions that are defined in other compilation unit. // // In GCC and Clang, all you have to do to enable LTO is adding the // `-flto` flag to the compiler and the linker command lines. If `-flto` // is given, the compiler generates a file that contains not machine code // but the compiler's IR (intermediate representation). In GCC, the output // is an ELF file which wraps GCC's IR. In LLVM, it's not even an ELF file // but just a raw LLVM IR file. // // Here is what we have to do if at least one input file is not a usual // ELF file but an IR object file: // // 1. Read symbols both from usual ELF files and from IR object files and // resolve symbols as usual. // // 2. Pass all IR objects to the compiler backend. The compiler backend // compiles the IRs and returns a few big ELF object files as a // result. // // 3. Parse the returned ELF files and overwrite IR object symbols with // the returned ones, discarding IR object files. // // 4. Continue the rest of the linking process as usual. // // When gcc or clang inovkes ld, they pass `-plugin /path/to/linker-plugin.so` // to the linker. The given .so file provides a way to call the compiler // backend. // // The linker plugin API is documented at // https://gcc.gnu.org/wiki/whopr/driver, though the document is a bit // outdated. // // Frankly, the linker plugin API is peculiar and is not very easy to use. // For some reason, the API functions don't return the result of a // function call as a return value but instead calls other function with // the result as its argument to "return" the result. // // For example, the first thing you need to do after dlopen()'ing a linker // plugin .so is to call `onload` function with a list of callback // functions. `onload` calls callbacks to notify about the pointers to // other functions the linker plugin provides. I don't know why `onload` // can't just return a list of functions or why the linker plugin can't // define not only `onload` but other functions, but that's what it is. // // Here is the steps to use the linker plugin: // // 1. dlopen() the linker plugin .so and call `onload` to obtain pointers // to other functions provided by the plugin. // // 2. Call `claim_file_hook` with an IR object file to read its symbol // table. `claim_file_hook` calls the `add_symbols` callback to // "return" a list of symbols. // // 3. `claim_file_hook` returns LDPT_OK only when the plugin wants to // handle a given file. Since we pass only IR object files to the // plugin in mold, it always returns LDPT_OK in our case. // // 4. Once we made a decision as to which object file to include into the // output file, we call `all_symbols_read_hook` to compile IR objects // into a few big ELF files. That function calls the `get_symbols` // callback to ask us about the symbol resolution results. (The // compiler backend needs to know whether an undefined symbol in an IR // object was resolved to a regular object file or a shared object to // do whole program optimization, for example.) // // 5. `all_symbols_read_hook` "returns" the result by calling the // `add_input_file` callback. The callback is called with a path to an // LTO'ed ELF file. We parse that ELF file and override symbols // defined by IR objects with the ELF file's ones. // // 6. Lastly, we call `cleanup_hook` to remove temporary files created by // the compiler backend. #include "mold.h" #include "lto.h" #include #include #include #include #include #include #include #if 0 # define LOG std::cerr #else # define LOG std::ostringstream() #endif namespace mold { // Global variables // We store LTO-related information to global variables, // as the LTO plugin is not thread-safe by design anyway. template static Context *gctx; template static std::vector *> lto_objects; static int phase = 0; static std::vector plugin_symbols; static ClaimFileHandler *claim_file_hook; static AllSymbolsReadHandler *all_symbols_read_hook; static CleanupHandler *cleanup_hook; static bool is_gcc_linker_api_v1 = false; // Event handlers template static PluginStatus message(PluginLevel level, const char *fmt, ...) { LOG << "message\n"; Context &ctx = *gctx; char buf[1000]; va_list ap; va_start(ap, fmt); vsnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); switch (level) { case LDPL_INFO: Out(ctx) << buf; break; case LDPL_WARNING: Warn(ctx) << buf; break; case LDPL_ERROR: case LDPL_FATAL: Fatal(ctx) << buf; } return LDPS_OK; } template static PluginStatus register_claim_file_hook(ClaimFileHandler fn) { LOG << "register_claim_file_hook\n"; claim_file_hook = fn; return LDPS_OK; } template static PluginStatus register_all_symbols_read_hook(AllSymbolsReadHandler fn) { LOG << "register_all_symbols_read_hook\n"; all_symbols_read_hook = fn; return LDPS_OK; } template static PluginStatus register_cleanup_hook(CleanupHandler fn) { LOG << "register_cleanup_hook\n"; cleanup_hook = fn; return LDPS_OK; } static PluginStatus add_symbols(void *handle, int nsyms, const PluginSymbol *psyms) { LOG << "add_symbols: " << nsyms << "\n"; assert(phase == 1); plugin_symbols = {psyms, psyms + nsyms}; return LDPS_OK; } template static PluginStatus add_input_file(const char *path) { LOG << "add_input_file: " << path << "\n"; Context &ctx = *gctx; static i64 file_priority = 100; MappedFile *mf = must_open_file(ctx, path); mf->is_dependency = false; ObjectFile *file = new ObjectFile(ctx, mf, ""); ctx.obj_pool.emplace_back(file); lto_objects.push_back(file); file->priority = file_priority++; file->is_reachable = true; file->parse(ctx); file->resolve_symbols(ctx); return LDPS_OK; } static PluginStatus get_input_file(const void *handle, struct PluginInputFile *file) { LOG << "get_input_file\n"; return LDPS_OK; } template static PluginStatus release_input_file(const void *handle) { LOG << "release_input_file\n"; ObjectFile &file = *(ObjectFile *)handle; file.mf->close_fd(); return LDPS_OK; } static PluginStatus add_input_library(const char *path) { LOG << "add_input_library\n"; return LDPS_OK; } static PluginStatus set_extra_library_path(const char *path) { LOG << "set_extra_library_path\n"; return LDPS_OK; } template static PluginStatus get_view(const void *handle, const void **view) { LOG << "get_view\n"; ObjectFile &file = *(ObjectFile *)handle; *view = (void *)file.mf->data; return LDPS_OK; } static PluginStatus get_input_section_count(const void *handle, int *count) { LOG << "get_input_section_count\n"; return LDPS_OK; } static PluginStatus get_input_section_type(const PluginSection section, int *type) { LOG << "get_input_section_type\n"; return LDPS_OK; } static PluginStatus get_input_section_name(const PluginSection section, char **section_name) { LOG << "get_input_section_name\n"; return LDPS_OK; } static PluginStatus get_input_section_contents(const PluginSection section, const char **section_contents, size_t *len) { LOG << "get_input_section_contents\n"; return LDPS_OK; } static PluginStatus update_section_order(const PluginSection *section_list, int num_sections) { LOG << "update_section_order\n"; return LDPS_OK; } static PluginStatus allow_section_ordering() { LOG << "allow_section_ordering\n"; return LDPS_OK; } static PluginStatus get_symbols_v1(const void *handle, int nsyms, PluginSymbol *psyms) { unreachable(); } // get_symbols teaches the LTO plugin as to how we have resolved symbols. // The plugin uses the symbol resolution info to optimize the program. // // For example, if a definition in an IR file is not referenced by // non-IR objects at all, the plugin may choose to completely inline // that definition within the IR objects and remove the symbol from the // LTO result. On the other hand, if a definition is referenced by a // non-IR object, it has to keep the symbol in the LTO result. template static PluginStatus get_symbols(const void *handle, int nsyms, PluginSymbol *psyms, bool is_v2) { ObjectFile &file = *(ObjectFile *)handle; assert(file.is_lto_obj); // If file is an archive member which was not chose to be included in // to the final result, we need to make the plugin to ignore all // symbols. if (!file.is_reachable) { assert(!is_v2); for (int i = 0; i < nsyms; i++) psyms[i].resolution = LDPR_PREEMPTED_REG; return LDPS_NO_SYMS; } auto get_resolution = [&](ElfSym &esym, Symbol &sym) { if (!sym.file) return LDPR_UNDEF; if (sym.file == &file) { if (sym.referenced_by_regular_obj) return LDPR_PREVAILING_DEF; if (sym.is_exported) return is_v2 ? LDPR_PREVAILING_DEF : LDPR_PREVAILING_DEF_IRONLY_EXP; return LDPR_PREVAILING_DEF_IRONLY; } if (sym.file->is_dso) return LDPR_RESOLVED_DYN; if (((ObjectFile *)sym.file)->is_lto_obj && !sym.is_wrapped) return esym.is_undef() ? LDPR_RESOLVED_IR : LDPR_PREEMPTED_IR; return esym.is_undef() ? LDPR_RESOLVED_EXEC : LDPR_PREEMPTED_REG; }; // Set the symbol resolution results to psyms. for (i64 i = 0; i < nsyms; i++) { ElfSym &esym = file.elf_syms[i + 1]; Symbol &sym = *file.symbols[i + 1]; psyms[i].resolution = get_resolution(esym, sym); } return LDPS_OK; } // This function restarts mold itself with `--:lto-pass2` and // `--:ignore-ir-file` flags. We do this as a workaround for the old // linker plugins that do not support the get_symbols_v3 API. // // get_symbols_v1 and get_symbols_v2 don't provide a way to ignore an // object file we previously passed to the linker plugin. So we can't // "unload" object files in archives that we ended up not choosing to // include into the final output. // // As a workaround, we restart the linker with a list of object files // the linker has to ignore, so that it won't read the object files // from archives next time. // // This is an ugly hack and should be removed once GCC adopts the v3 API. template static void restart_process(Context &ctx) { std::vector args; for (std::string_view arg : ctx.cmdline_args) args.push_back(strdup(std::string(arg).c_str())); for (std::unique_ptr> &file : ctx.obj_pool) if (file->is_lto_obj && !file->is_reachable) args.push_back(strdup(("--:ignore-ir-file=" + file->mf->get_identifier()).c_str())); args.push_back("--:lto-pass2"); args.push_back(nullptr); std::cout << std::flush; std::cerr << std::flush; std::string self = get_self_path(); execv(self.c_str(), (char * const *)args.data()); std::cerr << "execv failed: " << errno_string() << "\n"; _exit(1); } template static PluginStatus get_symbols_v2(const void *handle, int nsyms, PluginSymbol *psyms) { LOG << "get_symbols_v2\n"; return get_symbols(handle, nsyms, psyms, true); } template static PluginStatus get_symbols_v3(const void *handle, int nsyms, PluginSymbol *psyms) { LOG << "get_symbols_v3\n"; return get_symbols(handle, nsyms, psyms, false); } static PluginStatus allow_unique_segment_for_sections() { LOG << "allow_unique_segment_for_sections\n"; return LDPS_OK; } static PluginStatus unique_segment_for_sections(const char *segment_name, uint64_t flags, uint64_t align, const PluginSection *section_list, int num_sections) { LOG << "unique_segment_for_sections\n"; return LDPS_OK; } static PluginStatus get_input_section_alignment(const PluginSection section, int *addralign) { LOG << "get_input_section_alignment\n"; return LDPS_OK; } static PluginStatus get_input_section_size(const PluginSection section, uint64_t *size) { LOG << "get_input_section_size\n"; return LDPS_OK; } template static PluginStatus register_new_input_hook(NewInputHandler fn) { LOG << "register_new_input_hook\n"; return LDPS_OK; } static PluginStatus get_wrap_symbols(uint64_t *num_symbols, const char ***wrap_symbols) { LOG << "get_wrap_symbols\n"; return LDPS_OK; } template static PluginLinkerAPIVersion get_api_version(const char *plugin_identifier, unsigned plugin_version, int minimal_api_supported, int maximal_api_supported, const char **linker_identifier, const char **linker_version) { if (LAPI_V1 < minimal_api_supported) Fatal(*gctx) << "LTO plugin does not support V0 or V1 API"; std::string version = mold_version + "\0"s; *linker_identifier = "mold"; *linker_version = version.data(); if (LAPI_V1 <= maximal_api_supported) { is_gcc_linker_api_v1 = true; return LAPI_V1; } return LAPI_V0; } // dlopen the linker plugin file template static void load_lto_plugin(Context &ctx) { static std::once_flag flag; std::call_once(flag, [&] { assert(phase == 0); phase = 1; gctx = &ctx; void *handle = dlopen(ctx.arg.plugin.c_str(), RTLD_NOW | RTLD_LOCAL); if (!handle) Fatal(ctx) << "could not open plugin file: " << dlerror(); OnloadFn *onload = (OnloadFn *)dlsym(handle, "onload"); if (!onload) Fatal(ctx) << "failed to load plugin " << ctx.arg.plugin << ": " << dlerror(); auto save = [&](std::string_view str) { return save_string(ctx, std::string(str).c_str()).data(); }; std::vector tv; tv.emplace_back(LDPT_MESSAGE, message); if (ctx.arg.shared) tv.emplace_back(LDPT_LINKER_OUTPUT, LDPO_DYN); else if (ctx.arg.pie) tv.emplace_back(LDPT_LINKER_OUTPUT, LDPO_PIE); else tv.emplace_back(LDPT_LINKER_OUTPUT, LDPO_EXEC); for (std::string_view opt : ctx.arg.plugin_opt) tv.emplace_back(LDPT_OPTION, save(opt)); tv.emplace_back(LDPT_REGISTER_CLAIM_FILE_HOOK, register_claim_file_hook); tv.emplace_back(LDPT_REGISTER_ALL_SYMBOLS_READ_HOOK, register_all_symbols_read_hook); tv.emplace_back(LDPT_REGISTER_CLEANUP_HOOK, register_cleanup_hook); tv.emplace_back(LDPT_ADD_SYMBOLS, add_symbols); tv.emplace_back(LDPT_GET_SYMBOLS, get_symbols_v1); tv.emplace_back(LDPT_ADD_INPUT_FILE, add_input_file); tv.emplace_back(LDPT_GET_INPUT_FILE, get_input_file); tv.emplace_back(LDPT_RELEASE_INPUT_FILE, release_input_file); tv.emplace_back(LDPT_ADD_INPUT_LIBRARY, add_input_library); tv.emplace_back(LDPT_OUTPUT_NAME, save(ctx.arg.output)); tv.emplace_back(LDPT_SET_EXTRA_LIBRARY_PATH, set_extra_library_path); tv.emplace_back(LDPT_GET_VIEW, get_view); tv.emplace_back(LDPT_GET_INPUT_SECTION_COUNT, get_input_section_count); tv.emplace_back(LDPT_GET_INPUT_SECTION_TYPE, get_input_section_type); tv.emplace_back(LDPT_GET_INPUT_SECTION_NAME, get_input_section_name); tv.emplace_back(LDPT_GET_INPUT_SECTION_CONTENTS, get_input_section_contents); tv.emplace_back(LDPT_UPDATE_SECTION_ORDER, update_section_order); tv.emplace_back(LDPT_ALLOW_SECTION_ORDERING, allow_section_ordering); tv.emplace_back(LDPT_ADD_SYMBOLS_V2, add_symbols); tv.emplace_back(LDPT_GET_SYMBOLS_V2, get_symbols_v2); tv.emplace_back(LDPT_ALLOW_UNIQUE_SEGMENT_FOR_SECTIONS, allow_unique_segment_for_sections); tv.emplace_back(LDPT_UNIQUE_SEGMENT_FOR_SECTIONS, unique_segment_for_sections); tv.emplace_back(LDPT_GET_SYMBOLS_V3, get_symbols_v3); tv.emplace_back(LDPT_GET_INPUT_SECTION_ALIGNMENT, get_input_section_alignment); tv.emplace_back(LDPT_GET_INPUT_SECTION_SIZE, get_input_section_size); tv.emplace_back(LDPT_REGISTER_NEW_INPUT_HOOK, register_new_input_hook); tv.emplace_back(LDPT_GET_WRAP_SYMBOLS, get_wrap_symbols); tv.emplace_back(LDPT_GET_API_VERSION, get_api_version); tv.emplace_back(LDPT_NULL, 0); [[maybe_unused]] PluginStatus status = onload(tv.data()); assert(status == LDPS_OK); }); } template static ElfSym to_elf_sym(PluginSymbol &psym) { ElfSym esym; memset(&esym, 0, sizeof(esym)); switch (psym.def) { case LDPK_DEF: esym.st_shndx = SHN_ABS; break; case LDPK_WEAKDEF: esym.st_shndx = SHN_ABS; esym.st_bind = STB_WEAK; break; case LDPK_UNDEF: esym.st_shndx = SHN_UNDEF; break; case LDPK_WEAKUNDEF: esym.st_shndx = SHN_UNDEF; esym.st_bind = STB_WEAK; break; case LDPK_COMMON: esym.st_shndx = SHN_COMMON; break; } switch (psym.symbol_type) { case LDST_FUNCTION: esym.st_type = STT_FUNC; break; case LDST_VARIABLE: esym.st_type = STT_OBJECT; break; }; switch (psym.visibility) { case LDPV_PROTECTED: esym.st_visibility = STV_PROTECTED; break; case LDPV_INTERNAL: esym.st_visibility = STV_INTERNAL; break; case LDPV_HIDDEN: esym.st_visibility = STV_HIDDEN; break; } esym.st_size = psym.size; return esym; } // Returns true if a given linker plugin looks like LLVM's one. // Returns false if it's GCC. template static bool is_llvm(Context &ctx) { return ctx.arg.plugin.find("LLVMgold.") != ctx.arg.plugin.npos; } // Returns true if a given linker plugin supports the get_symbols_v3 API. // Any version of LLVM and GCC 12 or newer support it. template static bool supports_v3_api(Context &ctx) { return is_gcc_linker_api_v1 || is_llvm(ctx); } template static PluginInputFile create_plugin_input_file(Context &ctx, MappedFile *mf) { PluginInputFile file; MappedFile *mf2 = mf->parent ? mf->parent : mf; file.name = save_string(ctx, mf2->name).data(); file.offset = mf->get_offset(); file.filesize = mf->size; mf2->reopen_fd(file.name); file.fd = mf2->fd; if (!file.fd) Fatal(ctx) << "cannot open " << file.name << ": " << errno_string(); return file; } template ObjectFile *read_lto_object(Context &ctx, MappedFile *mf) { if (ctx.arg.plugin.empty()) Fatal(ctx) << mf->name << ": unable to handle this LTO object file because " << "the -plugin option was not provided. Please make sure you " << "added -flto not only when creating object files but also " << "when linking the final executable."; load_lto_plugin(ctx); // V0 API's claim_file is not thread-safe. static std::mutex mu; std::unique_lock lock(mu, std::defer_lock); if (!is_gcc_linker_api_v1) lock.lock(); // Create mold's object instance ObjectFile *obj = new ObjectFile; ctx.obj_pool.emplace_back(obj); obj->filename = mf->name; obj->symbols.push_back(new Symbol); obj->first_global = 1; obj->is_lto_obj = true; obj->mf = mf; obj->archive_name = mf->parent ? mf->parent->name : ""; // Create plugin's object instance PluginInputFile file = create_plugin_input_file(ctx, mf); file.handle = (void *)obj; LOG << "read_lto_symbols: "<< mf->name << "\n"; // claim_file_hook() calls add_symbols() which initializes `plugin_symbols` int claimed = false; claim_file_hook(&file, &claimed); if (!claimed) Fatal(ctx) << mf->name << ": not claimed by the LTO plugin;" << " please make sure you are using the same compiler of the" << " same version for all object files"; if (mf->parent) mf->parent->close_fd(); else mf->close_fd(); // Create a symbol strtab i64 strtab_size = 1; for (PluginSymbol &psym : plugin_symbols) strtab_size += strlen(psym.name) + 1; std::string strtab(strtab_size, '\0'); // Initialize esyms obj->lto_elf_syms.resize(plugin_symbols.size() + 1); obj->lto_comdat_groups.resize(plugin_symbols.size() + 1); i64 strtab_offset = 1; for (i64 i = 0; i < plugin_symbols.size(); i++) { PluginSymbol &psym = plugin_symbols[i]; obj->lto_elf_syms[i + 1] = to_elf_sym(psym); obj->lto_elf_syms[i + 1].st_name = strtab_offset; i64 len = strlen(psym.name); memcpy(strtab.data() + strtab_offset, psym.name, len); strtab_offset += len + 1; // comdat_key is non-null if the symbol is defined in a comdat member // section. We handle such symbols differently than comdat symbols in // a regular file because, unlike regular object files, IR files don't // have input sections. if (psym.comdat_key) { std::string_view key = save_string(ctx, psym.comdat_key); obj->lto_comdat_groups[i + 1] = insert_comdat_group(ctx, key); } } obj->symbol_strtab = save_string(ctx, strtab); obj->elf_syms = obj->lto_elf_syms; obj->initialize_symbols(ctx); plugin_symbols.clear(); return obj; } // Entry point template std::vector *> run_lto_plugin(Context &ctx) { Timer t(ctx, "run_lto_plugin"); load_lto_plugin(ctx); if (!ctx.arg.lto_pass2 && !supports_v3_api(ctx)) restart_process(ctx); assert(phase == 1); phase = 2; // Set `referenced_by_regular_obj` bit. tbb::parallel_for_each(ctx.objs, [](ObjectFile *file) { if (!file->is_lto_obj) { for (Symbol *sym : file->get_global_syms()) { if (sym->file && !sym->file->is_dso && ((ObjectFile *)sym->file)->is_lto_obj) { std::scoped_lock lock(sym->mu); sym->referenced_by_regular_obj = true; } } } }); // Symbols specified by the --wrap option needs to be visible from // regular object files. for (std::string_view name : ctx.arg.wrap) { get_symbol(ctx, name)->referenced_by_regular_obj = true; std::string_view x = save_string(ctx, "__wrap_" + std::string(name)); std::string_view y = save_string(ctx, "__real_" + std::string(name)); get_symbol(ctx, x)->referenced_by_regular_obj = true; get_symbol(ctx, y)->referenced_by_regular_obj = true; } // Keep some symbols for (Symbol *sym : ctx.arg.undefined) sym->referenced_by_regular_obj = true; // Object files containing .gnu.offload_lto_.* sections need to be // given to the LTO backend. Such sections contains code and data for // peripherails (typically GPUs). for (ObjectFile *file : ctx.objs) { if (file->is_reachable && !file->is_lto_obj && file->is_gcc_offload_obj) { PluginInputFile pfile = create_plugin_input_file(ctx, file->mf); int claimed = false; claim_file_hook(&pfile, &claimed); assert(!claimed); } } // all_symbols_read_hook() calls add_input_file() and add_input_library() LOG << "all symbols read\n"; if (PluginStatus st = all_symbols_read_hook(); st != LDPS_OK) Fatal(ctx) << "LTO: all_symbols_read_hook returns " << st; return lto_objects; } template void lto_cleanup(Context &ctx) { Timer t(ctx, "lto_cleanup"); if (cleanup_hook) cleanup_hook(); } using E = MOLD_TARGET; template ObjectFile *read_lto_object(Context &, MappedFile *); template std::vector *> run_lto_plugin(Context &); template void lto_cleanup(Context &); } // namespace mold ================================================ FILE: src/lto-win32.cc ================================================ #include "mold.h" #include "lto.h" namespace mold { template ObjectFile *read_lto_object(Context &ctx, MappedFile *mf) { Fatal(ctx) << "LTO is not supported on Windows"; } template std::vector *> run_lto_plugin(Context &ctx) { return {}; } template void lto_cleanup(Context &ctx) {} using E = MOLD_TARGET; template ObjectFile *read_lto_object(Context &, MappedFile *); template std::vector *> run_lto_plugin(Context &); template void lto_cleanup(Context &); } // namespace mold ================================================ FILE: src/lto.h ================================================ #pragma once #include "../lib/integers.h" namespace mold { enum PluginStatus { LDPS_OK, LDPS_NO_SYMS, LDPS_BAD_HANDLE, LDPS_ERR, }; enum PluginTag { LDPT_NULL, LDPT_API_VERSION, LDPT_GOLD_VERSION, LDPT_LINKER_OUTPUT, LDPT_OPTION, LDPT_REGISTER_CLAIM_FILE_HOOK, LDPT_REGISTER_ALL_SYMBOLS_READ_HOOK, LDPT_REGISTER_CLEANUP_HOOK, LDPT_ADD_SYMBOLS, LDPT_GET_SYMBOLS, LDPT_ADD_INPUT_FILE, LDPT_MESSAGE, LDPT_GET_INPUT_FILE, LDPT_RELEASE_INPUT_FILE, LDPT_ADD_INPUT_LIBRARY, LDPT_OUTPUT_NAME, LDPT_SET_EXTRA_LIBRARY_PATH, LDPT_GNU_LD_VERSION, LDPT_GET_VIEW, LDPT_GET_INPUT_SECTION_COUNT, LDPT_GET_INPUT_SECTION_TYPE, LDPT_GET_INPUT_SECTION_NAME, LDPT_GET_INPUT_SECTION_CONTENTS, LDPT_UPDATE_SECTION_ORDER, LDPT_ALLOW_SECTION_ORDERING, LDPT_GET_SYMBOLS_V2, LDPT_ALLOW_UNIQUE_SEGMENT_FOR_SECTIONS, LDPT_UNIQUE_SEGMENT_FOR_SECTIONS, LDPT_GET_SYMBOLS_V3, LDPT_GET_INPUT_SECTION_ALIGNMENT, LDPT_GET_INPUT_SECTION_SIZE, LDPT_REGISTER_NEW_INPUT_HOOK, LDPT_GET_WRAP_SYMBOLS, LDPT_ADD_SYMBOLS_V2, LDPT_GET_API_VERSION, }; enum PluginApiVersion { LD_PLUGIN_API_VERSION = 1, }; struct PluginTagValue { PluginTagValue(PluginTag tag, int val) : tag(tag), val(val) {} PluginTagValue(PluginTag tag, auto *ptr) : tag(tag), ptr((void *)ptr) {} PluginTag tag; union { int val; void *ptr; }; }; enum PluginOutputFileType { LDPO_REL, LDPO_EXEC, LDPO_DYN, LDPO_PIE, }; struct PluginInputFile { const char *name; #if __MINGW32__ HANDLE fd; #else int fd; #endif u64 offset; u64 filesize; void *handle; }; struct PluginSection { const void *handle; u32 shndx; }; struct PluginSymbol { char *name; char *version; #ifdef __LITTLE_ENDIAN__ u8 def; u8 symbol_type; u8 section_kind; u8 padding; #else u8 padding; u8 section_kind; u8 symbol_type; u8 def; #endif i32 visibility; u64 size; char *comdat_key; i32 resolution; }; enum PluginSymbolKind { LDPK_DEF, LDPK_WEAKDEF, LDPK_UNDEF, LDPK_WEAKUNDEF, LDPK_COMMON, }; enum PluginSymbolVisibility { LDPV_DEFAULT, LDPV_PROTECTED, LDPV_INTERNAL, LDPV_HIDDEN, }; enum PluginSymbolType { LDST_UNKNOWN, LDST_FUNCTION, LDST_VARIABLE, }; enum PluginSymbolSectionKind { LDSSK_DEFAULT, LDSSK_BSS, }; enum PluginSymbolResolution { LDPR_UNKNOWN, LDPR_UNDEF, LDPR_PREVAILING_DEF, LDPR_PREVAILING_DEF_IRONLY, LDPR_PREEMPTED_REG, LDPR_PREEMPTED_IR, LDPR_RESOLVED_IR, LDPR_RESOLVED_EXEC, LDPR_RESOLVED_DYN, LDPR_PREVAILING_DEF_IRONLY_EXP, }; enum PluginLevel { LDPL_INFO, LDPL_WARNING, LDPL_ERROR, LDPL_FATAL, }; enum PluginLinkerAPIVersion { LAPI_V0, LAPI_V1, }; typedef PluginStatus OnloadFn(PluginTagValue *tv); typedef PluginStatus ClaimFileHandler(const PluginInputFile *, int *); typedef PluginStatus AllSymbolsReadHandler(); typedef PluginStatus CleanupHandler(); typedef PluginStatus NewInputHandler(const PluginInputFile *); } // namespace mold ================================================ FILE: src/main.cc ================================================ #include "mold.h" #include "config.h" #include #include #include #include #include namespace mold { template static void check_file_compatibility(Context &ctx, ReaderContext &rctx, MappedFile *mf) { std::string_view target = get_machine_type(ctx, rctx, mf); if (target != ctx.arg.emulation) Fatal(ctx) << mf->name << ": incompatible file type: " << ctx.arg.emulation << " is expected but got " << target; } template static ObjectFile *new_object_file(Context &ctx, ReaderContext &rctx, MappedFile *mf, std::string archive_name) { static Counter count("parsed_objs"); count++; check_file_compatibility(ctx, rctx, mf); ObjectFile *file = new ObjectFile(ctx, mf, archive_name); ctx.obj_pool.emplace_back(file); file->priority = ctx.file_priority++; file->as_needed = rctx.in_lib || (!archive_name.empty() && !rctx.whole_archive); rctx.tg->run([file, &ctx] { file->parse(ctx); }); if (ctx.arg.trace) Out(ctx) << "trace: " << *file; return file; } template static ObjectFile *new_lto_obj(Context &ctx, ReaderContext &rctx, MappedFile *mf, std::string archive_name) { static Counter count("parsed_lto_objs"); count++; if (ctx.arg.ignore_ir_file.count(mf->get_identifier())) return nullptr; ObjectFile *file = read_lto_object(ctx, mf); file->priority = ctx.file_priority++; file->archive_name = archive_name; file->as_needed = rctx.in_lib || (!archive_name.empty() && !rctx.whole_archive); if (ctx.arg.trace) Out(ctx) << "trace: " << *file; return file; } template static SharedFile * new_shared_file(Context &ctx, ReaderContext &rctx, MappedFile *mf) { check_file_compatibility(ctx, rctx, mf); SharedFile *file = new SharedFile(ctx, mf); ctx.dso_pool.emplace_back(file); file->priority = ctx.file_priority++; file->as_needed = rctx.as_needed; rctx.tg->run([file, &ctx] { file->parse(ctx); }); if (ctx.arg.trace) Out(ctx) << "trace: " << *file; return file; } template void read_file(Context &ctx, ReaderContext &rctx, MappedFile *mf) { switch (get_file_type(ctx, mf)) { case FileType::ELF_OBJ: ctx.objs.push_back(new_object_file(ctx, rctx, mf, "")); return; case FileType::ELF_DSO: ctx.dsos.push_back(new_shared_file(ctx, rctx, mf)); return; case FileType::AR: case FileType::THIN_AR: for (MappedFile *child : read_archive_members(ctx, mf)) { switch (get_file_type(ctx, child)) { case FileType::ELF_OBJ: ctx.objs.push_back(new_object_file(ctx, rctx, child, mf->name)); break; case FileType::GCC_LTO_OBJ: case FileType::LLVM_BITCODE: if (ObjectFile *file = new_lto_obj(ctx, rctx, child, mf->name)) ctx.objs.push_back(file); break; case FileType::ELF_DSO: Warn(ctx) << mf->name << "(" << child->name << "): shared object file in an archive is ignored"; break; default: break; } } return; case FileType::TEXT: Script(ctx, rctx, mf).parse_linker_script(); return; case FileType::GCC_LTO_OBJ: case FileType::LLVM_BITCODE: if (ObjectFile *file = new_lto_obj(ctx, rctx, mf, "")) ctx.objs.push_back(file); return; default: Fatal(ctx) << mf->name << ": unknown file type"; } } template static std::string_view detect_machine_type(Context &ctx, std::vector args) { for (ReaderContext rctx; const std::string &arg : args) { if (arg == "--Bstatic") { rctx.static_ = true; } else if (arg == "--Bdynamic") { rctx.static_ = false; } else if (!arg.starts_with('-')) { if (MappedFile *mf = open_file(ctx, arg)) if (get_file_type(ctx, mf) != FileType::TEXT) if (std::string_view target = get_machine_type(ctx, rctx, mf); !target.empty()) return target; } } for (ReaderContext rctx; const std::string &arg : args) { if (arg == "--Bstatic") { rctx.static_ = true; } else if (arg == "--Bdynamic") { rctx.static_ = false; } else if (!arg.starts_with('-')) { if (MappedFile *mf = open_file(ctx, arg)) if (get_file_type(ctx, mf) == FileType::TEXT) if (std::string_view target = Script(ctx, rctx, mf).get_script_output_type(); !target.empty()) return target; } } Fatal(ctx) << "-m option is missing"; } template MappedFile *open_library(Context &ctx, ReaderContext &rctx, std::string path) { MappedFile *mf = open_file(ctx, path); if (!mf) return nullptr; std::string_view target = get_machine_type(ctx, rctx, mf); if (!target.empty() && target != E::name) { Warn(ctx) << path << ": skipping incompatible file: " << target << " (e_machine " << (int)E::e_machine << ")"; return nullptr; } return mf; } template MappedFile *find_library(Context &ctx, ReaderContext &rctx, std::string name) { if (name.starts_with(':')) { for (std::string_view dir : ctx.arg.library_paths) { std::string path = std::string(dir) + "/" + name.substr(1); if (MappedFile *mf = open_library(ctx, rctx, path)) return mf; } Fatal(ctx) << "library not found: " << name; } for (std::string_view dir : ctx.arg.library_paths) { std::string stem = std::string(dir) + "/lib" + name; if (!rctx.static_) if (MappedFile *mf = open_library(ctx, rctx, stem + ".so")) return mf; if (MappedFile *mf = open_library(ctx, rctx, stem + ".a")) return mf; } Fatal(ctx) << "library not found: " << name; } template static void read_input_files(Context &ctx, std::span args) { Timer t(ctx, "read_input_files"); ReaderContext rctx; std::vector stack; std::unordered_set visited; tbb::task_group tg; rctx.tg = &tg; while (!args.empty()) { std::string_view arg = args[0]; args = args.subspan(1); if (arg == "--as-needed") { rctx.as_needed = true; } else if (arg == "--no-as-needed") { rctx.as_needed = false; } else if (arg == "--whole-archive") { rctx.whole_archive = true; } else if (arg == "--no-whole-archive") { rctx.whole_archive = false; } else if (arg == "--Bstatic") { rctx.static_ = true; } else if (arg == "--Bdynamic") { rctx.static_ = false; } else if (arg == "--start-lib") { rctx.in_lib = true; } else if (arg == "--end-lib") { rctx.in_lib = false; } else if (arg == "--push-state") { stack.push_back(rctx); } else if (arg == "--pop-state") { if (stack.empty()) Fatal(ctx) << "no state pushed before popping"; rctx = stack.back(); stack.pop_back(); } else if (arg.starts_with("-l")) { arg = arg.substr(2); if (visited.contains(arg)) continue; visited.insert(arg); MappedFile *mf = find_library(ctx, rctx, std::string(arg)); mf->given_fullpath = false; read_file(ctx, rctx, mf); } else { read_file(ctx, rctx, must_open_file(ctx, std::string(arg))); } } if (ctx.objs.empty() && ctx.dsos.empty()) Fatal(ctx) << "no input files"; if (rctx.static_ || ctx.arg.relocatable) { ctx.arg.static_ = true; ctx.arg.dynamic_linker = ""; } tg.wait(); } template static bool has_lto_obj(Context &ctx) { for (ObjectFile *file : ctx.objs) if (file->is_reachable && (file->is_lto_obj || file->is_gcc_offload_obj)) return true; return false; } template static i64 get_thread_count(Context &ctx) { if (ctx.arg.thread_count.has_value()) return *ctx.arg.thread_count; // mold doesn't scale well with too many threads, so limit it to 32. int n = tbb::global_control::active_value( tbb::global_control::max_allowed_parallelism); return std::min(n, 32); } template int mold_main(int argc, char **argv) { Context ctx; // Process -run option first. process_run_subcommand() does not return. if (argc >= 2 && (argv[1] == "-run"sv || argv[1] == "--run"sv)) process_run_subcommand(ctx, argc, argv); // Parse non-positional command line options ctx.cmdline_args = expand_response_files(ctx, argv); std::vector file_args = parse_nonpositional_args(ctx); // If no -m option is given, deduce it from input files. if (ctx.arg.emulation.empty()) ctx.arg.emulation = detect_machine_type(ctx, file_args); // Redo if -m does not match with our speculation. if (ctx.arg.emulation != E::name) return redo_main(ctx.arg.emulation, argc, argv); Timer t_all(ctx, "all"); install_signal_handler(); // Fork a subprocess unless --no-fork is given. if (ctx.arg.fork) fork_child(); acquire_global_lock(); ctx.global_limit.emplace(tbb::global_control::max_allowed_parallelism, get_thread_count(ctx)); // Handle --wrap options if any. for (std::string_view name : ctx.arg.wrap) get_symbol(ctx, name)->is_wrapped = true; // Handle --retain-symbols-file options if any. if (ctx.arg.retain_symbols_file) for (Symbol *sym : *ctx.arg.retain_symbols_file) sym->write_to_symtab = true; for (std::string_view arg : ctx.arg.trace_symbol) get_symbol(ctx, arg)->is_traced = true; // Parse input files read_input_files(ctx, file_args); // Uniquify shared object files by soname { std::unordered_set seen; std::erase_if(ctx.dsos, [&](SharedFile *file) { return !seen.insert(file->soname).second; }); } // Handle -repro if (ctx.arg.repro) write_repro_file(ctx); Timer t_before_copy(ctx, "before_copy"); // Apply -exclude-libs apply_exclude_libs(ctx); // Create a dummy file containing linker-synthesized symbols. if (!ctx.arg.relocatable) create_internal_file(ctx); // Resolve symbols by choosing the most appropriate file for each // symbol. This pass also removes redundant comdat sections (e.g. // duplicate inline functions). resolve_symbols(ctx); // If there's an object file compiled with -flto, do link-time // optimization. if (has_lto_obj(ctx)) do_lto(ctx); // Now that we know which object files are to be included to the // final output, we can remove unnecessary files. std::erase_if(ctx.objs, [](InputFile *file) { return !file->is_reachable; }); std::erase_if(ctx.dsos, [](InputFile *file) { return !file->is_reachable; }); // Parse .eh_frame section contents. parse_eh_frame_sections(ctx); // Split mergeable section contents into section pieces. create_merged_sections(ctx); // Handle --relocatable. Since the linker's behavior is quite different // from the normal one when the option is given, the logic is implemented // to a separate file. if (ctx.arg.relocatable) { combine_objects(ctx); return 0; } // Create .bss sections for common symbols. convert_common_symbols(ctx); // Apply version scripts. apply_version_script(ctx); // Parse symbol version suffixes (e.g. "foo@ver1"). parse_symbol_version(ctx); // Set is_imported and is_exported bits for each symbol. compute_import_export(ctx); // Make sure that there's no duplicate symbol if (!ctx.arg.allow_multiple_definition) check_duplicate_symbols(ctx); // Handle --zero-to-bss, which converts data sections containing only // zeros into BSS. if (ctx.arg.zero_to_bss) convert_zero_to_bss(ctx); // Set "address-taken" bits for input sections. if (ctx.arg.icf) compute_address_significance(ctx); // Handle PPC64-specific .opd sections. if constexpr (is_ppc64v1) ppc64v1_rewrite_opd(ctx); // Garbage-collect unreachable sections. if (ctx.arg.gc_sections) gc_sections(ctx); // Merge identical read-only sections. if (ctx.arg.icf) icf_sections(ctx); // Create linker-synthesized sections such as .got or .plt. create_synthetic_sections(ctx); // Handle --no-allow-shlib-undefined if (!ctx.arg.allow_shlib_undefined) check_shlib_undefined(ctx); // Warn if symbols with different types are defined under the same name. check_symbol_types(ctx); // Bin input sections into output sections. create_output_sections(ctx); // Convert an .ARM.exidx to a synthetic section. if constexpr (is_arm32) create_arm_exidx_section(ctx); // Handle --section-align options. if (!ctx.arg.section_align.empty()) apply_section_align(ctx); // Add synthetic symbols such as __ehdr_start or __end. add_synthetic_symbols(ctx); // Beyond this point, no new files will be added to ctx.objs // or ctx.dsos. // Handle `-z cet-report`. if (ctx.arg.z_cet_report != CET_REPORT_NONE) check_cet_errors(ctx); // Handle `-z execstack-if-needed`. if (ctx.arg.z_execstack_if_needed) for (ObjectFile *file : ctx.objs) if (file->needs_executable_stack) ctx.arg.z_execstack = true; // If we are linking a .so file, remaining undefined symbols does // not cause a linker error. Instead, they are treated as if they // were imported symbols. // // If we are linking an executable, weak undefs are converted to // weakly imported symbols so that they'll have another chance to be // resolved. claim_unresolved_symbols(ctx); // Beyond this point, no new symbols will be added to the result. // Handle --print-dependencies if (ctx.arg.print_dependencies) print_dependencies(ctx); // Handle --require-defined for (Symbol *sym : ctx.arg.require_defined) if (!sym->file) Error(ctx) << "--require-defined: undefined symbol: " << *sym; // .init_array and .fini_array contents have to be sorted by // a special rule. Sort them. sort_init_fini(ctx); // Likewise, .ctors and .dtors have to be sorted. They are rare // because they are superceded by .init_array/.fini_array, though. sort_ctor_dtor(ctx); // If .ctors/.dtors are to be placed to .init_array/.fini_array, // we need to reverse their contents. fixup_ctors_in_init_array(ctx); // Handle --shuffle-sections if (ctx.arg.shuffle_sections != SHUFFLE_SECTIONS_NONE) shuffle_sections(ctx); // Copy string referred by .dynamic to .dynstr. add_dynamic_strings(ctx); if constexpr (is_ppc64v1) ppc64v1_scan_symbols(ctx); // Scan relocations to find symbols that need entries in .got, .plt, // .got.plt, .dynsym, .dynstr, etc. scan_relocations(ctx); // Compute the is_weak bit for each imported symbol. compute_imported_symbol_weakness(ctx); // Sort sections by section attributes so that we'll have to // create as few segments as possible. sort_output_sections(ctx); // Handle --separate-debug-file. if (ctx.gnu_debuglink) separate_debug_sections(ctx); // Compute sizes of output sections while assigning offsets // within an output section to input sections. compute_section_sizes(ctx); // If --packed_dyn_relocs=relr was given, base relocations are stored // to a .relr.dyn section in a compressed form. Construct a compressed // relocations now so that we can fix section sizes and file layout. if (ctx.arg.pack_dyn_relocs_relr) construct_relr(ctx); // Reserve a space for dynamic symbol strings in .dynstr and sort // .dynsym contents if necessary. Beyond this point, no symbol will // be added to .dynsym. sort_dynsyms(ctx); // Sort .debug_info contents so that DWARF32 debug info precedes that of // DWARF64. This is to mitigate the possibility of a relocation overflow. sort_debug_info_sections(ctx); // Print reports about undefined symbols, if needed. if (ctx.arg.unresolved_symbols == UNRESOLVED_ERROR) report_undef_errors(ctx); // Fill .gnu.version_d section contents. if (ctx.verdef) ctx.verdef->construct(ctx); // Fill .gnu.version_r section contents. ctx.verneed->construct(ctx); // .eh_frame is a special section from the linker's point of view, // as its contents are parsed and reconstructed by the linker, // unlike other sections that are regarded as opaque bytes. // Here, we construct output .eh_frame contents. ctx.eh_frame->construct(ctx); // If --emit-relocs is given, we'll copy relocation sections from input // files to an output file. if (ctx.arg.emit_relocs) create_reloc_sections(ctx); // Compute .symtab and .strtab sizes for each file. if (!ctx.arg.strip_all) create_output_symtab(ctx); // Compute the section header values for all sections. compute_section_headers(ctx); // Assign offsets to output sections i64 filesize = set_osec_offsets(ctx); // On RISC-V, branches are encode using multiple instructions so // that they can jump to anywhere in ±2 GiB by default. They may // be replaced with shorter instruction sequences if destinations // are close enough. Do this optimization. if constexpr (is_riscv || is_loongarch) { shrink_sections(ctx); filesize = set_osec_offsets(ctx); } // We've created range extension thunks with a pessimistive assumption // that all out-of-section references are out of range. Now that we know // the addresses of all sections,, we can eliminate excessive thunks. if constexpr (needs_thunk) { remove_redundant_thunks(ctx); filesize = set_osec_offsets(ctx); } if constexpr (is_arm32) { if (ctx.extra.exidx) { ctx.extra.exidx->remove_duplicate_entries(ctx); filesize = set_osec_offsets(ctx); } } // At this point, memory layout is fixed. // Set actual addresses to linker-synthesized symbols. fix_synthetic_symbols(ctx); // Beyond this, you can assume that symbol addresses including their // GOT or PLT addresses have a correct final value. // If --compress-debug-sections is given, compress .debug_* sections // using zlib or zstd. if (ctx.arg.compress_debug_sections != ELFCOMPRESS_NONE) { compress_debug_sections(ctx); filesize = set_osec_offsets(ctx); } // At this point, both memory and file layouts are fixed. // Gather thunk symbols and attach them to themselves. if constexpr (needs_thunk) gather_thunk_addresses(ctx); t_before_copy.stop(); // Create an output file ctx.output_file = OutputFile::open(ctx, ctx.arg.output, filesize, 0777); ctx.buf = ctx.output_file->buf; Timer t_copy(ctx, "copy"); // Copy input sections to the output file and apply relocations. copy_chunks(ctx); if constexpr (is_arm32be) arm32be_swap_bytes(ctx); if constexpr (is_x86_64) if (ctx.arg.z_rewrite_endbr) rewrite_endbr(ctx); // Dynamic linker works better with sorted .rela.dyn section, // so we sort them. sort_reldyn(ctx); // .gdb_index's contents cannot be constructed before applying // relocations to other debug sections. We have relocated debug // sections now, so write the .gdb_index section. if (ctx.gdb_index && !ctx.gnu_debuglink) write_gdb_index(ctx); // .note.gnu.build-id section contains a cryptographic hash of the // entire output file. Now that we wrote everything except build-id, // we can compute it. if (ctx.buildid) write_build_id(ctx); if (ctx.gnu_debuglink) write_gnu_debuglink(ctx); t_copy.stop(); ctx.checkpoint(); // Close the output file. This is the end of the linker's main job. ctx.output_file->close(ctx); // Handle --dependency-file if (!ctx.arg.dependency_file.empty()) write_dependency_file(ctx); if (!ctx.arg.plugin.empty()) lto_cleanup(ctx); t_all.stop(); if (ctx.arg.print_map) print_map(ctx); if (ctx.gnu_debuglink) write_separate_debug_file(ctx); // Show stats numbers if (ctx.arg.stats) show_stats(ctx); if (ctx.arg.perf) print_timer_records(ctx.timer_records); std::cout << std::flush; std::cerr << std::flush; notify_parent(); release_global_lock(); if (ctx.arg.quick_exit) _exit(0); for (std::function &fn : ctx.on_exit) fn(); ctx.checkpoint(); return 0; } using E = MOLD_TARGET; template int mold_main(int, char **); } // namespace mold ================================================ FILE: src/mapfile.cc ================================================ #include "mold.h" #include #include #include #include #include #include namespace mold { template using Map = tbb::concurrent_hash_map *, std::vector *>>; template static Map get_map(Context &ctx) { Map map; tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { for (Symbol *sym : file->symbols) { if (sym->file == file && sym->get_type() != STT_SECTION) { if (InputSection *isec = sym->get_input_section()) { typename Map::accessor acc; map.insert(acc, {isec, {}}); acc->second.push_back(sym); } } } }); if (map.size() <= 1) return map; tbb::parallel_for(map.range(), [](const typename Map::range_type &range) { for (auto &[k, v] : range) ranges::stable_sort(v, {}, &Symbol::value); }); return map; } template void print_map(Context &ctx) { Timer t(ctx, "print_map"); std::ostream *out = &std::cout; std::ofstream file; if (!ctx.arg.Map.empty() && ctx.arg.Map != "-") { file.open(ctx.arg.Map); if (file.fail()) Fatal(ctx) << "--print-map: cannot open " << ctx.arg.Map << ": " << errno_string(); out = &file; } // Construct a section-to-symbol map. Map map = get_map(ctx); // Print a mapfile. *out << " VMA Size Align Out In Symbol\n"; for (Chunk *chunk : ctx.chunks) { *out << std::showbase << std::setw(18) << std::hex << (u64)chunk->shdr.sh_addr << std::dec << std::setw(11) << (u64)chunk->shdr.sh_size << std::setw(6) << (u64)chunk->shdr.sh_addralign << " " << chunk->name << "\n"; OutputSection *osec = chunk->to_osec(); if (!osec) continue; std::span *> members = osec->members; std::vector bufs(members.size()); tbb::parallel_for((i64)0, (i64)members.size(), [&](i64 i) { InputSection *mem = members[i]; std::ostringstream ss; u64 addr = 0; if (osec->shdr.sh_flags & SHF_ALLOC) addr = osec->shdr.sh_addr + mem->offset; ss << std::showbase << std::setw(18) << std::hex << addr << std::dec << std::setw(11) << (u64)mem->sh_size << std::setw(6) << (1 << mem->p2align) << " " << *mem << "\n"; typename Map::const_accessor acc; if (map.find(acc, mem)) for (Symbol *sym : acc->second) ss << std::showbase << std::setw(18) << std::hex << sym->get_addr(ctx) << std::dec << " 0 0 " << *sym << "\n"; bufs[i] = ss.str(); }); for (std::string &str : bufs) *out << str; } } using E = MOLD_TARGET; template void print_map(Context &ctx); } // namespace mold ================================================ FILE: src/mapped-file-unix.cc ================================================ #include "mold.h" namespace mold { MappedFile *open_file_impl(const std::string &path, std::string &error) { i64 fd = ::open(path.c_str(), O_RDONLY); if (fd == -1) { if (errno != ENOENT) error = "opening " + path + " failed: " + errno_string(); return nullptr; } struct stat st; if (fstat(fd, &st) == -1) error = path + ": fstat failed: " + errno_string(); MappedFile *mf = new MappedFile; mf->name = path; mf->size = st.st_size; if (st.st_size > 0) { mf->data = (u8 *)mmap(nullptr, st.st_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); if (mf->data == MAP_FAILED) error = path + ": mmap failed: " + errno_string(); } close(fd); return mf; } void MappedFile::unmap() { if (size == 0 || parent || !data) return; munmap(data, size); data = nullptr; } void MappedFile::close_fd() { if (fd == -1) return; close(fd); fd = -1; } void MappedFile::reopen_fd(const std::string &path) { if (fd == -1) fd = open(path.c_str(), O_RDONLY); } } // namespace mold ================================================ FILE: src/mapped-file-win32.cc ================================================ #include "mold.h" namespace mold { MappedFile *open_file_impl(const std::string &path, std::string &error) { HANDLE fd = CreateFileA(path.c_str(), GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, nullptr, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr); if (fd == INVALID_HANDLE_VALUE) { auto err = GetLastError(); if (err != ERROR_FILE_NOT_FOUND) error = "opening " + path + " failed: " + errno_string(); return nullptr; } if (GetFileType(fd) != FILE_TYPE_DISK) { CloseHandle(fd); return nullptr; } DWORD size_hi; DWORD size_lo = GetFileSize(fd, &size_hi); if (size_lo == INVALID_FILE_SIZE) { error = path + ": GetFileSize failed: " + errno_string(); return nullptr; } u64 size = ((u64)size_hi << 32) + size_lo; MappedFile *mf = new MappedFile; mf->name = path; mf->size = size; mf->fd = fd; if (size > 0) { HANDLE h = CreateFileMapping(fd, nullptr, PAGE_READONLY, 0, size, nullptr); if (!h) { error = path + ": CreateFileMapping failed: " + errno_string(); return nullptr; } mf->data = (u8 *)MapViewOfFile(h, FILE_MAP_COPY, 0, 0, size); CloseHandle(h); if (!mf->data) { error = path + ": MapViewOfFile failed: " + errno_string(); return nullptr; } } return mf; } void MappedFile::unmap() { if (size == 0 || parent || !data) return; UnmapViewOfFile(data); if (fd != INVALID_HANDLE_VALUE) CloseHandle(fd); data = nullptr; } void MappedFile::close_fd() { if (fd == INVALID_HANDLE_VALUE) return; CloseHandle(fd); fd = INVALID_HANDLE_VALUE; } void MappedFile::reopen_fd(const std::string &path) { if (fd == INVALID_HANDLE_VALUE) fd = CreateFileA(path.c_str(), GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, nullptr, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr); } } // namespace mold ================================================ FILE: src/mold-wrapper.c ================================================ #define _GNU_SOURCE 1 #include #include #include #include #include #include #include #include #if __has_include() # include #endif extern char **environ; static char *get_mold_path() { char *path = getenv("MOLD_PATH"); if (path) return path; fprintf(stderr, "MOLD_PATH is not set\n"); exit(1); } static void debug_print(const char *fmt, ...) { if (!getenv("MOLD_WRAPPER_DEBUG")) return; va_list ap; va_start(ap, fmt); fprintf(stderr, "mold-wrapper.so: "); vfprintf(stderr, fmt, ap); fflush(stderr); va_end(ap); } static int count_args(va_list *ap) { va_list aq; va_copy(aq, *ap); int i = 0; while (va_arg(aq, char *)) i++; va_end(aq); return i; } static void copy_args(char **argv, const char *arg0, va_list *ap) { int i = 1; char *arg; while ((arg = va_arg(*ap, char *))) argv[i++] = arg; ((const char **)argv)[0] = arg0; ((const char **)argv)[i] = NULL; } static bool is_ld(const char *path) { const char *ptr = path + strlen(path); while (path < ptr && ptr[-1] != '/') ptr--; return !strcmp(ptr, "ld") || !strcmp(ptr, "ld.lld") || !strcmp(ptr, "ld.gold") || !strcmp(ptr, "ld.bfd") || !strcmp(ptr, "ld.mold"); } int execvpe(const char *file, char *const *argv, char *const *envp) { debug_print("execvpe %s\n", file); if (!strcmp(file, "ld") || is_ld(file)) file = get_mold_path(); for (int i = 0; envp[i]; i++) putenv(envp[i]); typeof(execvpe) *real = dlsym(RTLD_NEXT, "execvp"); return real(file, argv, environ); } int execve(const char *path, char *const *argv, char *const *envp) { debug_print("execve %s\n", path); if (is_ld(path)) path = get_mold_path(); typeof(execve) *real = dlsym(RTLD_NEXT, "execve"); return real(path, argv, envp); } int execl(const char *path, const char *arg0, ...) { va_list ap; va_start(ap, arg0); char **argv = alloca((count_args(&ap) + 2) * sizeof(char *)); copy_args(argv, arg0, &ap); va_end(ap); return execve(path, argv, environ); } int execlp(const char *file, const char *arg0, ...) { va_list ap; va_start(ap, arg0); char **argv = alloca((count_args(&ap) + 2) * sizeof(char *)); copy_args(argv, arg0, &ap); va_end(ap); return execvpe(file, argv, environ); } int execle(const char *path, const char *arg0, ...) { va_list ap; va_start(ap, arg0); char **argv = alloca((count_args(&ap) + 2) * sizeof(char *)); copy_args(argv, arg0, &ap); char **env = va_arg(ap, char **); va_end(ap); return execve(path, argv, env); } int execv(const char *path, char *const *argv) { return execve(path, argv, environ); } int execvp(const char *file, char *const *argv) { return execvpe(file, argv, environ); } int posix_spawn(pid_t *pid, const char *path, const posix_spawn_file_actions_t *file_actions, const posix_spawnattr_t *attrp, char *const *argv, char *const *envp) { debug_print("posix_spawn %s\n", path); if (is_ld(path)) path = get_mold_path(); typeof(posix_spawn) *real = dlsym(RTLD_NEXT, "posix_spawn"); return real(pid, path, file_actions, attrp, argv, envp); } int posix_spawnp(pid_t *pid, const char *file, const posix_spawn_file_actions_t *file_actions, const posix_spawnattr_t *attrp, char *const *argv, char *const *envp) { debug_print("posix_spawnp %s\n", file); if (is_ld(file)) file = get_mold_path(); typeof(posix_spawnp) *real = dlsym(RTLD_NEXT, "posix_spawnp"); return real(pid, file, file_actions, attrp, argv, envp); } ================================================ FILE: src/mold.h ================================================ #pragma once #include "../lib/lib.h" #include "elf.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef _WIN32 # include #endif #if ENABLE_MSAN_UNPOISON extern "C" void __msan_unpoison(void *, size_t); # define msan_unpoison(addr, sz) __msan_unpoison(addr, sz) #else # define msan_unpoison(addr, sz) #endif namespace mold { template class InputFile; template class InputSection; template class MergedSection; template class ObjectFile; template class Chunk; template class OutputSection; template class SharedFile; template class Symbol; template struct CieRecord; template struct Context; template struct FdeRecord; template class MergeableSection; template class RelocSection; struct ReaderContext; template std::ostream &operator<<(std::ostream &out, const Symbol &sym); extern std::string mold_version; // // error.cc // // Some C++ stdlibs don't support std::osyncstream even though // it's is in the C++20 standard. So we implement it ourselves. class SyncStream { public: SyncStream(std::ostream &out) : out(out) {} ~SyncStream() { emit(); } void emit() { if (!emitted) { std::scoped_lock lock(mu); out << ss.str() << '\n'; emitted = true; } } template SyncStream &operator<<(T &&val) { ss << std::forward(val); return *this; } private: std::ostream &out; std::stringstream ss; bool emitted = false; static inline std::mutex mu; }; template class Out { public: Out(Context &ctx) {} template Out &operator<<(T &&val) { out << std::forward(val); return *this; } private: SyncStream out{std::cout}; }; template class Fatal { public: Fatal(Context &ctx); [[noreturn]] ~Fatal(); template Fatal &operator<<(T &&val) { out << std::forward(val); return *this; } private: SyncStream out{std::cerr}; }; template class Error { public: Error(Context &ctx); template Error &operator<<(T &&val) { out << std::forward(val); return *this; } private: SyncStream out{std::cerr}; }; template class Warn { public: Warn(Context &ctx); template Warn &operator<<(T &&val) { if (out) *out << std::forward(val); return *this; } private: std::optional out; }; // // signal-unix.cc // inline char *output_tmpfile = nullptr; inline u8 *output_buffer_start = nullptr; inline u8 *output_buffer_end = nullptr; std::string errno_string(); void cleanup(); void install_signal_handler(); // // mapped-file-unix.cc // // MappedFile represents an mmap'ed input file. // mold uses mmap-IO only. class MappedFile { public: ~MappedFile() { unmap(); } void unmap(); void close_fd(); void reopen_fd(const std::string &path); template MappedFile *slice(Context &ctx, std::string name, u64 start, u64 size) { MappedFile *mf = new MappedFile; mf->name = name; mf->data = data + start; mf->size = size; mf->parent = this; ctx.mf_pool.emplace_back(mf); return mf; } std::string_view get_contents() { return std::string_view((char *)data, size); } i64 get_offset() const { return parent ? (data - parent->data + parent->get_offset()) : 0; } // Returns a string that uniquely identify a file that is possibly // in an archive. std::string get_identifier() const { if (parent) { // We use the file offset within an archive as an identifier // because archive members may have the same name. return parent->name + ":" + std::to_string(get_offset()); } if (thin_parent) { // If this is a thin archive member, the filename part is // guaranteed to be unique. return thin_parent->name + ":" + name; } return name; } std::string name; u8 *data = nullptr; i64 size = 0; bool given_fullpath = true; MappedFile *parent = nullptr; MappedFile *thin_parent = nullptr; // For --dependency-file bool is_dependency = true; #ifdef _WIN32 HANDLE fd = INVALID_HANDLE_VALUE; #else int fd = -1; #endif }; MappedFile *open_file_impl(const std::string &path, std::string &error); template MappedFile *open_file(Context &ctx, std::string path) { if (path.starts_with('/') && !ctx.arg.chroot.empty()) path = ctx.arg.chroot + "/" + path_clean(path); std::string error; MappedFile *mf = open_file_impl(path, error); if (!error.empty()) Fatal(ctx) << error; if (mf) ctx.mf_pool.emplace_back(mf); return mf; } template MappedFile *must_open_file(Context &ctx, std::string path) { MappedFile *mf = open_file(ctx, path); if (!mf) Fatal(ctx) << "cannot open " << path << ": " << errno_string(); return mf; } // // jobs-unix.cc // void acquire_global_lock(); void release_global_lock(); // // Mergeable section fragments // template struct __attribute__((aligned(4))) SectionFragment { SectionFragment(MergedSection *sec, bool is_alive) : output_section(*sec), is_alive(is_alive) {} u64 get_addr(Context &ctx) const { return output_section.shdr.sh_addr + offset; } MergedSection &output_section; i64 offset = -1; Atomic p2align = 0; Atomic is_alive = false; // True if this fragment must be placed within 2^32 bytes from the // start of the output section. Atomic is_32bit = false; }; // Additional class members for dynamic symbols. Because most symbols // don't need them and we allocate tens of millions of symbol objects // for large programs, we separate them from `Symbol` class to save // memory. template struct SymbolAux { i32 got_idx = -1; i32 gottp_idx = -1; i32 tlsgd_idx = -1; i32 tlsdesc_idx = -1; i32 plt_idx = -1; i32 pltgot_idx = -1; i32 dynsym_idx = -1; i32 opd_idx = -1; u32 djb_hash = 0; // For range extension thunks std::vector thunk_addrs; }; // // thunks.cc // template class Thunk {}; template class Thunk { public: Thunk(OutputSection &osec, i64 offset) : output_section(osec), offset(offset) {} u64 get_addr() const { return output_section.shdr.sh_addr + offset; } i64 size() { return offsets.back(); } void compute_size(); void shrink_size(Context &ctx) { compute_size(); } void copy_buf(Context &ctx); OutputSection &output_section; i64 offset; std::vector *> symbols; std::vector offsets; std::string name; }; template <> void Thunk::shrink_size(Context &); template <> void Thunk::shrink_size(Context &); template static consteval i64 get_branch_distance() { // ARM64's branch has 26 bits immediate. The immediate is padded with // implicit two-bit zeros because all instructions are 4 bytes aligned // and therefore the least two bits are always zero. So the branch // operand is effectively 28 bits long. That means the branch range is // [-2^27, 2^27) or PC ± 128 MiB. if (is_arm64) return 1 << 27; // ARM32's Thumb branch has 24 bits immediate, and the instructions are // aligned to 2, so it's effectively 25 bits. It's [-2^24, 2^24) or PC ± // 16 MiB. // // ARM32's non-Thumb branches have twice longer range than its Thumb // counterparts, but we conservatively use the Thumb's limitation. if (is_arm32) return 1 << 24; // PPC's branch has 24 bits immediate, and the instructions are aligned // to 4, therefore the reach is [-2^25, 2^25) or PC ± 32 MiB. assert(is_ppc); return 1 << 25; } // The maximum distance of branch instructions used for function calls. // // The exact origin for computing a destination varies slightly depending // on the target architecture. For example, ARM32's B instruction jumps to // the branch's address + immediate + 4 (i.e., B with offset 0 jumps to // the next instruction), while RISC-V has no such implicit bias. Here, we // subtract 32 as a safety margin that is large enough for all targets. template static constexpr i64 branch_distance = get_branch_distance() - 32; template void remove_redundant_thunks(Context &ctx); template void gather_thunk_addresses(Context &ctx); // // input-sections.cc // // .eh_frame section contains CIE and FDE records to teach the runtime // how to handle exceptions. Usually, a .eh_frame contains one CIE // followed by as many FDEs as the number of functions defined by the // file. CIE contains common information for FDEs (it is actually // short for Common Information Entry). FDE contains the start address // of a function and its length as well as how to handle exceptions // for that function. // // Unlike other sections, the linker has to parse .eh_frame for optimal // output for the following reasons: // // - Compilers tend to emit the same CIE as long as the programming // language is the same, so CIEs in input object files are almost // always identical. We want to merge them to make a resulting // .eh_frame smaller. // // - If we eliminate a function (e.g. when we see two object files // containing the duplicate definition of an inlined function), we // want to also eliminate a corresponding FDE so that a resulting // .eh_frame doesn't contain a dead FDE entry. // // - If we need to compare two function definitions for equality for // ICF, we need to compare not only the function body but also its // exception handlers. // // Note that we assume that the first relocation entry for an FDE // always points to the function that the FDE is associated to. template struct CieRecord { CieRecord(Context &ctx, ObjectFile &file, InputSection &isec, u32 input_offset, std::span> rels, u32 rel_idx) : file(file), input_section(isec), input_offset(input_offset), rel_idx(rel_idx), rels(rels), contents(file.get_string(ctx, isec.shdr())) {} i64 size() const { return *(U32 *)(contents.data() + input_offset) + 4; } std::string_view get_contents() const { return contents.substr(input_offset, size()); } std::span> get_rels() const { i64 end = input_offset + size(); i64 i = rel_idx; while (i < rels.size() && rels[i].r_offset < end) i++; return rels.subspan(rel_idx, i - rel_idx); } ObjectFile &file; InputSection &input_section; u32 input_offset = -1; u32 output_offset = -1; u32 rel_idx = -1; u32 icf_idx = -1; bool is_leader = false; std::span> rels; std::string_view contents; }; template bool cie_equals(const CieRecord &a, const CieRecord &b); template struct FdeRecord { FdeRecord(u32 input_offset, u32 rel_idx) : input_offset(input_offset), rel_idx(rel_idx) {} i64 size(ObjectFile &file) const { return *(U32 *)(file.cies[cie_idx].contents.data() + input_offset) + 4; } std::string_view get_contents(ObjectFile &file) const { return file.cies[cie_idx].contents.substr(input_offset, size(file)); } std::span> get_rels(ObjectFile &file) const { std::span> rels = file.cies[cie_idx].rels; i64 end = input_offset + size(file); i64 i = rel_idx; while (i < rels.size() && rels[i].r_offset < end) i++; return rels.subspan(rel_idx, i - rel_idx); } u32 input_offset = -1; u32 output_offset = -1; u32 rel_idx = -1; u16 cie_idx = -1; Atomic is_alive = true; }; // A struct to hold target-dependent input section members. template struct InputSectionExtras {}; template struct InputSectionExtras { InputSection *exidx = nullptr; }; struct RelocDelta { u64 offset; i64 delta; }; // RISC-V and LoongArch support code-shrinking linker relaxation. // // r_deltas is used to manage the locations where instructions are removed // from a section. r_deltas is sorted by offset. Each RelocDelta indicates // that the contents at and after `offset` and up to the next RelocDelta // offset need to be shifted towards the beginning of the section by // `delta` bytes when copying section contents to the output buffer. // // Since code-shrinking relaxation never bloats section contents, `delta` // increases monotonically within the vector as well. template requires is_riscv || is_loongarch struct InputSectionExtras { std::vector r_deltas; }; // InputSection represents a section in an input object file. template class __attribute__((aligned(4))) InputSection { public: InputSection(Context &ctx, ObjectFile &file, i64 shndx); void uncompress(Context &ctx); void copy_contents_to(Context &ctx, u8 *buf, i64 sz); void scan_relocations(Context &ctx); void write_to(Context &ctx, u8 *buf); void apply_reloc_alloc(Context &ctx, u8 *base); void apply_reloc_nonalloc(Context &ctx, u8 *base); void kill(); std::string_view name() const; i64 get_priority() const; u64 get_addr() const; ElfShdr &shdr() const; std::span> get_rels(Context &ctx) const; std::span> get_fdes() const; std::string_view get_func_name(Context &ctx, i64 offset) const; bool is_relr_reloc(Context &ctx, const ElfRel &rel) const; bool icf_removed() const; bool record_undef_error(Context &ctx, const ElfRel &rel); void check_range(Context &ctx, i64 i, i64 val, i64 lo, i64 hi); std::pair *, i64> get_fragment(Context &ctx, const ElfRel &rel); ObjectFile &file; OutputSection *output_section = nullptr; i64 sh_size = -1; std::string_view contents; i32 fde_begin = -1; i32 fde_end = -1; i64 offset = -1; i32 shndx = -1; i32 relsec_idx = -1; i32 reldyn_offset = 0; bool uncompressed = false; // For COMDAT de-duplication and garbage collection Atomic is_alive = true; u8 p2align = 0; // For ICF Atomic address_taken = false; // For garbage collection Atomic is_visited = false; // For ICF // // `leader` is the section that this section has been merged with. // Three kind of values are possible: // - `leader == nullptr`: This section was not eligible for ICF. // - `leader == this`: This section was retained. // - `leader != this`: This section was merged with another identical section. InputSection *leader = nullptr; i32 icf_idx = -1; bool icf_eligible = false; bool icf_leaf = false; [[no_unique_address]] InputSectionExtras extra; private: void scan_pcrel(Context &ctx, Symbol &sym, const ElfRel &rel); void scan_absrel(Context &ctx, Symbol &sym, const ElfRel &rel); void scan_tlsdesc(Context &ctx, Symbol &sym); void check_tlsle(Context &ctx, Symbol &sym, const ElfRel &rel); void apply_dyn_absrel(Context &ctx, Symbol &sym, const ElfRel &rel, u8 *loc, u64 S, i64 A, u64 P, ElfRel **dynrel); void apply_toc_rel(Context &ctx, Symbol &sym, const ElfRel &rel, u8 *loc, u64 S, i64 A, u64 P, ElfRel **dynrel); std::optional get_tombstone(Symbol &sym, SectionFragment *frag); }; // // tls.cc // template u64 get_tp_addr(const ElfPhdr &); template u64 get_dtp_addr(const ElfPhdr &); // // filetype.cc // enum class FileType { UNKNOWN, EMPTY, ELF_OBJ, ELF_DSO, AR, THIN_AR, TEXT, GCC_LTO_OBJ, LLVM_BITCODE, }; template FileType get_file_type(Context &ctx, MappedFile *mf); template std::string_view get_machine_type(Context &ctx, ReaderContext &rctx, MappedFile *mf); // // output-chunks.cc // template Chunk *find_chunk(Context &ctx, u32 sh_type); template Chunk *find_chunk(Context &ctx, std::string_view name); template u64 get_eflags(Context &ctx) { return 0; } template i64 to_phdr_flags(Context &ctx, Chunk *chunk); template void write_plt_header(Context &ctx, u8 *buf); template void write_plt_entry(Context &ctx, u8 *buf, Symbol &sym); template void write_pltgot_entry(Context &ctx, u8 *buf, Symbol &sym); // Chunk represents a contiguous region in an output file. template class __attribute__((aligned(4))) Chunk { public: virtual ~Chunk() = default; virtual bool is_header() { return false; } virtual OutputSection *to_osec() { return nullptr; } virtual void compute_section_size(Context &ctx) {} virtual i64 get_reldyn_size(Context &ctx) const { return 0; } virtual void construct_relr(Context &ctx) {} virtual void copy_buf(Context &ctx) {} virtual void write_to(Context &ctx, u8 *buf) { unreachable(); } virtual void update_shdr(Context &ctx) {} std::string_view name; ElfShdr shdr = { .sh_addralign = 1 }; i64 shndx = 0; bool is_relro = false; // For --gdb-index bool is_compressed = false; // Some synethetic sections add local symbols to the output. // For example, range extension thunks adds function_name@thunk // symbol for each thunk entry. The following members are used // for such synthesizing symbols. virtual void compute_symtab_size(Context &ctx) {} virtual void populate_symtab(Context &ctx) {} i64 local_symtab_idx = 0; i64 num_local_symtab = 0; i64 strtab_size = 0; i64 strtab_offset = 0; // Offset in .rel.dyn i64 reldyn_offset = 0; // For --section-order i64 sect_order = 0; // For --pack-dyn-relocs=relr std::vector relr; }; // ELF header which is at the beginning of each ELF file. template class OutputEhdr : public Chunk { public: OutputEhdr(u32 sh_flags) { this->name = "EHDR"; this->shdr.sh_flags = sh_flags; this->shdr.sh_size = sizeof(ElfEhdr); this->shdr.sh_addralign = sizeof(Word); } bool is_header() override { return true; } void copy_buf(Context &ctx) override; }; // OutputShdr represents the section header. The section header is usually // located at the end of an ELF file and is optional for executables. // Executables work without it because the runtime only reads the program // header. Section header is significant only in object files and not // needed at runtime template class OutputShdr : public Chunk { public: OutputShdr() { this->name = "SHDR"; this->shdr.sh_size = 1; this->shdr.sh_addralign = sizeof(Word); } bool is_header() override { return true; } void copy_buf(Context &ctx) override; }; // Program header, a.k.a. segment header. Each entry in the program header // represents a contiguous region of memory and has attributes such as // page protection bits. On program startup, the kernel mmap's the file // contents to memory based on the program header. template class OutputPhdr : public Chunk { public: OutputPhdr(u32 sh_flags) { this->name = "PHDR"; this->shdr.sh_flags = sh_flags; this->shdr.sh_addralign = sizeof(Word); } bool is_header() override { return true; } void update_shdr(Context &ctx) override; void copy_buf(Context &ctx) override; std::vector> phdrs; }; // .interp contains the pathname of a dynamic linker. Dynamically-linked // executables have the section. If exists, the kernel runs the program at // the specified path with the executable pathname as an argument, // allowing the dynamic linker to run the program. template class InterpSection : public Chunk { public: InterpSection() { this->name = ".interp"; this->shdr.sh_type = SHT_PROGBITS; this->shdr.sh_flags = SHF_ALLOC; } void update_shdr(Context &ctx) override; void copy_buf(Context &ctx) override; }; enum AbsRelKind { ABS_REL_NONE, ABS_REL_BASEREL, ABS_REL_RELR, ABS_REL_IFUNC, ABS_REL_DYNREL, }; // Represents a word-size absolute relocation (e.g. R_X86_64_64) template struct AbsRel { InputSection *isec = nullptr; u64 offset = 0; Symbol *sym = nullptr; i64 addend = 0; AbsRelKind kind = ABS_REL_NONE; }; // OutputSection represents the usual output section that contains input // sections read from object files. template class OutputSection : public Chunk { public: OutputSection(std::string_view name, u32 type) { this->name = name; this->shdr.sh_type = type; } OutputSection *to_osec() override { return this; } void compute_section_size(Context &ctx) override; i64 get_reldyn_size(Context &ctx) const override; void construct_relr(Context &ctx) override; void copy_buf(Context &ctx) override; void write_to(Context &ctx, u8 *buf) override; void compute_symtab_size(Context &ctx) override; void populate_symtab(Context &ctx) override; void scan_abs_relocations(Context &ctx); void create_range_extension_thunks(Context &ctx); std::vector *> members; std::vector>> thunks; std::unique_ptr> reloc_sec; std::vector> abs_rels; Atomic sh_flags; // Used only by create_output_sections() std::vector *>> members_vec; }; // .got is a linker-synthesized constant pool whose entry size is the same // as the pointer size. It is used to store runtime addresses of global // variables and TP-relative offsets of thread-local variables. template class GotSection : public Chunk { public: GotSection() { this->name = ".got"; this->is_relro = true; this->shdr.sh_type = SHT_PROGBITS; this->shdr.sh_flags = SHF_ALLOC | SHF_WRITE; this->shdr.sh_addralign = sizeof(Word); // We always create a .got so that _GLOBAL_OFFSET_TABLE_ has // something to point to. s390x psABI define GOT[1] and GOT[2] // as reserved slots, so we allocate two more for them. this->shdr.sh_size = (is_s390x ? 3 : 1) * sizeof(Word); } void add_got_symbol(Context &ctx, Symbol *sym); void add_gottp_symbol(Context &ctx, Symbol *sym); void add_tlsgd_symbol(Context &ctx, Symbol *sym); void add_tlsdesc_symbol(Context &ctx, Symbol *sym); void add_tlsld(Context &ctx); u64 get_tlsld_addr(Context &ctx) const; bool has_tlsld(Context &ctx) const { return tlsld_idx != -1; } i64 get_reldyn_size(Context &ctx) const override; void copy_buf(Context &ctx) override; void construct_relr(Context &ctx) override; void compute_symtab_size(Context &ctx) override; void populate_symtab(Context &ctx) override; std::vector *> got_syms; std::vector *> tlsgd_syms; std::vector *> tlsdesc_syms; std::vector *> gottp_syms; i64 tlsld_idx = -1; }; // .got.plt is similar to .got in the sense that it is a table containing // pointers. The contents in .got.plt are function pointers used by .plt. template class GotPltSection : public Chunk { public: GotPltSection(Context &ctx) { this->name = ".got.plt"; this->is_relro = ctx.arg.z_now; this->shdr.sh_type = is_ppc64 ? SHT_NOBITS : SHT_PROGBITS; this->shdr.sh_flags = SHF_ALLOC | SHF_WRITE; this->shdr.sh_addralign = sizeof(Word); this->shdr.sh_size = HDR_SIZE; } void update_shdr(Context &ctx) override; void copy_buf(Context &ctx) override; static constexpr i64 HDR_SIZE = (is_ppc64v2 ? 2 : 3) * sizeof(Word); static constexpr i64 ENTRY_SIZE = (is_ppc64v1 ? 3 : 1) * sizeof(Word); }; // .plt contains linker-synthesized stub code that acts as if they are // functions. They are in fact immediately branches to real function entry // points. .plt is used as a stub for runtime lazy symbol resolution. template class PltSection : public Chunk { public: PltSection() { this->name = ".plt"; this->shdr.sh_type = SHT_PROGBITS; if constexpr (is_sparc) { this->shdr.sh_flags = SHF_ALLOC | SHF_EXECINSTR | SHF_WRITE; this->shdr.sh_addralign = 256; } else { this->shdr.sh_flags = SHF_ALLOC | SHF_EXECINSTR; this->shdr.sh_addralign = 16; } } void add_symbol(Context &ctx, Symbol *sym); void update_shdr(Context &ctx) override; void copy_buf(Context &ctx) override; void compute_symtab_size(Context &ctx) override; void populate_symtab(Context &ctx) override; std::vector *> symbols; }; // .plt.got is similar to .plt but doesn't support lazy symbol resolution. // If we have the same symbol already in .got, resolving the same symbol // lazily for .plt is just waste of time. Therefore, in such case, we use // .plt.got for that symbol instead. template class PltGotSection : public Chunk { public: PltGotSection() { this->name = ".plt.got"; this->shdr.sh_type = SHT_PROGBITS; this->shdr.sh_flags = SHF_ALLOC | SHF_EXECINSTR; this->shdr.sh_addralign = 16; } void add_symbol(Context &ctx, Symbol *sym); void copy_buf(Context &ctx) override; void compute_symtab_size(Context &ctx) override; void populate_symtab(Context &ctx) override; std::vector *> symbols; }; // .rel.plt contains relocation information for .plt. template class RelPltSection : public Chunk { public: RelPltSection() { this->name = E::is_rela ? ".rela.plt" : ".rel.plt"; this->shdr.sh_type = E::is_rela ? SHT_RELA : SHT_REL; this->shdr.sh_flags = SHF_ALLOC; this->shdr.sh_entsize = sizeof(ElfRel); this->shdr.sh_addralign = sizeof(Word); } void update_shdr(Context &ctx) override; void copy_buf(Context &ctx) override; }; // .rel.dyn contains relocation infromation for other sections. template class RelDynSection : public Chunk { public: RelDynSection() { this->name = E::is_rela ? ".rela.dyn" : ".rel.dyn"; this->shdr.sh_type = E::is_rela ? SHT_RELA : SHT_REL; this->shdr.sh_flags = SHF_ALLOC; this->shdr.sh_entsize = sizeof(ElfRel); this->shdr.sh_addralign = sizeof(Word); } void update_shdr(Context &ctx) override; }; // .relr.dyn is a relatively new section to contain base relocation // information. // // A relocatable executable/DSO contains a lot of certain type of // relocation entries, called "base relocations", to specify the locations // of pointers in the FILE that need to be adjusted according to the // desired load address and the actual load address. As an example, // consider the following C code. // // extern int foo; // int *bar = &foo; // // If an executable containing the above code is built as relocatable // executable, meaning that the executable can be loaded not to a specific // address in memory but anywhere in the virtual address space, then the // pointer `bar`'s address is not known at link-time. // // The linker temporarily links the executable to a base address, record // that information to the ELF header, and emits dynamic relocations to // refer to the location of `bar`. At runtime, the loader adds the // difference of the expected load address and the actual one to the // pointer value to fix the pointer value. // // Relocatable executables/DSOs usually contain a fairly large number of // base relocations. In particular, C++ virtual function table is an array // of statically-initialized pointers which need base relocations. // // Notice that base relocations don't contain symbol information. They // need only pointer locations in the ELF file that need fixing at // load-time. Therefore, storing that information to the usual ELF // relocation table is waste of space. // // .relr.dyn is designed to store base relocations in a space-efficient way. template class RelrDynSection : public Chunk { public: RelrDynSection() { this->name = ".relr.dyn"; this->shdr.sh_type = SHT_RELR; this->shdr.sh_flags = SHF_ALLOC; this->shdr.sh_entsize = sizeof(Word); this->shdr.sh_addralign = sizeof(Word); } void update_shdr(Context &ctx) override; void copy_buf(Context &ctx) override; }; // .strtab is referenced by .strtab and contains symbol names. Note that // .strtab is not needed at runtime; one can remove the section from an // ELF file without breaking it. Strings that runtime accesses are stored // in .dynstr. template class StrtabSection : public Chunk { public: StrtabSection() { this->name = ".strtab"; this->shdr.sh_type = SHT_STRTAB; } void update_shdr(Context &ctx) override; void copy_buf(Context &ctx) override; // Offsets in .strtab for ARM32 mapping symbols static constexpr i64 ARM = 1; static constexpr i64 THUMB = 4; static constexpr i64 DATA = 7; }; // .shstrtab contains section names, such as ".text" or ".data". Just like // .strtab, .shstrtab is not needed at runtime. One can remove .shstrtab // and section table from an executable without breaking it. template class ShstrtabSection : public Chunk { public: ShstrtabSection() { this->name = ".shstrtab"; this->shdr.sh_type = SHT_STRTAB; } void update_shdr(Context &ctx) override; void copy_buf(Context &ctx) override; }; // .dynstr contains strings that the runtime uses. template class DynstrSection : public Chunk { public: DynstrSection() { this->name = ".dynstr"; this->shdr.sh_type = SHT_STRTAB; this->shdr.sh_flags = SHF_ALLOC; } i64 add_string(std::string_view str); i64 find_string(std::string_view str); void copy_buf(Context &ctx) override; private: std::unordered_map strings; }; // .dynamic contains various information for dynamically-linked ELF files. // At runtime, the dynamic linker reads the information to work // appropriately. template class DynamicSection : public Chunk { public: DynamicSection(Context &ctx) { this->name = ".dynamic"; this->shdr.sh_type = SHT_DYNAMIC; this->shdr.sh_addralign = sizeof(Word); this->shdr.sh_entsize = sizeof(ElfDyn); if (ctx.arg.z_rodynamic) { this->shdr.sh_flags = SHF_ALLOC; this->is_relro = false; } else { this->shdr.sh_flags = SHF_ALLOC | SHF_WRITE; this->is_relro = true; } } void update_shdr(Context &ctx) override; void copy_buf(Context &ctx) override; }; template std::optional> to_output_esym(Context &ctx, Symbol &sym, u32 st_name, U32 *shndx); // .symtab contains non-dynamic symbols. The section is not needed at // runtime and can be stripped from an ELF file without affecting the // behavior of the program. Symbols in .symtab are mainly for debugging. template class SymtabSection : public Chunk { public: SymtabSection() { this->name = ".symtab"; this->shdr.sh_type = SHT_SYMTAB; this->shdr.sh_entsize = sizeof(ElfSym); this->shdr.sh_addralign = sizeof(Word); } void update_shdr(Context &ctx) override; void copy_buf(Context &ctx) override; }; // .symtab_shndx is a parallel table for .symtab to contain section // indices for symbols. // // Symbol table entry contains a field for section index, but that's only // 16 bit in size, so it cannot refer to a section whose section index is // greater than 65535. We use .symtab_shndx for ELF files containing a lot // of sections. // // Use of this section is exceptional. Most ELF files don't contain one. template class SymtabShndxSection : public Chunk { public: SymtabShndxSection() { this->name = ".symtab_shndx"; this->shdr.sh_type = SHT_SYMTAB_SHNDX; this->shdr.sh_entsize = 4; this->shdr.sh_addralign = 4; } }; // .dynsym contains symbols for dynamic linking. This is similar to // .symtab, but .dynsym contains data that the runtime uses. template class DynsymSection : public Chunk { public: DynsymSection() { this->name = ".dynsym"; this->shdr.sh_type = SHT_DYNSYM; this->shdr.sh_flags = SHF_ALLOC; this->shdr.sh_entsize = sizeof(ElfSym); this->shdr.sh_addralign = sizeof(Word); } void add_symbol(Context &ctx, Symbol *sym); void update_shdr(Context &ctx) override; void copy_buf(Context &ctx) override; std::vector *> symbols; i64 dynstr_offset = -1; }; // .hash contains an on-disk hash table for .dynsym so that the runtime // can look up a symbol name quickly without scannin all entries in // .dynsym. // // Quickly identifying whether or not a .dynsym contains a given symbol is // especially important for ELF because of the dynamic symbol lookup rule // for ELF. In ELF, each dynamic symbol is not searched from a specific // library but from all the ELF files loaded to memory. Therefore, // minimizing the cost of each dynamic symbol lookup is important. template class HashSection : public Chunk { public: HashSection() { this->name = ".hash"; this->shdr.sh_type = SHT_HASH; this->shdr.sh_flags = SHF_ALLOC; this->shdr.sh_entsize = sizeof(Entry); this->shdr.sh_addralign = sizeof(Entry); } void update_shdr(Context &ctx) override; void copy_buf(Context &ctx) override; private: // Even though u32 should suffice as an etnry size for all targets, // s390x uses u64. It looks like a spec bug, but we need to follow // suit for the sake of binary compatibility. using Entry = std::conditional_t, U64, U32>; }; // .gnu.hash is an alternative format for .hash. It contains not only an // on-disk hash table but also contains a bloom filter to quickly identify // whether or not a given symbol name exists in .dynsym. template class GnuHashSection : public Chunk { public: GnuHashSection() { this->name = ".gnu.hash"; this->shdr.sh_type = SHT_GNU_HASH; this->shdr.sh_flags = SHF_ALLOC; this->shdr.sh_addralign = sizeof(Word); } void update_shdr(Context &ctx) override; void copy_buf(Context &ctx) override; static constexpr i64 LOAD_FACTOR = 8; static constexpr i64 HEADER_SIZE = 16; static constexpr i64 BLOOM_SHIFT = 26; i64 num_buckets = -1; i64 num_bloom = 1; i64 num_exported = -1; }; // MergedSection represents a section containing a constant pool such as // string literals or floating-point constants. It is created from // MergeableSection. template class MergedSection : public Chunk { public: static MergedSection * get_instance(Context &ctx, std::string_view name, const ElfShdr &shdr); SectionFragment *insert(Context &ctx, std::string_view data, u64 hash, i64 p2align); void resolve(Context &ctx); void compute_section_size(Context &ctx) override; void copy_buf(Context &ctx) override; void write_to(Context &ctx, u8 *buf) override; void print_stats(Context &ctx); std::vector *> members; std::mutex mu; ConcurrentMap> map; HyperLogLog estimator; bool resolved = false; private: MergedSection(std::string_view name, i64 flags, i64 type, i64 entsize); std::vector shard_offsets; }; // .eh_frame contains runtime information as to how to handle exceptions // for each function. Each input object file contains one .eh_frame section. // We parse input .eh_frame sections, merge their contents and emit the // merged information to .eh_frame. template class EhFrameSection : public Chunk { public: EhFrameSection() { this->name = ".eh_frame"; this->shdr.sh_type = SHT_PROGBITS; this->shdr.sh_flags = SHF_ALLOC; this->shdr.sh_addralign = sizeof(Word); } void construct(Context &ctx); void apply_eh_reloc(Context &ctx, const ElfRel &rel, u64 offset, u64 val); void copy_buf(Context &ctx) override; }; // .eh_frame_hdr is a lookup table for .eh_frame. Entries in .eh_frame_hdr // are sorted by their dcorresponding function addresses, so tha the // runtime can quickly find an exception-handling record for the current // function by binary search. Without .eh_frame_hdr, the runtime would // have had to do linear search in .eh_frame. template class EhFrameHdrSection : public Chunk { public: EhFrameHdrSection() { this->name = ".eh_frame_hdr"; this->shdr.sh_type = SHT_PROGBITS; this->shdr.sh_flags = SHF_ALLOC; this->shdr.sh_addralign = 4; this->shdr.sh_size = HEADER_SIZE; } static constexpr i64 HEADER_SIZE = 12; void update_shdr(Context &ctx) override; void copy_buf(Context &ctx) override; i64 num_fdes = 0; }; // EhFrameRelocSection contains relcoation records for .eh_frame. We use // this class only for relocatable outputs (i.e. the output is an .o file // as opposed to an executable or a .so file.) template class EhFrameRelocSection : public Chunk { public: EhFrameRelocSection() { this->name = E::is_rela ? ".rela.eh_frame" : ".rel.eh_frame"; this->shdr.sh_type = E::is_rela ? SHT_RELA : SHT_REL; this->shdr.sh_flags = SHF_INFO_LINK; this->shdr.sh_addralign = sizeof(Word); this->shdr.sh_entsize = sizeof(ElfRel); } void update_shdr(Context &ctx) override; void copy_buf(Context &ctx) override; }; // .copyrel and .copyrel.rel.ro represent memory regions to which the // runtime copies symbols from other ELF files for copy relocations. template class CopyrelSection : public Chunk { public: CopyrelSection(bool is_relro) { this->name = is_relro ? ".copyrel.rel.ro" : ".copyrel"; this->is_relro = is_relro; this->shdr.sh_type = SHT_NOBITS; this->shdr.sh_flags = SHF_ALLOC | SHF_WRITE; } void add_symbol(Context &ctx, Symbol *sym); i64 get_reldyn_size(Context &ctx) const override { return symbols.size(); } void copy_buf(Context &ctx) override; std::vector *> symbols; }; // .gnu.version contains a parallel table for .dynsym to specify symbol // versions of undefined symbols. A symbol having an entry in .gnu.version // must be resolved to a symbol with the exact same version string at // runtime. template class VersymSection : public Chunk { public: VersymSection() { this->name = ".gnu.version"; this->shdr.sh_type = SHT_GNU_VERSYM; this->shdr.sh_flags = SHF_ALLOC; this->shdr.sh_entsize = 2; this->shdr.sh_addralign = 2; } void update_shdr(Context &ctx) override; void copy_buf(Context &ctx) override; std::vector> contents; }; // .gnu.version_r contains information to refer to shared libraries and // their symbol versions. template class VerneedSection : public Chunk { public: VerneedSection() { this->name = ".gnu.version_r"; this->shdr.sh_type = SHT_GNU_VERNEED; this->shdr.sh_flags = SHF_ALLOC; this->shdr.sh_addralign = 4; } void construct(Context &ctx); void update_shdr(Context &ctx) override; void copy_buf(Context &ctx) override; std::vector contents; }; // .gnu.version contains a parallel table for .dynsym to specify symbol // versions of defined symbols. This section appears only in .so files, // and it specifies the symbol version for each defined dynamic symbol. template class VerdefSection : public Chunk { public: VerdefSection() { this->name = ".gnu.version_d"; this->shdr.sh_type = SHT_GNU_VERDEF; this->shdr.sh_flags = SHF_ALLOC; this->shdr.sh_addralign = 4; } void construct(Context &ctx); void update_shdr(Context &ctx) override; void copy_buf(Context &ctx) override; std::vector contents; }; // .note.gnu.build-id contains an identifier for an output ELF file. The // contents of the section is usually a cryptogrpahic hash of the output // file itself to guarantee uniqueness of build-id. template class BuildIdSection : public Chunk { public: BuildIdSection() { this->name = ".note.gnu.build-id"; this->shdr.sh_type = SHT_NOTE; this->shdr.sh_flags = SHF_ALLOC; this->shdr.sh_addralign = 4; this->shdr.sh_size = 1; } void update_shdr(Context &ctx) override; void copy_buf(Context &ctx) override; std::vector contents; }; // .note.package is an optional hint section that can contain arbitrary // string. Package managers, such as dpkg or rpm, uses the section to // embed package metadata into each ELF file so that it is easy to find // the origin of an ELF file without any additional information. template class NotePackageSection : public Chunk { public: NotePackageSection() { this->name = ".note.package"; this->shdr.sh_type = SHT_NOTE; this->shdr.sh_flags = SHF_ALLOC; this->shdr.sh_addralign = 4; } void update_shdr(Context &ctx) override; void copy_buf(Context &ctx) override; }; // .note.gnu.property section contains an additional runtime information // about ISA variant. template class NotePropertySection : public Chunk { public: NotePropertySection() { this->name = ".note.gnu.property"; this->shdr.sh_type = SHT_NOTE; this->shdr.sh_flags = SHF_ALLOC; this->shdr.sh_addralign = sizeof(Word); } void update_shdr(Context &ctx) override; void copy_buf(Context &ctx) override; private: struct Entry64 { U32 type; U32 size; U32 flags; u8 padding[4]; }; struct Entry32 { U32 type; U32 size; U32 flags; }; using Entry = std::conditional_t; std::vector contents; }; // .gnu_debuglink section contains a pathname and its CRC32 checksum for a // separate debug info file. gdb can read the section to read debug info // from an external file. template class GnuDebuglinkSection : public Chunk { public: GnuDebuglinkSection() { this->name = ".gnu_debuglink"; this->shdr.sh_type = SHT_PROGBITS; this->shdr.sh_addralign = 4; } void update_shdr(Context &ctx) override; void copy_buf(Context &ctx) override; std::string filename; u32 crc32 = 0; }; // .gdb_index contains several tables to speed up gdb start-up. template class GdbIndexSection : public Chunk { public: GdbIndexSection() { this->name = ".gdb_index"; this->shdr.sh_type = SHT_PROGBITS; this->shdr.sh_addralign = 4; } }; // Debug sections can be compressed with zlib or zstd to reduce the // overall size of an ELF file. CompressedSection represents a compressed // section. template class CompressedSection : public Chunk { public: CompressedSection(Context &ctx, Chunk &chunk); void copy_buf(Context &ctx) override; std::unique_ptr uncompressed_data; ElfChdr chdr = {}; private: std::unique_ptr compressor; }; // RelocSection represents a relocation table for an output file. // This is used only for the reproducible output (i.e. the `-r` output). template class RelocSection : public Chunk { public: RelocSection(Context &ctx, OutputSection &osec); void update_shdr(Context &ctx) override; void copy_buf(Context &ctx) override; private: OutputSection &output_section; std::vector offsets; }; // PT_GNU_RELRO works on page granularity. We want to align its end to // a page boundary. We append this section at end of a segment so that // the segment always ends at a page boundary. template class RelroPaddingSection : public Chunk { public: RelroPaddingSection() { this->name = ".relro_padding"; this->is_relro = true; this->shdr.sh_type = SHT_NOBITS; this->shdr.sh_flags = SHF_ALLOC | SHF_WRITE; this->shdr.sh_addralign = 1; this->shdr.sh_size = 1; } }; // ComdatGroupSection represents a comdat group for an output file. // This is used only for the reproducible output (i.e. the `-r` output). template class ComdatGroupSection : public Chunk { public: ComdatGroupSection(Symbol &sym, std::vector *> members) : sym(sym), members(std::move(members)) { this->name = ".group"; this->shdr.sh_type = SHT_GROUP; this->shdr.sh_entsize = 4; this->shdr.sh_addralign = 4; this->shdr.sh_size = this->members.size() * 4 + 4; } void update_shdr(Context &ctx) override; void copy_buf(Context &ctx) override; private: Symbol &sym; std::vector *> members; }; // // output-file.cc // template class OutputFile { public: static std::unique_ptr> open(Context &ctx, std::string path, i64 filesize, int perm); virtual void close(Context &ctx) = 0; virtual ~OutputFile() { free(buf2); } u8 *buf = nullptr; u8 *buf2 = nullptr; i64 buf2_size = 0; std::string path; int fd = -1; i64 filesize = 0; bool is_mmapped = false; bool is_unmapped = false; protected: OutputFile(std::string path, i64 filesize, bool is_mmapped) : path(path), filesize(filesize), is_mmapped(is_mmapped) {} }; template class MallocOutputFile : public OutputFile { public: MallocOutputFile(Context &ctx, std::string path, i64 filesize, int perm) : OutputFile(path, filesize, false), ptr(new u8[filesize]), perm(perm) { this->buf = ptr.get(); } void close(Context &ctx) override { Timer t(ctx, "close_file"); FILE *fp; if (this->path == "-") { fp = stdout; } else { #ifdef _WIN32 int pmode = (perm & 0200) ? (_S_IREAD | _S_IWRITE) : _S_IREAD; i64 fd = _open(this->path.c_str(), _O_RDWR | _O_CREAT | _O_BINARY, pmode); #else i64 fd = ::open(this->path.c_str(), O_RDWR | O_CREAT, perm); #endif if (fd == -1) Fatal(ctx) << "cannot open " << this->path << ": " << errno_string(); #ifdef _WIN32 fp = _fdopen(fd, "wb"); #else fp = fdopen(fd, "w"); #endif } fwrite(this->buf, this->filesize, 1, fp); if (this->buf2) fwrite(this->buf2, this->buf2_size, 1, fp); fclose(fp); } private: std::unique_ptr ptr; int perm; }; template class LockingOutputFile : public OutputFile { public: LockingOutputFile(Context &ctx, std::string path, int perm); void resize(Context &ctx, i64 filesize); void close(Context &ctx) override; }; // // gdb-index.cc // template void write_gdb_index(Context &ctx); // // input-files.cc // // A comdat section typically represents an inline function, // which are de-duplicated by the linker. // // For each inline function, there's one comdat section, which // contains section indices of the function code and its data such as // string literals, if any. // // Comdat sections are identified by its signature. If two comdat // sections have the same signature, the linker picks up one and // discards the other by eliminating all sections that the other // comdat section refers to. struct ComdatGroup { // The file priority of the owner file of this comdat section. Atomic owner = -1; }; template struct ComdatGroupRef { ComdatGroup *group; i32 sect_idx; std::span> members; }; template class MergeableSection { public: MergeableSection(Context &ctx, MergedSection &parent, std::unique_ptr> &isec); void split_contents(Context &ctx); void resolve_contents(Context &ctx); std::pair *, i64> get_fragment(i64 offset); std::string_view get_contents(i64 idx); MergedSection &parent; u8 p2align = 0; std::unique_ptr> input_section; std::vector *> fragments; private: std::vector frag_offsets; std::vector hashes; }; // InputFile is the base class of ObjectFile and SharedFile. template class InputFile { public: InputFile(Context &ctx, MappedFile *mf); InputFile() : filename("") {} virtual ~InputFile() = default; template std::span get_data(Context &ctx, const ElfShdr &shdr); template std::span get_data(Context &ctx, i64 idx); std::string_view get_string(Context &ctx, const ElfShdr &shdr); std::string_view get_string(Context &ctx, i64 idx); u32 get_eflags() { return ((ElfEhdr *)mf->data)->e_flags; } ElfShdr *find_section(i64 type); virtual void resolve_symbols(Context &ctx) = 0; virtual void mark_live_objects(Context &ctx, std::function *)> feeder) = 0; std::span *> get_local_syms(); std::span *> get_global_syms(); std::string_view get_source_name() const; MappedFile *mf = nullptr; std::span> elf_sections; std::span> elf_syms; std::vector *> symbols; i64 first_global = 0; std::string filename; bool is_dso = false; i64 priority; Atomic is_reachable = false; std::string_view shstrtab; std::string_view symbol_strtab; bool as_needed = false; bool has_init_array = false; bool has_ctors = false; // To create an output .symtab u64 local_symtab_idx = 0; u64 global_symtab_idx = 0; u64 num_local_symtab = 0; u64 num_global_symtab = 0; u64 strtab_offset = 0; u64 strtab_size = 0; // For --emit-relocs std::vector output_sym_indices; protected: std::vector> local_syms; std::vector> frag_syms; }; template struct ObjectFileExtras {}; template struct ObjectFileExtras { std::optional stack_align; std::optional arch; bool unaligned_access = false; }; template <> struct ObjectFileExtras { InputSection *got2 = nullptr; }; // ObjectFile represents an input .o file. template class ObjectFile : public InputFile { public: ObjectFile() = default; ObjectFile(Context &ctx, MappedFile *mf, std::string archive_name) : InputFile(ctx, mf), archive_name(archive_name) {} void parse(Context &ctx); void initialize_symbols(Context &ctx); void parse_ehframe(Context &ctx); void convert_mergeable_sections(Context &ctx); void reattach_section_pieces(Context &ctx); void resolve_symbols(Context &ctx) override; void mark_live_objects(Context &ctx, std::function *)> feeder) override; void convert_undefined_weak_symbols(Context &ctx); void scan_relocations(Context &ctx); void convert_common_symbols(Context &ctx); void compute_symtab_size(Context &ctx); void populate_symtab(Context &ctx); i64 get_shndx(const ElfSym &esym); InputSection *get_section(const ElfSym &esym); std::string archive_name; std::vector>> sections; std::vector>> mergeable_sections; std::vector> elf_sections2; std::vector> cies; std::vector> fdes; std::vector has_symver; std::vector> comdat_groups; std::vector *> eh_frame_sections; std::vector>> decoded_crel; bool exclude_libs = false; std::map gnu_properties; bool needs_executable_stack = false; bool is_lto_obj = false; bool is_gcc_offload_obj = false; bool is_rust_obj = false; bool is_dwarf32 = false; i64 fde_idx = 0; i64 fde_offset = 0; i64 fde_size = 0; // For ICF std::unique_ptr> llvm_addrsig; // For .gdb_index InputSection *debug_info = nullptr; InputSection *debug_pubnames = nullptr; InputSection *debug_pubtypes = nullptr; // For LTO std::vector> lto_elf_syms; std::vector lto_comdat_groups; private: void initialize_sections(Context &ctx); void sort_relocations(Context &ctx); void initialize_ehframe_sections(Context &ctx); void parse_note_gnu_property(Context &ctx, const ElfShdr &shdr); void override_symbol(Context &ctx, Symbol &sym, const ElfSym &esym, i64 symidx); void merge_visibility(Context &ctx, Symbol &sym, u8 visibility); bool has_common_symbol = false; const ElfShdr *symtab_sec; std::span> symtab_shndx_sec; public: // Target-specific member [[no_unique_address]] ObjectFileExtras extra; }; // SharedFile represents an input .so file. template class SharedFile : public InputFile { public: SharedFile(Context &ctx, MappedFile *mf) : InputFile(ctx, mf) {} void parse(Context &ctx); void resolve_symbols(Context &ctx) override; std::span *> get_symbols_at(Symbol *sym); i64 get_alignment(Symbol *sym); std::vector get_dt_needed(Context &ctx); std::string_view get_dt_audit(Context &ctx); bool is_readonly(Symbol *sym); void mark_live_objects(Context &ctx, std::function *)> feeder) override; void compute_symtab_size(Context &ctx); void populate_symtab(Context &ctx); std::string soname; std::vector version_strings; std::vector *> symbols2; std::vector> elf_syms2; private: std::string get_soname(Context &ctx); void maybe_override_symbol(Symbol &sym, const ElfSym &esym); std::vector read_dt_needed(Context &ctx); std::vector read_verdef(Context &ctx); std::vector versyms; const ElfShdr *symtab_sec; // Used by get_symbols_at() std::once_flag init_sorted_syms; std::vector *> sorted_syms; }; // // linker-script.cc // struct ReaderContext { bool as_needed = false; bool in_lib = false; bool static_ = false; bool whole_archive = false; tbb::task_group *tg = nullptr; }; struct DynamicPattern { std::string_view pattern; std::string_view source; bool is_cpp = false; }; template class Script { public: Script(Context &ctx, ReaderContext &rctx, MappedFile *mf) : ctx(ctx), rctx(rctx), mf(mf) {} std::string_view get_script_output_type(); void parse_linker_script(); void parse_version_script(); std::vector parse_dynamic_list(); private: [[noreturn]] void error(std::string_view pos, std::string msg); void tokenize(); std::span skip(std::span tok, std::string_view str); std::span read_output_format(std::span tok); std::span read_group(std::span tok); std::span read_version_script_commands(std::span tok, std::string_view ver_str, u16 ver_idx, bool is_global, bool is_cpp); std::span read_version_script(std::span tok); MappedFile *resolve_path(std::string_view tok, bool check_target); std::span read_dynamic_list_commands(std::span tok, std::vector &result, bool is_cpp); Context &ctx; ReaderContext &rctx; MappedFile *mf = mf; std::once_flag once; std::vector tokens; }; template std::vector parse_dynamic_list(Context &ctx, std::string_view path); // // archive-file.cc // template std::vector read_thin_archive_members(Context &ctx, MappedFile *mf); template std::vector read_fat_archive_members(Context &ctx, MappedFile *mf); template std::vector read_archive_members(Context &ctx, MappedFile *mf); // // lto.cc // template ObjectFile *read_lto_object(Context &ctx, MappedFile *mb); template std::vector *> run_lto_plugin(Context &ctx); template void lto_cleanup(Context &ctx); // // shrink-sections.cc // inline i64 get_removed_bytes(std::span deltas, i64 i) { if (i == 0) return deltas[i].delta; return deltas[i].delta - deltas[i - 1].delta; } template void shrink_sections(Context &ctx); template void shrink_section(Context &ctx, InputSection &isec); template i64 get_r_delta(InputSection &isec, u64 offset); template i64 compute_distance(Context &ctx, Symbol &sym, InputSection &isec, const ElfRel &rel); // // gc-sections.cc // template void gc_sections(Context &ctx); // // icf.cc // template void icf_sections(Context &ctx); // // relocatable.cc // template void combine_objects(Context &ctx); // // mapfile.cc // template void print_map(Context &ctx); // // subprocess.cc // template void fork_child(); template void notify_parent(); template [[noreturn]] void process_run_subcommand(Context &ctx, int argc, char **argv); // // cmdline.cc // template std::vector expand_response_files(Context &ctx, char **argv); template std::vector parse_nonpositional_args(Context &ctx); // // passes.cc // template int redo_main(std::string_view, int argc, char **argv); template void create_internal_file(Context &); template void apply_exclude_libs(Context &); template void create_synthetic_sections(Context &); template void set_file_priority(Context &); template void resolve_symbols(Context &); template void do_lto(Context &); template void parse_eh_frame_sections(Context &); template void create_merged_sections(Context &); template void convert_common_symbols(Context &); template void create_output_sections(Context &); template void add_synthetic_symbols(Context &); template void apply_section_align(Context &); template void check_cet_errors(Context &); template void print_dependencies(Context &); template void write_repro_file(Context &); template void check_duplicate_symbols(Context &); template void convert_zero_to_bss(Context &); template void check_shlib_undefined(Context &); template void check_symbol_types(Context &); template void sort_init_fini(Context &); template void sort_ctor_dtor(Context &); template void fixup_ctors_in_init_array(Context &); template void shuffle_sections(Context &); template void add_dynamic_strings(Context &); template void compute_section_sizes(Context &); template void sort_output_sections(Context &); template void claim_unresolved_symbols(Context &); template void scan_relocations(Context &); template void compute_imported_symbol_weakness(Context &); template void construct_relr(Context &); template void sort_dynsyms(Context &); template void sort_debug_info_sections(Context &); template void create_output_symtab(Context &); template void report_undef_errors(Context &); template void create_reloc_sections(Context &); template void copy_chunks(Context &); template void apply_version_script(Context &); template void parse_symbol_version(Context &); template void compute_import_export(Context &); template void compute_address_significance(Context &); template void separate_debug_sections(Context &); template void compute_section_headers(Context &); template i64 set_osec_offsets(Context &); template void fix_synthetic_symbols(Context &); template void compress_debug_sections(Context &); template void sort_reldyn(Context &); template void write_build_id(Context &); template void write_gnu_debuglink(Context &); template void write_separate_debug_file(Context &ctx); template void write_dependency_file(Context &); template void show_stats(Context &); // // arch-x86-64.cc // void rewrite_endbr(Context &ctx); // // arch-arm32.cc // template class Arm32ExidxSection : public Chunk { public: Arm32ExidxSection(OutputSection &osec) : output_section(osec) { this->name = ".ARM.exidx"; this->shdr.sh_type = SHT_ARM_EXIDX; this->shdr.sh_flags = SHF_ALLOC; this->shdr.sh_addralign = 4; } void compute_section_size(Context &ctx) override; void update_shdr(Context &ctx) override; void remove_duplicate_entries(Context &ctx); void copy_buf(Context &ctx) override; private: std::vector get_contents(Context &ctx); OutputSection &output_section; }; template u64 get_eflags(Context &ctx); template void create_arm_exidx_section(Context &ctx); void arm32be_swap_bytes(Context &ctx); // // arch-riscv.cc // template class RiscvAttributesSection : public Chunk { public: RiscvAttributesSection() { this->name = ".riscv.attributes"; this->shdr.sh_type = SHT_RISCV_ATTRIBUTES; } void update_shdr(Context &ctx) override; void copy_buf(Context &ctx) override; std::vector contents; }; template u64 get_eflags(Context &ctx); // // arch-ppc64v1.cc // void ppc64v1_rewrite_opd(Context &ctx); void ppc64v1_scan_symbols(Context &ctx); class PPC64OpdSection : public Chunk { public: PPC64OpdSection() { this->name = ".opd"; this->shdr.sh_type = SHT_PROGBITS; this->shdr.sh_flags = SHF_ALLOC | SHF_WRITE; this->shdr.sh_addralign = 8; } void add_symbol(Context &ctx, Symbol *sym); i64 get_reldyn_size(Context &ctx) const override; void copy_buf(Context &ctx) override; static constexpr i64 ENTRY_SIZE = sizeof(Word) * 3; std::vector *> symbols; }; // // arch-ppc64v2.cc // extern const std::vector> ppc64_save_restore_insns; class PPC64SaveRestoreSection : public Chunk { public: PPC64SaveRestoreSection() { this->name = ".save_restore_gprs"; this->shdr.sh_type = SHT_PROGBITS; this->shdr.sh_flags = SHF_ALLOC | SHF_EXECINSTR; this->shdr.sh_addralign = 16; this->shdr.sh_size = ppc64_save_restore_insns.size() * 4; } void copy_buf(Context &ctx) override; }; template <> u64 get_eflags(Context &ctx); // // main.cc // struct BuildId { i64 size() const { switch (kind) { case HEX: return value.size(); case HASH: return hash_size; case UUID: return 16; default: unreachable(); } } enum { NONE, HEX, HASH, UUID } kind = NONE; std::vector value; i64 hash_size = 0; }; typedef enum { UNRESOLVED_ERROR, UNRESOLVED_WARN, UNRESOLVED_IGNORE, } UnresolvedKind; typedef enum { BSYMBOLIC_NONE, BSYMBOLIC_ALL, BSYMBOLIC_FUNCTIONS, BSYMBOLIC_NON_WEAK, BSYMBOLIC_NON_WEAK_FUNCTIONS, } BsymbolicKind; typedef enum { SEPARATE_LOADABLE_SEGMENTS, SEPARATE_CODE, NOSEPARATE_CODE, } SeparateCodeKind; typedef enum { CET_REPORT_NONE, CET_REPORT_WARNING, CET_REPORT_ERROR, } CetReportKind; typedef enum { SHUFFLE_SECTIONS_NONE, SHUFFLE_SECTIONS_SHUFFLE, SHUFFLE_SECTIONS_REVERSE, } ShuffleSectionsKind; struct VersionPattern { std::string_view pattern; std::string_view source; std::string_view ver_str; i64 ver_idx = -1; bool is_cpp = false; }; struct SectionOrder { enum { NONE, SECTION, GROUP, ADDR, ALIGN, SYMBOL } type = NONE; std::string name; u64 value = 0; std::string_view token; // for error reporting }; // Target-specific context members template struct ContextExtras {}; template struct ContextExtras { NotePropertySection *note_property = nullptr; }; template struct ContextExtras { Arm32ExidxSection *exidx = nullptr; }; template struct ContextExtras { RiscvAttributesSection *riscv_attributes = nullptr; }; template <> struct ContextExtras { Symbol *_SDA_BASE_ = nullptr; }; template <> struct ContextExtras { PPC64OpdSection *opd = nullptr; Symbol *TOC = nullptr; }; template <> struct ContextExtras { PPC64SaveRestoreSection *save_restore = nullptr; Symbol *TOC = nullptr; Atomic is_power10 = false; }; template <> struct ContextExtras { Symbol *tls_get_addr = nullptr; }; // Context represents a context object for each invocation of the linker. // It contains command line flags, pointers to singleton objects // (such as linker-synthesized output sections), unique_ptrs for // resource management, and other miscellaneous objects. template struct Context { Context() { arg.entry = get_symbol(*this, "_start"); arg.fini = get_symbol(*this, "_fini"); arg.init = get_symbol(*this, "_init"); if constexpr (is_sparc) extra.tls_get_addr = get_symbol(*this, "__tls_get_addr"); } Context(const Context &) = delete; void checkpoint() { if (has_error) { cleanup(); _exit(1); } } // Command-line arguments struct { BsymbolicKind Bsymbolic = BSYMBOLIC_NONE; BuildId build_id; CetReportKind z_cet_report = CET_REPORT_NONE; Glob undefined_glob; Glob unique; SeparateCodeKind z_separate_code = NOSEPARATE_CODE; ShuffleSectionsKind shuffle_sections = SHUFFLE_SECTIONS_NONE; Symbol *entry = nullptr; Symbol *fini = nullptr; Symbol *init = nullptr; UnresolvedKind unresolved_symbols = UNRESOLVED_IGNORE; bool allow_multiple_definition = false; bool allow_shlib_undefined = true; bool apply_dynamic_relocs = true; bool be8 = false; bool color_diagnostics = false; bool default_symver = false; bool demangle = true; bool detach = true; bool discard_all = false; bool discard_locals = false; bool dynamic_list_data = false; bool eh_frame_hdr = true; bool emit_relocs = false; bool enable_new_dtags = true; bool execute_only = false; bool export_dynamic = false; bool fatal_warnings = false; bool fork = true; bool gc_sections = false; bool gdb_index = false; bool hash_style_gnu = true; bool hash_style_sysv = true; bool icf = false; bool icf_all = false; bool ignore_data_address_equality = false; bool lto_pass2 = false; bool nmagic = false; bool noinhibit_exec = false; bool oformat_binary = false; bool omagic = false; bool pack_dyn_relocs_relr = false; bool perf = false; bool pic = false; bool pie = false; bool print_dependencies = false; bool print_map = false; bool quick_exit = true; bool relax = true; bool relocatable = false; bool relocatable_merge_sections = false; bool repro = false; bool rosegment = true; bool shared = false; bool start_stop = false; bool static_ = false; bool stats = false; bool strip_all = false; bool strip_debug = false; bool suppress_warnings = false; bool trace = false; bool undefined_version = false; bool warn_common = false; bool warn_once = false; bool warn_textrel = false; bool z_copyreloc = true; bool z_delete = true; bool z_dlopen = true; bool z_dump = true; bool z_dynamic_undefined_weak = true; bool z_execstack = false; bool z_execstack_if_needed = false; bool z_ibt = false; bool z_initfirst = false; bool z_interpose = false; bool z_keep_text_section_prefix = false; bool z_nodefaultlib = false; bool z_now = false; bool z_origin = false; bool z_relro = true; bool z_rewrite_endbr = false; bool z_rodynamic = false; bool z_sectionheader = true; bool z_shstk = false; bool z_start_stop_visibility_protected = false; bool z_text = false; bool zero_to_bss = false; i64 compress_debug_sections = ELFCOMPRESS_NONE; i64 filler = -1; i64 spare_dynamic_tags = 5; i64 spare_program_headers = 0; i64 z_stack_size = 0; std::optional thread_count; std::optional *>> retain_symbols_file; std::optional physical_image_base; std::string Map; std::string audit; std::string chroot; std::string depaudit; std::string dependency_file; std::string directory; std::string dynamic_linker; std::string output = "a.out"; std::string package_metadata; std::string plugin; std::string print_gc_sections; std::string print_gc_sections_file; std::string print_icf_sections; std::string print_icf_sections_file; std::string rpaths; std::string separate_debug_file; std::string soname; std::string sysroot; std::string_view emulation; std::unordered_map section_align; std::unordered_map section_start; std::unordered_set discard_section; std::unordered_set exclude_libs; std::unordered_set ignore_ir_file; std::unordered_set wrap; std::vector section_order; std::vector *> require_defined; std::vector *> undefined; std::vector *, std::variant *, u64>>> defsyms; std::vector library_paths; std::vector plugin_opt; std::vector version_definitions; std::vector auxiliary; std::vector filter; std::vector trace_symbol; u32 z_x86_64_isa_level = 0; u64 image_base = 0x200000; u64 shuffle_sections_seed = 0; } arg; std::optional global_limit; std::vector version_patterns; std::vector dynamic_list_patterns; i64 default_version = VER_NDX_UNSPECIFIED; i64 page_size = E::page_size; bool has_error = false; // Reader context i64 file_priority = 10000; // Symbol table tbb::concurrent_hash_map, HashCmp> symbol_map; tbb::concurrent_hash_map comdat_groups; tbb::concurrent_vector>> merged_sections; tbb::concurrent_vector> timer_records; tbb::concurrent_vector> on_exit; tbb::concurrent_vector>> obj_pool; tbb::concurrent_vector>> dso_pool; tbb::concurrent_vector> string_pool; tbb::concurrent_vector> mf_pool; tbb::concurrent_vector>> chunk_pool; tbb::concurrent_vector>> osec_pool; // Symbol auxiliary data std::vector> symbol_aux; // Fully-expanded command line args std::vector cmdline_args; // Input files std::vector *> objs; std::vector *> dsos; ObjectFile *internal_obj = nullptr; std::vector> internal_esyms; // Output buffer std::unique_ptr> output_file; u8 *buf = nullptr; bool overwrite_output_file = false; std::vector *> chunks; Atomic needs_tlsld = false; Atomic has_textrel = false; Atomic num_ifunc_dynrels = 0; tbb::concurrent_hash_map *, std::vector> undef_errors; // For --separate-debug-file std::vector *> debug_chunks; // Output chunks OutputEhdr *ehdr = nullptr; OutputShdr *shdr = nullptr; OutputPhdr *phdr = nullptr; InterpSection *interp = nullptr; GotSection *got = nullptr; GotPltSection *gotplt = nullptr; RelPltSection *relplt = nullptr; RelDynSection *reldyn = nullptr; RelrDynSection *relrdyn = nullptr; DynamicSection *dynamic = nullptr; StrtabSection *strtab = nullptr; DynstrSection *dynstr = nullptr; HashSection *hash = nullptr; GnuHashSection *gnu_hash = nullptr; GnuDebuglinkSection *gnu_debuglink = nullptr; ShstrtabSection *shstrtab = nullptr; PltSection *plt = nullptr; PltGotSection *pltgot = nullptr; SymtabSection *symtab = nullptr; SymtabShndxSection *symtab_shndx = nullptr; DynsymSection *dynsym = nullptr; EhFrameSection *eh_frame = nullptr; EhFrameHdrSection *eh_frame_hdr = nullptr; EhFrameRelocSection *eh_frame_reloc = nullptr; CopyrelSection *copyrel = nullptr; CopyrelSection *copyrel_relro = nullptr; VersymSection *versym = nullptr; VerneedSection *verneed = nullptr; VerdefSection *verdef = nullptr; BuildIdSection *buildid = nullptr; NotePackageSection *note_package = nullptr; GdbIndexSection *gdb_index = nullptr; RelroPaddingSection *relro_padding = nullptr; MergedSection *comment = nullptr; // For --gdb-index std::span debug_info; std::span debug_abbrev; std::span debug_ranges; std::span debug_addr; std::span debug_rnglists; // For thread-local variables u64 tls_begin = 0; u64 tp_addr = 0; u64 dtp_addr = 0; // Linker-synthesized symbols Symbol *_DYNAMIC = nullptr; Symbol *_GLOBAL_OFFSET_TABLE_ = nullptr; Symbol *_PROCEDURE_LINKAGE_TABLE_ = nullptr; Symbol *_TLS_MODULE_BASE_ = nullptr; Symbol *__GNU_EH_FRAME_HDR = nullptr; Symbol *__bss_start = nullptr; Symbol *__dso_handle = nullptr; Symbol *__ehdr_start = nullptr; Symbol *__executable_start = nullptr; Symbol *__exidx_end = nullptr; Symbol *__exidx_start = nullptr; Symbol *__fini_array_end = nullptr; Symbol *__fini_array_start = nullptr; Symbol *__global_pointer = nullptr; Symbol *__init_array_end = nullptr; Symbol *__init_array_start = nullptr; Symbol *__preinit_array_end = nullptr; Symbol *__preinit_array_start = nullptr; Symbol *__rel_iplt_end = nullptr; Symbol *__rel_iplt_start = nullptr; Symbol *_edata = nullptr; Symbol *_end = nullptr; Symbol *_etext = nullptr; Symbol *edata = nullptr; Symbol *end = nullptr; Symbol *etext = nullptr; [[no_unique_address]] ContextExtras extra; }; template std::string_view get_machine_type(Context &ctx, ReaderContext &rctx, MappedFile *mf); template MappedFile *open_library(Context &ctx, ReaderContext &rctx, std::string path); template MappedFile *find_library(Context &ctx, ReaderContext &rctx, std::string path); template void read_file(Context &ctx, ReaderContext &rctx, MappedFile *mf); template int mold_main(int argc, char **argv); template std::ostream &operator<<(std::ostream &out, const InputFile &file); // // Symbol // enum { NEEDS_GOT = 1 << 0, NEEDS_PLT = 1 << 1, NEEDS_CPLT = 1 << 2, NEEDS_GOTTP = 1 << 3, NEEDS_TLSGD = 1 << 4, NEEDS_COPYREL = 1 << 5, NEEDS_TLSDESC = 1 << 6, NEEDS_PPC_OPD = 1 << 7, // for PPCv1 }; // Flags for Symbol::get_addr() enum { NO_PLT = 1 << 0, // Request an address other than .plt NO_OPD = 1 << 1, // Request an address other than .opd (PPC64V1 only) }; // Symbol class represents a symbol. For each unique symbol name, we // create one instance of Symbol. // // A symbol has not only one but several different addresses if it // has PLT or GOT entries. This class provides various functions to // compute different addresses. template class Symbol { public: Symbol() = default; Symbol(std::string_view name, bool demangle) : nameptr(name.data()), namelen(name.size()), demangle(demangle) {} Symbol(const Symbol &other) : Symbol(other.name(), other.demangle) {} u64 get_addr(Context &ctx, i64 flags = 0) const; u64 get_got_addr(Context &ctx) const; u64 get_gotplt_addr(Context &ctx) const; u64 get_gottp_addr(Context &ctx) const; u64 get_tlsgd_addr(Context &ctx) const; u64 get_tlsdesc_addr(Context &ctx) const; u64 get_plt_addr(Context &ctx) const; u64 get_opd_addr(Context &ctx) const; u64 get_got_pltgot_addr(Context &ctx) const; void set_got_idx(Context &ctx, i32 idx); void set_gottp_idx(Context &ctx, i32 idx); void set_tlsgd_idx(Context &ctx, i32 idx); void set_tlsdesc_idx(Context &ctx, i32 idx); void set_plt_idx(Context &ctx, i32 idx); void set_pltgot_idx(Context &ctx, i32 idx); void set_opd_idx(Context &ctx, i32 idx); void set_dynsym_idx(Context &ctx, i32 idx); i32 get_got_idx(Context &ctx) const; i32 get_gottp_idx(Context &ctx) const; i32 get_tlsgd_idx(Context &ctx) const; i32 get_tlsdesc_idx(Context &ctx) const; i32 get_plt_idx(Context &ctx) const; i32 get_pltgot_idx(Context &ctx) const; i32 get_opd_idx(Context &ctx) const; i32 get_dynsym_idx(Context &ctx) const; bool has_plt(Context &ctx) const; bool has_got(Context &ctx) const { return get_got_idx(ctx) != -1; } bool has_gottp(Context &ctx) const { return get_gottp_idx(ctx) != -1; } bool has_tlsgd(Context &ctx) const { return get_tlsgd_idx(ctx) != -1; } bool has_tlsdesc(Context &ctx) const { return get_tlsdesc_idx(ctx) != -1; } bool has_opd(Context &ctx) const { return get_opd_idx(ctx) != -1; } u32 get_djb_hash(Context &ctx) const; void set_djb_hash(Context &ctx, u32 hash); void add_thunk_addr(Context &ctx, u64 addr) requires needs_thunk; u64 get_thunk_addr(Context &ctx, u64 P) const requires needs_thunk; bool is_absolute() const; bool is_relative() const { return !is_absolute(); } bool is_local(Context &ctx) const; bool is_ifunc() const { return get_type() == STT_GNU_IFUNC; } bool is_pde_ifunc(Context &ctx) const; bool is_remaining_undef_weak() const; bool is_pcrel_linktime_const(Context &ctx) const; bool is_tprel_linktime_const(Context &ctx) const; bool is_tprel_runtime_const(Context &ctx) const; InputSection *get_input_section() const; Chunk *get_output_section() const; SectionFragment *get_frag() const; void set_input_section(InputSection *); void set_output_section(Chunk *); void set_frag(SectionFragment *); void set_name(std::string_view); std::string_view name() const; u32 get_type() const; std::string_view get_version() const; i64 get_output_sym_idx(Context &ctx) const; const ElfSym &esym() const; void add_aux(Context &ctx); // A symbol is owned by a file. If two or more files define the // same symbol, the one with the strongest definition owns the symbol. // If `file` is null, the symbol is not defined by any input file. InputFile *file = nullptr; // A symbol usually belongs to an input section, but it can belong // to a section fragment, an output section or nothing // (i.e. absolute symbol). `origin` holds one of them. We use the // least significant two bits to distinguish type. enum : uintptr_t { TAG_ABS = 0b00, TAG_ISEC = 0b01, TAG_OSEC = 0b10, TAG_FRAG = 0b11, TAG_MASK = 0b11, }; // We want to make sure there are enough number of unused bits in // pointers referring to these structures. In particular, we need // __attribute__((aligned(4))) for m68k on which int, long, float // and double are aligned only to two byte boundaries. static_assert(alignof(InputSection) >= 4); static_assert(alignof(Chunk) >= 4); static_assert(alignof(SectionFragment) >= 4); uintptr_t origin = 0; // `value` contains symbol value. If it's an absolute symbol, it is // equivalent to its address. If it belongs to an input section or a // section fragment, value is added to the base of the input section // to yield an address. u64 value = 0; const char *nameptr = nullptr; i32 namelen = 0; // Index into the symbol table of the owner file. i32 sym_idx = -1; i32 aux_idx = -1; u16 ver_idx = VER_NDX_UNSPECIFIED; // `flags` has NEEDS_ flags. Atomic flags = 0; tbb::spin_mutex mu; Atomic visibility = STV_DEFAULT; bool is_weak : 1 = false; bool write_to_symtab : 1 = false; // for --strip-all and the like bool is_traced : 1 = false; // for --trace-symbol bool is_wrapped : 1 = false; // for --wrap // For symbols with default symbol version, e.g. foo@@VERSION. bool is_versioned_default : 1 = false; // If a symbol can be resolved to a symbol in a different ELF file at // runtime, `is_imported` is true. If a symbol is a dynamic symbol and // can be used by other ELF file at runtime, `is_exported` is true. // // Note that both can be true at the same time. Such symbol represents // a function or data exported from this ELF file which can be // imported by other definition at runtime. That is actually a usual // exported symbol when creating a DSO. In other words, a dynamic // symbol exported by a DSO is usually imported by itself. // // If is_imported is true and is_exported is false, it is a dynamic // symbol just imported from other DSO. // // If is_imported is false and is_exported is true, there are two // possible cases. If we are creating an executable, we know that // exported symbols cannot be intercepted by any DSO (because the // dynamic loader searches a dynamic symbol from an executable before // examining any DSOs), so any exported symbol is export-only in an // executable. If we are creating a DSO, export-only symbols // represent a protected symbol (i.e. a symbol whose visibility is // STV_PROTECTED). bool is_imported : 1 = false; bool is_exported : 1 = false; // `is_canonical` is true if this symbol represents a "canonical" PLT. // Here is the explanation as to what the canonical PLT is. // // In C/C++, the process-wide function pointer equality is guaranteed. // That is, if you take an address of a function `foo`, it's always // evaluated to the same address wherever you do that. // // For the sake of explanation, assume that `libx.so` exports a // function symbol `foo`, and there's a program that uses `libx.so`. // Both `libx.so` and the main executable take the address of `foo`, // which must be evaluated to the same address because of the above // guarantee. // // If the main executable is position-independent code (PIC), `foo` is // evaluated to the beginning of the function code, as you would have // expected. The address of `foo` is stored to GOTs, and the machine // code that takes the address of `foo` reads the GOT entries at // runtime. // // However, if it's not PIC, the main executable's code was compiled // to not use GOT (note that shared objects are always PIC, only // executables can be non-PIC). It instead assumes that `foo` (and any // other global variables/functions) has an address that is fixed at // link-time. This assumption is correct if `foo` is in the same // position-dependent executable, but it's not if `foo` is imported // from some other DSO at runtime. // // In this case, we use the address of the `foo`'s PLT entry in the // main executable (whose address is fixed at link-time) as its // address. In order to guarantee pointer equality, we also need to // fill foo's GOT entries in DSOs with the addres of the foo's PLT // entry instead of `foo`'s real address. We can do that by setting a // symbol value to `foo`'s dynamic symbol. If a symbol value is set, // the dynamic loader initialize `foo`'s GOT entries with that value // instead of the symbol's real address. // // We call such PLT entry in the main executable as "canonical". // If `foo` has a canonical PLT, its address is evaluated to its // canonical PLT's address. Otherwise, it's evaluated to `foo`'s // address. // // Only non-PIC main executables may have canonical PLTs. PIC // executables and shared objects never have a canonical PLT. // // This bit manages if we need to make this symbol's PLT canonical. // This bit is meaningful only when the symbol has a PLT entry. bool is_canonical : 1 = false; // If an input object file is not compiled with -fPIC (or with // -fno-PIC), the file not position independent. That means the // machine code included in the object file does not use GOT to access // global variables. Instead, it assumes that addresses of global // variables are known at link-time. // // Let's say `libx.so` exports a global variable `foo`, and a main // executable uses the variable. If the executable is not compiled // with -fPIC, we can't simply apply a relocation that refers `foo` // because `foo`'s address is not known at link-time. // // In this case, we could print out the "recompile with -fPIC" error // message, but there's a way to workaround. // // The loader supports a feature so-called "copy relocations". // A copy relocation instructs the loader to copy data from a DSO to a // specified location in the main executable. By using this feature, // we can copy `foo`'s data to a BSS region at runtime. With that, // we can apply relocations agianst `foo` as if `foo` existed in the // main executable's BSS area, whose address is known at link-time. // // Copy relocations are used only by position-dependent executables. // Position-independent executables and DSOs don't need them because // they use GOT to access global variables. // // `has_copyrel` is true if we need to emit a copy relocation for this // symbol. If the original symbol in a DSO is in a read-only memory // region, `is_copyrel_readonly` is set to true so that the copied data // will become read-only at run-time. bool has_copyrel : 1 = false; bool is_copyrel_readonly : 1 = false; // For symbol resolution. This flag is used rarely. See a comment in // resolve_symbols(). bool skip_dso : 1 = false; // For --gc-sections bool gc_root : 1 = false; // For LTO. True if the symbol is referenced by a regular object (as // opposed to IR object). bool referenced_by_regular_obj : 1 = false; // If true, we try to dmenagle the sybmol when printing. bool demangle : 1 = false; }; template Symbol *get_symbol(Context &ctx, std::string_view key, std::string_view name); template Symbol *get_symbol(Context &ctx, std::string_view name); template ComdatGroup *insert_comdat_group(Context &ctx, std::string_view name); template std::string_view demangle(const Symbol &sym); template std::ostream &operator<<(std::ostream &out, const Symbol &sym); // // Inline objects and functions // template inline std::ostream & operator<<(std::ostream &out, const InputSection &isec) { out << isec.file << ":(" << isec.name() << ")"; return out; } template inline void InputSection::kill() { if (is_alive.exchange(false)) for (FdeRecord &fde : get_fdes()) fde.is_alive = false; } template inline u64 InputSection::get_addr() const { return output_section->shdr.sh_addr + offset; } template inline std::string_view InputSection::name() const { if (file.elf_sections.size() <= shndx) return (shdr().sh_flags & SHF_TLS) ? ".tls_common" : ".common"; return file.shstrtab.data() + file.elf_sections[shndx].sh_name; } template inline i64 InputSection::get_priority() const { return ((i64)file.priority << 32) | shndx; } template i64 get_addend(u8 *loc, const ElfRel &rel); template requires (E::is_rela && !is_sh4) inline i64 get_addend(u8 *loc, const ElfRel &rel) { return rel.r_addend; } template i64 get_addend(InputSection &isec, const ElfRel &rel) { return get_addend((u8 *)isec.contents.data() + rel.r_offset, rel); } template void write_addend(u8 *loc, i64 val, const ElfRel &rel); template requires (E::is_rela && !is_sh4) void write_addend(u8 *loc, i64 val, const ElfRel &rel) {} template inline ElfShdr &InputSection::shdr() const { if (shndx < file.elf_sections.size()) return file.elf_sections[shndx]; return file.elf_sections2[shndx - file.elf_sections.size()]; } template inline std::span> InputSection::get_rels(Context &ctx) const { if (relsec_idx == -1) return {}; ElfShdr &shdr = file.elf_sections[relsec_idx]; if (shdr.sh_type == SHT_CREL) return file.decoded_crel[relsec_idx]; return file.template get_data>(ctx, shdr); } template inline std::span> InputSection::get_fdes() const { if (fde_begin == -1) return {}; std::span> span(file.fdes); return span.subspan(fde_begin, fde_end - fde_begin); } template std::pair *, i64> InputSection::get_fragment(Context &ctx, const ElfRel &rel) { assert(!(shdr().sh_flags & SHF_ALLOC)); const ElfSym &esym = file.elf_syms[rel.r_sym]; if (esym.is_abs() || esym.is_common() || esym.is_undef()) return {nullptr, 0}; i64 shndx = file.get_shndx(esym); std::unique_ptr> &m = file.mergeable_sections[shndx]; if (!m) return {nullptr, 0}; if (esym.st_type == STT_SECTION) return m->get_fragment(esym.st_value + get_addend(*this, rel)); std::pair *, i64> p = m->get_fragment(esym.st_value); return {p.first, p.second + get_addend(*this, rel)}; } // Input object files may contain duplicate code for inline functions // and such. Linkers de-duplicate them at link-time. However, linkers // generaly don't remove debug info for de-duplicated functions because // doing that requires parsing the entire debug section. // // Instead, linkers write "tombstone" values to dead debug info records // instead of bogus values so that debuggers can skip them. // // This function returns a tombstone value for the symbol if the symbol // refers a dead debug info section. template inline std::optional InputSection::get_tombstone(Symbol &sym, SectionFragment *frag) { if (frag) return {}; InputSection *isec = sym.get_input_section(); // Setting a tombstone is a special feature for a dead debug section. if (!isec || isec->is_alive) return {}; std::string_view str = name(); if (!str.starts_with(".debug_")) return {}; // If the section was dead due to ICF, we don't want to emit debug // info for that section but want to set real values to .debug_line so // that users can set a breakpoint inside a merged section. if (isec->icf_removed() && str == ".debug_line") return {}; // 0 is an invalid value in most debug info sections, so we use it // as a tombstone value. .debug_loc and .debug_ranges reserve 0 as // the terminator marker, so we use 1 if that'str the case. return (str == ".debug_loc" || str == ".debug_ranges") ? 1 : 0; } template inline bool InputSection::icf_removed() const { return this->leader && this->leader != this; } template inline void InputSection::check_range(Context &ctx, i64 i, i64 val, i64 lo, i64 hi) { if (val < lo || hi <= val) { const ElfRel &rel = get_rels(ctx)[i]; Symbol &sym = *file.symbols[rel.r_sym]; Error(ctx) << *this << ": relocation " << rel << " against " << sym << " out of range: " << val << " is not in [" << lo << ", " << hi << ")"; } } template std::pair *, i64> MergeableSection::get_fragment(i64 offset) { auto it = ranges::upper_bound(frag_offsets, offset); i64 idx = it - 1 - frag_offsets.begin(); return {fragments[idx], offset - frag_offsets[idx]}; } template std::string_view MergeableSection::get_contents(i64 i) { i64 cur = frag_offsets[i]; if (i == frag_offsets.size() - 1) return input_section->contents.substr(cur); return input_section->contents.substr(cur, frag_offsets[i + 1] - cur); } template template inline std::span InputFile::get_data(Context &ctx, const ElfShdr &shdr) { std::string_view view = this->get_string(ctx, shdr); if (view.size() % sizeof(T)) Fatal(ctx) << *this << ": corrupted section"; return {(T *)view.data(), view.size() / sizeof(T)}; } template template inline std::span InputFile::get_data(Context &ctx, i64 idx) { if (elf_sections.size() <= idx) Fatal(ctx) << *this << ": invalid section index"; return this->template get_data(elf_sections[idx]); } template inline std::string_view InputFile::get_string(Context &ctx, const ElfShdr &shdr) { u8 *begin = mf->data + shdr.sh_offset; u8 *end = begin + shdr.sh_size; if (mf->data + mf->size < end) Fatal(ctx) << *this << ": section header is out of range: " << shdr.sh_offset; return {(char *)begin, (size_t)(end - begin)}; } template inline std::string_view InputFile::get_string(Context &ctx, i64 idx) { if (elf_sections.size() <= idx) Fatal(ctx) << *this << ": invalid section index: " << idx; return this->get_string(ctx, elf_sections[idx]); } template inline std::span *> InputFile::get_local_syms() { return std::span *>(this->symbols).subspan(0, this->first_global); } template inline std::span *> InputFile::get_global_syms() { return std::span *>(this->symbols).subspan(this->first_global); } template inline i64 ObjectFile::get_shndx(const ElfSym &esym) { assert(&this->elf_syms[0] <= &esym); assert(&esym <= &this->elf_syms[this->elf_syms.size() - 1]); if (esym.st_shndx == SHN_XINDEX) return symtab_shndx_sec[&esym - &this->elf_syms[0]]; if (esym.st_shndx >= SHN_LORESERVE) return 0; return esym.st_shndx; } template inline InputSection *ObjectFile::get_section(const ElfSym &esym) { return sections[get_shndx(esym)].get(); } template u64 Symbol::get_addr(Context &ctx, i64 flags) const { if (SectionFragment *frag = get_frag()) { if (!frag->is_alive) { // This condition is met if a non-alloc section refers an // alloc section and if the referenced piece of data is // garbage-collected. Typically, this condition occurs if a // debug info section refers a string constant in .rodata. return 0; } return frag->get_addr(ctx) + value; } if (has_copyrel) { return is_copyrel_readonly ? ctx.copyrel_relro->shdr.sh_addr + value : ctx.copyrel->shdr.sh_addr + value; } if constexpr (is_ppc64v1) if (!(flags & NO_OPD) && has_opd(ctx)) return get_opd_addr(ctx); if (!(flags & NO_PLT) && has_plt(ctx)) { assert(is_imported || is_ifunc()); return get_plt_addr(ctx); } InputSection *isec = get_input_section(); if (!isec) return value; // absolute symbol if (!isec->is_alive) { if (isec->icf_removed()) return isec->leader->get_addr() + value; if (isec->name() == ".eh_frame") { // .eh_frame contents are parsed and reconstructed by the linker, // so pointing to a specific location in a source .eh_frame // section doesn't make much sense. However, CRT files contain // symbols pointing to the very beginning and ending of the section. // // If LTO is enabled, GCC may add `.lto_priv.` as a symbol // suffix. That's why we use starts_with() instead of `==` here. if (name().starts_with("__EH_FRAME_BEGIN__") || name().starts_with("__EH_FRAME_LIST__") || name().starts_with(".eh_frame_seg") || esym().st_type == STT_SECTION) return ctx.eh_frame->shdr.sh_addr; if (name().starts_with("__FRAME_END__") || name().starts_with("__EH_FRAME_LIST_END__")) return ctx.eh_frame->shdr.sh_addr + ctx.eh_frame->shdr.sh_size; // ARM object files contain "$d" local symbol at the beginning // of data sections. Their values are not significant for .eh_frame, // so we just treat them as offset 0. if (name() == "$d" || name().starts_with("$d.")) return ctx.eh_frame->shdr.sh_addr; Fatal(ctx) << "symbol referring to .eh_frame is not supported: " << *this << " " << *file; } // The control can reach here if there's a relocation that refers // a local symbol belonging to a comdat group section. This is a // violation of the spec, as all relocations should use only global // symbols of comdat members. However, .eh_frame tends to have such // relocations. return 0; } return isec->get_addr() + value; } template inline u64 Symbol::get_got_addr(Context &ctx) const { return ctx.got->shdr.sh_addr + get_got_idx(ctx) * sizeof(Word); } template inline u64 Symbol::get_gotplt_addr(Context &ctx) const { assert(get_plt_idx(ctx) != -1); return ctx.gotplt->shdr.sh_addr + GotPltSection::HDR_SIZE + get_plt_idx(ctx) * GotPltSection::ENTRY_SIZE; } template inline u64 Symbol::get_gottp_addr(Context &ctx) const { assert(get_gottp_idx(ctx) != -1); return ctx.got->shdr.sh_addr + get_gottp_idx(ctx) * sizeof(Word); } template inline u64 Symbol::get_tlsgd_addr(Context &ctx) const { assert(get_tlsgd_idx(ctx) != -1); return ctx.got->shdr.sh_addr + get_tlsgd_idx(ctx) * sizeof(Word); } template inline u64 Symbol::get_tlsdesc_addr(Context &ctx) const { assert(get_tlsdesc_idx(ctx) != -1); return ctx.got->shdr.sh_addr + get_tlsdesc_idx(ctx) * sizeof(Word); } template inline u64 to_plt_offset(i32 pltidx) { if constexpr (is_ppc64v1) { // The PPC64 ELFv1 ABI requires PLT entries to vary in size // depending on their indices. For entries whose PLT index is // less than 32768, the entry size is 8 bytes. Other entries are // 12 bytes long. if (pltidx < 0x8000) return E::plt_hdr_size + pltidx * 8; return E::plt_hdr_size + 0x8000 * 8 + (pltidx - 0x8000) * 12; } else { return E::plt_hdr_size + pltidx * E::plt_size; } } template inline u64 Symbol::get_plt_addr(Context &ctx) const { if (i32 idx = get_plt_idx(ctx); idx != -1) return ctx.plt->shdr.sh_addr + to_plt_offset(idx); return ctx.pltgot->shdr.sh_addr + get_pltgot_idx(ctx) * E::pltgot_size; } template inline u64 Symbol::get_opd_addr(Context &ctx) const { assert(get_opd_idx(ctx) != -1); return ctx.extra.opd->shdr.sh_addr + get_opd_idx(ctx) * PPC64OpdSection::ENTRY_SIZE; } template inline u64 Symbol::get_got_pltgot_addr(Context &ctx) const { // An ifunc symbol occupies two consecutive GOT slots in a // position-dependent executable (PDE). The first slot contains the // symbol's PLT address, and the second slot holds the resolved // address. A PDE uses the ifunc symbol's PLT entry as the address // for the symbol, akin to a canonical PLT. // // This function returns the address that the PLT entry should use // to jump to the resolved address. // // Note that we don't use this function for PPC64. In PPC64, symbols // are always accessed through the TOC table regardless of the // -fno-PIE setting. We don't need canonical PLTs on the psABIs too. if (is_pde_ifunc(ctx)) return get_got_addr(ctx) + sizeof(Word); return get_got_addr(ctx); } template inline void Symbol::set_got_idx(Context &ctx, i32 idx) { assert(aux_idx != -1); assert(ctx.symbol_aux[aux_idx].got_idx < 0); ctx.symbol_aux[aux_idx].got_idx = idx; } template inline void Symbol::set_gottp_idx(Context &ctx, i32 idx) { assert(aux_idx != -1); assert(ctx.symbol_aux[aux_idx].gottp_idx < 0); ctx.symbol_aux[aux_idx].gottp_idx = idx; } template inline void Symbol::set_tlsgd_idx(Context &ctx, i32 idx) { assert(aux_idx != -1); assert(ctx.symbol_aux[aux_idx].tlsgd_idx < 0); ctx.symbol_aux[aux_idx].tlsgd_idx = idx; } template inline void Symbol::set_tlsdesc_idx(Context &ctx, i32 idx) { assert(aux_idx != -1); assert(ctx.symbol_aux[aux_idx].tlsdesc_idx < 0); ctx.symbol_aux[aux_idx].tlsdesc_idx = idx; } template inline void Symbol::set_plt_idx(Context &ctx, i32 idx) { assert(aux_idx != -1); assert(ctx.symbol_aux[aux_idx].plt_idx < 0); ctx.symbol_aux[aux_idx].plt_idx = idx; } template inline void Symbol::set_pltgot_idx(Context &ctx, i32 idx) { assert(aux_idx != -1); assert(ctx.symbol_aux[aux_idx].pltgot_idx < 0); ctx.symbol_aux[aux_idx].pltgot_idx = idx; } template inline void Symbol::set_opd_idx(Context &ctx, i32 idx) { assert(aux_idx != -1); assert(ctx.symbol_aux[aux_idx].opd_idx < 0); ctx.symbol_aux[aux_idx].opd_idx = idx; } template inline void Symbol::set_dynsym_idx(Context &ctx, i32 idx) { assert(aux_idx != -1); ctx.symbol_aux[aux_idx].dynsym_idx = idx; } template inline i32 Symbol::get_got_idx(Context &ctx) const { return (aux_idx == -1) ? -1 : ctx.symbol_aux[aux_idx].got_idx; } template inline i32 Symbol::get_gottp_idx(Context &ctx) const { return (aux_idx == -1) ? -1 : ctx.symbol_aux[aux_idx].gottp_idx; } template inline i32 Symbol::get_tlsgd_idx(Context &ctx) const { return (aux_idx == -1) ? -1 : ctx.symbol_aux[aux_idx].tlsgd_idx; } template inline i32 Symbol::get_tlsdesc_idx(Context &ctx) const { return (aux_idx == -1) ? -1 : ctx.symbol_aux[aux_idx].tlsdesc_idx; } template inline i32 Symbol::get_plt_idx(Context &ctx) const { return (aux_idx == -1) ? -1 : ctx.symbol_aux[aux_idx].plt_idx; } template inline i32 Symbol::get_pltgot_idx(Context &ctx) const { return (aux_idx == -1) ? -1 : ctx.symbol_aux[aux_idx].pltgot_idx; } template inline i32 Symbol::get_opd_idx(Context &ctx) const { return (aux_idx == -1) ? -1 : ctx.symbol_aux[aux_idx].opd_idx; } template inline i32 Symbol::get_dynsym_idx(Context &ctx) const { return (aux_idx == -1) ? -1 : ctx.symbol_aux[aux_idx].dynsym_idx; } template inline u32 Symbol::get_djb_hash(Context &ctx) const { assert(aux_idx != -1); return ctx.symbol_aux[aux_idx].djb_hash; } template inline void Symbol::set_djb_hash(Context &ctx, u32 hash) { assert(aux_idx != -1); ctx.symbol_aux[aux_idx].djb_hash = hash; } template void Symbol::add_thunk_addr(Context &ctx, u64 addr) requires needs_thunk { add_aux(ctx); std::vector &vec = ctx.symbol_aux[aux_idx].thunk_addrs; assert(vec.empty() || vec.back() < addr); vec.push_back(addr); } template u64 Symbol::get_thunk_addr(Context &ctx, u64 P) const requires needs_thunk { std::span vec = ctx.symbol_aux[aux_idx].thunk_addrs; u64 lo = (P < branch_distance) ? 0 : P - branch_distance; if (auto it = ranges::lower_bound(vec, lo); it != vec.end()) if (i64 disp = *it - P; -branch_distance <= disp && disp < branch_distance) return *it; Fatal(ctx) << "range extension thunk out of range: " << *this; } template inline bool Symbol::has_plt(Context &ctx) const { return get_plt_idx(ctx) != -1 || get_pltgot_idx(ctx) != -1; } template inline bool Symbol::is_absolute() const { // An unresolved weak symbol acts as if it were an absolute address // at address 0 if (is_remaining_undef_weak()) return true; return !is_imported && !get_frag() && !get_input_section() && !get_output_section(); } template inline bool Symbol::is_local(Context &ctx) const { if (ctx.arg.relocatable) return esym().st_bind == STB_LOCAL; return !is_imported && !is_exported; } template inline bool Symbol::is_pde_ifunc(Context &ctx) const { // Returns true if this is an ifunc tha uses two GOT slots return is_ifunc() && !ctx.arg.pic && !is_ppc64; } // A remaining weak undefined symbol is promoted to a dynamic symbol // in DSO and resolved to 0 in an executable. This function returns // true if it's latter. template inline bool Symbol::is_remaining_undef_weak() const { return !is_imported && esym().is_undef_weak(); } // Returns true if the symbol's PC-relative address is known at link-time. template inline bool Symbol::is_pcrel_linktime_const(Context &ctx) const { return !is_imported && !is_ifunc() && (is_relative() || !ctx.arg.pic); } // Returns true if the symbol's Thread Pointer-relative address is // known at link-time. template inline bool Symbol::is_tprel_linktime_const(Context &ctx) const { assert(get_type() == STT_TLS); return !ctx.arg.shared && !is_imported; } // Returns true if the symbol's Thread Pointer-relative address is // known at load-time. template inline bool Symbol::is_tprel_runtime_const(Context &ctx) const { // Returns true unless we are creating a dlopen'able DSO. assert(get_type() == STT_TLS); return !(ctx.arg.shared && ctx.arg.z_dlopen); } template inline InputSection *Symbol::get_input_section() const { if ((origin & TAG_MASK) == TAG_ISEC) return (InputSection *)(origin & ~TAG_MASK); return nullptr; } template inline Chunk *Symbol::get_output_section() const { if ((origin & TAG_MASK) == TAG_OSEC) return (Chunk *)(origin & ~TAG_MASK); return nullptr; } template inline SectionFragment *Symbol::get_frag() const { if ((origin & TAG_MASK) == TAG_FRAG) return (SectionFragment *)(origin & ~TAG_MASK); return nullptr; } template inline void Symbol::set_input_section(InputSection *isec) { uintptr_t addr = (uintptr_t)isec; assert((addr & TAG_MASK) == 0); origin = addr | TAG_ISEC; } template inline void Symbol::set_output_section(Chunk *osec) { uintptr_t addr = (uintptr_t)osec; assert((addr & TAG_MASK) == 0); origin = addr | TAG_OSEC; } template inline void Symbol::set_frag(SectionFragment *frag) { uintptr_t addr = (uintptr_t)frag; assert((addr & TAG_MASK) == 0); origin = addr | TAG_FRAG; } template inline u32 Symbol::get_type() const { if (esym().st_type == STT_GNU_IFUNC && file->is_dso) return STT_FUNC; return esym().st_type; } template inline std::string_view Symbol::get_version() const { if (file->is_dso) { std::span vers = ((SharedFile *)file)->version_strings; if (!vers.empty()) return vers[ver_idx]; } return ""; } template inline i64 Symbol::get_output_sym_idx(Context &ctx) const { i64 i = file->output_sym_indices[sym_idx]; assert(i != -1); if (is_local(ctx)) return file->local_symtab_idx + i; return file->global_symtab_idx + i; } template inline const ElfSym &Symbol::esym() const { return file->elf_syms[sym_idx]; } template inline void Symbol::set_name(std::string_view name) { nameptr = name.data(); namelen = name.size(); } template inline std::string_view Symbol::name() const { return {nameptr, (size_t)namelen}; } template inline void Symbol::add_aux(Context &ctx) { if (aux_idx == -1) { aux_idx = ctx.symbol_aux.size(); ctx.symbol_aux.resize(aux_idx + 1); } } inline bool is_c_identifier(std::string_view s) { if (s.empty()) return false; auto is_alpha = [](char c) { return c == '_' || ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'); }; auto is_alnum = [&](char c) { return is_alpha(c) || ('0' <= c && c <= '9'); }; if (!is_alpha(s[0])) return false; for (i64 i = 1; i < s.size(); i++) if (!is_alnum(s[i])) return false; return true; } template std::string_view save_string(Context &ctx, const std::string &str) { u8 *buf = new u8[str.size() + 1]; memcpy(buf, str.data(), str.size()); buf[str.size()] = '\0'; ctx.string_pool.emplace_back(buf); return {(char *)buf, str.size()}; } } // namespace mold ================================================ FILE: src/output-chunks.cc ================================================ #include "mold.h" #include #include #include #include #include #include #include namespace mold { // The hash function for .hash. static u32 elf_hash(std::string_view name) { u32 h = 0; for (u8 c : name) { h = (h << 4) + c; u32 g = h & 0xf0000000; if (g != 0) h ^= g >> 24; h &= ~g; } return h; } template Chunk *find_chunk(Context &ctx, u32 sh_type) { for (Chunk *chunk : ctx.chunks) if (chunk->shdr.sh_type == sh_type) return chunk; return nullptr; } template Chunk *find_chunk(Context &ctx, std::string_view name) { for (Chunk *chunk : ctx.chunks) if (chunk->name == name) return chunk; return nullptr; } template static u64 get_entry_addr(Context &ctx) { if (ctx.arg.relocatable) return 0; if (InputFile *file = ctx.arg.entry->file) if (!file->is_dso) return ctx.arg.entry->get_addr(ctx); if (!ctx.arg.shared) Warn(ctx) << "entry symbol is not defined: " << *ctx.arg.entry; return 0; } template void OutputEhdr::copy_buf(Context &ctx) { ElfEhdr &hdr = *(ElfEhdr *)(ctx.buf + this->shdr.sh_offset); memset(&hdr, 0, sizeof(hdr)); memcpy(&hdr.e_ident, "\177ELF", 4); hdr.e_ident[EI_CLASS] = E::is_64 ? ELFCLASS64 : ELFCLASS32; hdr.e_ident[EI_DATA] = E::is_le ? ELFDATA2LSB : ELFDATA2MSB; hdr.e_ident[EI_VERSION] = EV_CURRENT; hdr.e_machine = E::e_machine; hdr.e_version = EV_CURRENT; hdr.e_entry = get_entry_addr(ctx); hdr.e_flags = get_eflags(ctx); hdr.e_ehsize = sizeof(ElfEhdr); // If e_shstrndx is too large, a dummy value is set to e_shstrndx. // The real value is stored to the zero'th section's sh_link field. if (ctx.shstrtab) { if (ctx.shstrtab->shndx < SHN_LORESERVE) hdr.e_shstrndx = ctx.shstrtab->shndx; else hdr.e_shstrndx = SHN_XINDEX; } if (ctx.arg.relocatable) hdr.e_type = ET_REL; else if (ctx.arg.pic) hdr.e_type = ET_DYN; else hdr.e_type = ET_EXEC; if (ctx.phdr) { hdr.e_phoff = ctx.phdr->shdr.sh_offset; hdr.e_phentsize = sizeof(ElfPhdr); hdr.e_phnum = ctx.phdr->shdr.sh_size / sizeof(ElfPhdr); } if (ctx.shdr) { hdr.e_shoff = ctx.shdr->shdr.sh_offset; hdr.e_shentsize = sizeof(ElfShdr); // Since e_shnum is a 16-bit integer field, we can't store a very // large value there. If it is >65535, the real value is stored to // the zero'th section's sh_size field. i64 shnum = ctx.shdr->shdr.sh_size / sizeof(ElfShdr); hdr.e_shnum = (shnum <= UINT16_MAX) ? shnum : 0; } } template void OutputShdr::copy_buf(Context &ctx) { ElfShdr *hdr = (ElfShdr *)(ctx.buf + this->shdr.sh_offset); memset(hdr, 0, this->shdr.sh_size); if (ctx.shstrtab && SHN_LORESERVE <= ctx.shstrtab->shndx) hdr[0].sh_link = ctx.shstrtab->shndx; i64 shnum = ctx.shdr->shdr.sh_size / sizeof(ElfShdr); if (UINT16_MAX < shnum) hdr[0].sh_size = shnum; for (Chunk *chunk : ctx.chunks) if (chunk->shndx) hdr[chunk->shndx] = chunk->shdr; } template i64 to_phdr_flags(Context &ctx, Chunk *chunk) { // All sections are put into a single RWX segment if --omagic if (ctx.arg.omagic) return PF_R | PF_W | PF_X; bool write = (chunk->shdr.sh_flags & SHF_WRITE); bool exec = (chunk->shdr.sh_flags & SHF_EXECINSTR); // .text is not readable if --execute-only if (exec && ctx.arg.execute_only) { if (write) Error(ctx) << "--execute-only is not compatible with writable section: " << chunk->name; return PF_X; } // .rodata is merged with .text if --no-rosegment if (!write && !ctx.arg.rosegment) exec = true; return PF_R | (write ? PF_W : PF_NONE) | (exec ? PF_X : PF_NONE); } template static std::vector> create_phdr(Context &ctx) { std::vector> vec; auto define = [&](u64 type, u64 flags, Chunk *chunk) { ElfPhdr phdr = {}; phdr.p_type = type; phdr.p_flags = flags; phdr.p_align = chunk->shdr.sh_addralign; if (chunk->shdr.sh_type == SHT_NOBITS) { // p_offset indicates the in-file start offset and is not // significant for segments with zero on-file size. We still want to // keep it congruent with the virtual address modulo page size // because some loaders (at least FreeBSD's) are picky about it. phdr.p_offset = chunk->shdr.sh_addr % ctx.page_size; } else { phdr.p_offset = chunk->shdr.sh_offset; phdr.p_filesz = chunk->shdr.sh_size; } phdr.p_vaddr = chunk->shdr.sh_addr; phdr.p_paddr = chunk->shdr.sh_addr; if (chunk->shdr.sh_flags & SHF_ALLOC) phdr.p_memsz = chunk->shdr.sh_size; vec.push_back(phdr); }; auto append = [&](Chunk *chunk) { ElfPhdr &phdr = vec.back(); phdr.p_align = std::max(phdr.p_align, chunk->shdr.sh_addralign); phdr.p_memsz = chunk->shdr.sh_addr + chunk->shdr.sh_size - phdr.p_vaddr; if (chunk->shdr.sh_type != SHT_NOBITS) phdr.p_filesz = phdr.p_memsz; }; auto is_bss = [](Chunk *chunk) { return chunk->shdr.sh_type == SHT_NOBITS; }; auto is_tbss = [](Chunk *chunk) { return chunk->shdr.sh_type == SHT_NOBITS && (chunk->shdr.sh_flags & SHF_TLS); }; auto is_note = [](Chunk *chunk) { return chunk->shdr.sh_type == SHT_NOTE; }; // When we are creating PT_LOAD segments, we consider only // the following chunks. std::vector *> chunks; for (Chunk *chunk : ctx.chunks) if ((chunk->shdr.sh_flags & SHF_ALLOC) && !is_tbss(chunk)) chunks.push_back(chunk); // The ELF spec says that "loadable segment entries in the program // header table appear in ascending order, sorted on the p_vaddr // member". ranges::stable_sort(chunks, {}, [](Chunk *x) { return x->shdr.sh_addr; }); // Create a PT_PHDR for the program header itself. if (ctx.phdr && (ctx.phdr->shdr.sh_flags & SHF_ALLOC)) define(PT_PHDR, PF_R, ctx.phdr); // Create a PT_INTERP. if (ctx.interp) define(PT_INTERP, PF_R, ctx.interp); // Create a PT_NOTE for SHF_NOTE sections. for (i64 i = 0; i < chunks.size();) { Chunk *first = chunks[i++]; if (is_note(first)) { i64 flags = to_phdr_flags(ctx, first); define(PT_NOTE, flags, first); while (i < chunks.size() && is_note(ctx.chunks[i]) && to_phdr_flags(ctx, ctx.chunks[i]) == flags) append(ctx.chunks[i++]); } } // Create PT_LOAD segments. for (i64 i = 0; i < chunks.size();) { Chunk *first = chunks[i++]; i64 flags = to_phdr_flags(ctx, first); define(PT_LOAD, flags, first); vec.back().p_align = std::max(ctx.page_size, vec.back().p_align); // Add contiguous ALLOC sections as long as they have the same // section flags and there's no on-disk gap in between. if (!is_bss(first)) while (i < chunks.size() && !is_bss(chunks[i]) && to_phdr_flags(ctx, chunks[i]) == flags && chunks[i]->shdr.sh_offset - first->shdr.sh_offset == chunks[i]->shdr.sh_addr - first->shdr.sh_addr) append(chunks[i++]); while (i < chunks.size() && is_bss(chunks[i]) && to_phdr_flags(ctx, chunks[i]) == flags) append(chunks[i++]); } // Create a PT_TLS. for (i64 i = 0; i < ctx.chunks.size();) { Chunk *first = ctx.chunks[i++]; if (first->shdr.sh_flags & SHF_TLS) { define(PT_TLS, PF_R, first); while (i < ctx.chunks.size() && (ctx.chunks[i]->shdr.sh_flags & SHF_TLS)) append(ctx.chunks[i++]); } } // Add PT_DYNAMIC if (ctx.dynamic && ctx.dynamic->shdr.sh_size) define(PT_DYNAMIC, to_phdr_flags(ctx, ctx.dynamic), ctx.dynamic); // Add PT_GNU_EH_FRAME if (ctx.eh_frame_hdr) define(PT_GNU_EH_FRAME, PF_R, ctx.eh_frame_hdr); // Add PT_GNU_PROPERTY if (Chunk *chunk = find_chunk(ctx, ".note.gnu.property")) define(PT_GNU_PROPERTY, PF_R, chunk); // Add PT_GNU_STACK, which is a marker segment that doesn't really // contain any segments. It controls executable bit of stack area. { ElfPhdr phdr = {}; phdr.p_type = PT_GNU_STACK; phdr.p_flags = ctx.arg.z_execstack ? (PF_R | PF_W | PF_X) : (PF_R | PF_W); phdr.p_memsz = ctx.arg.z_stack_size; phdr.p_align = 1; vec.push_back(phdr); } // Create a PT_GNU_RELRO. if (ctx.arg.z_relro) { for (i64 i = 0; i < chunks.size();) { Chunk *first = chunks[i++]; if (first->is_relro) { define(PT_GNU_RELRO, PF_R, first); while (i < chunks.size() && chunks[i]->is_relro) append(chunks[i++]); vec.back().p_align = 1; } } } // Create a PT_ARM_EDXIDX if constexpr (is_arm32) if (ctx.extra.exidx) define(PT_ARM_EXIDX, PF_R, ctx.extra.exidx); // Create a PT_RISCV_ATTRIBUTES if constexpr (is_riscv) if (ctx.extra.riscv_attributes->shdr.sh_size) define(PT_RISCV_ATTRIBUTES, PF_R, ctx.extra.riscv_attributes); // Create a PT_OPENBSD_RANDOMIZE for (Chunk *chunk : ctx.chunks) if (chunk->name == ".openbsd.randomdata") define(PT_OPENBSD_RANDOMIZE, PF_R | PF_W, chunk); // Set p_paddr if --physical-image-base was given. --physical-image-base // is typically used in embedded programming to specify the base address // of a memory-mapped ROM area. In that environment, paddr refers to a // segment's initial location in ROM and vaddr refers the its run-time // address. // // When a device is turned on, it start executing code at a fixed // location in the ROM area. At that location is a startup routine that // copies data or code from ROM to RAM before using them. // // .data must have different paddr and vaddr because ROM is not writable. // paddr of .rodata and .text may or may be equal to vaddr. They can be // directly read or executed from ROM, but oftentimes they are copied // from ROM to RAM because Flash or EEPROM are usually much slower than // DRAM. // // We want to keep vaddr == pvaddr for as many segments as possible so // that they can be directly read/executed from ROM. If a gap between // two segments is two page size or larger, we give up and pack segments // tightly so that we don't waste too much ROM area. if (ctx.arg.physical_image_base) { for (i64 i = 0; i < vec.size(); i++) { if (vec[i].p_type != PT_LOAD) continue; u64 addr = *ctx.arg.physical_image_base; bool in_sync = (vec[i].p_vaddr == addr); vec[i].p_paddr = addr; addr += vec[i].p_memsz; for (i++; i < vec.size() && vec[i].p_type == PT_LOAD; i++) { ElfPhdr &p = vec[i]; if (in_sync && addr <= p.p_vaddr && p.p_vaddr < addr + ctx.page_size * 2) { p.p_paddr = p.p_vaddr; addr = p.p_vaddr + p.p_memsz; } else { in_sync = false; p.p_paddr = addr; addr += p.p_memsz; } } break; } } vec.resize(vec.size() + ctx.arg.spare_program_headers); return vec; } template void OutputPhdr::update_shdr(Context &ctx) { phdrs = create_phdr(ctx); this->shdr.sh_size = phdrs.size() * sizeof(ElfPhdr); for (ElfPhdr &phdr : phdrs) { if (phdr.p_type == PT_TLS) { ctx.tls_begin = phdr.p_vaddr; ctx.tp_addr = get_tp_addr(phdr); ctx.dtp_addr = get_dtp_addr(phdr); break; } } } template void OutputPhdr::copy_buf(Context &ctx) { write_vector(ctx.buf + this->shdr.sh_offset, phdrs); } template void InterpSection::update_shdr(Context &ctx) { this->shdr.sh_size = ctx.arg.dynamic_linker.size() + 1; } template void InterpSection::copy_buf(Context &ctx) { write_string(ctx.buf + this->shdr.sh_offset, ctx.arg.dynamic_linker); } template void RelDynSection::update_shdr(Context &ctx) { i64 offset = 0; for (Chunk *chunk : ctx.chunks) { chunk->reldyn_offset = offset; offset += chunk->get_reldyn_size(ctx) * sizeof(ElfRel); } this->shdr.sh_size = offset; this->shdr.sh_link = ctx.dynsym->shndx; } template void RelrDynSection::update_shdr(Context &ctx) { i64 n = 0; for (Chunk *chunk : ctx.chunks) n += chunk->relr.size(); this->shdr.sh_size = n * sizeof(Word); } template void RelrDynSection::copy_buf(Context &ctx) { Word *buf = (Word *)(ctx.buf + this->shdr.sh_offset); for (Chunk *chunk : ctx.chunks) for (u64 val : chunk->relr) *buf++ = (val & 1) ? val : (chunk->shdr.sh_addr + val); } template void StrtabSection::update_shdr(Context &ctx) { i64 offset = 1; // ARM32 uses $a, $t and $t mapping symbols to mark the beginning of // ARM, Thumb and data in text, respectively. These symbols don't // affect correctness of the program but helps disassembler to // disassemble machine code appropriately. if constexpr (is_arm32) if (!ctx.arg.strip_all) offset += sizeof("$a\0$t\0$d"); for (Chunk *chunk : ctx.chunks) { chunk->strtab_offset = offset; offset += chunk->strtab_size; } for (ObjectFile *file : ctx.objs) { file->strtab_offset = offset; offset += file->strtab_size; } for (SharedFile *file : ctx.dsos) { file->strtab_offset = offset; offset += file->strtab_size; } this->shdr.sh_size = (offset == 1) ? 0 : offset; } template void StrtabSection::copy_buf(Context &ctx) { u8 *buf = ctx.buf + this->shdr.sh_offset; buf[0] = '\0'; if constexpr (is_arm32) if (!ctx.arg.strip_all) memcpy(buf + 1, "$a\0$t\0$d", 9); } template void ShstrtabSection::update_shdr(Context &ctx) { std::unordered_map map; i64 offset = 1; for (Chunk *chunk : ctx.chunks) { if (!chunk->is_header() && !chunk->name.empty()) { auto [it, inserted] = map.insert({chunk->name, offset}); chunk->shdr.sh_name = it->second; if (inserted) offset += chunk->name.size() + 1; } } this->shdr.sh_size = offset; } template void ShstrtabSection::copy_buf(Context &ctx) { u8 *base = ctx.buf + this->shdr.sh_offset; base[0] = '\0'; for (Chunk *chunk : ctx.chunks) if (chunk->shdr.sh_name) write_string(base + chunk->shdr.sh_name, chunk->name); } template i64 DynstrSection::add_string(std::string_view str) { if (this->shdr.sh_size == 0) { strings.insert({"", 0}); this->shdr.sh_size = 1; } auto [it, inserted] = strings.insert({str, this->shdr.sh_size}); if (inserted) this->shdr.sh_size += str.size() + 1; return it->second; } template i64 DynstrSection::find_string(std::string_view str) { auto it = strings.find(str); assert(it != strings.end()); return it->second; } template void DynstrSection::copy_buf(Context &ctx) { u8 *base = ctx.buf + this->shdr.sh_offset; for (std::pair p : strings) write_string(base + p.second, p.first); i64 off = ctx.dynsym->dynstr_offset; for (Symbol *sym : ctx.dynsym->symbols) if (sym) off += write_string(base + off, sym->name()); } template void SymtabSection::update_shdr(Context &ctx) { i64 nsyms = 1; // Section symbols for (Chunk *chunk : ctx.chunks) if (chunk->shndx) nsyms++; // Linker-synthesized symbols for (Chunk *chunk : ctx.chunks) { chunk->local_symtab_idx = nsyms; nsyms += chunk->num_local_symtab; } // File local symbols for (ObjectFile *file : ctx.objs) { file->local_symtab_idx = nsyms; nsyms += file->num_local_symtab; } // File global symbols for (ObjectFile *file : ctx.objs) { file->global_symtab_idx = nsyms; nsyms += file->num_global_symtab; } for (SharedFile *file : ctx.dsos) { file->global_symtab_idx = nsyms; nsyms += file->num_global_symtab; } this->shdr.sh_info = ctx.objs[0]->global_symtab_idx; this->shdr.sh_link = ctx.strtab->shndx; this->shdr.sh_size = (nsyms == 1) ? 0 : nsyms * sizeof(ElfSym); } template void SymtabSection::copy_buf(Context &ctx) { ElfSym *buf = (ElfSym *)(ctx.buf + this->shdr.sh_offset); memset(buf, 0, sizeof(ElfSym)); if (ctx.symtab_shndx) { ElfShdr &shdr = ctx.symtab_shndx->shdr; memset(ctx.buf + shdr.sh_offset, 0, shdr.sh_size); } // Create section symbols for (Chunk *chunk : ctx.chunks) { if (chunk->shndx) { ElfSym &sym = buf[chunk->shndx]; memset(&sym, 0, sizeof(sym)); sym.st_type = STT_SECTION; sym.st_value = chunk->shdr.sh_addr; if (ctx.symtab_shndx) { U32 *xindex = (U32 *)(ctx.buf + ctx.symtab_shndx->shdr.sh_offset); xindex[chunk->shndx] = chunk->shndx; sym.st_shndx = SHN_XINDEX; } else { sym.st_shndx = chunk->shndx; } } } // Populate linker-synthesized symbols tbb::parallel_for_each(ctx.chunks, [&](Chunk *chunk) { chunk->populate_symtab(ctx); }); // Copy symbols from input files tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { file->populate_symtab(ctx); }); tbb::parallel_for_each(ctx.dsos, [&](SharedFile *file) { file->populate_symtab(ctx); }); } // An ARM64 function with a non-standard calling convention is marked with // STO_AARCH64_VARIANT_PCS bit in the symbol table. // // A function with that bit is not safe to be called through a lazy PLT // stub because the PLT resolver may clobber registers that should be // preserved in a non-standard calling convention. // // To solve the problem, the dynamic linker scans the dynamic symbol table // at process startup time and resolve symbols with STO_AARCH64_VARIANT_PCS // bit eagerly, so that the PLT resolver won't be called for that symbol // lazily. As an optimization, it does so only when DT_AARCH64_VARIANT_PCS // is set in the dynamic section. // // This function returns true if DT_AARCH64_VARIANT_PCS needs to be set. template static bool contains_variant_pcs(Context &ctx) { for (Symbol *sym : ctx.plt->symbols) if (sym->esym().arm64_variant_pcs) return true; return false; } // RISC-V has the same feature but with a different name. template static bool contains_variant_cc(Context &ctx) { for (Symbol *sym : ctx.plt->symbols) if (sym->esym().riscv_variant_cc) return true; return false; } template static std::vector> create_dynamic_section(Context &ctx) { std::vector> vec; auto define = [&](u64 tag, u64 val) { vec.push_back(tag); vec.push_back(val); }; for (SharedFile *file : ctx.dsos) define(DT_NEEDED, ctx.dynstr->find_string(file->soname)); if (!ctx.arg.rpaths.empty()) define(ctx.arg.enable_new_dtags ? DT_RUNPATH : DT_RPATH, ctx.dynstr->find_string(ctx.arg.rpaths)); if (!ctx.arg.soname.empty()) define(DT_SONAME, ctx.dynstr->find_string(ctx.arg.soname)); for (std::string_view str : ctx.arg.auxiliary) define(DT_AUXILIARY, ctx.dynstr->find_string(str)); if (!ctx.arg.audit.empty()) define(DT_AUDIT, ctx.dynstr->find_string(ctx.arg.audit)); if (!ctx.arg.depaudit.empty()) define(DT_DEPAUDIT, ctx.dynstr->find_string(ctx.arg.depaudit)); for (std::string_view str : ctx.arg.filter) define(DT_FILTER, ctx.dynstr->find_string(str)); if (ctx.reldyn->shdr.sh_size) { define(E::is_rela ? DT_RELA : DT_REL, ctx.reldyn->shdr.sh_addr); define(E::is_rela ? DT_RELASZ : DT_RELSZ, ctx.reldyn->shdr.sh_size); define(E::is_rela ? DT_RELAENT : DT_RELENT, sizeof(ElfRel)); } if (ctx.relrdyn) { define(DT_RELR, ctx.relrdyn->shdr.sh_addr); define(DT_RELRSZ, ctx.relrdyn->shdr.sh_size); define(DT_RELRENT, ctx.relrdyn->shdr.sh_entsize); } if (ctx.relplt->shdr.sh_size) { define(DT_JMPREL, ctx.relplt->shdr.sh_addr); define(DT_PLTRELSZ, ctx.relplt->shdr.sh_size); define(DT_PLTREL, E::is_rela ? DT_RELA : DT_REL); } if constexpr (is_sparc) { if (ctx.plt->shdr.sh_size) define(DT_PLTGOT, ctx.plt->shdr.sh_addr); } else if constexpr (is_ppc32) { if (ctx.gotplt->shdr.sh_size) define(DT_PLTGOT, ctx.gotplt->shdr.sh_addr + GotPltSection::HDR_SIZE); } else { if (ctx.gotplt->shdr.sh_size) define(DT_PLTGOT, ctx.gotplt->shdr.sh_addr); } if (ctx.dynsym->shdr.sh_size) { define(DT_SYMTAB, ctx.dynsym->shdr.sh_addr); define(DT_SYMENT, sizeof(ElfSym)); } if (ctx.dynstr->shdr.sh_size) { define(DT_STRTAB, ctx.dynstr->shdr.sh_addr); define(DT_STRSZ, ctx.dynstr->shdr.sh_size); } if (find_chunk(ctx, SHT_INIT_ARRAY)) { define(DT_INIT_ARRAY, ctx.__init_array_start->value); define(DT_INIT_ARRAYSZ, ctx.__init_array_end->value - ctx.__init_array_start->value); } if (find_chunk(ctx, SHT_PREINIT_ARRAY)) { define(DT_PREINIT_ARRAY, ctx.__preinit_array_start->value); define(DT_PREINIT_ARRAYSZ, ctx.__preinit_array_end->value - ctx.__preinit_array_start->value); } if (find_chunk(ctx, SHT_FINI_ARRAY)) { define(DT_FINI_ARRAY, ctx.__fini_array_start->value); define(DT_FINI_ARRAYSZ, ctx.__fini_array_end->value - ctx.__fini_array_start->value); } if (ctx.versym->shdr.sh_size) define(DT_VERSYM, ctx.versym->shdr.sh_addr); if (ctx.verneed->shdr.sh_size) { define(DT_VERNEED, ctx.verneed->shdr.sh_addr); define(DT_VERNEEDNUM, ctx.verneed->shdr.sh_info); } if (ctx.verdef) { define(DT_VERDEF, ctx.verdef->shdr.sh_addr); define(DT_VERDEFNUM, ctx.verdef->shdr.sh_info); } if (Symbol &sym = *ctx.arg.init; sym.file && !sym.file->is_dso) define(DT_INIT, sym.get_addr(ctx)); if (Symbol &sym = *ctx.arg.fini; sym.file && !sym.file->is_dso) define(DT_FINI, sym.get_addr(ctx)); if (ctx.hash) define(DT_HASH, ctx.hash->shdr.sh_addr); if (ctx.gnu_hash) define(DT_GNU_HASH, ctx.gnu_hash->shdr.sh_addr); if (ctx.has_textrel) define(DT_TEXTREL, 0); i64 flags = 0; i64 flags1 = 0; if (ctx.arg.pie) flags1 |= DF_1_PIE; if (ctx.arg.z_now) { flags |= DF_BIND_NOW; flags1 |= DF_1_NOW; } if (ctx.arg.z_origin) { flags |= DF_ORIGIN; flags1 |= DF_1_ORIGIN; } if (!ctx.arg.z_dlopen) flags1 |= DF_1_NOOPEN; if (ctx.arg.z_nodefaultlib) flags1 |= DF_1_NODEFLIB; if (!ctx.arg.z_delete) flags1 |= DF_1_NODELETE; if (!ctx.arg.z_dump) flags1 |= DF_1_NODUMP; if (ctx.arg.z_initfirst) flags1 |= DF_1_INITFIRST; if (ctx.arg.z_interpose) flags1 |= DF_1_INTERPOSE; if (!ctx.got->gottp_syms.empty()) flags |= DF_STATIC_TLS; if (ctx.has_textrel) flags |= DF_TEXTREL; if (flags) define(DT_FLAGS, flags); if (flags1) define(DT_FLAGS_1, flags1); if constexpr (is_arm64) if (contains_variant_pcs(ctx)) define(DT_AARCH64_VARIANT_PCS, 0); if constexpr (is_riscv) if (contains_variant_cc(ctx)) define(DT_RISCV_VARIANT_CC, 0); if constexpr (is_ppc32) define(DT_PPC_GOT, ctx.gotplt->shdr.sh_addr); if constexpr (is_ppc64) { // PPC64_GLINK is defined by the psABI to refer to 32 bytes before // the first PLT entry. I don't know why it's 32 bytes off, but // it's what it is. define(DT_PPC64_GLINK, ctx.plt->shdr.sh_addr + to_plt_offset(0) - 32); } // GDB needs a DT_DEBUG entry in an executable to store a word-size // data for its own purpose. Its content is not important. if (!ctx.arg.shared && !ctx.arg.z_rodynamic) define(DT_DEBUG, 0); define(DT_NULL, 0); for (i64 i = 0; i < ctx.arg.spare_dynamic_tags; i++) define(DT_NULL, 0); return vec; } template void DynamicSection::update_shdr(Context &ctx) { if (ctx.arg.static_ && !ctx.arg.pie) return; this->shdr.sh_size = create_dynamic_section(ctx).size() * sizeof(Word); this->shdr.sh_link = ctx.dynstr->shndx; } template void DynamicSection::copy_buf(Context &ctx) { std::vector> contents = create_dynamic_section(ctx); assert(this->shdr.sh_size == contents.size() * sizeof(Word)); write_vector(ctx.buf + this->shdr.sh_offset, contents); } // Assign offsets to OutputSection members template void OutputSection::compute_section_size(Context &ctx) { ElfShdr &shdr = this->shdr; // Text sections must to be handled by create_range_extension_thunks() // if they may need range extension thunks. assert(!needs_thunk || !(shdr.sh_flags & SHF_EXECINSTR) || ctx.arg.relocatable); // Since one output section may contain millions of input sections, // we first split input sections into groups and assign offsets to // groups. struct Group { std::span *> members; i64 size = 0; i64 offset = 0; i64 align = 1; }; std::vector groups; constexpr i64 group_size = 10000; for (std::span *> m = members; !m.empty();) { i64 sz = std::min(group_size, m.size()); groups.push_back({m.subspan(0, sz)}); m = m.subspan(sz); } tbb::parallel_for_each(groups, [](Group &g) { i64 off = 0; for (InputSection *isec : g.members) { off = align_to(off, 1 << isec->p2align) + isec->sh_size; g.align = std::max(g.align, 1 << isec->p2align); } g.size = off; }); i64 off = 0; for (Group &g : groups) { off = align_to(off, g.align); g.offset = off; off += g.size; } shdr.sh_size = off; // Assign offsets to input sections. tbb::parallel_for_each(groups, [](Group &g) { i64 off = g.offset; for (InputSection *isec : g.members) { off = align_to(off, 1 << isec->p2align); isec->offset = off; off += isec->sh_size; } }); } template void OutputSection::copy_buf(Context &ctx) { if (this->shdr.sh_type == SHT_NOBITS) return; // Copy section contents u8 *buf = ctx.buf + this->shdr.sh_offset; write_to(ctx, buf); // Emit dynamic relocations if (!ctx.reldyn) return; ElfRel *rel = (ElfRel *)(ctx.buf + ctx.reldyn->shdr.sh_offset + this->reldyn_offset); for (AbsRel &r : abs_rels) { Symbol &sym = *r.sym; u8 *loc = buf + r.isec->offset + r.offset; u64 S = sym.get_addr(ctx); u64 A = r.addend; u64 P = this->shdr.sh_addr + r.isec->offset + r.offset; if constexpr (is_riscv || is_loongarch) { i64 delta = get_r_delta(*r.isec, r.offset); loc -= delta; P -= delta; } auto write = [&](i64 ty, i64 idx, u64 val) { *rel++ = ElfRel(P, ty, idx, val); if (ctx.arg.apply_dynamic_relocs) *(Word *)loc = val; }; switch (r.kind) { case ABS_REL_NONE: case ABS_REL_RELR: *(Word *)loc = S + A; break; case ABS_REL_BASEREL: write(E::R_RELATIVE, 0, S + A); break; case ABS_REL_IFUNC: if constexpr (supports_ifunc) write(E::R_IRELATIVE, 0, sym.get_addr(ctx, NO_PLT) + A); break; case ABS_REL_DYNREL: write(E::R_ABS, sym.get_dynsym_idx(ctx), A); break; } } } template void OutputSection::write_to(Context &ctx, u8 *buf) { // Copy section contents to an output file. tbb::parallel_for((i64)0, (i64)members.size(), [&](i64 i) { InputSection &isec = *members[i]; isec.write_to(ctx, buf + isec.offset); // Clear trailing padding. We write trap instructions for an // executable segment so that a disassembler wouldn't try to // disassemble garbage as instructions. u64 this_end = isec.offset + isec.sh_size; u64 next_start; if (i + 1 < members.size()) next_start = members[i + 1]->offset; else next_start = this->shdr.sh_size; u8 *loc = buf + this_end; i64 size = next_start - this_end; auto fill = [&](const u8 (&filler)[N]) { for (i64 i = 0; i + N <= size; i += N) memcpy(loc + i, filler, N); }; if (this->shdr.sh_flags & SHF_EXECINSTR) { // s390x's old CRT files use NOP slides in .init and .fini. // https://sourceware.org/bugzilla/show_bug.cgi?id=31042 if (is_s390x && (this->name == ".init" || this->name == ".fini")) fill({ 0x07, 0x00 }); // nopr else fill(E::trap); } else { memset(loc, 0, size); } }); // Emit range extension thunks. if constexpr (needs_thunk) { tbb::parallel_for_each(thunks, [&](std::unique_ptr> &thunk) { thunk->copy_buf(ctx); }); } } // .relr.dyn contains base relocations encoded in a space-efficient form. // The contents of the section is essentially just a list of addresses // that have to be fixed up at runtime. // // Here is the encoding scheme (we assume 64-bit ELF in this description // for the sake of simplicity): .relr.dyn contains zero or more address // groups. Each address group consists of a 64-bit start address followed // by zero or more 63-bit bitmaps. Let A be the address of a start // address. Then, the loader fixes address A. If Nth bit in the following // bitmap is on, the loader also fixes address A + N * 8. In this scheme, // one address and one bitmap can represent up to 64 base relocations in a // 512 bytes range. // // A start address and a bitmap is distinguished by the lowest significant // bit. An address must be even and thus its LSB is 0 (odd address is not // representable in this encoding and such relocation must be stored to // the .rel.dyn section). A bitmap has LSB 1. template static std::vector encode_relr(std::span pos) { assert(ranges::all_of(pos, [](u64 x) { return x % sizeof(Word) == 0; })); assert(ranges::is_sorted(pos)); std::vector vec; i64 num_bits = E::is_64 ? 63 : 31; i64 max_delta = sizeof(Word) * num_bits; for (i64 i = 0; i < pos.size();) { vec.push_back(pos[i]); u64 base = pos[i] + sizeof(Word); i++; for (;;) { u64 bits = 0; for (; i < pos.size() && pos[i] - base < max_delta; i++) bits |= (u64)1 << ((pos[i] - base) / sizeof(Word)); if (!bits) break; vec.push_back((bits << 1) | 1); base += max_delta; } } return vec; } template static AbsRelKind get_abs_rel_kind(Context &ctx, Symbol &sym) { if (sym.is_ifunc()) return sym.is_pde_ifunc(ctx) ? ABS_REL_NONE : ABS_REL_IFUNC; if (sym.is_absolute()) return ABS_REL_NONE; // True if the symbol's address is in the output file. if (!sym.is_imported || (sym.flags & NEEDS_CPLT) || (sym.flags & NEEDS_COPYREL)) return ctx.arg.pic ? ABS_REL_BASEREL : ABS_REL_NONE; return ABS_REL_DYNREL; } template static bool is_absrel(const ElfRel &r) { // On ARM32, R_ARM_TARGET1 is typically used for entries in .init_array // and is interpreted as either ABS32 or REL32 depending on the target. // All targets we support handle it as if it were a ABS32. if constexpr (is_arm32) return r.r_type == R_ARM_ABS32 || r.r_type == R_ARM_TARGET1; // SPARC64 defines two separate relocations for aligned and unaligned words. if constexpr (is_sparc) return r.r_type == R_SPARC_64 || r.r_type == R_SPARC_UA64; return r.r_type == E::R_ABS; } // Scan word-size absolute relocations (e.g. R_X86_64_64). This is // separated from scan_relocations() because only such relocations can // be promoted to dynamic relocations. template void OutputSection::scan_abs_relocations(Context &ctx) { std::vector>> shards(members.size()); // Collect all word-size absolute relocations tbb::parallel_for((i64)0, (i64)members.size(), [&](i64 i) { InputSection *isec = members[i]; for (const ElfRel &r : isec->get_rels(ctx)) if (is_absrel(r)) shards[i].push_back(AbsRel{isec, r.r_offset, isec->file.symbols[r.r_sym], get_addend(*isec, r)}); }); abs_rels = flatten(shards); // We can sometimes avoid creating dynamic relocations in read-only // sections by promoting symbols to canonical PLT or copy relocations. if (!ctx.arg.pic && !(this->shdr.sh_flags & SHF_WRITE)) for (AbsRel &r : abs_rels) if (Symbol &sym = *r.sym; sym.is_imported && !sym.is_absolute()) sym.flags |= (sym.get_type() == STT_FUNC) ? NEEDS_CPLT : NEEDS_COPYREL; // Now we can compute whether they need to be promoted to dynamic // relocations or not. for (AbsRel &r : abs_rels) r.kind = get_abs_rel_kind(ctx, *r.sym); // If we have a relocation against a read-only section, we need to // set the DT_TEXTREL flag for the loader. for (AbsRel &r : abs_rels) { if (r.kind != ABS_REL_NONE && !(r.isec->shdr().sh_flags & SHF_WRITE)) { if (ctx.arg.z_text) { Error(ctx) << *r.isec << ": relocation at offset 0x" << std::hex << r.offset << " against symbol `" << *r.sym << "' can not be used; recompile with -fPIC"; } else if (ctx.arg.warn_textrel) { Warn(ctx) << *r.isec << ": relocation against symbol `" << *r.sym << "' in read-only section"; } ctx.has_textrel = true; } } // If --pack-dyn-relocs=relr is enabled, base relocations are put into // .relr.dyn. if (ctx.arg.pack_dyn_relocs_relr && !(this->shdr.sh_flags & SHF_EXECINSTR)) for (AbsRel &r : abs_rels) if (r.kind == ABS_REL_BASEREL && r.isec->shdr().sh_addralign % sizeof(Word) == 0 && r.offset % sizeof(Word) == 0) r.kind = ABS_REL_RELR; } template i64 OutputSection::get_reldyn_size(Context &ctx) const { i64 n = 0; for (const AbsRel &r : abs_rels) if (r.kind != ABS_REL_NONE && r.kind != ABS_REL_RELR) n++; return n; } template void OutputSection::construct_relr(Context &ctx) { std::vector pos; for (const AbsRel &r : abs_rels) if (r.kind == ABS_REL_RELR) pos.push_back(r.isec->offset + r.offset); this->relr = encode_relr(pos); } // Compute spaces needed for thunk symbols template void OutputSection::compute_symtab_size(Context &ctx) { if constexpr (needs_thunk) { this->strtab_size = 0; this->num_local_symtab = 0; for (std::unique_ptr> &thunk : thunks) { // For ARM32, we emit additional symbol "$t", "$a" and "$d" for // each thunk to mark the beginning of Thumb code, ARM code and // data, respectively. if constexpr (is_arm32) this->num_local_symtab += thunk->symbols.size() * 4; else this->num_local_symtab += thunk->symbols.size(); for (Symbol *sym : thunk->symbols) this->strtab_size += sym->name().size() + thunk->name.size() + 2; } } } // If we create range extension thunks, we also synthesize symbols to mark // the locations of thunks. Creating such symbols is optional, but it helps // disassembling and/or debugging our output. template void OutputSection::populate_symtab(Context &ctx) { if constexpr (needs_thunk) { ElfSym *esym = (ElfSym *)(ctx.buf + ctx.symtab->shdr.sh_offset) + this->local_symtab_idx; u8 *strtab_base = ctx.buf + ctx.strtab->shdr.sh_offset; u8 *strtab = strtab_base + this->strtab_offset; memset(esym, 0, this->num_local_symtab * sizeof(ElfSym)); memset(strtab, 0, this->strtab_size); auto write_esym = [&](u64 addr, i64 st_name) { memset(esym, 0, sizeof(*esym)); esym->st_name = st_name; esym->st_type = STT_FUNC; esym->st_shndx = this->shndx; esym->st_value = addr; esym++; }; for (std::unique_ptr> &thunk : thunks) { for (i64 i = 0; i < thunk->symbols.size(); i++) { Symbol &sym = *thunk->symbols[i]; u64 addr = thunk->get_addr() + thunk->offsets[i]; write_esym(addr, strtab - strtab_base); strtab += write_string(strtab, sym.name()) - 1; *strtab++ = '$'; strtab += write_string(strtab, thunk->name); // Emit "$t", "$a" and "$d" if ARM32. if constexpr (is_arm32) { write_esym(addr, ctx.strtab->THUMB); write_esym(addr + 4, ctx.strtab->ARM); write_esym(addr + 12, ctx.strtab->DATA); } } } } } template void GotSection::add_got_symbol(Context &ctx, Symbol *sym) { sym->set_got_idx(ctx, this->shdr.sh_size / sizeof(Word)); // An IFUNC symbol uses two GOT slots in a position-dependent // executable. if (sym->is_pde_ifunc(ctx)) this->shdr.sh_size += sizeof(Word) * 2; else this->shdr.sh_size += sizeof(Word); got_syms.push_back(sym); } template void GotSection::add_gottp_symbol(Context &ctx, Symbol *sym) { sym->set_gottp_idx(ctx, this->shdr.sh_size / sizeof(Word)); this->shdr.sh_size += sizeof(Word); gottp_syms.push_back(sym); } template void GotSection::add_tlsgd_symbol(Context &ctx, Symbol *sym) { sym->set_tlsgd_idx(ctx, this->shdr.sh_size / sizeof(Word)); this->shdr.sh_size += sizeof(Word) * 2; tlsgd_syms.push_back(sym); } template void GotSection::add_tlsdesc_symbol(Context &ctx, Symbol *sym) { // TLSDESC's GOT slot values may vary depending on libc, so we // always emit a dynamic relocation for each TLSDESC entry. // // If dynamic relocation is not available (i.e. if we are creating a // statically-linked executable), we always relax TLSDESC relocations // so that no TLSDESC relocation exist at runtime. assert(supports_tlsdesc); assert(!ctx.arg.static_); sym->set_tlsdesc_idx(ctx, this->shdr.sh_size / sizeof(Word)); this->shdr.sh_size += sizeof(Word) * 2; tlsdesc_syms.push_back(sym); } template void GotSection::add_tlsld(Context &ctx) { assert(tlsld_idx == -1); tlsld_idx = this->shdr.sh_size / sizeof(Word); this->shdr.sh_size += sizeof(Word) * 2; } template u64 GotSection::get_tlsld_addr(Context &ctx) const { assert(tlsld_idx != -1); return this->shdr.sh_addr + tlsld_idx * sizeof(Word); } namespace { template struct GotEntry { bool is_relr(Context &ctx) const { return r_type == E::R_RELATIVE && ctx.arg.pack_dyn_relocs_relr; } i64 idx = 0; u64 val = 0; i64 r_type = R_NONE; Symbol *sym = nullptr; }; } // Get .got and .rel.dyn contents. // // .got is a linker-synthesized constant pool whose entry is of pointer // size. If we know a correct value for an entry, we'll just set that value // to the entry. Otherwise, we'll create a dynamic relocation and let the // dynamic linker to fill the entry at load-time. // // Most GOT entries contain addresses of global variable. If a global // variable is an imported symbol, we don't know its address until runtime. // GOT contains the addresses of such variables at runtime so that we can // access imported global variables via GOT. // // Thread-local variables (TLVs) also use GOT entries. We need them because // TLVs are accessed in a different way than the ordinary global variables. // Their addresses are not unique; each thread has its own copy of TLVs. template static std::vector> get_got_entries(Context &ctx) { std::vector> entries; auto add = [&](GotEntry ent) { entries.push_back(ent); }; // Create GOT entries for ordinary symbols for (Symbol *sym : ctx.got->got_syms) { i64 idx = sym->get_got_idx(ctx); // IFUNC always needs to be fixed up by the dynamic linker. if constexpr (supports_ifunc) { if (sym->is_ifunc()) { if (sym->is_pde_ifunc(ctx)) { add({idx, sym->get_plt_addr(ctx)}); add({idx + 1, sym->get_addr(ctx, NO_PLT), E::R_IRELATIVE}); } else { add({idx, sym->get_addr(ctx, NO_PLT), E::R_IRELATIVE}); } continue; } } if (sym->is_imported) { // If a symbol is imported, let the dynamic linker to resolve it. add({idx, 0, E::R_GLOB_DAT, sym}); } else if (ctx.arg.pic && sym->is_relative()) { // We know the symbol's address, but it needs a base relocation. add({idx, sym->get_addr(ctx, NO_PLT), E::R_RELATIVE}); } else { // We know the symbol's exact run-time address at link-time. add({idx, sym->get_addr(ctx, NO_PLT)}); } } // Create GOT entries for TLVs. for (Symbol *sym : ctx.got->tlsgd_syms) { i64 idx = sym->get_tlsgd_idx(ctx); if (sym->is_imported) { // If a symbol is imported, let the dynamic linker to resolve it. add({idx, 0, E::R_DTPMOD, sym}); add({idx + 1, 0, E::R_DTPOFF, sym}); } else if (ctx.arg.shared) { // If we are creating a shared library, we know the TLV's offset // within the current TLS block. We don't know the module ID though. add({idx, 0, E::R_DTPMOD}); add({idx + 1, sym->get_addr(ctx) - ctx.dtp_addr}); } else { // If we are creating an executable, we know both the module ID and // the offset. Module ID 1 indicates the main executable. add({idx, 1}); add({idx + 1, sym->get_addr(ctx) - ctx.dtp_addr}); } } if constexpr (supports_tlsdesc) { for (Symbol *sym : ctx.got->tlsdesc_syms) { i64 idx = sym->get_tlsdesc_idx(ctx); // TLSDESC uses two consecutive GOT slots, and a single TLSDESC // dynamic relocation fills both. The actual values of the slots // vary depending on libc, so we can't precompute their values. // We always emit a dynamic relocation for each incoming TLSDESC // reloc. if (sym->is_imported) add({idx, 0, E::R_TLSDESC, sym}); else add({idx, sym->get_addr(ctx) - ctx.tls_begin, E::R_TLSDESC}); } } for (Symbol *sym : ctx.got->gottp_syms) { i64 idx = sym->get_gottp_idx(ctx); if (sym->is_imported) { // If we know nothing about the symbol, let the dynamic linker // to fill the GOT entry. add({idx, 0, E::R_TPOFF, sym}); } else if (ctx.arg.shared) { // If we know the offset within the current thread vector, // let the dynamic linker to adjust it. add({idx, sym->get_addr(ctx) - ctx.tls_begin, E::R_TPOFF}); } else { // Otherwise, we know the offset from the thread pointer (TP) at // link-time, so we can fill the GOT entry directly. add({idx, sym->get_addr(ctx) - ctx.tp_addr}); } } if (ctx.got->tlsld_idx != -1) { if (ctx.arg.shared) add({ctx.got->tlsld_idx, 0, E::R_DTPMOD}); else add({ctx.got->tlsld_idx, 1}); // 1 means the main executable } return entries; } template i64 GotSection::get_reldyn_size(Context &ctx) const { i64 n = 0; for (GotEntry &ent : get_got_entries(ctx)) if (!ent.is_relr(ctx) && ent.r_type != R_NONE) n++; return n; } // Fill .got and .rel.dyn. template void GotSection::copy_buf(Context &ctx) { Word *buf = (Word *)(ctx.buf + this->shdr.sh_offset); memset(buf, 0, this->shdr.sh_size); // s390x psABI requires GOT[0] to be set to the link-time value of _DYNAMIC. if constexpr (is_s390x) if (ctx.dynamic) buf[0] = ctx.dynamic->shdr.sh_addr; // ARM64 psABI doesn't say anything about GOT[0], but glibc/arm64's code // path for -static-pie wrongly assumed that GOT[0] refers to _DYNAMIC. // // https://sourceware.org/git/?p=glibc.git;a=commitdiff;h=43d06ed218fc8be5 if constexpr (is_arm64) if (ctx.dynamic && ctx.arg.static_ && ctx.arg.pie) buf[0] = ctx.dynamic->shdr.sh_addr; ElfRel *rel = (ElfRel *)(ctx.buf + ctx.reldyn->shdr.sh_offset + this->reldyn_offset); for (GotEntry &ent : get_got_entries(ctx)) { if (ent.is_relr(ctx) || ent.r_type == R_NONE) { buf[ent.idx] = ent.val; continue; } *rel++ = ElfRel(this->shdr.sh_addr + ent.idx * sizeof(Word), ent.r_type, ent.sym ? ent.sym->get_dynsym_idx(ctx) : 0, ent.val); if (ctx.arg.apply_dynamic_relocs) { // A single TLSDESC relocation fixes two consecutive GOT slots // where one slot holds a function pointer and the other an // argument to the function. An addend should be applied not to // the function pointer but to the function argument, which is // usually stored to the second slot. // // ARM32 employs the inverted layout for some reason, so an // addend is applied to the first slot. i64 i = ent.idx; if constexpr (supports_tlsdesc && !is_arm32) if (ent.r_type == E::R_TLSDESC) i = ent.idx + 1; buf[i] = ent.val; } } } template void GotSection::construct_relr(Context &ctx) { std::vector pos; for (GotEntry &ent : get_got_entries(ctx)) if (ent.is_relr(ctx)) pos.push_back(ent.idx * sizeof(Word)); this->relr = encode_relr(pos); } template void GotSection::compute_symtab_size(Context &ctx) { this->strtab_size = 0; this->num_local_symtab = 0; for (Symbol *sym : got_syms) { this->strtab_size += sym->name().size() + sizeof("$got"); this->num_local_symtab++; } for (Symbol *sym : gottp_syms) { this->strtab_size += sym->name().size() + sizeof("$gottp"); this->num_local_symtab++; } for (Symbol *sym : tlsgd_syms) { this->strtab_size += sym->name().size() + sizeof("$tlsgd"); this->num_local_symtab++; } for (Symbol *sym : tlsdesc_syms) { this->strtab_size += sym->name().size() + sizeof("$tlsdesc"); this->num_local_symtab++; } if (tlsld_idx != -1) { this->strtab_size += sizeof("$tlsld"); this->num_local_symtab++; } } template void GotSection::populate_symtab(Context &ctx) { if (this->num_local_symtab == 0) return; ElfSym *esym = (ElfSym *)(ctx.buf + ctx.symtab->shdr.sh_offset) + this->local_symtab_idx; u8 *strtab_base = ctx.buf + ctx.strtab->shdr.sh_offset; u8 *strtab = strtab_base + this->strtab_offset; auto write = [&](std::string_view name, std::string_view suffix, i64 value) { memset(esym, 0, sizeof(*esym)); esym->st_name = strtab - strtab_base; esym->st_type = STT_OBJECT; esym->st_shndx = this->shndx; esym->st_value = value; esym++; strtab += write_string(strtab, name) - 1; strtab += write_string(strtab, suffix); }; for (Symbol *sym : got_syms) write(sym->name(), "$got", sym->get_got_addr(ctx)); for (Symbol *sym : gottp_syms) write(sym->name(), "$gottp", sym->get_gottp_addr(ctx)); for (Symbol *sym : tlsgd_syms) write(sym->name(), "$tlsgd", sym->get_tlsgd_addr(ctx)); for (Symbol *sym : tlsdesc_syms) write(sym->name(), "$tlsdesc", sym->get_tlsdesc_addr(ctx)); if (tlsld_idx != -1) write("", "$tlsld", get_tlsld_addr(ctx)); } template void GotPltSection::update_shdr(Context &ctx) { this->shdr.sh_size = HDR_SIZE + ctx.plt->symbols.size() * ENTRY_SIZE; } template void GotPltSection::copy_buf(Context &ctx) { // On PPC64, it's dynamic loader responsibility to fill the .got.plt // section. Dynamic loader finds the address of the first PLT entry by // DT_PPC64_GLINK and assumes that each PLT entry is 4 bytes long. if constexpr (!is_ppc64) { Word *buf = (Word *)(ctx.buf + this->shdr.sh_offset); // The first slot of .got.plt points to _DYNAMIC, as requested by // the psABI. The second and the third slots are reserved by the psABI. static_assert(HDR_SIZE / sizeof(Word) == 3); buf[0] = ctx.dynamic ? (u64)ctx.dynamic->shdr.sh_addr : 0; buf[1] = 0; buf[2] = 0; for (i64 i = 0; i < ctx.plt->symbols.size(); i++) buf[i + 3] = ctx.plt->shdr.sh_addr; } } template void PltSection::add_symbol(Context &ctx, Symbol *sym) { assert(!sym->has_plt(ctx)); sym->set_plt_idx(ctx, symbols.size()); symbols.push_back(sym); ctx.dynsym->add_symbol(ctx, sym); } template void PltSection::update_shdr(Context &ctx) { if (symbols.empty()) this->shdr.sh_size = 0; else this->shdr.sh_size = to_plt_offset(symbols.size()); } template void PltSection::copy_buf(Context &ctx) { u8 *buf = ctx.buf + ctx.plt->shdr.sh_offset; write_plt_header(ctx, buf); for (i64 i = 0; i < symbols.size(); i++) write_plt_entry(ctx, buf + to_plt_offset(i), *symbols[i]); } template void PltSection::compute_symtab_size(Context &ctx) { this->num_local_symtab = symbols.size(); this->strtab_size = 0; for (Symbol *sym : symbols) this->strtab_size += sym->name().size() + sizeof("$plt"); if constexpr (is_arm32) this->num_local_symtab += symbols.size() * 2 + 2; } template void PltSection::populate_symtab(Context &ctx) { if (this->num_local_symtab == 0) return; ElfSym *esym = (ElfSym *)(ctx.buf + ctx.symtab->shdr.sh_offset) + this->local_symtab_idx; u8 *strtab_base = ctx.buf + ctx.strtab->shdr.sh_offset; u8 *strtab = strtab_base + this->strtab_offset; auto write_esym = [&](u64 addr, i64 st_name) { memset(esym, 0, sizeof(*esym)); esym->st_name = st_name; esym->st_type = STT_FUNC; esym->st_bind = STB_LOCAL; esym->st_shndx = this->shndx; esym->st_value = addr; esym++; }; if constexpr (is_arm32) { write_esym(this->shdr.sh_addr, ctx.strtab->ARM); write_esym(this->shdr.sh_addr + 16, ctx.strtab->DATA); } for (Symbol *sym : symbols) { u64 addr = sym->get_plt_addr(ctx); write_esym(addr, strtab - strtab_base); strtab += write_string(strtab, sym->name()) - 1; strtab += write_string(strtab, "$plt"); if constexpr (is_arm32) { write_esym(addr, ctx.strtab->ARM); write_esym(addr + 12, ctx.strtab->DATA); } } } template void PltGotSection::add_symbol(Context &ctx, Symbol *sym) { assert(!sym->has_plt(ctx)); assert(sym->has_got(ctx)); sym->set_pltgot_idx(ctx, symbols.size()); symbols.push_back(sym); this->shdr.sh_size = symbols.size() * E::pltgot_size; } template void PltGotSection::copy_buf(Context &ctx) { u8 *buf = ctx.buf + ctx.pltgot->shdr.sh_offset; for (i64 i = 0; i < symbols.size(); i++) write_pltgot_entry(ctx, buf + i * E::pltgot_size, *symbols[i]); } template void PltGotSection::compute_symtab_size(Context &ctx) { this->num_local_symtab = symbols.size(); this->strtab_size = 0; for (Symbol *sym : symbols) this->strtab_size += sym->name().size() + sizeof("$pltgot"); if constexpr (is_arm32) this->num_local_symtab += symbols.size() * 2; } template void PltGotSection::populate_symtab(Context &ctx) { if (this->num_local_symtab == 0) return; ElfSym *esym = (ElfSym *)(ctx.buf + ctx.symtab->shdr.sh_offset) + this->local_symtab_idx; u8 *strtab_base = ctx.buf + ctx.strtab->shdr.sh_offset; u8 *strtab = strtab_base + this->strtab_offset; auto write_esym = [&](u64 addr, i64 st_name) { memset(esym, 0, sizeof(*esym)); esym->st_name = st_name; esym->st_type = STT_FUNC; esym->st_shndx = this->shndx; esym->st_value = addr; esym++; }; for (Symbol *sym : symbols) { u64 addr = sym->get_plt_addr(ctx); write_esym(addr, strtab - strtab_base); strtab += write_string(strtab, sym->name()) - 1; strtab += write_string(strtab, "$pltgot"); if constexpr (is_arm32) { write_esym(addr, ctx.strtab->ARM); write_esym(addr + 12, ctx.strtab->DATA); } } } template void RelPltSection::update_shdr(Context &ctx) { this->shdr.sh_size = ctx.plt->symbols.size() * sizeof(ElfRel); this->shdr.sh_link = ctx.dynsym->shndx; if (!is_sparc) this->shdr.sh_info = ctx.gotplt->shndx; } template void RelPltSection::copy_buf(Context &ctx) { ElfRel *buf = (ElfRel *)(ctx.buf + this->shdr.sh_offset); for (Symbol *sym : ctx.plt->symbols) { // SPARC doesn't have a .got.plt because its role is merged to .plt. // On SPARC, .plt is writable (!) and the dynamic linker directly // modifies .plt's machine instructions as it resolves dynamic symbols. // Therefore, it doesn't need a separate section to store the symbol // resolution results. That is of course horrible from the security // point of view, though. u64 addr = is_sparc ? sym->get_plt_addr(ctx) : sym->get_gotplt_addr(ctx); *buf++ = ElfRel(addr, E::R_JUMP_SLOT, sym->get_dynsym_idx(ctx), 0); } } // RISC-V and LoongArch have code-shrinking linker relaxation. If we // have removed instructions from a function, we need to update its // size as well. template static u64 get_symbol_size(Symbol &sym) { const ElfSym &esym = sym.esym(); if constexpr (is_riscv || is_loongarch) if (esym.st_size) if (InputSection *isec = sym.get_input_section()) if (isec->shdr().sh_flags & SHF_EXECINSTR) return esym.st_size + esym.st_value - sym.value - get_r_delta(*isec, esym.st_value + esym.st_size); return esym.st_size; } template std::optional> to_output_esym(Context &ctx, Symbol &sym, u32 st_name, U32 *shn_xindex) { ElfSym esym; memset(&esym, 0, sizeof(esym)); esym.st_name = st_name; esym.st_type = sym.get_type(); esym.st_size = get_symbol_size(sym); if (sym.is_local(ctx)) esym.st_bind = STB_LOCAL; else if (sym.is_weak) esym.st_bind = STB_WEAK; else if (sym.file->is_dso) esym.st_bind = STB_GLOBAL; else esym.st_bind = sym.esym().st_bind; if constexpr (is_arm64) esym.arm64_variant_pcs = sym.esym().arm64_variant_pcs; if constexpr (is_riscv) esym.riscv_variant_cc = sym.esym().riscv_variant_cc; if constexpr (is_ppc64v2) esym.ppc64_local_entry = sym.esym().ppc64_local_entry; auto get_st_shndx = [&](Symbol &sym) -> u32 { if (SectionFragment *frag = sym.get_frag()) if (frag->is_alive) return frag->output_section.shndx; if constexpr (is_ppc64v1) if (sym.has_opd(ctx)) return ctx.extra.opd->shndx; if (InputSection *isec = sym.get_input_section()) { if (isec->is_alive) return isec->output_section->shndx; if (isec->icf_removed()) return isec->leader->output_section->shndx; } return SHN_UNDEF; }; i64 shndx = -1; InputSection *isec = sym.get_input_section(); if (sym.has_copyrel) { // Symbol in .copyrel shndx = sym.is_copyrel_readonly ? ctx.copyrel_relro->shndx : ctx.copyrel->shndx; esym.st_value = sym.get_addr(ctx); } else if (sym.file->is_dso || sym.esym().is_undef()) { // Undefined symbol in a DSO esym.st_shndx = SHN_UNDEF; esym.st_size = 0; if (sym.is_canonical) esym.st_value = sym.get_plt_addr(ctx); } else if (Chunk *osec = sym.get_output_section()) { // Linker-synthesized symbols shndx = osec->shndx; esym.st_value = sym.get_addr(ctx); } else if (SectionFragment *frag = sym.get_frag()) { // Section fragment shndx = frag->output_section.shndx; esym.st_value = sym.get_addr(ctx); } else if (!isec) { // Absolute symbol esym.st_shndx = SHN_ABS; esym.st_value = sym.get_addr(ctx); } else if (sym.get_type() == STT_TLS) { // TLS symbol shndx = get_st_shndx(sym); esym.st_value = sym.get_addr(ctx) - ctx.tls_begin; } else if (sym.is_pde_ifunc(ctx) && sym.has_plt(ctx)) { // IFUNC symbol in PDE that uses two GOT slots shndx = get_st_shndx(sym); esym.st_type = STT_FUNC; esym.st_visibility = sym.visibility; esym.st_value = sym.get_plt_addr(ctx); } else if ((isec->shdr().sh_flags & SHF_MERGE) && !(isec->shdr().sh_flags & SHF_ALLOC)) { // Symbol in a mergeable non-SHF_ALLOC section, such as .debug_str ObjectFile *file = (ObjectFile *)sym.file; MergeableSection &m = *file->mergeable_sections[file->get_shndx(sym.esym())]; SectionFragment *frag; i64 frag_addend; std::tie(frag, frag_addend) = m.get_fragment(sym.esym().st_value); shndx = m.parent.shndx; esym.st_visibility = sym.visibility; esym.st_value = frag->get_addr(ctx) + frag_addend; } else { // Symbol in a regular section shndx = get_st_shndx(sym); esym.st_visibility = sym.visibility; esym.st_value = sym.get_addr(ctx, NO_PLT); } // Symbol's st_shndx is only 16 bits wide, so we can't store a large // section index there. If the total number of sections is equal to // or greater than SHN_LORESERVE (= 65280), the real index is stored // to a SHT_SYMTAB_SHNDX section which contains a parallel array of // the symbol table. if (0 <= shndx && shndx < SHN_LORESERVE) { esym.st_shndx = shndx; } else if (SHN_LORESERVE <= shndx) { if (!shn_xindex) return {}; esym.st_shndx = SHN_XINDEX; *shn_xindex = shndx; } return esym; } template void DynsymSection::add_symbol(Context &ctx, Symbol *sym) { if (symbols.empty()) symbols.resize(1); if (sym->get_dynsym_idx(ctx) == -1) { sym->set_dynsym_idx(ctx, -2); symbols.push_back(sym); } } template void DynsymSection::update_shdr(Context &ctx) { this->shdr.sh_link = ctx.dynstr->shndx; this->shdr.sh_size = sizeof(ElfSym) * symbols.size(); } template void DynsymSection::copy_buf(Context &ctx) { ElfSym *buf = (ElfSym *)(ctx.buf + this->shdr.sh_offset); i64 offset = dynstr_offset; memset(buf, 0, sizeof(ElfSym)); for (i64 i = 1; i < symbols.size(); i++) { Symbol &sym = *symbols[i]; std::optional> esym = to_output_esym(ctx, sym, offset, nullptr); if (!esym) { Error(ctx) << ctx.arg.output << ": .dynsym: too many output sections: " << (ctx.shdr->shdr.sh_size / sizeof(ElfShdr)) << " requested, but ELF allows at most 65279"; return; } buf[sym.get_dynsym_idx(ctx)] = *esym; offset += sym.name().size() + 1; } } template void HashSection::update_shdr(Context &ctx) { if (ctx.dynsym->symbols.empty()) return; i64 header_size = sizeof(Entry) * 2; i64 num_slots = ctx.dynsym->symbols.size(); this->shdr.sh_size = header_size + num_slots * sizeof(Entry) * 2; this->shdr.sh_link = ctx.dynsym->shndx; } template void HashSection::copy_buf(Context &ctx) { u8 *base = ctx.buf + this->shdr.sh_offset; memset(base, 0, this->shdr.sh_size); std::span *> syms = ctx.dynsym->symbols; Entry *hdr = (Entry *)base; Entry *buckets = hdr + 2; Entry *chains = buckets + syms.size(); hdr[0] = syms.size(); hdr[1] = syms.size(); for (Symbol *sym : syms.subspan(1)) { i64 i = sym->get_dynsym_idx(ctx); i64 h = elf_hash(sym->name()) % syms.size(); chains[i] = buckets[h]; buckets[h] = i; } } template void GnuHashSection::update_shdr(Context &ctx) { if (ctx.dynsym->symbols.empty()) return; // We allocate 12 bits for each symbol in the bloom filter. num_bloom = bit_ceil((num_exported * 12) / (sizeof(Word) * 8)); this->shdr.sh_size = HEADER_SIZE; // Header this->shdr.sh_size += num_bloom * sizeof(Word); // Bloom filter this->shdr.sh_size += num_buckets * 4; // Hash buckets this->shdr.sh_size += num_exported * 4; // Hash values this->shdr.sh_link = ctx.dynsym->shndx; } template void GnuHashSection::copy_buf(Context &ctx) { u8 *base = ctx.buf + this->shdr.sh_offset; memset(base, 0, this->shdr.sh_size); i64 first_exported = ctx.dynsym->symbols.size() - num_exported; *(U32 *)base = num_buckets; *(U32 *)(base + 4) = first_exported; *(U32 *)(base + 8) = num_bloom; *(U32 *)(base + 12) = BLOOM_SHIFT; std::span *> syms = ctx.dynsym->symbols; syms = syms.subspan(first_exported); if (syms.empty()) return; // Write a bloom filter Word *bloom = (Word *)(base + HEADER_SIZE); std::vector indices(num_exported); for (i64 i = 0; i < syms.size(); i++) { constexpr i64 word_bits = sizeof(Word) * 8; u32 h = syms[i]->get_djb_hash(ctx); indices[i] = h % num_buckets; i64 idx = (h / word_bits) % num_bloom; bloom[idx] |= 1LL << (h % word_bits); bloom[idx] |= 1LL << ((h >> BLOOM_SHIFT) % word_bits); } // Write hash bucket indices U32 *buckets = (U32 *)(bloom + num_bloom); for (i64 i = syms.size() - 1; i >= 0; i--) buckets[indices[i]] = first_exported + i; // Write a hash table U32 *table = buckets + num_buckets; for (i64 i = 0; i < syms.size(); i++) { // The last entry in a chain must be terminated with an entry with // least-significant bit 1. u32 h = syms[i]->get_djb_hash(ctx); if (i == syms.size() - 1 || indices[i] != indices[i + 1]) table[i] = h | 1; else table[i] = h & ~1; } } template std::string_view get_merged_output_name(Context &ctx, std::string_view name, u64 flags, i64 entsize, i64 addralign) { if (ctx.arg.relocatable && !ctx.arg.relocatable_merge_sections) return name; if (!ctx.arg.unique.empty() && ctx.arg.unique.find(name) != -1) return name; // GCC seems to create sections named ".rodata.strN..M" // or ".rodata.cst. MergedSection::MergedSection(std::string_view name, i64 flags, i64 type, i64 entsize) { this->name = name; this->shdr.sh_flags = flags; this->shdr.sh_type = type; this->shdr.sh_entsize = entsize; } template MergedSection * MergedSection::get_instance(Context &ctx, std::string_view name, const ElfShdr &shdr) { if (!(shdr.sh_flags & SHF_MERGE)) return nullptr; i64 addralign = std::max(1, shdr.sh_addralign); i64 flags = shdr.sh_flags & ~(u64)SHF_GROUP & ~(u64)SHF_COMPRESSED; i64 entsize = shdr.sh_entsize; if (entsize == 0) entsize = (shdr.sh_flags & SHF_STRINGS) ? 1 : (i64)shdr.sh_addralign; if (entsize == 0) return nullptr; name = get_merged_output_name(ctx, name, flags, entsize, addralign); auto find = [&]() -> MergedSection * { for (std::unique_ptr> &osec : ctx.merged_sections) if (name == osec->name && flags == osec->shdr.sh_flags && shdr.sh_type == osec->shdr.sh_type && entsize == osec->shdr.sh_entsize) return osec.get(); return nullptr; }; // Search for an exiting output section. static std::shared_mutex mu; { std::shared_lock lock(mu); if (MergedSection *osec = find()) return osec; } // Create a new output section. std::unique_lock lock(mu); if (MergedSection *osec = find()) return osec; MergedSection *osec = new MergedSection(name, flags, shdr.sh_type, entsize); ctx.merged_sections.emplace_back(osec); return osec; } template SectionFragment * MergedSection::insert(Context &ctx, std::string_view data, u64 hash, i64 p2align) { // Even if GC is enabled, we garbage-collect only memory-mapped strings. // Non-memory-allocated strings are typically identifiers used by debug info. // To remove such strings, use the `strip` command. bool is_alive = !ctx.arg.gc_sections || !(this->shdr.sh_flags & SHF_ALLOC); SectionFragment *frag = map.insert(data, hash, SectionFragment(this, is_alive)).first; update_maximum(frag->p2align, p2align); return frag; } template static std::string get_cmdline_args(Context &ctx) { std::stringstream ss; ss << ctx.cmdline_args[1]; for (i64 i = 2; i < ctx.cmdline_args.size(); i++) ss << " " << ctx.cmdline_args[i]; return ss.str(); } // Add strings to .comment template static void add_comment_strings(Context &ctx) { auto add = [&](std::string str) { std::string_view buf = save_string(ctx, str); std::string_view data(buf.data(), buf.size() + 1); ctx.comment->insert(ctx, data, hash_string(data), 0); }; // Add an identification string to .comment. add(mold_version); // Embed command line arguments for debugging. char *env = getenv("MOLD_DEBUG"); if (env && env[0]) add("mold command line: " + get_cmdline_args(ctx)); } template void MergedSection::resolve(Context &ctx) { tbb::parallel_for_each(members, [&](MergeableSection *sec) { sec->split_contents(ctx); }); // We aim 2/3 occupation ratio map.resize(estimator.get_cardinality() * 3 / 2); tbb::parallel_for_each(members, [&](MergeableSection *sec) { sec->resolve_contents(ctx); }); if (this == ctx.comment) add_comment_strings(ctx); // Compute section alignment u32 p2align = 0; for (MergeableSection *sec : members) p2align = std::max(p2align, sec->p2align); this->shdr.sh_addralign = 1 << p2align; resolved = true; } template void MergedSection::compute_section_size(Context &ctx) { if (!resolved) resolve(ctx); std::vector sizes(map.NUM_SHARDS * 2); tbb::parallel_for((i64)0, map.NUM_SHARDS, [&](i64 i) { using Entry = typename decltype(map)::Entry; std::vector entries = map.get_sorted_entries(i); i64 off1 = 0; i64 off2 = 0; for (Entry *ent : entries) { SectionFragment &frag = ent->value; if (frag.is_alive) { if (frag.is_32bit) { off1 = align_to(off1, 1 << frag.p2align); frag.offset = off1; off1 += ent->keylen; } else { off2 = align_to(off2, 1 << frag.p2align); frag.offset = off2; off2 += ent->keylen; } } } sizes[i] = off1; sizes[i + map.NUM_SHARDS] = off2; }); i64 shard_size = map.nbuckets / map.NUM_SHARDS; shard_offsets.resize(sizes.size() + 1); for (i64 i = 1; i < sizes.size() + 1; i++) shard_offsets[i] = align_to(shard_offsets[i - 1] + sizes[i - 1], this->shdr.sh_addralign); this->shdr.sh_size = shard_offsets.back(); tbb::parallel_for((i64)1, map.NUM_SHARDS, [&](i64 i) { for (i64 j = shard_size * i; j < shard_size * (i + 1); j++) { SectionFragment &frag = map.entries[j].value; if (frag.is_alive) { if (frag.is_32bit) frag.offset += shard_offsets[i]; else frag.offset += shard_offsets[i + map.NUM_SHARDS]; } } }); } template void MergedSection::copy_buf(Context &ctx) { write_to(ctx, ctx.buf + this->shdr.sh_offset); } template void MergedSection::write_to(Context &ctx, u8 *buf) { i64 shard_size = map.nbuckets / map.NUM_SHARDS; tbb::parallel_for((i64)0, map.NUM_SHARDS, [&](i64 i) { // There might be gaps between strings to satisfy alignment requirements. // If that's the case, we need to zero-clear them. if (this->shdr.sh_addralign > 1 && this->shdr.sh_addralign != this->shdr.sh_entsize) { memset(buf + shard_offsets[i], 0, shard_offsets[i + 1] - shard_offsets[i]); i64 j = map.NUM_SHARDS + i; memset(buf + shard_offsets[j], 0, shard_offsets[j + 1] - shard_offsets[j]); } // Copy strings for (i64 j = shard_size * i; j < shard_size * (i + 1); j++) if (const char *key = map.entries[j].key) if (SectionFragment &frag = map.entries[j].value; frag.is_alive) memcpy(buf + frag.offset, key, map.entries[j].keylen); }); } template void MergedSection::print_stats(Context &ctx) { i64 used = 0; for (i64 i = 0; i < map.nbuckets; i++) if (map.entries[i].key) used++; Out(ctx) << this->name << " estimation=" << estimator.get_cardinality() << " actual=" << used; } template void EhFrameSection::construct(Context &ctx) { Timer t(ctx, "eh_frame"); // Remove dead FDEs and assign them offsets within their corresponding // CIE group. tbb::parallel_for_each(ctx.objs, [](ObjectFile *file) { std::erase_if(file->fdes, [](FdeRecord &fde) { return !fde.is_alive; }); i64 offset = 0; for (FdeRecord &fde : file->fdes) { fde.output_offset = offset; offset += fde.size(*file); } file->fde_size = offset; }); // Uniquify CIEs and assign offsets to them. std::vector *> leaders; auto find_leader = [&](CieRecord &cie) -> CieRecord * { for (CieRecord *leader : leaders) if (cie_equals(*leader, cie)) return leader; return nullptr; }; i64 offset = 0; for (ObjectFile *file : ctx.objs) { for (CieRecord &cie : file->cies) { if (CieRecord *leader = find_leader(cie)) { cie.output_offset = leader->output_offset; } else { cie.output_offset = offset; cie.is_leader = true; offset += cie.size(); leaders.push_back(&cie); } } } // Assign FDE offsets to files. i64 idx = 0; for (ObjectFile *file : ctx.objs) { file->fde_idx = idx; idx += file->fdes.size(); file->fde_offset = offset; offset += file->fde_size; } // .eh_frame must end with a null word. this->shdr.sh_size = offset + 4; } // Write to .eh_frame and .eh_frame_hdr. template void EhFrameSection::copy_buf(Context &ctx) { u8 *base = ctx.buf + this->shdr.sh_offset; struct HdrEntry { I32 init_addr; I32 fde_addr; }; HdrEntry *eh_hdr = nullptr; if (ctx.eh_frame_hdr) eh_hdr = (HdrEntry *)(ctx.buf + ctx.eh_frame_hdr->shdr.sh_offset + EhFrameHdrSection::HEADER_SIZE); tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { // Copy CIEs. for (CieRecord &cie : file->cies) { if (!cie.is_leader) continue; std::string_view contents = cie.get_contents(); memcpy(base + cie.output_offset, contents.data(), contents.size()); if (ctx.arg.relocatable) continue; for (const ElfRel &rel : cie.get_rels()) { assert(rel.r_offset - cie.input_offset < contents.size()); Symbol &sym = *file->symbols[rel.r_sym]; u64 loc = cie.output_offset + rel.r_offset - cie.input_offset; u64 val = sym.get_addr(ctx) + get_addend(cie.input_section, rel); apply_eh_reloc(ctx, rel, loc, val); } } // Copy FDEs. for (i64 i = 0; i < file->fdes.size(); i++) { FdeRecord &fde = file->fdes[i]; std::span> rels = fde.get_rels(*file); i64 offset = file->fde_offset + fde.output_offset; std::string_view contents = fde.get_contents(*file); memcpy(base + offset, contents.data(), contents.size()); CieRecord &cie = file->cies[fde.cie_idx]; *(U32 *)(base + offset + 4) = offset + 4 - cie.output_offset; if (ctx.arg.relocatable) continue; for (i64 j = 0; j < rels.size(); j++) { const ElfRel &rel = rels[j]; assert(rel.r_offset - fde.input_offset < contents.size()); Symbol &sym = *file->symbols[rel.r_sym]; u64 loc = offset + rel.r_offset - fde.input_offset; u64 val = sym.get_addr(ctx) + get_addend(cie.input_section, rel); apply_eh_reloc(ctx, rel, loc, val); if (j == 0 && eh_hdr) { // Write to .eh_frame_hdr HdrEntry &ent = eh_hdr[file->fde_idx + i]; u64 origin = ctx.eh_frame_hdr->shdr.sh_addr; ent.init_addr = val - origin; ent.fde_addr = this->shdr.sh_addr + offset - origin; } } } }); // Write a terminator. *(U32 *)(base + this->shdr.sh_size - 4) = 0; // Sort .eh_frame_hdr contents. if (eh_hdr) { tbb::parallel_sort(eh_hdr, eh_hdr + ctx.eh_frame_hdr->num_fdes, [](const HdrEntry &a, const HdrEntry &b) { return a.init_addr < b.init_addr; }); } } template void EhFrameHdrSection::update_shdr(Context &ctx) { num_fdes = 0; for (ObjectFile *file : ctx.objs) num_fdes += file->fdes.size(); this->shdr.sh_size = HEADER_SIZE + num_fdes * 8; } template void EhFrameHdrSection::copy_buf(Context &ctx) { u8 *base = ctx.buf + this->shdr.sh_offset; // Write a header. The actual table is written by EhFrameSection::copy_buf. base[0] = 1; base[1] = DW_EH_PE_pcrel | DW_EH_PE_sdata4; base[2] = DW_EH_PE_udata4; base[3] = DW_EH_PE_datarel | DW_EH_PE_sdata4; *(U32 *)(base + 4) = ctx.eh_frame->shdr.sh_addr - this->shdr.sh_addr - 4; *(U32 *)(base + 8) = num_fdes; } template void EhFrameRelocSection::update_shdr(Context &ctx) { tbb::enumerable_thread_specific count; tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { for (CieRecord &cie : file->cies) if (cie.is_leader) count.local() += cie.get_rels().size(); for (FdeRecord &fde : file->fdes) count.local() += fde.get_rels(*file).size(); }); this->shdr.sh_size = count.combine(std::plus()) * sizeof(ElfRel); this->shdr.sh_link = ctx.symtab->shndx; this->shdr.sh_info = ctx.eh_frame->shndx; } template void EhFrameRelocSection::copy_buf(Context &ctx) { ElfRel *buf = (ElfRel *)(ctx.buf + this->shdr.sh_offset); auto copy = [&](ObjectFile &file, InputSection &isec, const ElfRel &r, u64 offset) { Symbol &sym = *file.symbols[r.r_sym]; memset(buf, 0, sizeof(*buf)); if (sym.esym().st_type == STT_SECTION) { // We discard section symbols in input files and re-create new // ones for each output section. So we need to adjust relocations' // addends if they refer a section symbol. InputSection *target = sym.get_input_section(); buf->r_sym = target->output_section->shndx; if constexpr (E::is_rela) buf->r_addend = get_addend(isec, r) + target->offset; else if (ctx.arg.relocatable) write_addend(ctx.buf + ctx.eh_frame->shdr.sh_offset + offset, get_addend(isec, r) + target->offset, r); } else { buf->r_sym = sym.get_output_sym_idx(ctx); if constexpr (E::is_rela) buf->r_addend = get_addend(isec, r); } buf->r_offset = ctx.eh_frame->shdr.sh_addr + offset; buf->r_type = r.r_type; buf++; }; for (ObjectFile *file : ctx.objs) { for (CieRecord &cie : file->cies) if (cie.is_leader) for (const ElfRel &rel : cie.get_rels()) copy(*file, cie.input_section, rel, cie.output_offset + rel.r_offset - cie.input_offset); for (FdeRecord &fde : file->fdes) { i64 offset = file->fde_offset + fde.output_offset; for (const ElfRel &rel : fde.get_rels(*file)) copy(*file, file->cies[fde.cie_idx].input_section, rel, offset + rel.r_offset - fde.input_offset); } } } template void CopyrelSection::add_symbol(Context &ctx, Symbol *sym) { assert(!ctx.arg.shared); if (sym->has_copyrel) return; if (!sym->file->is_dso) { assert(sym->esym().is_undef_weak()); Error(ctx) << *sym->file << ": cannot create a copy relocation for " << *sym <<"; recompile with -fPIE or -fPIC"; return; } if (sym->esym().st_visibility == STV_PROTECTED) { Error(ctx) << *sym->file << ": cannot create a copy relocation for protected symbol '" << *sym << "'; recompile with -fPIC"; return; } if (!ctx.arg.z_copyreloc) { Error(ctx) << "-z nocopyreloc: " << *sym->file << ": cannot create a copy relocation for symbol '" << *sym << "'; recompile with -fPIC"; return; } symbols.push_back(sym); SharedFile &file = *(SharedFile *)sym->file; i64 alignment = file.get_alignment(sym); u64 offset = align_to(this->shdr.sh_size, alignment); this->shdr.sh_size = offset + sym->esym().st_size; this->shdr.sh_addralign = std::max(alignment, this->shdr.sh_addralign); // We need to create dynamic symbols not only for this particular symbol // but also for its aliases (i.e. other symbols at the same address) // becasue otherwise the aliases are broken apart at runtime. // For example, `environ`, `_environ` and `__environ` in libc.so are // aliases. If one of the symbols is copied by a copy relocation, other // symbols have to refer to the copied place as well. for (Symbol *sym2 : file.get_symbols_at(sym)) { sym2->add_aux(ctx); sym2->is_imported = true; sym2->is_exported = true; sym2->has_copyrel = true; sym2->is_copyrel_readonly = this->is_relro; sym2->value = offset; ctx.dynsym->add_symbol(ctx, sym2); } } template void CopyrelSection::copy_buf(Context &ctx) { ElfRel *rel = (ElfRel *)(ctx.buf + ctx.reldyn->shdr.sh_offset + this->reldyn_offset); for (Symbol *sym : symbols) *rel++ = ElfRel(sym->get_addr(ctx), E::R_COPY, sym->get_dynsym_idx(ctx), 0); } // .gnu.version section contains version indices as a parallel array for // .dynsym. If a dynamic symbol is a defined one, its version information // is in .gnu.version_d. Otherwise, it's in .gnu.version_r. template void VersymSection::update_shdr(Context &ctx) { this->shdr.sh_size = contents.size() * sizeof(contents[0]); this->shdr.sh_link = ctx.dynsym->shndx; } template void VersymSection::copy_buf(Context &ctx) { write_vector(ctx.buf + this->shdr.sh_offset, contents); } // If `-z pack-relative-relocs` is specified, we'll create a .relr.dyn // section and store base relocation records to that section instead of // to the usual .rela.dyn section. // // .relr.dyn is relatively new feature and not supported by glibc until // 2.38 which was released in 2022. If we don't do anything, executables // built with `-z pack-relative-relocs` would just crash immediately on // startup with an older version of glibc. // // As a workaround, we'll add a dependency to a dummy version name // "GLIBC_ABI_DT_RELR" if `-z pack-relative-relocs` is given so that // executables built with the option failed with a more friendly "version // `GLIBC_ABI_DT_RELR' not found" error message. glibc 2.38 or later knows // about this dummy version name and simply ignores it. template static bool is_glibc2(SharedFile &file) { if (file.soname.starts_with("libc.so.")) for (std::string_view str : file.version_strings) if (str.starts_with("GLIBC_2.")) return true; return false; } template void VerneedSection::construct(Context &ctx) { Timer t(ctx, "fill_verneed"); // Create a list of versioned symbols and sort by file and version. std::vector *> syms; for (i64 i = 1; i < ctx.dynsym->symbols.size(); i++) { Symbol &sym = *ctx.dynsym->symbols[i]; if (sym.file->is_dso && VER_NDX_LAST_RESERVED < sym.ver_idx) syms.push_back(&sym); } if (syms.empty()) return; ranges::stable_sort(syms, {}, [](Symbol *x) { return std::tuple{((SharedFile *)x->file)->soname, x->ver_idx}; }); // Resize .gnu.version ctx.versym->contents.resize(ctx.dynsym->symbols.size(), VER_NDX_GLOBAL); ctx.versym->contents[0] = VER_NDX_LOCAL; // Allocate a large enough buffer for .gnu.version_r. contents.resize((sizeof(ElfVerneed) + sizeof(ElfVernaux)) * (syms.size() + 1)); // Fill .gnu.version_r. u8 *buf = (u8 *)&contents[0]; u8 *ptr = buf; ElfVerneed *verneed = nullptr; ElfVernaux *aux = nullptr; i64 veridx = VER_NDX_LAST_RESERVED + ctx.arg.version_definitions.size(); auto add_entry = [&](std::string_view verstr) { verneed->vn_cnt++; if (aux) aux->vna_next = sizeof(ElfVernaux); aux = (ElfVernaux *)ptr; ptr += sizeof(ElfVernaux); aux->vna_hash = elf_hash(verstr); aux->vna_other = ++veridx; aux->vna_name = ctx.dynstr->add_string(verstr); }; auto start_group = [&](SharedFile &file) { this->shdr.sh_info++; if (verneed) verneed->vn_next = ptr - (u8 *)verneed; verneed = (ElfVerneed *)ptr; ptr += sizeof(ElfVerneed); verneed->vn_version = 1; verneed->vn_file = ctx.dynstr->find_string(file.soname); verneed->vn_aux = sizeof(ElfVerneed); aux = nullptr; if (ctx.arg.pack_dyn_relocs_relr && is_glibc2(file)) add_entry("GLIBC_ABI_DT_RELR"); }; // Create version entries. for (i64 i = 0; i < syms.size(); i++) { if (i == 0 || syms[i - 1]->file != syms[i]->file) { start_group(*(SharedFile *)syms[i]->file); add_entry(syms[i]->get_version()); } else if (syms[i - 1]->ver_idx != syms[i]->ver_idx) { add_entry(syms[i]->get_version()); } ctx.versym->contents[syms[i]->get_dynsym_idx(ctx)] = veridx; } // Resize .gnu.version_r to fit to its contents. contents.resize(ptr - buf); } template void VerneedSection::update_shdr(Context &ctx) { this->shdr.sh_size = contents.size(); this->shdr.sh_link = ctx.dynstr->shndx; } template void VerneedSection::copy_buf(Context &ctx) { write_vector(ctx.buf + this->shdr.sh_offset, contents); } template void VerdefSection::construct(Context &ctx) { Timer t(ctx, "fill_verdef"); if (ctx.arg.version_definitions.empty()) return; // Handle --default-symver if (ctx.arg.default_symver) for (Symbol *sym : ctx.dynsym->symbols) if (sym && !sym->file->is_dso) if (u16 ver = sym->ver_idx; ver == VER_NDX_GLOBAL || ver == VER_NDX_UNSPECIFIED) sym->ver_idx = VER_NDX_LAST_RESERVED + 1; // Resize .gnu.version and write to it ctx.versym->contents.resize(ctx.dynsym->symbols.size(), VER_NDX_GLOBAL); ctx.versym->contents[0] = VER_NDX_LOCAL; for (Symbol *sym : ctx.dynsym->symbols) if (sym && !sym->file->is_dso && sym->ver_idx != VER_NDX_UNSPECIFIED) ctx.versym->contents[sym->get_dynsym_idx(ctx)] = sym->ver_idx; // Allocate a buffer for .gnu.version_d and write to it contents.resize((sizeof(ElfVerdef) + sizeof(ElfVerdaux)) * (ctx.arg.version_definitions.size() + 1)); u8 *ptr = (u8 *)contents.data(); ElfVerdef *verdef = nullptr; auto write = [&](std::string_view verstr, i64 idx, i64 flags) { this->shdr.sh_info++; if (verdef) verdef->vd_next = ptr - (u8 *)verdef; verdef = (ElfVerdef *)ptr; ptr += sizeof(ElfVerdef); verdef->vd_version = 1; verdef->vd_flags = flags; verdef->vd_ndx = idx; verdef->vd_cnt = 1; verdef->vd_hash = elf_hash(verstr); verdef->vd_aux = sizeof(ElfVerdef); ElfVerdaux *aux = (ElfVerdaux *)ptr; ptr += sizeof(ElfVerdaux); aux->vda_name = ctx.dynstr->add_string(verstr); }; std::string_view soname = ctx.arg.soname; if (soname.empty()) soname = save_string(ctx, path_filename(ctx.arg.output)); write(soname, 1, VER_FLG_BASE); i64 idx = VER_NDX_LAST_RESERVED + 1; for (std::string_view verstr : ctx.arg.version_definitions) write(verstr, idx++, 0); } template void VerdefSection::update_shdr(Context &ctx) { this->shdr.sh_size = contents.size(); this->shdr.sh_link = ctx.dynstr->shndx; } template void VerdefSection::copy_buf(Context &ctx) { write_vector(ctx.buf + this->shdr.sh_offset, contents); } template void BuildIdSection::update_shdr(Context &ctx) { this->shdr.sh_size = ctx.arg.build_id.size() + 16; // +16 for the header } template void BuildIdSection::copy_buf(Context &ctx) { U32 *base = (U32 *)(ctx.buf + this->shdr.sh_offset); memset(base, 0, this->shdr.sh_size); base[0] = 4; // Name size base[1] = ctx.arg.build_id.size(); // Hash size base[2] = NT_GNU_BUILD_ID; // Type memcpy(base + 3, "GNU", 4); // Name string write_vector(base + 4, contents); // Build ID } template void NotePackageSection::update_shdr(Context &ctx) { if (!ctx.arg.package_metadata.empty()) { // +17 is for the header and the NUL terminator this->shdr.sh_size = align_to(ctx.arg.package_metadata.size() + 17, 4); } } template void NotePackageSection::copy_buf(Context &ctx) { U32 *buf = (U32 *)(ctx.buf + this->shdr.sh_offset); memset(buf, 0, this->shdr.sh_size); buf[0] = 4; // Name size buf[1] = this->shdr.sh_size - 16; // Content size buf[2] = NT_FDO_PACKAGING_METADATA; // Type memcpy(buf + 3, "FDO", 4); // Name write_string(buf + 4, ctx.arg.package_metadata); // Content } // Merges input files' .note.gnu.property values. template void NotePropertySection::update_shdr(Context &ctx) { // Obtain the list of keys std::vector *> files = ctx.objs; std::erase(files, ctx.internal_obj); std::set keys; for (ObjectFile *file : files) for (std::pair kv : file->gnu_properties) keys.insert(kv.first); auto get_value = [](ObjectFile *file, u32 key) -> u32 { auto it = file->gnu_properties.find(key); if (it != file->gnu_properties.end()) return it->second; return 0; }; // Merge values for each key std::map map; for (u32 key : keys) { if (GNU_PROPERTY_X86_UINT32_AND_LO <= key && key <= GNU_PROPERTY_X86_UINT32_AND_HI) { // An AND feature is set if all input objects have the property and // the feature. map[key] = 0xffff'ffff; for (ObjectFile *file : files) map[key] &= get_value(file, key); } else if (GNU_PROPERTY_X86_UINT32_OR_LO <= key && key <= GNU_PROPERTY_X86_UINT32_OR_HI) { // An OR feature is set if some input object has the feature. for (ObjectFile *file : files) map[key] |= get_value(file, key); } else if (GNU_PROPERTY_X86_UINT32_OR_AND_LO <= key && key <= GNU_PROPERTY_X86_UINT32_OR_AND_HI) { // An OR-AND feature is set if all input object files have the property // and some of them has the feature. auto has_key = [&](ObjectFile *file) { return file->gnu_properties.contains(key); }; if (ranges::all_of(files, has_key)) for (ObjectFile *file : files) map[key] |= get_value(file, key); } } if (ctx.arg.z_ibt) map[GNU_PROPERTY_X86_FEATURE_1_AND] |= GNU_PROPERTY_X86_FEATURE_1_IBT; if (ctx.arg.z_shstk) map[GNU_PROPERTY_X86_FEATURE_1_AND] |= GNU_PROPERTY_X86_FEATURE_1_SHSTK; map[GNU_PROPERTY_X86_ISA_1_NEEDED] |= ctx.arg.z_x86_64_isa_level; // Serialize the map contents.clear(); for (std::pair kv : map) if (kv.second) contents.push_back({kv.first, 4, kv.second}); if (contents.empty()) this->shdr.sh_size = 0; else this->shdr.sh_size = 16 + contents.size() * sizeof(contents[0]); } template void NotePropertySection::copy_buf(Context &ctx) { U32 *buf = (U32 *)(ctx.buf + this->shdr.sh_offset); memset(buf, 0, this->shdr.sh_size); buf[0] = 4; // Name size buf[1] = this->shdr.sh_size - 16; // Content size buf[2] = NT_GNU_PROPERTY_TYPE_0; // Type memcpy(buf + 3, "GNU", 4); // Name write_vector(buf + 4, contents); // Content } template CompressedSection::CompressedSection(Context &ctx, Chunk &chunk) { // Allocate a temporary buffer to write uncompressed contents. Note // that we use u8[] instead of std::vector to avoid the cost of // zero-initialization, as sh_size can be very large. std::unique_ptr buf(new u8[chunk.shdr.sh_size]); // Write uncompressed contents and then compress them chunk.write_to(ctx, buf.get()); if (ctx.arg.compress_debug_sections == ELFCOMPRESS_ZLIB) compressor.reset(new ZlibCompressor(buf.get(), chunk.shdr.sh_size)); else compressor.reset(new ZstdCompressor(buf.get(), chunk.shdr.sh_size)); // Compute header field values chdr.ch_type = ctx.arg.compress_debug_sections; chdr.ch_size = chunk.shdr.sh_size; chdr.ch_addralign = chunk.shdr.sh_addralign; this->name = chunk.name; this->shndx = chunk.shndx; this->is_compressed = true; this->shdr = chunk.shdr; this->shdr.sh_flags |= SHF_COMPRESSED; this->shdr.sh_addralign = 1; this->shdr.sh_size = sizeof(chdr) + compressor->compressed_size; // We can discard the uncompressed contents unless --gdb-index is given if (ctx.arg.gdb_index) this->uncompressed_data = std::move(buf); } template void CompressedSection::copy_buf(Context &ctx) { u8 *base = ctx.buf + this->shdr.sh_offset; memcpy(base, &chdr, sizeof(chdr)); compressor->write_to(base + sizeof(chdr)); } template RelocSection::RelocSection(Context &ctx, OutputSection &osec) : output_section(osec) { if constexpr (E::is_rela) { this->name = save_string(ctx, ".rela" + std::string(osec.name)); this->shdr.sh_type = SHT_RELA; } else { this->name = save_string(ctx, ".rel" + std::string(osec.name)); this->shdr.sh_type = SHT_REL; } this->shdr.sh_flags = SHF_INFO_LINK; this->shdr.sh_addralign = sizeof(Word); this->shdr.sh_entsize = sizeof(ElfRel); // Compute an offset for each input section offsets.resize(osec.members.size()); auto scan = [&](const tbb::blocked_range &r, i64 sum, bool is_final) { for (i64 i = r.begin(); i < r.end(); i++) { InputSection &isec = *osec.members[i]; if (is_final) offsets[i] = sum; sum += isec.get_rels(ctx).size(); } return sum; }; i64 num_entries = tbb::parallel_scan( tbb::blocked_range(0, osec.members.size()), 0, scan, std::plus()); this->shdr.sh_size = num_entries * sizeof(ElfRel); } template void RelocSection::update_shdr(Context &ctx) { this->shdr.sh_link = ctx.symtab->shndx; this->shdr.sh_info = output_section.shndx; } template void RelocSection::copy_buf(Context &ctx) { auto get_symidx_addend = [&](InputSection &isec, const ElfRel &rel) -> std::pair { Symbol &sym = *isec.file.symbols[rel.r_sym]; if (!(isec.shdr().sh_flags & SHF_ALLOC)) { SectionFragment *frag; i64 frag_addend; std::tie(frag, frag_addend) = isec.get_fragment(ctx, rel); if (frag) return {frag->output_section.shndx, frag->offset + frag_addend}; } if (sym.esym().st_type == STT_SECTION) { if (SectionFragment *frag = sym.get_frag()) return {frag->output_section.shndx, frag->offset + sym.value + get_addend(isec, rel)}; InputSection *isec2 = sym.get_input_section(); if (OutputSection *osec = isec2->output_section) return {osec->shndx, get_addend(isec, rel) + isec2->offset}; // This is usually a dead debug section referring to a // COMDAT-eliminated section. return {0, 0}; } if (sym.write_to_symtab) return {sym.get_output_sym_idx(ctx), get_addend(isec, rel)}; return {0, 0}; }; auto write = [&](ElfRel &out, InputSection &isec, const ElfRel &rel) { i64 symidx; i64 addend; std::tie(symidx, addend) = get_symidx_addend(isec, rel); i64 r_offset = isec.output_section->shdr.sh_addr + isec.offset + rel.r_offset; out = ElfRel(r_offset, rel.r_type, symidx, addend); if (ctx.arg.relocatable) { u8 *base = ctx.buf + isec.output_section->shdr.sh_offset + isec.offset; write_addend(base + rel.r_offset, addend, rel); } }; tbb::parallel_for((i64)0, (i64)output_section.members.size(), [&](i64 i) { ElfRel *buf = (ElfRel *)(ctx.buf + this->shdr.sh_offset) + offsets[i]; InputSection &isec = *output_section.members[i]; std::span> rels = isec.get_rels(ctx); for (i64 j = 0; j < rels.size(); j++) write(buf[j], isec, rels[j]); }); } template void ComdatGroupSection::update_shdr(Context &ctx) { assert(ctx.arg.relocatable); this->shdr.sh_link = ctx.symtab->shndx; if (sym.esym().st_type == STT_SECTION) this->shdr.sh_info = sym.get_input_section()->output_section->shndx; else this->shdr.sh_info = sym.get_output_sym_idx(ctx); } template void ComdatGroupSection::copy_buf(Context &ctx) { U32 *buf = (U32 *)(ctx.buf + this->shdr.sh_offset); *buf++ = GRP_COMDAT; for (Chunk *chunk : members) *buf++ = chunk->shndx; } template void GnuDebuglinkSection::update_shdr(Context &ctx) { filename = path_filename(ctx.arg.separate_debug_file); this->shdr.sh_size = align_to(filename.size() + 1, 4) + 4; } template void GnuDebuglinkSection::copy_buf(Context &ctx) { u8 *buf = ctx.buf + this->shdr.sh_offset; memset(buf, 0, this->shdr.sh_size); write_string(buf, filename); *(U32 *)(buf + this->shdr.sh_size - 4) = crc32; } using E = MOLD_TARGET; template class Chunk; template class OutputEhdr; template class OutputShdr; template class OutputPhdr; template class InterpSection; template class OutputSection; template class GotSection; template class GotPltSection; template class PltSection; template class PltGotSection; template class RelPltSection; template class RelDynSection; template class RelrDynSection; template class StrtabSection; template class ShstrtabSection; template class DynstrSection; template class DynamicSection; template class SymtabSection; template class DynsymSection; template class HashSection; template class GnuHashSection; template class MergedSection; template class EhFrameSection; template class EhFrameHdrSection; template class EhFrameRelocSection; template class CopyrelSection; template class VersymSection; template class VerneedSection; template class VerdefSection; template class BuildIdSection; template class NotePackageSection; template class NotePropertySection; template class GdbIndexSection; template class CompressedSection; template class RelocSection; template class ComdatGroupSection; template class GnuDebuglinkSection; template Chunk *find_chunk(Context &, u32); template Chunk *find_chunk(Context &, std::string_view); template i64 to_phdr_flags(Context &ctx, Chunk *chunk); template std::optional> to_output_esym(Context &, Symbol &, u32, U32 *); } // namespace mold ================================================ FILE: src/output-file-unix.cc ================================================ #include "mold.h" #include #include #include #include #include #ifdef __linux__ # include # include #endif namespace mold { static u32 get_umask() { u32 orig_umask = umask(0); umask(orig_umask); return orig_umask; } template static int open_or_create_file(Context &ctx, std::string path, std::string tmpfile, int perm) { // Reuse an existing file if exists and writable because on Linux, // writing to an existing file is much faster than creating a fresh // file and writing to it. if (ctx.overwrite_output_file && rename(path.c_str(), tmpfile.c_str()) == 0) { i64 fd = ::open(tmpfile.c_str(), O_RDWR | O_CREAT, perm); if (fd != -1) return fd; unlink(tmpfile.c_str()); } i64 fd = ::open(tmpfile.c_str(), O_RDWR | O_CREAT, perm); if (fd == -1) Fatal(ctx) << "cannot open " << tmpfile << ": " << errno_string(); return fd; } template class MemoryMappedOutputFile : public OutputFile { public: MemoryMappedOutputFile(Context &ctx, std::string path, i64 filesize, int perm) : OutputFile(path, filesize, true) { std::string pid = std::to_string(getpid()); std::string tmpfile = path_dirname(path) / ("." + path_filename(path) + "." + pid); this->fd = open_or_create_file(ctx, path, tmpfile, perm); if (fchmod(this->fd, perm & ~get_umask()) == -1) Fatal(ctx) << "fchmod failed: " << errno_string(); if (ftruncate(this->fd, filesize) == -1) Fatal(ctx) << "ftruncate failed: " << errno_string(); output_tmpfile = (char *)save_string(ctx, tmpfile).data(); #ifdef __linux__ // Calling falllocate speeds up later linking passes on ext4, // while it just takes time with not benefits on tmpfs. if (struct statfs fs; fstatfs(this->fd, &fs) || fs.f_type != TMPFS_MAGIC) fallocate(this->fd, 0, 0, filesize); #endif this->buf = (u8 *)mmap(nullptr, filesize, PROT_READ | PROT_WRITE, MAP_SHARED, this->fd, 0); if (this->buf == MAP_FAILED) Fatal(ctx) << path << ": mmap failed: " << errno_string(); mold::output_buffer_start = this->buf; mold::output_buffer_end = this->buf + filesize; } ~MemoryMappedOutputFile() { if (fd2 != -1) ::close(fd2); } void close(Context &ctx) override { Timer t(ctx, "close_file"); if (!this->is_unmapped) munmap(this->buf, this->filesize); if (!this->buf2) { ::close(this->fd); } else { FILE *out = fdopen(this->fd, "w"); fseek(out, 0, SEEK_END); fwrite(this->buf2, this->buf2_size, 1, out); fclose(out); } // If an output file already exists, open a file and then remove it. // This is the fastest way to unlink a file, as it does not make the // system to immediately release disk blocks occupied by the file. fd2 = ::open(this->path.c_str(), O_RDONLY); if (fd2 != -1) unlink(this->path.c_str()); if (rename(output_tmpfile, this->path.c_str()) == -1) Fatal(ctx) << this->path << ": rename failed: " << errno_string(); output_tmpfile = nullptr; } private: int fd2 = -1; }; template std::unique_ptr> OutputFile::open(Context &ctx, std::string path, i64 filesize, int perm) { Timer t(ctx, "open_file"); if (path.starts_with('/') && !ctx.arg.chroot.empty()) path = ctx.arg.chroot + "/" + path_clean(path); std::error_code error; bool is_special = path == "-" || (!std::filesystem::is_regular_file(path, error) && !error); OutputFile *file; if (is_special) file = new MallocOutputFile(ctx, path, filesize, perm); else file = new MemoryMappedOutputFile(ctx, path, filesize, perm); #ifdef MADV_HUGEPAGE // Enable transparent huge page for an output memory-mapped file. // On Linux, it has an effect only on tmpfs mounted with `huge=advise`, // but it can make the linker ~10% faster. You can try it by creating // a tmpfs with the following commands // // $ mkdir tmp // $ sudo mount -t tmpfs -o size=2G,huge=advise none tmp // // and then specifying a path under the directory as an output file. madvise(file->buf, filesize, MADV_HUGEPAGE); #endif if (ctx.arg.filler != -1) memset(file->buf, ctx.arg.filler, filesize); return std::unique_ptr(file); } // LockingOutputFile is similar to MemoryMappedOutputFile, but it doesn't // rename output files and instead acquires file lock using flock(). template LockingOutputFile::LockingOutputFile(Context &ctx, std::string path, int perm) : OutputFile(path, 0, true) { this->fd = ::open(path.c_str(), O_RDWR | O_CREAT, perm); if (this->fd == -1) Fatal(ctx) << "cannot open " << path << ": " << errno_string(); flock(this->fd, LOCK_EX); // We may be overwriting to an existing debug info file. We want to // make the file unusable so that gdb won't use it by accident until // it's ready. u8 buf[256] = {}; (void)!!write(this->fd, buf, sizeof(buf)); } template void LockingOutputFile::resize(Context &ctx, i64 filesize) { if (ftruncate(this->fd, filesize) == -1) Fatal(ctx) << "ftruncate failed: " << errno_string(); this->buf = (u8 *)mmap(nullptr, filesize, PROT_READ | PROT_WRITE, MAP_SHARED, this->fd, 0); if (this->buf == MAP_FAILED) Fatal(ctx) << this->path << ": mmap failed: " << errno_string(); this->filesize = filesize; mold::output_buffer_start = this->buf; mold::output_buffer_end = this->buf + filesize; } template void LockingOutputFile::close(Context &ctx) { if (!this->is_unmapped) munmap(this->buf, this->filesize); if (this->buf2) { FILE *out = fdopen(this->fd, "w"); fseek(out, 0, SEEK_END); fwrite(this->buf2, this->buf2_size, 1, out); fclose(out); } ::close(this->fd); } using E = MOLD_TARGET; template class OutputFile; template class LockingOutputFile; } // namespace mold ================================================ FILE: src/output-file-win32.cc ================================================ #include "mold.h" #include #include namespace mold { template class MemoryMappedOutputFile : public OutputFile { public: MemoryMappedOutputFile(Context &ctx, std::string path, i64 filesize, int perm) : OutputFile(path, filesize, true) { // TODO: use intermediate temporary file for output. DWORD attrs = (perm & 0200) ? FILE_ATTRIBUTE_NORMAL : FILE_ATTRIBUTE_READONLY; handle = CreateFileA(path.c_str(), GENERIC_READ | GENERIC_WRITE, FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, nullptr, CREATE_ALWAYS, attrs, nullptr); if (handle == INVALID_HANDLE_VALUE) Fatal(ctx) << "cannot open " << path << ": " << GetLastError(); HANDLE map = CreateFileMapping(handle, nullptr, PAGE_READWRITE, 0, filesize, nullptr); if (!map) Fatal(ctx) << path << ": CreateFileMapping failed: " << GetLastError(); this->buf = (u8 *)MapViewOfFile(map, FILE_MAP_WRITE, 0, 0, filesize); if (!this->buf) Fatal(ctx) << path << ": MapViewOfFile failed: " << GetLastError(); CloseHandle(map); mold::output_buffer_start = this->buf; mold::output_buffer_end = this->buf + filesize; } ~MemoryMappedOutputFile() { if (handle != INVALID_HANDLE_VALUE) CloseHandle(handle); } void close(Context &ctx) override { Timer t(ctx, "close_file"); UnmapViewOfFile(this->buf); if (this->buf2) { if (SetFilePointer(handle, 0, nullptr, FILE_END) == INVALID_SET_FILE_POINTER) Fatal(ctx) << this->path << ": SetFilePointer failed: " << GetLastError(); DWORD written; if (!WriteFile(handle, this->buf2, this->buf2_size, &written, nullptr)) Fatal(ctx) << this->path << ": WriteFile failed: " << GetLastError(); } CloseHandle(handle); handle = INVALID_HANDLE_VALUE; } private: HANDLE handle; }; template std::unique_ptr> OutputFile::open(Context &ctx, std::string path, i64 filesize, int perm) { Timer t(ctx, "open_file"); if (path.starts_with('/') && !ctx.arg.chroot.empty()) path = ctx.arg.chroot + "/" + path_clean(path); bool is_special = false; if (path == "-") { is_special = true; } else { HANDLE h = CreateFileA(path.c_str(), GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, nullptr, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr); if (h != INVALID_HANDLE_VALUE) { if (GetFileType(h) != FILE_TYPE_DISK) is_special = true; CloseHandle(h); } } OutputFile *file; if (is_special) file = new MallocOutputFile(ctx, path, filesize, perm); else file = new MemoryMappedOutputFile(ctx, path, filesize, perm); if (ctx.arg.filler != -1) memset(file->buf, ctx.arg.filler, filesize); return std::unique_ptr>(file); } template LockingOutputFile::LockingOutputFile(Context &ctx, std::string path, int perm) : OutputFile(path, 0, true) { Fatal(ctx) << "LockingOutputFile is not supported on Windows"; } template void LockingOutputFile::resize(Context &ctx, i64 filesize) {} template void LockingOutputFile::close(Context &ctx) {} using E = MOLD_TARGET; template class OutputFile; template class LockingOutputFile; } // namespace mold ================================================ FILE: src/passes.cc ================================================ #include "config.h" #include "mold.h" #include #include #include #include #include #include #include #include #include #include namespace mold { // Since mold_main is a template, we can't run it without a type parameter. // We speculatively run mold_main with X86_64, and if the speculation was // wrong, re-run it with an actual machine type. template int redo_main(std::string_view target, int argc, char **argv) { if constexpr (HAVE_TARGET_X86_64) if (target == X86_64::name) return mold_main(argc, argv); if constexpr (HAVE_TARGET_I386) if (target == I386::name) return mold_main(argc, argv); if constexpr (HAVE_TARGET_ARM64LE) if (target == ARM64LE::name) return mold_main(argc, argv); if constexpr (HAVE_TARGET_ARM64BE) if (target == ARM64BE::name) return mold_main(argc, argv); if constexpr (HAVE_TARGET_ARM32LE) if (target == ARM32LE::name) return mold_main(argc, argv); if constexpr (HAVE_TARGET_ARM32BE) if (target == ARM32BE::name) return mold_main(argc, argv); if constexpr (HAVE_TARGET_RV64LE) if (target == RV64LE::name) return mold_main(argc, argv); if constexpr (HAVE_TARGET_RV64BE) if (target == RV64BE::name) return mold_main(argc, argv); if constexpr (HAVE_TARGET_RV32LE) if (target == RV32LE::name) return mold_main(argc, argv); if constexpr (HAVE_TARGET_RV32BE) if (target == RV32BE::name) return mold_main(argc, argv); if constexpr (HAVE_TARGET_PPC32) if (target == PPC32::name) return mold_main(argc, argv); if constexpr (HAVE_TARGET_PPC64V1) if (target == PPC64V1::name) return mold_main(argc, argv); if constexpr (HAVE_TARGET_PPC64V2) if (target == PPC64V2::name) return mold_main(argc, argv); if constexpr (HAVE_TARGET_S390X) if (target == S390X::name) return mold_main(argc, argv); if constexpr (HAVE_TARGET_SPARC64) if (target == SPARC64::name) return mold_main(argc, argv); if constexpr (HAVE_TARGET_M68K) if (target == M68K::name) return mold_main(argc, argv); if constexpr (HAVE_TARGET_SH4LE) if (target == SH4LE::name) return mold_main(argc, argv); if constexpr (HAVE_TARGET_SH4BE) if (target == SH4BE::name) return mold_main(argc, argv); if constexpr (HAVE_TARGET_LOONGARCH32) if (target == LOONGARCH32::name) return mold_main(argc, argv); if constexpr (HAVE_TARGET_LOONGARCH64) if (target == LOONGARCH64::name) return mold_main(argc, argv); abort(); } template void apply_exclude_libs(Context &ctx) { Timer t(ctx, "apply_exclude_libs"); std::unordered_set &set = ctx.arg.exclude_libs; if (!set.empty()) for (ObjectFile *file : ctx.objs) if (!file->archive_name.empty()) if (set.contains(path_filename(file->archive_name)) || set.contains("ALL")) file->exclude_libs = true; } template static bool has_debug_info_section(Context &ctx) { for (ObjectFile *file : ctx.objs) if (file->debug_info) return true; return false; } template void create_synthetic_sections(Context &ctx) { auto push = [&](auto *x) { ctx.chunks.push_back(x); ctx.chunk_pool.emplace_back(x); return x; }; if (!ctx.arg.oformat_binary) { auto find = [&](std::string_view name) { for (SectionOrder &ord : ctx.arg.section_order) if (ord.type == SectionOrder::SECTION && ord.name == name) return true; return false; }; if (ctx.arg.section_order.empty() || find("EHDR")) ctx.ehdr = push(new OutputEhdr(SHF_ALLOC)); else ctx.ehdr = push(new OutputEhdr(0)); if (ctx.arg.section_order.empty() || find("PHDR")) ctx.phdr = push(new OutputPhdr(SHF_ALLOC)); else ctx.phdr = push(new OutputPhdr(0)); if (ctx.arg.z_sectionheader) ctx.shdr = push(new OutputShdr); } ctx.got = push(new GotSection); if constexpr (!is_sparc) ctx.gotplt = push(new GotPltSection(ctx)); ctx.reldyn = push(new RelDynSection); ctx.relplt = push(new RelPltSection); if (ctx.arg.pack_dyn_relocs_relr) ctx.relrdyn = push(new RelrDynSection); ctx.strtab = push(new StrtabSection); ctx.plt = push(new PltSection); ctx.pltgot = push(new PltGotSection); ctx.symtab = push(new SymtabSection); ctx.dynsym = push(new DynsymSection); ctx.dynstr = push(new DynstrSection); ctx.eh_frame = push(new EhFrameSection); ctx.copyrel = push(new CopyrelSection(false)); ctx.copyrel_relro = push(new CopyrelSection(true)); if (ctx.shdr) ctx.shstrtab = push(new ShstrtabSection); if (!ctx.arg.dynamic_linker.empty()) ctx.interp = push(new InterpSection); if (ctx.arg.build_id.kind != BuildId::NONE) ctx.buildid = push(new BuildIdSection); if (ctx.arg.eh_frame_hdr) ctx.eh_frame_hdr = push(new EhFrameHdrSection); if (ctx.arg.gdb_index && has_debug_info_section(ctx)) ctx.gdb_index = push(new GdbIndexSection); if (ctx.arg.z_relro && ctx.arg.section_order.empty()) ctx.relro_padding = push(new RelroPaddingSection); if (ctx.arg.hash_style_sysv) ctx.hash = push(new HashSection); if (ctx.arg.hash_style_gnu) ctx.gnu_hash = push(new GnuHashSection); if (!ctx.arg.version_definitions.empty()) ctx.verdef = push(new VerdefSection); if (ctx.arg.emit_relocs) ctx.eh_frame_reloc = push(new EhFrameRelocSection); if (!ctx.arg.separate_debug_file.empty()) ctx.gnu_debuglink = push(new GnuDebuglinkSection); if (ctx.arg.shared || !ctx.dsos.empty() || ctx.arg.pie) { ctx.dynamic = push(new DynamicSection(ctx)); // If .dynamic exists, .dynsym and .dynstr must exist as well // since .dynamic refers to them. ctx.dynstr->add_string(""); ctx.dynsym->symbols.resize(1); } ctx.versym = push(new VersymSection); ctx.verneed = push(new VerneedSection); ctx.note_package = push(new NotePackageSection); if (!ctx.arg.oformat_binary) { ElfShdr shdr = {}; shdr.sh_type = SHT_PROGBITS; shdr.sh_flags = SHF_MERGE | SHF_STRINGS; ctx.comment = MergedSection::get_instance(ctx, ".comment", shdr); } if constexpr (is_x86) ctx.extra.note_property = push(new NotePropertySection); if constexpr (is_riscv) ctx.extra.riscv_attributes = push(new RiscvAttributesSection); if constexpr (is_ppc64v1) ctx.extra.opd = push(new PPC64OpdSection); if constexpr (is_ppc64v2) ctx.extra.save_restore = push(new PPC64SaveRestoreSection); } template static void mark_live_objects(Context &ctx) { for (Symbol *sym : ctx.arg.undefined) if (sym->file) sym->file->is_reachable = true; for (Symbol *sym : ctx.arg.require_defined) if (sym->file) sym->file->is_reachable = true; if (!ctx.arg.undefined_glob.empty()) { tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { if (!file->is_reachable) { for (Symbol *sym : file->get_global_syms()) { if (sym->file == file && ctx.arg.undefined_glob.find(sym->name()) != -1) { file->is_reachable = true; sym->gc_root = true; break; } } } }); } std::vector *> roots; append(roots, ctx.objs); append(roots, ctx.dsos); for (InputFile *file : roots) if (!file->as_needed) file->is_reachable = true; std::erase_if(roots, [](InputFile *file) { return !file->is_reachable; }); tbb::parallel_for_each(roots, [&](InputFile *file, tbb::feeder *> &feeder) { file->mark_live_objects(ctx, [&](InputFile *obj) { feeder.add(obj); }); }); } // Symbol resolution involving a default symbol version is tricky because // a symbol that provides the default version has two names by which it // can be referred. Specifically, a symbol `foo` with the default version // `VER1` can be referred to either as `foo` or `foo@VER1`. No other // symbols have two names like that. // // By default, we insert symbols with a default version without an at-sign // (i.e. `foo` instead of `foo@VER1`) into our internal symbol table. // Therefore, if the symbol is referenced with an at-sign (i.e. // `foo@VER1`), the reference fails to resolve. This function corrects // that error. // // In this function, we check all unresolved versioned symbols of the form // `foo@VER1` by removing the version part and see if `foo` has version // `VER1`. If it does, that's the symbol we are looking for. template static void resolve_default_symver(Context &ctx) { std::vector *> files; append(files, ctx.objs); append(files, ctx.dsos); tbb::parallel_for_each(files, [](InputFile *file) { std::span *> syms = file->get_global_syms(); for (i64 i = 0; i < syms.size(); i++) if (syms[i]->is_versioned_default) syms[i] = (Symbol *)syms[i]->origin; }); } template static void clear_symbols(Context &ctx) { std::vector *> files; append(files, ctx.objs); append(files, ctx.dsos); tbb::parallel_for_each(files, [](InputFile *file) { for (Symbol *sym : file->get_global_syms()) { if (__atomic_load_n(&sym->file, __ATOMIC_ACQUIRE) == file) { sym->origin = 0; sym->value = -1; sym->sym_idx = -1; sym->ver_idx = VER_NDX_UNSPECIFIED; sym->is_weak = false; sym->is_imported = false; sym->is_exported = false; __atomic_store_n(&sym->file, nullptr, __ATOMIC_RELEASE); } } }); } template void resolve_symbols(Context &ctx) { Timer t(ctx, "resolve_symbols"); std::vector *> files; append(files, ctx.objs); append(files, ctx.dsos); for (;;) { // Call resolve_symbols() to find the most appropriate file for each // symbol. And then mark reachable objects to decide which files to // include into an output. tbb::parallel_for_each(files, [&](InputFile *file) { file->resolve_symbols(ctx); }); resolve_default_symver(ctx); mark_live_objects(ctx); // Symbols with hidden visibility need to be resolved within the // output file. If a hidden symbol was resolved to a DSO, we'll redo // symbol resolution from scratch with the flag to skip that symbol // next time. This should be rare. std::atomic_bool redo = false; tbb::parallel_for_each(ctx.dsos, [&](SharedFile *file) { if (file->is_reachable) { for (Symbol *sym : file->symbols) { if (sym->file == file && sym->visibility == STV_HIDDEN) { sym->skip_dso = true; redo = true; } } } }); if (!redo) break; clear_symbols(ctx); } // Now that we know the exact set of input files that are to be // included in the output file, we want to redo symbol resolution. // This is because symbols defined by object files in archive files // may have risen as a result of mark_live_objects(). // // To redo symbol resolution, we want to clear the state first. clear_symbols(ctx); // COMDAT elimination needs to happen exactly here. // // It needs to be after archive extraction, otherwise we might // assign COMDAT leader to an archive member that is not supposed to // be extracted. // // It needs to happen before the final symbol resolution, otherwise // we could eliminate a symbol that is already resolved to and cause // dangling references. tbb::parallel_for_each(ctx.objs, [](ObjectFile *file) { if (file->is_reachable) { for (ComdatGroupRef &ref : file->comdat_groups) update_minimum(ref.group->owner, file->priority); for (ComdatGroup *g : file->lto_comdat_groups) if (g) update_minimum(g->owner, file->priority); } }); tbb::parallel_for_each(ctx.objs, [](ObjectFile *file) { if (file->is_reachable) for (ComdatGroupRef &ref : file->comdat_groups) if (ref.group->owner != file->priority) for (u32 i : ref.members) if (InputSection *isec = file->sections[i].get()) isec->is_alive = false; }); // Redo symbol resolution tbb::parallel_for_each(files, [&](InputFile *file) { if (file->is_reachable) file->resolve_symbols(ctx); }); } // Do link-time optimization. We pass all IR object files to the compiler // backend to compile them into a few ELF object files. template void do_lto(Context &ctx) { Timer t(ctx, "do_lto"); // The compiler backend needs to know how symbols are resolved, so // compute symbol visibility, import/export bits, etc early. apply_version_script(ctx); parse_symbol_version(ctx); compute_import_export(ctx); // If multiple IR object files define the same symbol, the LTO backend // would choose one of them randomly instead of reporting an error. // So we need to check for symbol duplication error before doing an LTO. if (!ctx.arg.allow_multiple_definition) check_duplicate_symbols(ctx); // Invoke the LTO plugin. This step compiles IR object files into a few // big ELF files. std::vector *> lto_objs = run_lto_plugin(ctx); append(ctx.objs, lto_objs); // Redo name resolution. clear_symbols(ctx); // Remove IR object files. for (ObjectFile *file : ctx.objs) if (file->is_lto_obj) file->is_reachable = false; std::erase_if(ctx.objs, [](ObjectFile *file) { return file->is_lto_obj; }); resolve_symbols(ctx); } template void parse_eh_frame_sections(Context &ctx) { Timer t(ctx, "parse_eh_frame_sections"); tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { file->parse_ehframe(ctx); }); } template void create_merged_sections(Context &ctx) { Timer t(ctx, "create_merged_sections"); // Convert InputSections to MergeableSections. tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { file->convert_mergeable_sections(ctx); }); tbb::parallel_for_each(ctx.merged_sections, [&](std::unique_ptr> &sec) { if (sec->shdr.sh_flags & SHF_ALLOC) sec->resolve(ctx); }); tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { file->reattach_section_pieces(ctx); }); } template void convert_common_symbols(Context &ctx) { Timer t(ctx, "convert_common_symbols"); tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { file->convert_common_symbols(ctx); }); } template static bool has_ctors_and_init_array(Context &ctx) { return ranges::any_of(ctx.objs, [](ObjectFile *x) { return x->has_ctors; }) && ranges::any_of(ctx.objs, [](ObjectFile *x) { return x->has_init_array; }); } template static u64 canonicalize_type(std::string_view name, u64 type) { // Some old assemblers don't recognize these section names and // create them as SHT_PROGBITS. if (type == SHT_PROGBITS) { if (name == ".init_array" || name.starts_with(".init_array.")) return SHT_INIT_ARRAY; if (name == ".fini_array" || name.starts_with(".fini_array.")) return SHT_FINI_ARRAY; } // The x86-64 psABI defines SHT_X86_64_UNWIND for .eh_frame, allowing // the linker to recognize the section not by name but by section type. // However, that spec change was generally considered a mistake; it has // just complicated the situation. As a result, .eh_frame on x86-64 may // be either SHT_PROGBITS or SHT_X86_64_UNWIND. We use SHT_PROGBITS // consistently. if constexpr (is_x86_64) if (type == SHT_X86_64_UNWIND) return SHT_PROGBITS; return type; } struct OutputSectionKey { bool operator==(const OutputSectionKey &) const = default; std::string_view name; u64 type; struct Hash { size_t operator()(const OutputSectionKey &k) const { return combine_hash(hash_string(k.name), std::hash{}(k.type)); } }; }; template static std::string_view get_output_name(Context &ctx, std::string_view name, u64 flags) { if (ctx.arg.relocatable && !ctx.arg.relocatable_merge_sections) return name; if (!ctx.arg.unique.empty() && ctx.arg.unique.find(name) != -1) return name; if (flags & SHF_MERGE) return name; if constexpr (is_arm32) { if (name.starts_with(".ARM.exidx")) return ".ARM.exidx"; if (name.starts_with(".ARM.extab")) return ".ARM.extab"; } if (ctx.arg.z_keep_text_section_prefix) { static std::string_view prefixes[] = { ".text.hot.", ".text.unknown.", ".text.unlikely.", ".text.startup.", ".text.exit." }; for (std::string_view prefix : prefixes) { std::string_view stem = prefix.substr(0, prefix.size() - 1); if (name == stem || name.starts_with(prefix)) return stem; } } static std::string_view prefixes[] = { ".text.", ".data.rel.ro.", ".data.", ".rodata.", ".bss.rel.ro.", ".bss.", ".init_array.", ".fini_array.", ".tbss.", ".tdata.", ".gcc_except_table.", ".ctors.", ".dtors.", ".gnu.warning.", ".openbsd.randomdata.", ".sdata.", ".sbss.", ".srodata", ".gnu.build.attributes.", }; for (std::string_view prefix : prefixes) { std::string_view stem = prefix.substr(0, prefix.size() - 1); if (name == stem || name.starts_with(prefix)) return stem; } return name; } template static OutputSectionKey get_output_section_key(Context &ctx, InputSection &isec, bool ctors_in_init_array) { // If .init_array/.fini_array exist, .ctors/.dtors must be merged // with them. // // CRT object files contain .ctors/.dtors sections without any // relocations. They contain sentinel values, 0 and -1, to mark the // beginning and the end of the initializer/finalizer pointer arrays. // We do not place them into .init_array/.fini_array because such // invalid pointer values would simply make the program to crash. if (ctors_in_init_array && !isec.get_rels(ctx).empty()) { std::string_view name = isec.name(); if (name == ".ctors" || name.starts_with(".ctors.")) return {".init_array", SHT_INIT_ARRAY}; if (name == ".dtors" || name.starts_with(".dtors.")) return {".fini_array", SHT_FINI_ARRAY}; } const ElfShdr &shdr = isec.shdr(); std::string_view name = get_output_name(ctx, isec.name(), shdr.sh_flags); u64 type = canonicalize_type(name, shdr.sh_type); return {name, type}; } template static bool is_relro(OutputSection &osec) { // PT_GNU_RELRO segment is a security mechanism to make more pages // read-only than we could have done without it. // // Traditionally, sections are either read-only or read-write. If a // section contains dynamic relocations, it must have been put into a // read-write segment so that the program loader can mutate its // contents in memory, even if no one will write to it at runtime. // // RELRO segment allows us to make such pages writable only when a // program is being loaded. After that, the page becomes read-only. // // Some sections, such as .init, .fini, .got, .dynamic, contain // dynamic relocations but doesn't have to be writable at runtime, // so they are put into a RELRO segment. std::string_view name = osec.name; u32 type = osec.shdr.sh_type; u32 flags = osec.shdr.sh_flags; return name == ".toc" || name.ends_with(".rel.ro") || name.ends_with(".rel.ro.hot") || name.ends_with(".rel.ro.unlikely") || type == SHT_INIT_ARRAY || type == SHT_FINI_ARRAY || type == SHT_PREINIT_ARRAY || (flags & SHF_TLS); } // Create output sections for input sections. // // Since one output section could contain millions of input sections, // we need to do it efficiently. template void create_output_sections(Context &ctx) { Timer t(ctx, "create_output_sections"); using MapType = std::unordered_map *, OutputSectionKey::Hash>; MapType map; std::shared_mutex mu; bool ctors_in_init_array = has_ctors_and_init_array(ctx); tbb::enumerable_thread_specific caches; // Instantiate output sections tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { // Make a per-thread cache of the main map to avoid lock contention. // It makes a noticeable difference if we have millions of input sections. MapType &cache = caches.local(); for (std::unique_ptr> &isec : file->sections) { if (!isec || !isec->is_alive) continue; const ElfShdr &shdr = isec->shdr(); u32 sh_flags = shdr.sh_flags & ~SHF_MERGE & ~SHF_STRINGS & ~SHF_COMPRESSED & ~SHF_GNU_RETAIN; if (ctx.arg.relocatable && (sh_flags & SHF_GROUP)) { OutputSection *osec = new OutputSection(isec->name(), shdr.sh_type); osec->sh_flags = sh_flags; isec->output_section = osec; ctx.osec_pool.emplace_back(osec); continue; } auto get_or_insert = [&] { OutputSectionKey key = get_output_section_key(ctx, *isec, ctors_in_init_array); if (auto it = cache.find(key); it != cache.end()) return it->second; { std::shared_lock lock(mu); if (auto it = map.find(key); it != map.end()) { cache.insert({key, it->second}); return it->second; } } std::unique_ptr> osec = std::make_unique>(key.name, key.type); std::unique_lock lock(mu); auto [it, inserted] = map.insert({key, osec.get()}); if (inserted) ctx.osec_pool.emplace_back(std::move(osec)); cache.insert({key, it->second}); return it->second; }; OutputSection *osec = get_or_insert(); sh_flags &= ~SHF_GROUP; if ((osec->sh_flags & sh_flags) != sh_flags) osec->sh_flags |= sh_flags; isec->output_section = osec; } }); // Add input sections to output sections for (std::unique_ptr> &osec : ctx.osec_pool) osec->members_vec.resize(ctx.objs.size()); tbb::parallel_for((i64)0, (i64)ctx.objs.size(), [&](i64 i) { for (std::unique_ptr> &isec : ctx.objs[i]->sections) if (isec && isec->output_section) isec->output_section->members_vec[i].push_back(isec.get()); }); // Compute section alignment for (std::unique_ptr> &osec : ctx.osec_pool) { Atomic p2align; tbb::parallel_for((i64)0, (i64)ctx.objs.size(), [&](i64 i) { u32 x = 0; for (InputSection *isec : osec->members_vec[i]) x = std::max(x, isec->p2align); update_maximum(p2align, x); }); osec->shdr.sh_addralign = 1 << p2align; } for (std::unique_ptr> &osec : ctx.osec_pool) { osec->shdr.sh_flags = osec->sh_flags; osec->is_relro = is_relro(*osec); osec->members = flatten(osec->members_vec); osec->members_vec.clear(); osec->members_vec.shrink_to_fit(); } // Add output sections and mergeable sections to ctx.chunks std::vector *> chunks; for (std::unique_ptr> &osec : ctx.osec_pool) chunks.push_back(osec.get()); for (std::unique_ptr> &osec : ctx.merged_sections) chunks.push_back(osec.get()); // Sections are added to the section lists in an arbitrary order // because they are created in parallel. Sort them to to make the // output deterministic. tbb::parallel_sort(chunks.begin(), chunks.end(), [](Chunk *x, Chunk *y) { return std::tuple(x->name, x->shdr.sh_type, x->shdr.sh_flags) < std::tuple(y->name, y->shdr.sh_type, y->shdr.sh_flags); }); append(ctx.chunks, chunks); } // Create a dummy object file containing linker-synthesized // symbols. template void create_internal_file(Context &ctx) { ObjectFile *obj = new ObjectFile; ctx.obj_pool.emplace_back(obj); ctx.internal_obj = obj; ctx.objs.push_back(obj); // Create linker-synthesized symbols. ctx.internal_esyms.resize(1); obj->symbols.push_back(new Symbol); obj->first_global = 1; obj->is_reachable = true; obj->priority = 1; auto add = [&](Symbol *sym) { obj->symbols.push_back(sym); // An actual value will be set to a linker-synthesized symbol by // fix_synthetic_symbols(). Until then, `value` doesn't have a valid // value. 0xdeadbeef is a unique dummy value to make debugging easier // if the field is accidentally used before it gets a valid one. sym->value = 0xdeadbeef; ElfSym esym; memset(&esym, 0, sizeof(esym)); esym.st_type = STT_NOTYPE; esym.st_shndx = SHN_ABS; esym.st_bind = STB_GLOBAL; esym.st_visibility = STV_DEFAULT; ctx.internal_esyms.push_back(esym); }; // Add --defsym'd symbols for (i64 i = 0; i < ctx.arg.defsyms.size(); i++) add(ctx.arg.defsyms[i].first); // Add --section-order symbols for (SectionOrder &ord : ctx.arg.section_order) if (ord.type == SectionOrder::SYMBOL) add(get_symbol(ctx, ord.name)); obj->elf_syms = ctx.internal_esyms; } template static std::optional get_start_stop_name(Context &ctx, Chunk &chunk) { if ((chunk.shdr.sh_flags & SHF_ALLOC) && !chunk.name.empty()) { if (is_c_identifier(chunk.name)) return std::string(chunk.name); if (ctx.arg.start_stop) { auto isalnum = [](char c) { return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9'); }; std::string s{chunk.name}; if (s.starts_with('.')) s = s.substr(1); for (i64 i = 0; i < s.size(); i++) if (!isalnum(s[i])) s[i] = '_'; return s; } } return {}; } template void add_synthetic_symbols(Context &ctx) { ObjectFile &obj = *ctx.internal_obj; auto add = [&](std::string_view name, u32 type = STT_NOTYPE) { ElfSym esym; memset(&esym, 0, sizeof(esym)); esym.st_type = type; esym.st_shndx = SHN_ABS; esym.st_bind = STB_GLOBAL; esym.st_visibility = STV_HIDDEN; ctx.internal_esyms.push_back(esym); Symbol *sym = get_symbol(ctx, name); sym->value = 0xdeadbeef; // unique dummy value obj.symbols.push_back(sym); return sym; }; ctx.__ehdr_start = add("__ehdr_start"); ctx.__init_array_start = add("__init_array_start"); ctx.__init_array_end = add("__init_array_end"); ctx.__fini_array_start = add("__fini_array_start"); ctx.__fini_array_end = add("__fini_array_end"); ctx.__preinit_array_start = add("__preinit_array_start"); ctx.__preinit_array_end = add("__preinit_array_end"); ctx._DYNAMIC = add("_DYNAMIC"); ctx._GLOBAL_OFFSET_TABLE_ = add("_GLOBAL_OFFSET_TABLE_"); ctx._PROCEDURE_LINKAGE_TABLE_ = add("_PROCEDURE_LINKAGE_TABLE_"); ctx.__bss_start = add("__bss_start"); ctx._end = add("_end"); ctx._etext = add("_etext"); ctx._edata = add("_edata"); ctx.__executable_start = add("__executable_start"); ctx.__rel_iplt_start = add(E::is_rela ? "__rela_iplt_start" : "__rel_iplt_start"); ctx.__rel_iplt_end = add(E::is_rela ? "__rela_iplt_end" : "__rel_iplt_end"); if (ctx.arg.eh_frame_hdr) ctx.__GNU_EH_FRAME_HDR = add("__GNU_EH_FRAME_HDR"); if (!get_symbol(ctx, "end")->file) ctx.end = add("end"); if (!get_symbol(ctx, "etext")->file) ctx.etext = add("etext"); if (!get_symbol(ctx, "edata")->file) ctx.edata = add("edata"); if (!get_symbol(ctx, "__dso_handle")->file) ctx.__dso_handle = add("__dso_handle"); if constexpr (supports_tlsdesc) ctx._TLS_MODULE_BASE_ = add("_TLS_MODULE_BASE_", STT_TLS); if constexpr (is_riscv) { ctx.__global_pointer = add("__global_pointer$"); if (ctx.dynamic && !ctx.arg.shared) ctx.__global_pointer->is_exported = true; } if constexpr (is_arm32) { ctx.__exidx_start = add("__exidx_start"); ctx.__exidx_end = add("__exidx_end"); } if constexpr (is_ppc64) ctx.extra.TOC = add(".TOC."); if constexpr (is_ppc32) ctx.extra._SDA_BASE_ = add("_SDA_BASE_"); auto add_start_stop = [&](std::string s) { add(save_string(ctx, s)); if (ctx.arg.z_start_stop_visibility_protected) get_symbol(ctx, save_string(ctx, s))->is_exported = true; }; for (Chunk *chunk : ctx.chunks) { if (std::optional name = get_start_stop_name(ctx, *chunk)) { add_start_stop("__start_" + *name); add_start_stop("__stop_" + *name); if (ctx.arg.physical_image_base) { add_start_stop("__phys_start_" + *name); add_start_stop("__phys_stop_" + *name); } } } if constexpr (is_ppc64v2) for (std::pair p : ppc64_save_restore_insns) if (std::string_view label = p.first; !label.empty()) add(label); obj.elf_syms = ctx.internal_esyms; obj.resolve_symbols(ctx); // Make all synthetic symbols relative ones by associating them to // a dummy output section. for (Symbol *sym : obj.symbols) { if (sym->file == &obj) { sym->set_output_section(ctx.symtab); sym->is_imported = false; } } // Handle --defsym symbols. for (i64 i = 0; i < ctx.arg.defsyms.size(); i++) { Symbol *sym1 = ctx.arg.defsyms[i].first; std::variant *, u64> val = ctx.arg.defsyms[i].second; if (Symbol **ref = std::get_if *>(&val)) { Symbol *sym2 = *ref; if (!sym2->file) { Error(ctx) << "--defsym: undefined symbol: " << *sym2; continue; } ElfSym &esym = obj.elf_syms[i + 1]; esym.st_type = sym2->esym().st_type; if constexpr (is_ppc64v2) esym.ppc64_local_entry = sym2->esym().ppc64_local_entry; if (sym2->is_absolute()) sym1->origin = 0; } else { sym1->origin = 0; } } } template void apply_section_align(Context &ctx) { std::unordered_map &map = ctx.arg.section_align; if (!map.empty()) for (Chunk *chunk : ctx.chunks) if (OutputSection *osec = chunk->to_osec()) if (auto it = map.find(osec->name); it != map.end()) osec->shdr.sh_addralign = it->second; } template void check_cet_errors(Context &ctx) { bool warning = (ctx.arg.z_cet_report == CET_REPORT_WARNING); assert(warning || ctx.arg.z_cet_report == CET_REPORT_ERROR); auto has_feature = [](ObjectFile *file, u32 feature) { return ranges::any_of(file->gnu_properties, [&](std::pair kv) { return kv.first == GNU_PROPERTY_X86_FEATURE_1_AND && (kv.second & feature); }); }; for (ObjectFile *file : ctx.objs) { if (file == ctx.internal_obj) continue; if (!has_feature(file, GNU_PROPERTY_X86_FEATURE_1_IBT)) { if (warning) Warn(ctx) << *file << ": -cet-report=warning: " << "missing GNU_PROPERTY_X86_FEATURE_1_IBT"; else Error(ctx) << *file << ": -cet-report=error: " << "missing GNU_PROPERTY_X86_FEATURE_1_IBT"; } if (!has_feature(file, GNU_PROPERTY_X86_FEATURE_1_SHSTK)) { if (warning) Warn(ctx) << *file << ": -cet-report=warning: " << "missing GNU_PROPERTY_X86_FEATURE_1_SHSTK"; else Error(ctx) << *file << ": -cet-report=error: " << "missing GNU_PROPERTY_X86_FEATURE_1_SHSTK"; } } } template void print_dependencies(Context &ctx) { Out(ctx) << R"(# This is an output of the mold linker's --print-dependencies option. # # Each line consists of 4 fields, , , and # , separated by tab characters. It indicates that depends # on to use . is either "u" or "w" for # regular undefined or weak undefined, respectively. # # If you want to obtain dependency information per function granularity, # compile source files with the -ffunction-sections compiler flag. )"; auto println = [&](auto &src, Symbol &sym, ElfSym &esym) { if (InputSection *isec = sym.get_input_section()) Out(ctx) << src << "\t" << *isec << "\t" << (esym.is_weak() ? 'w' : 'u') << "\t" << sym; else Out(ctx) << src << "\t" << *sym.file << "\t" << (esym.is_weak() ? 'w' : 'u') << "\t" << sym; }; for (ObjectFile *file : ctx.objs) { for (std::unique_ptr> &isec : file->sections) { if (!isec) continue; std::unordered_set visited; for (const ElfRel &r : isec->get_rels(ctx)) { if (r.r_type == R_NONE || file->elf_syms.size() <= r.r_sym) continue; ElfSym &esym = file->elf_syms[r.r_sym]; Symbol &sym = *file->symbols[r.r_sym]; if (esym.is_undef() && sym.file && sym.file != file && visited.insert((void *)&sym).second) println(*isec, sym, esym); } } } for (SharedFile *file : ctx.dsos) { for (i64 i = file->first_global; i < file->symbols.size(); i++) { ElfSym &esym = file->elf_syms[i]; Symbol &sym = *file->symbols[i]; if (esym.is_undef() && sym.file && sym.file != file) println(*file, sym, esym); } } } template static std::string create_response_file(Context &ctx) { std::string buf; std::stringstream out; std::string cwd = std::filesystem::current_path().string(); out << "-C " << cwd.substr(1) << "\n"; if (cwd != "/") { out << "--chroot .."; i64 depth = ranges::count(cwd, '/'); for (i64 i = 1; i < depth; i++) out << "/.."; out << "\n"; } for (i64 i = 1; i < ctx.cmdline_args.size(); i++) { std::string_view arg = ctx.cmdline_args[i]; if (arg != "-repro" && arg != "--repro") out << arg << "\n"; } return out.str(); } template void write_repro_file(Context &ctx) { Timer t(ctx, "write_repro_file"); std::string path = ctx.arg.output + ".repro.tar"; std::unique_ptr tar = TarWriter::open(path, path_filename(ctx.arg.output) + ".repro"); if (!tar) Fatal(ctx) << "cannot open " << path << ": " << errno_string(); tar->append("response.txt", create_response_file(ctx)); tar->append("version.txt", mold_version + "\n"); std::unordered_set seen; for (std::unique_ptr &mf : ctx.mf_pool) { if (!mf->parent && seen.insert(mf->name).second) { // We reopen a file because we may have modified the contents of mf // in memory, which is mapped with PROT_WRITE and MAP_PRIVATE. MappedFile *mf2 = must_open_file(ctx, mf->name); tar->append(std::filesystem::absolute(mf->name).string(), mf2->get_contents()); mf2->unmap(); } } } template void check_duplicate_symbols(Context &ctx) { Timer t(ctx, "check_duplicate_symbols"); tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { if (!file->is_reachable) return; for (i64 i = file->first_global; i < file->elf_syms.size(); i++) { const ElfSym &esym = file->elf_syms[i]; Symbol &sym = *file->symbols[i]; // Skip if our symbol is undef or weak if (!sym.file || sym.file == file || sym.file == ctx.internal_obj || esym.is_undef() || esym.is_common() || (esym.st_bind == STB_WEAK)) continue; // Skip if our symbol is in a dead section. In most cases, the // section has been eliminated due to comdat deduplication. if (!esym.is_abs()) { InputSection *isec = file->get_section(esym); if (!isec || !isec->is_alive) continue; } // Skip if the symbol is a deduplicated comdat symbol that is in // an IR file. if (file->is_lto_obj) if (ComdatGroup *g = file->lto_comdat_groups[i]) if (g->owner != file->priority) continue; Error(ctx) << "duplicate symbol: " << *file << ": " << *sym.file << ": " << sym; } }); ctx.checkpoint(); } // GCC and Clang set the SHT_NOBITS flag for an output section only if the // section name is .bss or similar. Sections with nonstandard names, such // as those defined with __attribute__((section(".sectname"))), are always // emitted as non-BSS sections even if they contain only uninitialized // variables. // // This function finds such allocated but all-zero sections and converts // them into BSS, reducing the output file size. template void convert_zero_to_bss(Context &ctx) { Timer t(ctx, "convert_zero_to_bss"); tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { if (!file->is_reachable) return; for (std::unique_ptr> &isec : file->sections) { if (isec && isec->is_alive && isec->shdr().sh_type == SHT_PROGBITS && (isec->shdr().sh_flags & SHF_ALLOC) && (isec->shdr().sh_flags & SHF_WRITE) && !(isec->shdr().sh_flags & SHF_EXECINSTR) && isec->get_rels(ctx).empty() && !isec->contents.empty() && isec->contents.find_first_not_of('\0') == isec->contents.npos) { isec->shdr().sh_type = SHT_NOBITS; isec->contents = {}; } } }); } // If --no-allow-shlib-undefined is specified, we report errors on // unresolved symbols in shared libraries. This is useful when you are // creating a final executable and want to make sure that all symbols // including ones in shared libraries have been resolved. // // If you do not pass --no-allow-shlib-undefined, undefined symbols in // shared libraries will be reported as run-time error by the dynamic // linker. template void check_shlib_undefined(Context &ctx) { Timer t(ctx, "check_shlib_undefined"); auto is_sparc_register = [](const ElfSym &esym) { // Dynamic symbol table for SPARC contains bogus entries which // we need to ignore if constexpr (is_sparc) return esym.st_type == STT_SPARC_REGISTER; return false; }; // Skip test if we don't have a complete set of shared object files // for the program, because if there's a missing .so, an undefined // symbol might be defined by that library. auto do_test = [&] { std::unordered_set sonames; for (std::unique_ptr> &file : ctx.dso_pool) sonames.insert(file->soname); for (SharedFile *file : ctx.dsos) for (std::string_view soname : file->get_dt_needed(ctx)) if (!sonames.contains(soname)) return false; return true; }; if (do_test()) { tbb::parallel_for_each(ctx.dsos, [&](SharedFile *file) { // Check if all undefined symbols have been resolved. for (i64 i = 0; i < file->elf_syms.size(); i++) { const ElfSym &esym = file->elf_syms[i]; Symbol &sym = *file->symbols[i]; if (esym.is_undef() && !esym.is_weak() && !sym.file && !is_sparc_register(esym)) Error(ctx) << *file << ": --no-allow-shlib-undefined: undefined symbol: " << sym; } }); } // Beyond this point, DSOs that are not referenced directly by any // object file are not needed. They were kept by // SharedFile::mark_live_objects just for this pass. Therefore, // remove unneeded DSOs from the list now. for (SharedFile *file : ctx.dsos) file->is_reachable = !file->as_needed; tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { for (Symbol *sym : file->get_global_syms()) if (InputFile *file = sym->file) if (file->is_dso) file->is_reachable.test_and_set(); }); std::erase_if(ctx.dsos, [](SharedFile *file) { return !file->is_reachable; }); } template void check_symbol_types(Context &ctx) { Timer t(ctx, "check_symbol_types"); auto canonicalize = [](u32 ty) -> u32 { if (ty == STT_GNU_IFUNC) return STT_FUNC; if (ty == STT_COMMON) return STT_OBJECT; return ty; }; auto check = [&](InputFile &file, Symbol &sym, const ElfSym &esym1, const ElfSym &esym2) { if (sym.file && sym.file != &file && esym1.st_type != STT_NOTYPE && esym2.st_type != STT_NOTYPE && canonicalize(esym1.st_type) != canonicalize(esym2.st_type)) { Warn(ctx) << "symbol type mismatch: " << sym << '\n' << ">>> defined in " << *sym.file << " as " << stt_to_string(esym1.st_type) << '\n' << ">>> defined in " << file << " as " << stt_to_string(esym2.st_type); } }; tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { for (i64 i = file->first_global; i < file->elf_syms.size(); i++) if (Symbol *sym = file->symbols[i]; sym->file && sym->file != file) check(*file, *sym, sym->esym(), file->elf_syms[i]); }); tbb::parallel_for_each(ctx.dsos, [&](SharedFile *file) { for (i64 i = file->first_global; i < file->elf_syms.size(); i++) { if (Symbol *sym = file->symbols[i]; sym->file && sym->file != file) check(*file, *sym, sym->esym(), file->elf_syms[i]); if (Symbol *sym = file->symbols2[i]) if (sym->file && sym->file != file) check(*file, *sym, sym->esym(), file->elf_syms[i]); } }); } template static i64 get_init_fini_priority(InputSection *isec) { static std::regex re(R"(\.(\d+)$)", std::regex_constants::optimize); std::string_view name = isec->name(); std::cmatch m; if (std::regex_search(name.data(), name.data() + name.size(), m, re)) return std::stoi(m[1]); return 65536; } template static i64 get_ctor_dtor_priority(InputSection *isec) { auto opts = std::regex_constants::optimize | std::regex_constants::ECMAScript; static std::regex re1(R"((?:clang_rt\.)?crtbegin)", opts); static std::regex re2(R"((?:clang_rt\.)?crtend)", opts); static std::regex re3(R"(\.(\d+)$)", opts); // crtbegin.o and crtend.o contain marker symbols such as // __CTOR_LIST__ or __DTOR_LIST__. So they have to be at the // beginning or end of the section. std::smatch m; if (std::regex_search(isec->file.filename, m, re1)) return -2; if (std::regex_search(isec->file.filename, m, re2)) return 65536; std::string name(isec->name()); if (std::regex_search(name, m, re3)) return std::stoi(m[1]); return -1; } template void sort_init_fini(Context &ctx) { Timer t(ctx, "sort_init_fini"); struct Entry { InputSection *sect; i64 prio; }; for (Chunk *chunk : ctx.chunks) { if (OutputSection *osec = chunk->to_osec()) { if (osec->name == ".init_array" || osec->name == ".preinit_array" || osec->name == ".fini_array") { if (ctx.arg.shuffle_sections == SHUFFLE_SECTIONS_REVERSE) ranges::reverse(osec->members); std::vector vec; for (InputSection *isec : osec->members) { std::string_view name = isec->name(); if (name.starts_with(".ctors") || name.starts_with(".dtors")) vec.push_back({isec, 65535 - get_ctor_dtor_priority(isec)}); else vec.push_back({isec, get_init_fini_priority(isec)}); } ranges::stable_sort(vec, {}, &Entry::prio); for (i64 i = 0; i < vec.size(); i++) osec->members[i] = vec[i].sect; } } } } template void sort_ctor_dtor(Context &ctx) { Timer t(ctx, "sort_ctor_dtor"); struct Entry { InputSection *sect; i64 prio; }; for (Chunk *chunk : ctx.chunks) { if (OutputSection *osec = chunk->to_osec()) { if (osec->name == ".ctors" || osec->name == ".dtors") { if (ctx.arg.shuffle_sections != SHUFFLE_SECTIONS_REVERSE) ranges::reverse(osec->members); std::vector vec; for (InputSection *isec : osec->members) vec.push_back({isec, get_ctor_dtor_priority(isec)}); ranges::stable_sort(vec, {}, &Entry::prio); for (i64 i = 0; i < vec.size(); i++) osec->members[i] = vec[i].sect; } } } } // Returns true if a given section contains a DWARF32 debug record. // `isec` must be a .debug_info section. template static bool is_dwarf32(Context &ctx, InputSection *isec) { if (!isec) return true; if (isec->sh_size < 12) { // The section is too short. This is a user error, but instead of // being nitpicky about it, we simply handle it on a garbage-in, // garbage-out basis. return true; } // A .debug_info section contains compilation units (CUs). A 32-bit CU // starts with a 32-bit size field, while a 64-bit CU starts with a // magic number 0xffff'ffff followed by a 64-bit size field. // // Note that size doesn't take the size field itself into account, so // the actual size of a 64-bit CU including the size field is 12 bytes // larger than the value in the size field. u8 buf[12]; isec->copy_contents_to(ctx, buf, 12); if (*(U32 *)buf != 0xffff'ffff) return true; if (*(U64 *)(buf + 4) + 12 == isec->sh_size) return false; // An input .debug_info section usually contains a single CU. However, // if the linker combines multiple object files using `-r`, the // resulting object file may have a .debug_info section with as many CUs // as there are in the input files. Therefore, if the first CU doesn't // cover the entire .debug_info section, we need to keep reading until // the end of the section. // // An input .debug_info section may be compressed using zlib or zstd, so // we need to uncompress it before accessing `isec->contents`. isec->uncompress(ctx); u8 *p = (u8 *)isec->contents.data() + *(U64 *)(buf + 4) + 12; u8 *end = (u8 *)isec->contents.data() + isec->sh_size; while (end - p >= 12) { if (*(U32 *)p != 0xffff'ffff) return true; p += *(U64 *)(p + 4) + 12; } return false; } // Debug sections in an input object file refer to other debug sections in // the same file using section offsets. The offsets are 64 bits in DWARF64 // and 32 bits in DWARF32. // // GCC and Clang emit DWARF32 debug info by default even for 64-bit code. // That is, 64-bit values are used for addresses, and 32-bit values for // references between debug sections. This makes sense for ordinary programs // because it reduces the size of the debug info sections. // // You can change the format to DWARF64 by passing `-gdwarf64`. Therefore, // the "right" approach to build an extremely large program in debug mode is // to recompile everything with `-gdwarf64`. However, that’s often not // feasiable for various reasons. // // If we don't do anything about it, a relocation overflow could occur if // any output debug section exceeds 4 GiB in size, making it almost // impossible for users to link an object file compiled without `-gdwarf64` // to an extremely large program. // // This function works around the issue by sorting output debug section // contents so that DWARF32 input sections are at the start of the output // section followed by DWARF64 input sections. By doing this, we can avoid // relocation overflow until the total size of DWARF32 input sections alone // exceeds 4 GiB. template void sort_debug_info_sections(Context &ctx) { Timer t(ctx, "sort_debug_info_sections"); // True if mold is running under ctest bool is_in_test = false; if (char *env = getenv("MOLD_DEBUG"); env && env[0]) is_in_test = true; // Get lists of output debug sections that need sorting std::vector *> vec1; std::vector *> vec2; for (Chunk *chunk : ctx.chunks) if (OutputSection *osec = chunk->to_osec()) if (!(osec->shdr.sh_flags & SHF_ALLOC) && osec->name.starts_with(".debug_")) if (osec->shdr.sh_size >= UINT32_MAX || is_in_test) vec1.push_back(osec); for (std::unique_ptr> &osec : ctx.merged_sections) if (!(osec->shdr.sh_flags & SHF_ALLOC) && osec->name.starts_with(".debug_")) if (osec->shdr.sh_size >= UINT32_MAX || is_in_test) vec2.push_back(osec.get()); if (vec1.empty() && vec2.empty()) return; // Read each input file's .debug_info to record whether the file contains // DWARF32 or DWARF64 tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { file->is_dwarf32 = is_dwarf32(ctx, file->debug_info); }); // Unless we have a mix of DWARF32 and DWARF64, it doesn't make sense to // sort sections. if (ranges::all_of(ctx.objs, &ObjectFile::is_dwarf32) || ranges::none_of(ctx.objs, &ObjectFile::is_dwarf32)) return; // Reorder input sections in the output section so that DWARF32 // precededs DWARF64 tbb::parallel_for_each(vec1, [&](OutputSection *osec) { ranges::stable_partition(osec->members, [](InputSection *isec) { return isec->file.is_dwarf32; }); osec->compute_section_size(ctx); }); // Reorder strings in .debug_str and the like tbb::parallel_for_each(vec2, [&](MergedSection *osec) { tbb::parallel_for_each(osec->members, [&](MergeableSection *m) { if (m->input_section->file.is_dwarf32) for (SectionFragment *frag : m->fragments) frag->is_32bit = true; }); }); tbb::parallel_for_each(vec2, [&](MergedSection *osec) { osec->compute_section_size(ctx); }); } // .ctors/.dtors serves the same purpose as .init_array/.fini_array, // albeit with very subtly differences. Both contain pointers to // initializer/finalizer functions. The runtime executes them one by one // but in the exact opposite order to one another. Therefore, if we are to // place the contents of .ctors/.dtors into .init_array/.fini_array, we // need to reverse them. // // It's unfortunate that we have both .ctors/.dtors and // .init_array/.fini_array in ELF for historical reasons, but that's // the reality we need to deal with. template void fixup_ctors_in_init_array(Context &ctx) { Timer t(ctx, "fixup_ctors_in_init_array"); auto reverse_contents = [&](InputSection &isec) { if (isec.sh_size % sizeof(Word)) Fatal(ctx) << isec << ": section corrupted"; u8 *buf = (u8 *)isec.contents.data(); std::reverse((Word *)buf, (Word *)(buf + isec.sh_size)); std::span> rels = isec.get_rels(ctx); for (ElfRel &r : rels) r.r_offset = isec.sh_size - r.r_offset - sizeof(Word); ranges::stable_sort(rels, {}, &ElfRel::r_offset); }; if (Chunk *chunk = find_chunk(ctx, ".init_array")) if (OutputSection *osec = chunk->to_osec()) for (InputSection *isec : osec->members) if (isec->name().starts_with(".ctors")) reverse_contents(*isec); if (Chunk *chunk = find_chunk(ctx, ".fini_array")) if (OutputSection *osec = chunk->to_osec()) for (InputSection *isec : osec->members) if (isec->name().starts_with(".dtors")) reverse_contents(*isec); } template static void shuffle(std::vector *> &vec, u64 seed) { if (vec.empty()) return; // Xorshift random number generator. We use this RNG because it is // measurably faster than MT19937. auto rand = [&] { seed ^= seed << 13; seed ^= seed >> 7; seed ^= seed << 17; return seed; }; // The Fisher-Yates shuffling algorithm. // // We don't want to use std::shuffle for build reproducibility. That is, // std::shuffle's implementation is not guaranteed to be the same across // platform, so even though the result is guaranteed to be randomly // shuffled, the exact order may be different across implementations. // // We are not using std::uniform_int_distribution for the same reason. for (i64 i = 0; i < vec.size() - 1; i++) std::swap(vec[i], vec[i + rand() % (vec.size() - i)]); } template void shuffle_sections(Context &ctx) { Timer t(ctx, "shuffle_sections"); auto is_eligible = [](OutputSection *osec) { if (osec) { std::string_view name = osec->name; return (osec->shdr.sh_flags & SHF_ALLOC) && name != ".init" && name != ".fini" && name != ".ctors" && name != ".dtors" && name != ".init_array" && name != ".preinit_array" && name != ".fini_array"; } return false; }; switch (ctx.arg.shuffle_sections) { case SHUFFLE_SECTIONS_SHUFFLE: { tbb::parallel_for_each(ctx.chunks, [&](Chunk *chunk) { if (OutputSection *osec = chunk->to_osec(); is_eligible(osec)) { u64 seed = ctx.arg.shuffle_sections_seed + hash_string(osec->name); shuffle(osec->members, seed); } }); break; } case SHUFFLE_SECTIONS_REVERSE: tbb::parallel_for_each(ctx.chunks, [&](Chunk *chunk) { if (OutputSection *osec = chunk->to_osec(); is_eligible(osec)) ranges::reverse(osec->members); }); break; default: unreachable(); } } template void add_dynamic_strings(Context &ctx) { for (SharedFile *file : ctx.dsos) { std::string_view s = file->get_dt_audit(ctx); if (!s.empty()) { if (!ctx.arg.depaudit.empty()) ctx.arg.depaudit += ':'; ctx.arg.depaudit += std::string(s); } } auto add = [&](std::string_view s) { if (!s.empty()) ctx.dynstr->add_string(s); }; for (SharedFile *file : ctx.dsos) add(file->soname); for (std::string_view str : ctx.arg.auxiliary) add(str); for (std::string_view str : ctx.arg.filter) add(str); add(ctx.arg.audit); add(ctx.arg.depaudit); add(ctx.arg.rpaths); add(ctx.arg.soname); } template void compute_section_sizes(Context &ctx) { Timer t(ctx, "compute_section_sizes"); if constexpr (needs_thunk) { std::vector *> vec = ctx.chunks; auto tail = ranges::partition(vec, [&](Chunk *chunk) { return chunk->to_osec() && (chunk->shdr.sh_flags & SHF_EXECINSTR) && !ctx.arg.relocatable; }); // create_range_extension_thunks is not thread-safe for (Chunk *chunk : std::span(vec.begin(), tail.begin())) chunk->to_osec()->create_range_extension_thunks(ctx); tbb::parallel_for_each(tail, [&](Chunk *chunk) { chunk->compute_section_size(ctx); }); } else { tbb::parallel_for_each(ctx.chunks, [&](Chunk *chunk) { chunk->compute_section_size(ctx); }); } } // Find all unresolved symbols and attach them to the most appropriate files. // // Note that even a symbol that will be reported as an undefined symbol // will get an owner file in this function. Such symbol will be reported // by ObjectFile::scan_relocations(). This is because we want to report // errors only on symbols that are actually referenced. template void claim_unresolved_symbols(Context &ctx) { Timer t(ctx, "claim_unresolved_symbols"); tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { if (file == ctx.internal_obj) return; for (i64 i = file->first_global; i < file->elf_syms.size(); i++) { const ElfSym &esym = file->elf_syms[i]; Symbol &sym = *file->symbols[i]; if (!esym.is_undef()) continue; std::scoped_lock lock(sym.mu); if (sym.file) if (!sym.esym().is_undef() || sym.file->priority <= file->priority) continue; auto claim = [&](bool is_imported) { if (sym.is_traced) Out(ctx) << "trace-symbol: " << *file << ": unresolved" << (esym.is_weak() ? " weak" : "") << " symbol " << sym; sym.file = file; sym.origin = 0; sym.value = 0; sym.sym_idx = i; sym.is_weak = false; sym.is_imported = is_imported; sym.is_exported = false; sym.ver_idx = is_imported ? 0 : ctx.default_version; }; if (esym.is_undef_weak()) { if (ctx.arg.z_dynamic_undefined_weak && sym.visibility != STV_HIDDEN) { // Global weak undefined symbols are promoted to dynamic symbols // by default only when linking a DSO. We generally cannot do that // for executables because we may need to create a copy relocation // for a data symbol, but the symbol size is not available for an // unclaimed weak symbol. // // In contrast, GNU ld promotes weak symbols to dynamic ones even // for an executable as long as they don't need copy relocations // (i.e. they need only PLT entries.) That may result in an // inconsistent behavior of a linked program depending on whether // whether its object files were compiled with -fPIC or not. I think // that's bad semantics, so we don't do that. claim(true); } else { // Otherwise, weak undefs are converted to absolute symbols with value 0. claim(false); } continue; } // Traditionally, remaining undefined symbols cause a link failure // only when we are creating an executable. Undefined symbols in // shared objects are promoted to dynamic symbols, so that they'll // get another chance to be resolved at run-time. You can change the // behavior by passing `-z defs` to the linker. // // Even if `-z defs` is given, weak undefined symbols are still // promoted to dynamic symbols for compatibility with other linkers. // Some major programs, notably Firefox, depend on the behavior // (they use this loophole to export symbols from libxul.so). if (ctx.arg.shared && sym.visibility != STV_HIDDEN && ctx.arg.unresolved_symbols != UNRESOLVED_ERROR) { claim(true); continue; } // Convert remaining undefined symbols to absolute symbols with value 0. claim(false); } }); } template void scan_relocations(Context &ctx) { Timer t(ctx, "scan_relocations"); // Scan relocations to find dynamic symbols. tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { file->scan_relocations(ctx); }); // Word-size absolute relocations (e.g. R_X86_64_64) are handled // separately because they can be promoted to dynamic relocations. tbb::parallel_for_each(ctx.chunks, [&](Chunk *chunk) { if (OutputSection *osec = chunk->to_osec()) if (osec->shdr.sh_flags & SHF_ALLOC) osec->scan_abs_relocations(ctx); }); // Exit if there was a relocation that refers an undefined symbol. ctx.checkpoint(); // Aggregate dynamic symbols to a single vector. std::vector *> files; append(files, ctx.objs); append(files, ctx.dsos); std::vector *>> vec(files.size()); tbb::parallel_for((i64)0, (i64)files.size(), [&](i64 i) { for (Symbol *sym : files[i]->symbols) if (sym->file == files[i]) if (sym->flags || sym->is_imported || sym->is_exported) vec[i].push_back(sym); }); std::vector *> syms = flatten(vec); ctx.symbol_aux.reserve(syms.size()); if (ctx.needs_tlsld) ctx.got->add_tlsld(ctx); // Assign offsets in additional tables for each dynamic symbol. for (Symbol *sym : syms) { sym->add_aux(ctx); if (sym->is_imported || sym->is_exported) ctx.dynsym->add_symbol(ctx, sym); if (sym->flags & NEEDS_GOT) ctx.got->add_got_symbol(ctx, sym); if (sym->flags & NEEDS_CPLT) { sym->is_canonical = true; // A canonical PLT needs to be visible from DSOs. sym->is_exported = true; // We can't use .plt.got for a canonical PLT because otherwise // .plt.got and .got would refer to each other, resulting in an // infinite loop at runtime. ctx.plt->add_symbol(ctx, sym); } else if (sym->flags & NEEDS_PLT) { if (sym->flags & NEEDS_GOT) ctx.pltgot->add_symbol(ctx, sym); else ctx.plt->add_symbol(ctx, sym); } if (sym->flags & NEEDS_GOTTP) ctx.got->add_gottp_symbol(ctx, sym); if (sym->flags & NEEDS_TLSGD) ctx.got->add_tlsgd_symbol(ctx, sym); if (sym->flags & NEEDS_TLSDESC) ctx.got->add_tlsdesc_symbol(ctx, sym); if (sym->flags & NEEDS_COPYREL) { if (ctx.arg.z_relro && sym->file->is_dso && ((SharedFile *)sym->file)->is_readonly(sym)) ctx.copyrel_relro->add_symbol(ctx, sym); else ctx.copyrel->add_symbol(ctx, sym); } if constexpr (is_ppc64v1) if (sym->flags & NEEDS_PPC_OPD) ctx.extra.opd->add_symbol(ctx, sym); sym->flags = 0; } if (ctx.has_textrel && ctx.arg.warn_textrel) Warn(ctx) << "creating a DT_TEXTREL in an output file"; } // Compute the is_weak bit for each imported symbol. // // If all references to a shared symbol is weak, the symbol is marked // as weak in .dynsym. template void compute_imported_symbol_weakness(Context &ctx) { Timer t(ctx, "compute_imported_symbol_weakness"); tbb::parallel_for_each(ctx.objs, [](ObjectFile *file) { for (i64 i = file->first_global; i < file->elf_syms.size(); i++) { const ElfSym &esym = file->elf_syms[i]; Symbol &sym = *file->symbols[i]; if (esym.is_undef() && !esym.is_weak() && sym.file && sym.file->is_dso) { std::scoped_lock lock(sym.mu); sym.is_weak = false; } } }); } // Report all undefined symbols, grouped by symbol. template void report_undef_errors(Context &ctx) { constexpr i64 MAX_ERRORS = 3; if (ctx.arg.unresolved_symbols == UNRESOLVED_IGNORE) return; for (auto &pair : ctx.undef_errors) { Symbol *sym = pair.first; std::span errors = pair.second; std::stringstream ss; ss << "undefined symbol: " << (ctx.arg.demangle ? demangle(*sym) : sym->name()) << "\n"; for (i64 i = 0; i < errors.size() && i < MAX_ERRORS; i++) ss << errors[i]; if (MAX_ERRORS < errors.size()) ss << ">>> referenced " << (errors.size() - MAX_ERRORS) << " more times\n"; // Remove the trailing '\n' because Error/Warn adds it automatically std::string msg = ss.str(); msg.pop_back(); if (ctx.arg.unresolved_symbols == UNRESOLVED_ERROR) Error(ctx) << msg; else Warn(ctx) << msg; } ctx.checkpoint(); } template void create_reloc_sections(Context &ctx) { Timer t(ctx, "create_reloc_sections"); // Create .rela.* sections tbb::parallel_for((i64)0, (i64)ctx.chunks.size(), [&](i64 i) { if (OutputSection *osec = ctx.chunks[i]->to_osec()) osec->reloc_sec.reset(new RelocSection(ctx, *osec)); }); for (i64 i = 0, end = ctx.chunks.size(); i < end; i++) if (OutputSection *osec = ctx.chunks[i]->to_osec()) if (RelocSection *x = osec->reloc_sec.get()) ctx.chunks.push_back(x); } // Copy chunks to an output file template void copy_chunks(Context &ctx) { Timer t(ctx, "copy_chunks"); auto copy = [&](Chunk &chunk) { std::string name = chunk.name.empty() ? "(header)" : std::string(chunk.name); Timer t2(ctx, name, &t); chunk.copy_buf(ctx); }; // For --relocatable and --emit-relocs, we want to copy non-relocation // sections first. This is because REL-type relocation sections (as // opposed to RELA-type) stores relocation addends to target sections. // // We also does that for SH4 because despite being RELA, we always need // to write addends to relocated places for SH4. auto is_rel = [](Chunk &chunk) { return chunk.shdr.sh_type == SHT_REL || (is_sh4 && chunk.shdr.sh_type == SHT_RELA); }; tbb::parallel_for_each(ctx.chunks, [&](Chunk *chunk) { if (!is_rel(*chunk)) copy(*chunk); }); tbb::parallel_for_each(ctx.chunks, [&](Chunk *chunk) { if (is_rel(*chunk)) copy(*chunk); }); // Undefined symbols in SHF_ALLOC sections are found by scan_relocations(), // but those in non-SHF_ALLOC sections cannot be found until we copy section // contents. So we need to call this function again to report possible // undefined errors. report_undef_errors(ctx); // Zero-clear paddings between chunks auto zero = [&](Chunk *chunk, i64 next_start) { i64 pos = chunk->shdr.sh_offset + chunk->shdr.sh_size; memset(ctx.buf + pos, 0, next_start - pos); }; std::vector *> chunks = ctx.chunks; std::erase_if(chunks, [](Chunk *chunk) { return chunk->shdr.sh_type == SHT_NOBITS; }); for (i64 i = 1; i < chunks.size(); i++) zero(chunks[i - 1], chunks[i]->shdr.sh_offset); zero(chunks.back(), ctx.output_file->filesize); } template void construct_relr(Context &ctx) { Timer t(ctx, "construct_relr"); tbb::parallel_for_each(ctx.chunks, [&](Chunk *chunk) { chunk->construct_relr(ctx); }); } // The hash function for .gnu.hash. static u32 djb_hash(std::string_view name) { u32 h = 5381; for (u8 c : name) h = (h << 5) + h + c; return h; } template void sort_dynsyms(Context &ctx) { Timer t(ctx, "sort_dynsyms"); std::span *> syms = ctx.dynsym->symbols; if (syms.empty()) return; // In any symtab, local symbols must precede global symbols. auto globals = ranges::stable_partition(syms.subspan(1), [&](Symbol *sym) { return sym->is_local(ctx); }); // .gnu.hash imposes more restrictions on the order of the symbols in // .dynsym. if (ctx.gnu_hash) { auto exported_syms = ranges::stable_partition(globals, [](Symbol *sym) { return !sym->is_exported; }); // Count the number of exported symbols to compute the size of .gnu.hash. i64 num_exported = exported_syms.size(); u32 num_buckets = num_exported / ctx.gnu_hash->LOAD_FACTOR + 1; tbb::parallel_for_each(exported_syms, [&](Symbol *sym) { sym->set_djb_hash(ctx, djb_hash(sym->name())); }); tbb::parallel_sort(exported_syms, [&](Symbol *a, Symbol *b) { return std::tuple(a->get_djb_hash(ctx) % num_buckets, a->name()) < std::tuple(b->get_djb_hash(ctx) % num_buckets, b->name()); }); ctx.gnu_hash->num_buckets = num_buckets; ctx.gnu_hash->num_exported = num_exported; } // Compute .dynstr size ctx.dynsym->dynstr_offset = ctx.dynstr->shdr.sh_size; tbb::enumerable_thread_specific size; tbb::parallel_for((i64)1, (i64)syms.size(), [&](i64 i) { syms[i]->set_dynsym_idx(ctx, i); size.local() += syms[i]->name().size() + 1; }); ctx.dynstr->shdr.sh_size += size.combine(std::plus()); // ELF's symbol table sh_info holds the offset of the first global symbol. ctx.dynsym->shdr.sh_info = globals.begin() - syms.begin(); } template void create_output_symtab(Context &ctx) { Timer t(ctx, "compute_symtab_size"); if constexpr (needs_thunk) { i64 n = 0; for (Chunk *chunk : ctx.chunks) if (OutputSection *osec = chunk->to_osec()) for (std::unique_ptr> &thunk : osec->thunks) thunk->name = "thunk" + std::to_string(n++); } tbb::parallel_for_each(ctx.chunks, [&](Chunk *chunk) { chunk->compute_symtab_size(ctx); }); tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { file->compute_symtab_size(ctx); }); tbb::parallel_for_each(ctx.dsos, [&](SharedFile *file) { file->compute_symtab_size(ctx); }); } template void apply_version_script(Context &ctx) { Timer t(ctx, "apply_version_script"); // Assign versions to symbols specified with `extern "C++"` or // wildcard patterns first. Glob matcher; Glob cpp_matcher; // The "local:" label has a special meaning in the version script. // It can appear in any VERSION clause, and it hides matched symbols // unless other non-local patterns match to them. In other words, // "local:" has lower precedence than other version definitions. // // If two or more non-local patterns match to the same symbol, the // last one takes precedence. std::vector patterns = ctx.version_patterns; ranges::stable_partition(patterns, [](const VersionPattern &pat) { return pat.ver_idx == VER_NDX_LOCAL; }); auto has_wildcard = [](std::string_view str) { return str.find_first_of("*?[") != str.npos; }; for (i64 i = 0; i < patterns.size(); i++) { VersionPattern &v = patterns[i]; if (v.is_cpp) { if (!cpp_matcher.add(v.pattern, i)) Fatal(ctx) << "invalid version pattern: " << v.pattern; } else if (has_wildcard(v.pattern)) { if (!matcher.add(v.pattern, i)) Fatal(ctx) << "invalid version pattern: " << v.pattern; } } if (!matcher.empty() || !cpp_matcher.empty()) { tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { for (Symbol *sym : file->get_global_syms()) { if (sym->file != file) continue; std::string_view name = sym->name(); i64 match = matcher.find(name); // Match non-mangled symbols against the C++ pattern as well. // Weird, but required to match other linkers' behavior. if (!cpp_matcher.empty()) { if (std::optional s = demangle_cpp(name)) name = *s; match = std::max(match, cpp_matcher.find(name)); } if (match != -1) sym->ver_idx = patterns[match].ver_idx; } }); } // Next, assign versions to symbols specified by exact name. // In other words, exact matches have higher precedence over // wildcard or `extern "C++"` patterns. for (VersionPattern &v : patterns) { if (!v.is_cpp && !has_wildcard(v.pattern)) { Symbol *sym = get_symbol(ctx, v.pattern); if (!sym->file && !ctx.arg.undefined_version) Warn(ctx) << v.source << ": cannot assign version `" << v.ver_str << "` to symbol `" << *sym << "`: symbol not found"; if (sym->file && !sym->file->is_dso) sym->ver_idx = v.ver_idx; } } } template void parse_symbol_version(Context &ctx) { if (!ctx.arg.shared) return; Timer t(ctx, "parse_symbol_version"); std::unordered_map verdefs; for (i64 i = 0; i < ctx.arg.version_definitions.size(); i++) verdefs[ctx.arg.version_definitions[i]] = i + VER_NDX_LAST_RESERVED + 1; tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { if (file == ctx.internal_obj) return; for (i64 i = file->first_global; i < file->elf_syms.size(); i++) { // Match VERSION part of symbol foo@VERSION with version definitions. if (!file->has_symver[i - file->first_global]) continue; Symbol *sym = file->symbols[i]; if (sym->file != file) continue; const char *name = file->symbol_strtab.data() + file->elf_syms[i].st_name; std::string_view ver = strchr(name, '@') + 1; bool is_default = false; if (ver.starts_with('@')) { is_default = true; ver = ver.substr(1); } auto it = verdefs.find(ver); if (it == verdefs.end()) { Error(ctx) << *file << ": symbol " << *sym << " has undefined version " << ver; continue; } sym->ver_idx = it->second; if (!is_default) sym->ver_idx |= VERSYM_HIDDEN; // If both symbol `foo` and `foo@VERSION` are defined, `foo@VERSION` // hides `foo` so that all references to `foo` are resolved to a // versioned symbol. Likewise, if `foo@VERSION` and `foo@@VERSION` are // defined, the default one takes precedence. Symbol *sym2 = get_symbol(ctx, sym->name()); if (sym2 != sym && sym2->file == file && !file->has_symver[sym2->sym_idx - file->first_global]) if (sym2->ver_idx == ctx.default_version || (sym2->ver_idx & ~VERSYM_HIDDEN) == (sym->ver_idx & ~VERSYM_HIDDEN)) sym2->ver_idx = VER_NDX_LOCAL; } }); } template static bool should_export(Context &ctx, Symbol &sym) { if (sym.visibility == STV_HIDDEN) return false; switch (sym.ver_idx) { case VER_NDX_UNSPECIFIED: if (ctx.arg.dynamic_list_data) if (u32 ty = sym.get_type(); ty != STT_FUNC && ty != STT_GNU_IFUNC) return true; if (ctx.arg.shared) return !((ObjectFile *)sym.file)->exclude_libs; return ctx.arg.export_dynamic; case VER_NDX_LOCAL: return false; default: return true; } }; template static bool is_protected(Context &ctx, Symbol &sym) { if (sym.visibility == STV_PROTECTED) return true; switch (ctx.arg.Bsymbolic) { case BSYMBOLIC_ALL: return true; case BSYMBOLIC_NONE: return false; case BSYMBOLIC_FUNCTIONS: return sym.get_type() == STT_FUNC; case BSYMBOLIC_NON_WEAK: return !sym.is_weak; case BSYMBOLIC_NON_WEAK_FUNCTIONS: return !sym.is_weak && sym.get_type() == STT_FUNC; default: unreachable(); } } template void compute_import_export(Context &ctx) { Timer t(ctx, "compute_import_export"); // If we are creating an executable, we want to export symbols referenced // by DSOs unless they are explicitly marked as local by a version script. if (!ctx.arg.shared) { tbb::parallel_for_each(ctx.dsos, [](SharedFile *file) { for (Symbol *sym : file->symbols) { if (sym->file && !sym->file->is_dso && sym->visibility != STV_HIDDEN && sym->ver_idx != VER_NDX_LOCAL) { std::scoped_lock lock(sym->mu); sym->is_exported = true; } } }); } // Export symbols that are not hidden or marked as local. // We also want to mark imported symbols as such. tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { for (Symbol *sym : file->get_global_syms()) { // If we are using a symbol in a DSO, we need to import it. if (sym->file && sym->file->is_dso) { std::scoped_lock lock(sym->mu); sym->is_imported = true; continue; } // If we have a definition of a symbol, we may want to export it. if (sym->file == file && should_export(ctx, *sym)) { sym->is_exported = true; // Exported symbols are marked as imported as well by default // for DSOs. if (ctx.arg.shared && !is_protected(ctx, *sym)) sym->is_imported = true; } } }); // Apply --dynamic-list, --export-dynamic-symbol and // --export-dynamic-symbol-list options. // // The semantics of these options vary depending on whether we are // creating an executalbe or a shared object. // // For executable, matched symbols are exported. // // For shared objects, matched symbols are imported if it is already // exported so that they are interposable. In other words, symbols // that did not match will be bound locally within the output file, // effectively turning them into protected symbols. Glob matcher; Glob cpp_matcher; auto handle_match = [&](Symbol *sym) { if (ctx.arg.shared) { if (sym->is_exported) sym->is_imported = true; } else { if (sym->file && !sym->file->is_dso && sym->visibility != STV_HIDDEN) sym->is_exported = true; } }; for (DynamicPattern &p : ctx.dynamic_list_patterns) { if (p.is_cpp) { if (!cpp_matcher.add(p.pattern, 1)) Fatal(ctx) << p.source << ": invalid dynamic list entry: " << p.pattern; continue; } if (p.pattern.find_first_of("*?[") != p.pattern.npos) { if (!matcher.add(p.pattern, 1)) Fatal(ctx) << p.source << ": invalid dynamic list entry: " << p.pattern; continue; } handle_match(get_symbol(ctx, p.pattern)); } if (!matcher.empty() || !cpp_matcher.empty()) { tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { for (Symbol *sym : file->get_global_syms()) { if (sym->file != file) continue; if (ctx.arg.shared && !sym->is_exported) continue; std::string_view name = sym->name(); if (matcher.find(name) != -1) { handle_match(sym); } else if (!cpp_matcher.empty()) { if (std::optional s = demangle_cpp(name)) name = *s; if (cpp_matcher.find(name) != -1) handle_match(sym); } } }); } } // Compute the "address-taken" bit for each input section. // // As a space-saving optimization, we want to merge two read-only objects // into a single object if their contents are equivalent. That // optimization is called the Identical Code Folding or ICF. // // A catch is that comparing object contents is not enough to determine if // two objects can be merged safely; we need to take care of pointer // equivalence. // // In C/C++, two pointers are equivalent if and only if they are taken for // the same object. Merging two objects into a single object can break // this assumption because two distinctive pointers would become // equivalent as a result of merging. We can still merge one object with // another if no pointer to the object was taken in code, because without // a pointer, comparing its address becomes moot. // // In mold, each input section has an "address-taken" bit. If there is a // pointer-taking reference to the object, it's set to true. At the ICF // stage, we merge only objects whose addresses were not taken. // // For functions, address-taking relocations are separated from // non-address-taking ones. For example, x86-64 uses R_X86_64_PLT32 for // direct function calls (e.g., "call foo" to call the function foo) while // R_X86_64_PC32 or R_X86_64_GOT32 are used for pointer-taking operations. // // Unfortunately, for data, we can't distinguish between address-taking // relocations and non-address-taking ones. LLVM generates an "address // significance" table in the ".llvm_addrsig" section to mark symbols // whose addresses are taken in code. If that table is available, we use // that information in this function. Otherwise, we conservatively assume // that all data items are address-taken. template void compute_address_significance(Context &ctx) { Timer t(ctx, "compute_address_significance"); tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { // If .llvm_addrsig is available, use it. if (InputSection *sec = file->llvm_addrsig.get()) { u8 *p = (u8 *)sec->contents.data(); u8 *end = p + sec->contents.size(); while (p != end) { Symbol *sym = file->symbols[read_uleb(&p)]; if (InputSection *isec = sym->get_input_section()) isec->address_taken = true; } return; } // Otherwise, infer address significance. for (std::unique_ptr> &isec : file->sections) { if (!isec || !isec->is_alive || !(isec->shdr().sh_flags & SHF_ALLOC)) continue; if (!(isec->shdr().sh_flags & SHF_EXECINSTR)) isec->address_taken = true; for (const ElfRel &r : isec->get_rels(ctx)) if (!is_func_call_rel(r)) if (Symbol *sym = file->symbols[r.r_sym]; InputSection *dst = sym->get_input_section()) if (dst->shdr().sh_flags & SHF_EXECINSTR) dst->address_taken = true; } }); auto mark = [](Symbol *sym) { if (sym) if (InputSection *isec = sym->get_input_section()) isec->address_taken = true; }; // Some symbols' pointer values are leaked to the dynamic section. mark(ctx.arg.entry); mark(ctx.arg.init); mark(ctx.arg.fini); // Exported symbols are conservatively considered address-taken. if (ctx.dynsym) for (Symbol *sym : ctx.dynsym->symbols) if (sym && sym->is_exported) mark(sym); } // We want to sort output chunks in the following order. // // // // .interp // .note // .hash // .gnu.hash // .dynsym // .dynstr // .gnu.version // .gnu.version_r // .rela.dyn // .rela.plt // // // // // // .got // .toc // // .relro_padding // // // //
// .gdb_index // // .interp and some other linker-synthesized sections are placed at the // beginning of a file because they are needed by loader. Especially on // a hard drive with spinning disks, it is important to read these // sections in a single seek. // // .note sections are also placed at the beginning so that they are // included in a core crash dump even if it's truncated by ulimit. In // particular, if .note.gnu.build-id is in a truncated core file, you // can at least identify which executable has crashed. // // .gdb_index cannot be constructed before applying relocations to // other debug sections, so we create it after completing other part // of the output file and append it to the very end of the file. // // A PT_NOTE segment will contain multiple .note sections if exist, // but there's no way to represent a gap between .note sections. // Therefore, we sort .note sections by decreasing alignment // requirement. I believe each .note section size is a multiple of its // alignment, so by sorting them by alignment, we should be able to // avoid a gap between .note sections. // // .toc is placed right after .got for PPC64. PPC-specific .toc section // contains data that may be accessed with a 16-bit offset relative to // %r2. %r2 is set to .got + 32 KiB. Therefore, .toc needs to be within // [.got, .got + 64 KiB). // // Other file layouts are possible, but this layout is chosen to keep // the number of segments as few as possible. template void sort_output_sections_regular(Context &ctx) { auto get_rank1 = [&](Chunk *chunk) { u64 type = chunk->shdr.sh_type; u64 flags = chunk->shdr.sh_flags; if (chunk == ctx.ehdr) return 0; if (chunk == ctx.phdr) return 1; if (chunk == ctx.interp) return 2; if (type == SHT_NOTE && (flags & SHF_ALLOC)) return 3; if (chunk == ctx.hash) return 4; if (chunk == ctx.gnu_hash) return 5; if (chunk == ctx.dynsym) return 6; if (chunk == ctx.dynstr) return 7; if (chunk == ctx.versym) return 8; if (chunk == ctx.verneed) return 9; if (chunk == ctx.reldyn) return 10; if (chunk == ctx.relplt) return 11; if (chunk == ctx.shdr) return INT32_MAX - 1; if (chunk == ctx.gdb_index) return INT32_MAX; bool alloc = (flags & SHF_ALLOC); bool writable = (flags & SHF_WRITE); bool exec = (flags & SHF_EXECINSTR); bool tls = (flags & SHF_TLS); bool relro = chunk->is_relro; bool is_bss = (type == SHT_NOBITS); return (1 << 10) | (!alloc << 9) | (writable << 8) | (exec << 7) | (!tls << 6) | (!relro << 5) | (is_bss << 4); }; // Ties are broken by additional rules auto get_rank2 = [&](Chunk *chunk) -> i64 { ElfShdr &shdr = chunk->shdr; if (shdr.sh_type == SHT_NOTE) return -shdr.sh_addralign; if (chunk == ctx.got) return 2; if (chunk->name == ".toc") return 3; if (chunk == ctx.relro_padding) return INT64_MAX; return 0; }; ranges::stable_sort(ctx.chunks, {}, [&](Chunk *x) { return std::tuple{get_rank1(x), get_rank2(x), x->name}; }); } template static std::string_view get_section_order_group(Chunk &chunk) { if (chunk.shdr.sh_type == SHT_NOBITS) return "BSS"; if (chunk.shdr.sh_flags & SHF_EXECINSTR) return "TEXT"; if (chunk.shdr.sh_flags & SHF_WRITE) return "DATA"; return "RODATA"; }; // Sort sections according to a --section-order argument. template void sort_output_sections_by_order(Context &ctx) { auto get_rank = [&](Chunk *chunk) -> i64 { u64 flags = chunk->shdr.sh_flags; if (chunk == ctx.ehdr && !(chunk->shdr.sh_flags & SHF_ALLOC)) return -2; if (chunk == ctx.phdr && !(chunk->shdr.sh_flags & SHF_ALLOC)) return -1; if (chunk == ctx.shdr) return INT32_MAX; if (!(flags & SHF_ALLOC)) return INT32_MAX - 1; for (i64 i = 0; i < ctx.arg.section_order.size(); i++) if (SectionOrder &arg = ctx.arg.section_order[i]; arg.type == SectionOrder::SECTION && arg.name == chunk->name) return i; std::string_view group = get_section_order_group(*chunk); for (i64 i = 0; i < ctx.arg.section_order.size(); i++) if (SectionOrder &arg = ctx.arg.section_order[i]; arg.type == SectionOrder::GROUP && arg.name == group) return i; Error(ctx) << "--section-order: missing section specification for " << chunk->name; return 0; }; // It is an error if a section order cannot be determined by a given // section order list. for (Chunk *chunk : ctx.chunks) chunk->sect_order = get_rank(chunk); // Sort output sections by --section-order ranges::stable_sort(ctx.chunks, {}, &Chunk::sect_order); } template void sort_output_sections(Context &ctx) { if (ctx.arg.section_order.empty()) sort_output_sections_regular(ctx); else sort_output_sections_by_order(ctx); } template static i64 get_tls_segment_alignment(Context &ctx) { i64 val = 1; for (Chunk *chunk : ctx.chunks) if (chunk->shdr.sh_flags & SHF_TLS) val = std::max(val, chunk->shdr.sh_addralign); return val; } // This function assigns virtual addresses to output sections. Assigning // addresses is a bit tricky because we want to pack sections as tightly // as possible while not violating the constraints imposed by the hardware // and the OS kernel. Specifically, we need to satisfy the following // constraints: // // - Memory protection (readable, writable and executable) works at page // granularity. Therefore, if we want to set different memory attributes // to two sections, we need to place them into separate pages. // // - The ELF spec requires that a section's file offset is congruent to // its virtual address modulo the page size. For example, a section at // virtual address 0x401234 on x86-64 (4 KiB, or 0x1000 byte page // system) can be at file offset 0x3234 or 0x50234 but not at 0x1000. // // We need to insert paddings between sections if we can't satisfy the // above constraints without them. // // We don't want to waste too much memory and disk space for paddings. // There are a few tricks we can use to minimize paddings as below: // // - We want to place sections with the same memory attributes // contiguous as possible. // // - We can map the same file region to memory more than once. For // example, we can write code (with R and X bits) and read-only data // (with R bit) adjacent on file and map it twice as the last page of // the executable segment and the first page of the read-only data // segment. This doesn't save memory but saves disk space. template static void set_virtual_addresses_regular(Context &ctx) { constexpr i64 RELRO = 1LL << 32; auto get_flags = [&](Chunk *chunk) { i64 flags = to_phdr_flags(ctx, chunk); if (chunk->is_relro) return flags | RELRO; return flags; }; // Assign virtual addresses std::vector *> &chunks = ctx.chunks; u64 addr = ctx.arg.image_base; auto is_tls = [](Chunk *chunk) { return chunk->shdr.sh_flags & SHF_TLS; }; auto is_tbss = [](Chunk *chunk) { return (chunk->shdr.sh_flags & SHF_TLS) && (chunk->shdr.sh_type == SHT_NOBITS); }; for (i64 i = 0; i < chunks.size(); i++) { if (!(chunks[i]->shdr.sh_flags & SHF_ALLOC)) continue; // .relro_padding is a padding section to extend a PT_GNU_RELRO // segment to cover an entire page. Technically, we don't need a // .relro_padding section because we can leave a trailing part of a // segment an unused space. However, the `strip` command would delete // such an unused trailing part and make an executable invalid. // So we add a dummy section. if (chunks[i] == ctx.relro_padding) { chunks[i]->shdr.sh_addr = addr; chunks[i]->shdr.sh_size = align_to(addr, ctx.page_size) - addr; addr += ctx.page_size; continue; } // Handle --section-start first if (auto it = ctx.arg.section_start.find(chunks[i]->name); it != ctx.arg.section_start.end()) { addr = it->second; chunks[i]->shdr.sh_addr = addr; addr += chunks[i]->shdr.sh_size; continue; } // Memory protection works at page size granularity. We need to // put sections with different memory attributes into different // pages. We do it by inserting paddings here. if (i > 0 && chunks[i - 1] != ctx.relro_padding) { i64 flags1 = get_flags(chunks[i - 1]); i64 flags2 = get_flags(chunks[i]); if (!ctx.arg.nmagic && flags1 != flags2) { switch (ctx.arg.z_separate_code) { case SEPARATE_LOADABLE_SEGMENTS: addr = align_to(addr, ctx.page_size); break; case SEPARATE_CODE: if ((flags1 & PF_X) != (flags2 & PF_X)) { addr = align_to(addr, ctx.page_size); break; } [[fallthrough]]; case NOSEPARATE_CODE: if (addr % ctx.page_size != 0) addr += ctx.page_size; break; default: unreachable(); } } } // TLS sections are included only in PT_LOAD but also in PT_TLS. // We align the first TLS section so that the PT_TLS segment starts // at an address that meets the segment's alignment requirement. if (is_tls(chunks[i]) && (i == 0 || !is_tls(chunks[i - 1]))) addr = align_to(addr, get_tls_segment_alignment(ctx)); // TLS BSS sections are laid out so that they overlap with the // subsequent non-tbss sections. Overlapping is fine because a STT_TLS // segment contains an initialization image for newly-created threads, // and no one except the runtime reads its contents. Even the runtime // doesn't need a BSS part of a TLS initialization image; it just // leaves zero-initialized bytes as-is instead of copying zeros. // So no one really read tbss at runtime. // // We can instead allocate a dedicated virtual address space to tbss, // but that would be just a waste of the address and disk space. if (is_tbss(chunks[i])) { u64 addr2 = addr; for (;;) { addr2 = align_to(addr2, chunks[i]->shdr.sh_addralign); chunks[i]->shdr.sh_addr = addr2; addr2 += chunks[i]->shdr.sh_size; if (i + 2 == chunks.size() || !is_tbss(chunks[i + 1])) break; i++; } continue; } addr = align_to(addr, chunks[i]->shdr.sh_addralign); chunks[i]->shdr.sh_addr = addr; addr += chunks[i]->shdr.sh_size; } } template static void set_virtual_addresses_by_order(Context &ctx) { std::vector *> vec; for (Chunk *c : ctx.chunks) if (c->shdr.sh_flags & SHF_ALLOC) vec.push_back(c); u64 addr = ctx.arg.image_base; i64 i = 0; for (i64 j = 0; j < ctx.arg.section_order.size(); j++) { SectionOrder &ord = ctx.arg.section_order[j]; switch (ord.type) { case SectionOrder::SECTION: case SectionOrder::GROUP: for (; i < vec.size() && vec[i]->sect_order == j; i++) { // Memory protection works on page size granularity. We need to // put sections with different memory attributes into different // pages. We do it by inserting a padding. if (i != 0) { i64 flags1 = to_phdr_flags(ctx, vec[i - 1]); i64 flags2 = to_phdr_flags(ctx, vec[i]); if (flags1 != flags2) { switch (ctx.arg.z_separate_code) { case SEPARATE_LOADABLE_SEGMENTS: addr = align_to(addr, ctx.page_size); break; case SEPARATE_CODE: if ((flags1 & PF_X) != (flags2 & PF_X)) addr = align_to(addr, ctx.page_size); break; default: break; } } } addr = align_to(addr, vec[i]->shdr.sh_addralign); vec[i]->shdr.sh_addr = addr; addr += vec[i]->shdr.sh_size; } break; case SectionOrder::ADDR: if (addr != ctx.arg.image_base && ord.value < addr) Error(ctx) << "--section-order: address goes backward: requested " << std::hex << std::showbase << ord.value << " < current " << addr << " (at token '" << ord.token << "')"; addr = ord.value; break; case SectionOrder::ALIGN: addr = align_to(addr, ord.value); break; case SectionOrder::SYMBOL: get_symbol(ctx, ord.name)->value = addr; break; default: unreachable(); } } } // Returns the smallest integer N that satisfies N >= val and // N % align == skew % align. // // Section's file offset must be congruent to its virtual address modulo // the page size. We use this function to satisfy that requirement. static u64 align_with_skew(u64 val, u64 align, u64 skew) { return val + ((skew - val) & (align - 1)); } // Assign file offsets to output sections. template static i64 set_file_offsets(Context &ctx) { std::vector *> &chunks = ctx.chunks; u64 fileoff = 0; i64 i = 0; while (i < chunks.size()) { Chunk &first = *chunks[i]; if (!(first.shdr.sh_flags & SHF_ALLOC)) { fileoff = align_to(fileoff, first.shdr.sh_addralign); first.shdr.sh_offset = fileoff; fileoff += first.shdr.sh_size; i++; continue; } if (first.shdr.sh_type == SHT_NOBITS) { first.shdr.sh_offset = fileoff; i++; continue; } if (first.shdr.sh_addralign > ctx.page_size) fileoff = align_to(fileoff, first.shdr.sh_addralign); else fileoff = align_with_skew(fileoff, ctx.page_size, first.shdr.sh_addr); // Assign ALLOC sections contiguous file offsets as long as they // are contiguous in memory. for (;;) { chunks[i]->shdr.sh_offset = fileoff + chunks[i]->shdr.sh_addr - first.shdr.sh_addr; i++; if (i >= chunks.size() || !(chunks[i]->shdr.sh_flags & SHF_ALLOC) || chunks[i]->shdr.sh_type == SHT_NOBITS) break; // If --start-section is given, addresses may not increase // monotonically. if (chunks[i]->shdr.sh_addr < first.shdr.sh_addr) break; i64 gap_size = chunks[i]->shdr.sh_addr - chunks[i - 1]->shdr.sh_addr - chunks[i - 1]->shdr.sh_size; // If --start-section is given, there may be a large gap between // sections. We don't want to allocate a disk space for a gap if // exists. if (gap_size >= ctx.page_size) break; } fileoff = chunks[i - 1]->shdr.sh_offset + chunks[i - 1]->shdr.sh_size; while (i < chunks.size() && (chunks[i]->shdr.sh_flags & SHF_ALLOC) && chunks[i]->shdr.sh_type == SHT_NOBITS) { chunks[i]->shdr.sh_offset = fileoff; i++; } } return fileoff; } // Remove debug sections from ctx.chunks and save them to ctx.debug_chunks. // This is for --separate-debug-file. template void separate_debug_sections(Context &ctx) { auto is_debug_section = [&](Chunk *chunk) { if (chunk->shdr.sh_flags & SHF_ALLOC) return false; return chunk == ctx.gdb_index || chunk == ctx.symtab || chunk == ctx.strtab || chunk->name.starts_with(".debug_"); }; auto tail = ranges::stable_partition(ctx.chunks, std::not_fn(is_debug_section)); ctx.debug_chunks = {tail.begin(), tail.end()}; ctx.chunks.erase(tail.begin(), tail.end()); } template void compute_section_headers(Context &ctx) { // Update sh_size for each chunk. for (Chunk *chunk : ctx.chunks) chunk->update_shdr(ctx); // Remove empty chunks. std::erase_if(ctx.chunks, [&](Chunk *chunk) { return !chunk->to_osec() && chunk != ctx.gdb_index && chunk->shdr.sh_size == 0; }); // Set section indices. i64 shndx = 1; for (Chunk *chunk : ctx.chunks) if (!chunk->is_header()) chunk->shndx = shndx++; if (ctx.symtab && SHN_LORESERVE <= shndx) { SymtabShndxSection *sec = new SymtabShndxSection; sec->shndx = shndx++; sec->shdr.sh_link = ctx.symtab->shndx; ctx.symtab_shndx = sec; ctx.chunks.push_back(sec); ctx.chunk_pool.emplace_back(sec); } if (ctx.shdr) ctx.shdr->shdr.sh_size = shndx * sizeof(ElfShdr); // Some types of section header refer to other section by index. // Recompute all section headers to fill such fields with correct values. for (Chunk *chunk : ctx.chunks) chunk->update_shdr(ctx); if (ctx.symtab_shndx) { i64 symtab_size = ctx.symtab->shdr.sh_size / sizeof(ElfSym); ctx.symtab_shndx->shdr.sh_size = symtab_size * 4; } } // Assign virtual addresses and file offsets to output sections. template i64 set_osec_offsets(Context &ctx) { Timer t(ctx, "set_osec_offsets"); for (;;) { if (ctx.arg.section_order.empty()) set_virtual_addresses_regular(ctx); else set_virtual_addresses_by_order(ctx); ctx.checkpoint(); // Assigning new offsets may change the contents and the length // of the program header, so repeat it until converge. i64 fileoff = set_file_offsets(ctx); if (ctx.phdr) { i64 sz = ctx.phdr->shdr.sh_size; ctx.phdr->update_shdr(ctx); if (sz < ctx.phdr->shdr.sh_size) continue; } return fileoff; } } template static i64 get_num_irelative_relocs(Context &ctx) { i64 n = ctx.num_ifunc_dynrels; for (Symbol *sym : ctx.got->got_syms) if (sym->is_ifunc()) n++; return n; } template static u64 to_paddr(Context &ctx, u64 vaddr) { for (ElfPhdr &phdr : ctx.phdr->phdrs) if (phdr.p_type == PT_LOAD) if (phdr.p_vaddr <= vaddr && vaddr < phdr.p_vaddr + phdr.p_memsz) return phdr.p_paddr + (vaddr - phdr.p_vaddr); return 0; } template void fix_synthetic_symbols(Context &ctx) { auto start = [](Symbol *sym, auto &chunk, i64 bias = 0) { if (sym && chunk) { sym->set_output_section(chunk); sym->value = chunk->shdr.sh_addr + bias; } }; auto stop = [](Symbol *sym, auto &chunk, i64 bias = 0) { if (sym && chunk) { sym->set_output_section(chunk); sym->value = chunk->shdr.sh_addr + chunk->shdr.sh_size + bias; } }; std::vector *> sections; for (Chunk *chunk : ctx.chunks) if (!chunk->is_header() && (chunk->shdr.sh_flags & SHF_ALLOC)) sections.push_back(chunk); auto find = [&](std::string name) -> Chunk * { for (Chunk *chunk : sections) if (chunk->name == name) return chunk; return nullptr; }; // __bss_start if (Chunk *chunk = find(".bss")) start(ctx.__bss_start, chunk); if (ctx.ehdr && (ctx.ehdr->shdr.sh_flags & SHF_ALLOC)) { ctx.__ehdr_start->set_output_section(sections[0]); ctx.__ehdr_start->value = ctx.ehdr->shdr.sh_addr; ctx.__executable_start->set_output_section(sections[0]); ctx.__executable_start->value = ctx.ehdr->shdr.sh_addr; } if (ctx.__dso_handle) { ctx.__dso_handle->set_output_section(sections[0]); ctx.__dso_handle->value = sections[0]->shdr.sh_addr; } // __rel_iplt_start and __rel_iplt_end. These symbols need to be // defined in a statically-linked non-relocatable executable because // such executable lacks the .dynamic section and thus there's no way // to find ifunc relocations other than these symbols. if (ctx.reldyn && ctx.arg.static_ && !ctx.arg.pie) { stop(ctx.__rel_iplt_start, ctx.reldyn, -get_num_irelative_relocs(ctx) * sizeof(ElfRel)); stop(ctx.__rel_iplt_end, ctx.reldyn); } else { // If the symbols are not ncessary, we turn them to absolute // symbols at address 0. ctx.__rel_iplt_start->origin = 0; ctx.__rel_iplt_end->origin = 0; } // __{init,fini}_array_{start,end} for (Chunk *chunk : sections) { switch (chunk->shdr.sh_type) { case SHT_INIT_ARRAY: start(ctx.__init_array_start, chunk); stop(ctx.__init_array_end, chunk); break; case SHT_PREINIT_ARRAY: start(ctx.__preinit_array_start, chunk); stop(ctx.__preinit_array_end, chunk); break; case SHT_FINI_ARRAY: start(ctx.__fini_array_start, chunk); stop(ctx.__fini_array_end, chunk); break; } } // _end, _etext, _edata and the like for (Chunk *chunk : sections) { if (chunk->shdr.sh_flags & SHF_ALLOC) { stop(ctx._end, chunk); stop(ctx.end, chunk); } if (chunk->shdr.sh_flags & SHF_EXECINSTR) { stop(ctx._etext, chunk); stop(ctx.etext, chunk); } if (chunk->shdr.sh_type != SHT_NOBITS && (chunk->shdr.sh_flags & SHF_ALLOC)) { stop(ctx._edata, chunk); stop(ctx.edata, chunk); } } // _DYNAMIC start(ctx._DYNAMIC, ctx.dynamic); // _GLOBAL_OFFSET_TABLE_. I don't know why, but for the sake of // compatibility with existing code, it must be set to the beginning of // .got.plt instead of .got only on i386 and x86-64. if constexpr (is_x86) start(ctx._GLOBAL_OFFSET_TABLE_, ctx.gotplt); else start(ctx._GLOBAL_OFFSET_TABLE_, ctx.got); // _PROCEDURE_LINKAGE_TABLE_. We need this on SPARC. start(ctx._PROCEDURE_LINKAGE_TABLE_, ctx.plt); // _TLS_MODULE_BASE_. This symbol is used to obtain the address of // the TLS block in the TLSDESC model. I believe GCC and Clang don't // create a reference to it, but Intel compiler seems to be using // this symbol. if (ctx._TLS_MODULE_BASE_) { ctx._TLS_MODULE_BASE_->set_output_section(sections[0]); ctx._TLS_MODULE_BASE_->value = ctx.dtp_addr; } // __GNU_EH_FRAME_HDR start(ctx.__GNU_EH_FRAME_HDR, ctx.eh_frame_hdr); // RISC-V's __global_pointer$ if (ctx.__global_pointer) { if (Chunk *chunk = find(".sdata")) { start(ctx.__global_pointer, chunk, 0x800); } else { ctx.__global_pointer->set_output_section(sections[0]); ctx.__global_pointer->value = 0; } } // ARM32's __exidx_{start,end} if (ctx.__exidx_start) { if (Chunk *chunk = find(".ARM.exidx")) { start(ctx.__exidx_start, chunk); stop(ctx.__exidx_end, chunk); } } // PPC64's ".TOC." symbol. if constexpr (is_ppc64) { if (Chunk *chunk = find(".got")) { start(ctx.extra.TOC, chunk, 0x8000); } else if (Chunk *chunk = find(".toc")) { start(ctx.extra.TOC, chunk, 0x8000); } else { ctx.extra.TOC->set_output_section(sections[0]); ctx.extra.TOC->value = 0; } } // PPC64's _{save,rest}gpr{0,1}_{14,15,16,...,31} symbols if constexpr (is_ppc64v2) for (i64 i = 0; i < ppc64_save_restore_insns.size(); i++) if (std::string_view label = ppc64_save_restore_insns[i].first; !label.empty()) if (Symbol *sym = get_symbol(ctx, label); sym->file == ctx.internal_obj) start(sym, ctx.extra.save_restore, i * 4); // __start_ and __stop_ symbols for (Chunk *chunk : sections) { if (std::optional name = get_start_stop_name(ctx, *chunk)) { start(get_symbol(ctx, save_string(ctx, "__start_" + *name)), chunk); stop(get_symbol(ctx, save_string(ctx, "__stop_" + *name)), chunk); if (ctx.arg.physical_image_base) { u64 paddr = to_paddr(ctx, chunk->shdr.sh_addr); Symbol *x = get_symbol(ctx, save_string(ctx, "__phys_start_" + *name)); x->set_output_section(chunk); x->value = paddr; Symbol *y = get_symbol(ctx, save_string(ctx, "__phys_stop_" + *name)); y->set_output_section(chunk); y->value = paddr + chunk->shdr.sh_size; } } } // --defsym=sym=value symbols for (i64 i = 0; i < ctx.arg.defsyms.size(); i++) { Symbol *sym = ctx.arg.defsyms[i].first; std::variant *, u64> val = ctx.arg.defsyms[i].second; if (u64 *addr = std::get_if(&val)) { sym->origin = 0; sym->value = *addr; } else { Symbol *sym2 = std::get *>(val); sym->value = sym2->value; sym->origin = sym2->origin; sym->visibility = sym2->visibility.load(); } } // --section-order symbols for (SectionOrder &ord : ctx.arg.section_order) if (ord.type == SectionOrder::SYMBOL) get_symbol(ctx, ord.name)->set_output_section(sections[0]); } template void compress_debug_sections(Context &ctx) { Timer t(ctx, "compress_debug_sections"); // Since this pass is embarassingly parallel, we want to use all // available cores by default. i64 thread_count = 0; if (!ctx.arg.thread_count.has_value()) { thread_count = ctx.global_limit->active_value(tbb::global_control::max_allowed_parallelism); ctx.global_limit.reset(); } tbb::parallel_for((i64)0, (i64)ctx.chunks.size(), [&](i64 i) { Chunk &chunk = *ctx.chunks[i]; if (!(chunk.shdr.sh_flags & SHF_ALLOC) && chunk.shdr.sh_size && chunk.name.starts_with(".debug_")) { Chunk *comp = new CompressedSection(ctx, chunk); ctx.chunk_pool.emplace_back(comp); ctx.chunks[i] = comp; } }); if (thread_count > 0) ctx.global_limit.emplace(tbb::global_control::max_allowed_parallelism, thread_count); } // BLAKE3 is a cryptographic hash function just like SHA256. // We use it instead of SHA256 because it's faster. static void blake3_hash(u8 *buf, i64 size, u8 *out) { blake3_hasher hasher; blake3_hasher_init(&hasher); blake3_hasher_update(&hasher, buf, size); blake3_hasher_finalize(&hasher, out, BLAKE3_OUT_LEN); } template std::vector> get_shards(Context &ctx) { constexpr i64 shard_size = 4 * 1024 * 1024; // 4 MiB std::span buf = {ctx.buf, (size_t)ctx.output_file->filesize}; std::vector> vec; while (!buf.empty()) { i64 sz = std::min(shard_size, buf.size()); vec.push_back(buf.subspan(0, sz)); buf = buf.subspan(sz); } return vec; } // Sort dynamic relocations. This is the reason why we do it. // Quote from https://www.airs.com/blog/archives/186 // // The dynamic linker in glibc uses a one element cache when processing // relocs: if a relocation refers to the same symbol as the previous // relocation, then the dynamic linker reuses the value rather than // looking up the symbol again. Thus the dynamic linker gets the best // results if the dynamic relocations are sorted so that all dynamic // relocations for a given dynamic symbol are adjacent. // // Other than that, the linker sorts together all relative relocations, // which don't have symbols. Two relative relocations, or two relocations // against the same symbol, are sorted by the address in the output // file. This tends to optimize paging and caching when there are two // references from the same page. template void sort_reldyn(Context &ctx) { Timer t(ctx, "sort_reldyn"); ElfRel *begin = (ElfRel *)(ctx.buf + ctx.reldyn->shdr.sh_offset); ElfRel *end = begin + ctx.reldyn->shdr.sh_size / sizeof(ElfRel); // We group IFUNC relocations at the end of .rel.dyn because we want to // apply all the other relocations before running user-supplied IFUNC // resolvers. auto get_rank = [](u32 r_type) { if (r_type == E::R_RELATIVE) return 0; if constexpr (supports_ifunc) if (r_type == E::R_IRELATIVE) return 2; return 1; }; tbb::parallel_sort(begin, end, [&](const ElfRel &a, const ElfRel &b) { return std::tuple(get_rank(a.r_type), a.r_sym, a.r_offset) < std::tuple(get_rank(b.r_type), b.r_sym, b.r_offset); }); } template void write_build_id(Context &ctx) { Timer t(ctx, "write_build_id"); switch (ctx.arg.build_id.kind) { case BuildId::HEX: ctx.buildid->contents = ctx.arg.build_id.value; break; case BuildId::HASH: { std::vector> shards = get_shards(ctx); std::vector hashes(shards.size() * BLAKE3_OUT_LEN); tbb::parallel_for((i64)0, (i64)shards.size(), [&](i64 i) { blake3_hash(shards[i].data(), shards[i].size(), hashes.data() + i * BLAKE3_OUT_LEN); #if HAVE_MADVISE // Make the kernel page out the file contents we've just written // so that subsequent close(2) call will become quicker. if (i > 0 && ctx.output_file->is_mmapped) madvise(shards[i].data(), shards[i].size(), MADV_DONTNEED); #endif }); u8 buf[BLAKE3_OUT_LEN]; blake3_hash(hashes.data(), hashes.size(), buf); msan_unpoison(buf, BLAKE3_OUT_LEN); assert(ctx.arg.build_id.size() <= BLAKE3_OUT_LEN); ctx.buildid->contents = {buf, buf + ctx.arg.build_id.size()}; break; } case BuildId::UUID: { u8 buf[16]; get_random_bytes(buf, 16); // Indicate that this is UUIDv4 as defined by RFC4122 buf[6] = (buf[6] & 0b0000'1111) | 0b0100'0000; buf[8] = (buf[8] & 0b0011'1111) | 0b1000'0000; ctx.buildid->contents = {buf, buf + 16}; break; } default: unreachable(); } ctx.buildid->copy_buf(ctx); } // A .gnu_debuglink section contains a filename and a CRC32 checksum of a // debug info file. When we are writing a .gnu_debuglink, we don't know // its CRC32 checksum because we haven't created a debug info file. So we // write a dummy value instead. // // We can't choose a random value as a dummy value for build // reproducibility. We also don't want to write a fixed value for all // files because the CRC checksum is in this section to prevent using // wrong file on debugging. gdb rejects a debug info file if its CRC // doesn't match with the one in .gdb_debuglink. // // Therefore, we'll try to make our CRC checksum as unique as possible. // We'll remember that checksum, and after creating a debug info file, add // a few bytes of garbage at the end of it so that the debug info file's // CRC checksum becomes the one that we have precomputed. template void write_gnu_debuglink(Context &ctx) { Timer t(ctx, "write_gnu_debuglink"); u32 crc32; if (ctx.buildid) { crc32 = compute_crc32(0, ctx.buildid->contents.data(), ctx.buildid->contents.size()); } else { std::vector> shards = get_shards(ctx); std::vector> hashes(shards.size()); tbb::parallel_for((i64)0, (i64)shards.size(), [&](i64 i) { hashes[i] = hash_string({(char *)shards[i].data(), shards[i].size()}); }); crc32 = compute_crc32(0, (u8 *)hashes.data(), hashes.size() * 8); } ctx.gnu_debuglink->crc32 = crc32; ctx.gnu_debuglink->copy_buf(ctx); } // Write a separate debug file. This function is called after we finish // writing to the usual output file. template void write_separate_debug_file(Context &ctx) { Timer t(ctx, "write_separate_debug_file"); // Open an output file early LockingOutputFile *file = new LockingOutputFile(ctx, ctx.arg.separate_debug_file, 0666); // We want to write to the debug info file in background so that the // user doesn't have to wait for it to complete. if (ctx.arg.detach) notify_parent(); // Restore debug info sections that had been set aside while we were // creating the main file. i64 num_chunks = ctx.chunks.size(); append(ctx.chunks, ctx.debug_chunks); tbb::parallel_for(num_chunks, (i64)ctx.chunks.size(), [&](i64 i) { ctx.chunks[i]->compute_section_size(ctx); }); sort_debug_info_sections(ctx); // Handle --compress-debug-info if (ctx.arg.compress_debug_sections != ELFCOMPRESS_NONE) compress_debug_sections(ctx); // Recompute section header contents since we have added debug sections compute_section_headers(ctx); // A debug info file contains all sections as the original file, though // most of them can be empty as if they were bss sections. We convert // real sections into dummy sections here. for (i64 i = 0; i < num_chunks; i++) { Chunk *chunk = ctx.chunks[i]; if (!chunk->is_header() && chunk != ctx.shstrtab && chunk->shdr.sh_type != SHT_NOTE) { Chunk *sec = new OutputSection(chunk->name, SHT_NULL); sec->shdr = chunk->shdr; sec->shdr.sh_type = SHT_NOBITS; sec->shndx = chunk->shndx; ctx.chunks[i] = sec; ctx.chunk_pool.emplace_back(sec); } } // Assign file offsets to sections i64 fileoff = 0; for (Chunk *chunk : ctx.chunks) { ElfShdr &shdr = chunk->shdr; if (shdr.sh_type == SHT_NOBITS) { shdr.sh_offset = fileoff; } else if (shdr.sh_flags & SHF_ALLOC) { fileoff = align_with_skew(fileoff, ctx.page_size, shdr.sh_addr); shdr.sh_offset = fileoff; fileoff += shdr.sh_size; } else { fileoff = align_to(fileoff, shdr.sh_addralign); shdr.sh_offset = fileoff; fileoff += shdr.sh_size; } } if (ctx.phdr) { i64 sz = ctx.phdr->phdrs.size(); ctx.phdr->update_shdr(ctx); assert(ctx.phdr->phdrs.size() <= sz); ctx.phdr->phdrs.resize(sz); } // Write to a separate debug file file->resize(ctx, fileoff); ctx.output_file.reset(file); ctx.buf = ctx.output_file->buf; copy_chunks(ctx); if (ctx.gdb_index) write_gdb_index(ctx); // Reverse-compute a CRC32 value so that the CRC32 checksum embedded to // the .gnu_debuglink section in the main executable matches with the // debug info file's CRC32 checksum. u32 crc = compute_crc32(0, ctx.buf, ctx.output_file->filesize); u8 *&buf2 = ctx.output_file->buf2; i64 &buf2_size = ctx.output_file->buf2_size; if (buf2) crc = compute_crc32(crc, buf2, buf2_size); std::vector trailer = crc32_solve(crc, ctx.gnu_debuglink->crc32); buf2 = (u8 *)realloc(buf2, buf2_size + trailer.size()); memcpy(buf2 + buf2_size, trailer.data(), trailer.size()); buf2_size += trailer.size(); ctx.output_file->close(ctx); } // Write Makefile-style dependency rules to a file specified by // --dependency-file. This is analogous to the compiler's -M flag. template void write_dependency_file(Context &ctx) { std::vector deps; std::unordered_set seen; for (std::unique_ptr &mf : ctx.mf_pool) if (mf->is_dependency && !mf->parent) if (std::string path = path_clean(mf->name); seen.insert(path).second) deps.push_back(path); std::ostream *out = &std::cout; std::ofstream file; std::string &path = ctx.arg.dependency_file; if (path != "-") { file.open(path); if (file.fail()) Fatal(ctx) << "--dependency-file: cannot open " << path << ": " << errno_string(); out = &file; } *out << ctx.arg.output << ":"; for (std::string &s : deps) *out << " " << s; *out << "\n"; for (std::string &s : deps) *out << "\n" << s << ":\n"; } template void show_stats(Context &ctx) { for (ObjectFile *obj : ctx.objs) { static Counter defined("defined_syms"); defined += obj->first_global - 1; static Counter undefined("undefined_syms"); undefined += obj->symbols.size() - obj->first_global; for (std::unique_ptr> &sec : obj->sections) { if (!sec || !sec->is_alive) continue; static Counter alloc("reloc_alloc"); static Counter nonalloc("reloc_nonalloc"); if (sec->shdr().sh_flags & SHF_ALLOC) alloc += sec->get_rels(ctx).size(); else nonalloc += sec->get_rels(ctx).size(); } static Counter comdats("comdats"); comdats += obj->comdat_groups.size(); static Counter removed_comdats("removed_comdat_mem"); for (ComdatGroupRef &ref : obj->comdat_groups) if (ref.group->owner != obj->priority) removed_comdats += ref.members.size(); static Counter num_cies("num_cies"); num_cies += obj->cies.size(); static Counter num_unique_cies("num_unique_cies"); for (CieRecord &cie : obj->cies) if (cie.is_leader) num_unique_cies++; static Counter num_fdes("num_fdes"); num_fdes += obj->fdes.size(); } static Counter num_bytes("total_input_bytes"); for (std::unique_ptr &mf : ctx.mf_pool) num_bytes += mf->size; static Counter num_input_sections("input_sections"); for (ObjectFile *file : ctx.objs) num_input_sections += file->sections.size(); static Counter num_output_chunks("output_chunks", ctx.chunks.size()); static Counter num_objs("num_objs", ctx.objs.size()); static Counter num_dsos("num_dsos", ctx.dsos.size()); using Entry = typename ConcurrentMap>::Entry; static Counter merged_strings("merged_strings"); for (std::unique_ptr> &sec : ctx.merged_sections) for (Entry &ent : std::span(sec->map.entries, sec->map.nbuckets)) if (ent.key) merged_strings++; if constexpr (needs_thunk) { static Counter thunk_bytes("thunk_bytes"); for (Chunk *chunk : ctx.chunks) if (OutputSection *osec = chunk->to_osec()) for (std::unique_ptr> &thunk : osec->thunks) thunk_bytes += thunk->size(); } if constexpr (is_riscv || is_loongarch) { static Counter num_rels("shrunk_relocs"); for (Chunk *chunk : ctx.chunks) if (OutputSection *osec = chunk->to_osec()) if (osec->shdr.sh_flags & SHF_EXECINSTR) for (InputSection *isec : osec->members) num_rels += isec->extra.r_deltas.size(); } Counter::print(); for (std::unique_ptr> &sec : ctx.merged_sections) sec->print_stats(ctx); } using E = MOLD_TARGET; template int redo_main(std::string_view, int, char **); template void create_internal_file(Context &); template void apply_exclude_libs(Context &); template void create_synthetic_sections(Context &); template void resolve_symbols(Context &); template void do_lto(Context &); template void parse_eh_frame_sections(Context &); template void create_merged_sections(Context &); template void convert_common_symbols(Context &); template void create_output_sections(Context &); template void add_synthetic_symbols(Context &); template void check_cet_errors(Context &); template void apply_section_align(Context &); template void print_dependencies(Context &); template void write_repro_file(Context &); template void check_duplicate_symbols(Context &); template void convert_zero_to_bss(Context &); template void check_shlib_undefined(Context &); template void check_symbol_types(Context &); template void sort_init_fini(Context &); template void sort_ctor_dtor(Context &); template void fixup_ctors_in_init_array(Context &); template void shuffle_sections(Context &); template void add_dynamic_strings(Context &); template void compute_section_sizes(Context &); template void sort_output_sections(Context &); template void claim_unresolved_symbols(Context &); template void compute_imported_symbol_weakness(Context &); template void scan_relocations(Context &); template void report_undef_errors(Context &); template void create_reloc_sections(Context &); template void copy_chunks(Context &); template void construct_relr(Context &); template void sort_dynsyms(Context &); template void sort_debug_info_sections(Context &); template void create_output_symtab(Context &); template void apply_version_script(Context &); template void parse_symbol_version(Context &); template void compute_import_export(Context &); template void compute_address_significance(Context &); template void separate_debug_sections(Context &); template void compute_section_headers(Context &); template i64 set_osec_offsets(Context &); template void fix_synthetic_symbols(Context &); template void compress_debug_sections(Context &); template void sort_reldyn(Context &); template void write_build_id(Context &); template void write_gnu_debuglink(Context &); template void write_separate_debug_file(Context &); template void write_dependency_file(Context &); template void show_stats(Context &); } // namespace mold ================================================ FILE: src/relocatable.cc ================================================ // This file implements -r or --relocatable. That option forces the linker // to combine input object files into another single large object file. // Since the behavior of the linker when the option is given is quite // different from that of the normal execution mode, we separate code for // the feature into this separate file. // // The --relocatable option isn't used very often. After all, if you want // to combine object files into a single file, you could use `ar`. // However, some programs use it in a creative manner which is hard to be // substituted with static archives, so we need to support this option in // the same way as GNU ld does. A notable example is GHC (Glasgow Haskell // Compiler). GHC has its own dynamic linker which can load a .o file (as // opposed to a .so) into memory. GHC's module is not a shared object file // but a combined object file. // // There are many different ways to combine object files into a single file. // The simplest approach would be to just copy all sections from input files // to an output file as-is with a few exceptions for singleton sections such // as the symbol table or the string table. That works, but that's not // compatible with GNU ld. // // To be compatible with GNU ld, we need to do the followings: // // - Regular sections containing opaque data (e.g. ".text" or ".data") // are just copied as-is. Two sections with the same name are merged. // // - .symtab, .strtab and .shstrtab are merged. // // - COMDAT groups are uniquified. // // - Relocations are copied, but we need to fix symbol indices. #include "mold.h" #include #include namespace mold { // Create linker-synthesized sections template static void r_create_synthetic_sections(Context &ctx) { auto push = [&](auto *x) { ctx.chunks.push_back(x); ctx.chunk_pool.emplace_back(x); return x; }; ctx.ehdr = push(new OutputEhdr(0)); ctx.shdr = push(new OutputShdr); ctx.eh_frame = push(new EhFrameSection); ctx.eh_frame_reloc = push(new EhFrameRelocSection); ctx.strtab = push(new StrtabSection); ctx.symtab = push(new SymtabSection); ctx.shstrtab = push(new ShstrtabSection); if constexpr (is_x86) ctx.extra.note_property = push(new NotePropertySection); if constexpr (is_riscv) ctx.extra.riscv_attributes = push(new RiscvAttributesSection); } // Create SHT_GROUP (i.e. comdat group) sections. We uniquify comdat // sections by signature. We want to propagate input comdat groups as // output comdat groups if they are still alive after uniquification. template static void create_comdat_group_sections(Context &ctx) { Timer t(ctx, "create_comdat_group_sections"); std::vector *>> vec{ctx.objs.size()}; tbb::parallel_for((i64)0, (i64)ctx.objs.size(), [&](i64 i) { ObjectFile &file = *ctx.objs[i]; for (ComdatGroupRef &ref : file.comdat_groups) { if (ref.group->owner != file.priority) continue; Symbol *sym = file.symbols[file.elf_sections[ref.sect_idx].sh_info]; assert(sym); std::vector *> members; for (u32 j : ref.members) { const ElfShdr &shdr = file.elf_sections[j]; if (shdr.sh_type == (E::is_rela ? SHT_RELA : SHT_REL)) { InputSection &isec = *file.sections[shdr.sh_info]; members.push_back(isec.output_section->reloc_sec.get()); } else { InputSection &isec = *file.sections[j]; members.push_back(isec.output_section); } } vec[i].push_back(new ComdatGroupSection(*sym, std::move(members))); } }); for (std::vector *> &vec2 : vec) { for (Chunk *chunk : vec2) { ctx.chunks.push_back(chunk); ctx.chunk_pool.emplace_back(chunk); } } } // Unresolved undefined symbols in the -r mode are simply propagated to an // output file as undefined symbols. This function guarantees that // unresolved undefined symbols belongs to some input file. template static void r_claim_unresolved_symbols(Context &ctx) { Timer t(ctx, "r_claim_unresolved_symbols"); tbb::parallel_for_each(ctx.objs, [](ObjectFile *file) { for (i64 i = file->first_global; i < file->elf_syms.size(); i++) { const ElfSym &esym = file->elf_syms[i]; Symbol &sym = *file->symbols[i]; if (!esym.is_undef()) continue; std::scoped_lock lock(sym.mu); if (sym.file && (!sym.esym().is_undef() || sym.file->priority <= file->priority)) continue; sym.file = file; sym.origin = 0; sym.value = 0; sym.sym_idx = i; } }); } // Set output section in-file offsets. Output section memory addresses // are left as zero. template static u64 r_set_osec_offsets(Context &ctx) { u64 offset = 0; for (Chunk *chunk : ctx.chunks) { offset = align_to(offset, chunk->shdr.sh_addralign); chunk->shdr.sh_offset = offset; offset += chunk->shdr.sh_size; } return offset; } template void combine_objects(Context &ctx) { create_output_sections(ctx); r_create_synthetic_sections(ctx); r_claim_unresolved_symbols(ctx); compute_section_sizes(ctx); sort_output_sections(ctx); create_output_symtab(ctx); ctx.eh_frame->construct(ctx); create_reloc_sections(ctx); create_comdat_group_sections(ctx); compute_section_headers(ctx); i64 filesize = r_set_osec_offsets(ctx); ctx.output_file = OutputFile::open(ctx, ctx.arg.output, filesize, 0666); ctx.buf = ctx.output_file->buf; copy_chunks(ctx); ctx.output_file->close(ctx); ctx.checkpoint(); if (ctx.arg.print_map) print_map(ctx); if (ctx.arg.stats) show_stats(ctx); if (ctx.arg.perf) print_timer_records(ctx.timer_records); if (ctx.arg.quick_exit) _exit(0); } using E = MOLD_TARGET; template void combine_objects(Context &); } // namespace mold ================================================ FILE: src/shrink-sections.cc ================================================ // Since RISC instructions are generally up to 32 bits long, there's no // way to embed very large immediates into their branch instructions. For // example, RISC-V's JAL (jump and link) instruction can jump to only // within PC ± 1 MiB because its immediate is 21 bits long. If the // destination is further than that, we need to use two instructions // instead; the first instruction being AUIPC, which sets the upper 20 // bits of a displacement to a register, and the second being JALR, which // specifies the lower 12 bits and the register. Combined, they specify a // 32-bit displacement, which is sufficient to support the medium code // model. // // However, always using two or more instructions for function calls is a // waste of time and space if the branch target is within a single // instruction's reach. There are two approaches to address this problem // as follows: // // 1. The compiler optimistically emits a single branch instruction for // all function calls. The linker then checks if the branch target is // reachable, and if not, redirects the branch to a linker-synthesized // code sequence that uses two or more instructions to branch further. // That linker-synthesized code is called a "thunk". All RISC psABIs // except RISC-V and LoongArch take this approach. // // 2. The compiler pessimistically emits two instructions to branch // anywhere in PC ± 2 GiB, and the linker rewrites them with a single // instruction if the branch target is close enough. RISC-V and // LoongArch take this approach. // // This file contains functions to support (2). For (1), see thunks.cc. // // With the presence of this code-shrinking relaxation, sections can no // longer be considered as atomic units. If we delete an instruction from // the middle of a section, the section contents after that point need to // be shifted by the size of the instruction. Symbol values and relocation // offsets have to be shifted too if they refer to bytes past the deleted // ones. // // In mold, we use `r_deltas` to memorize how many bytes have been shifted // for relocations. For symbols, we directly mutate their `value` member. // // RISC-V and LoongArch object files tend to have way more relocations // than those for other targets. This is because all branches, including // those that jump within the same section, are explicitly expressed with // relocations. Here is why we need them: all control-flow statements, // such as `if` or `for`, are implemented using branch instructions. For // other targets, the compiler doesn't emit relocations for such branches // because it knows at compile-time exactly how many bytes have to be // skipped. That's not true in RISC-V and LoongArch because the linker may // delete bytes between a branch and its target. Therefore, all branches, // including in-section ones, have to be explicitly expressed with // relocations. // // Note that this mechanism only shrinks sections and never enlarges them, // as the compiler always emits the longest instruction sequence. This // makes the linker implementation a bit simpler because we don't need to // worry about oscillation. #if MOLD_RV64LE || MOLD_RV64BE || MOLD_RV32LE || MOLD_RV32BE || \ MOLD_LOONGARCH64 || MOLD_LOONGARCH32 #include "mold.h" #include namespace mold { using E = MOLD_TARGET; template <> i64 get_r_delta(InputSection &isec, u64 offset) { std::span deltas = isec.extra.r_deltas; auto it = ranges::upper_bound(deltas, offset, std::less_equal<>{}, &RelocDelta::offset); return (it == deltas.begin()) ? 0 : (it - 1)->delta; } template <> void shrink_sections(Context &ctx) { Timer t(ctx, "shrink_sections"); // Find all relaxable relocations and record how many bytes we can save // into r_deltas. // // Technically speaking, relaxing relocations may allow more relocations // to be relaxed because the distance between a branch instruction and // its target may decrease as a result of relaxation. That said, the // number of such relocations is negligible (I tried to self-host mold // on RISC-V as an experiment and found that the mold-built .text is // only ~0.04% larger than that of GNU ld), so we don't bother to handle // them. We scan relocations only once here. tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { for (std::unique_ptr> &isec : file->sections) if (isec && isec->is_alive && (isec->shdr().sh_flags & SHF_EXECINSTR)) shrink_section(ctx, *isec); }); // Fix symbol values. tbb::parallel_for_each(ctx.objs, [&](ObjectFile *file) { for (Symbol *sym : file->symbols) if (sym->file == file) if (InputSection *isec = sym->get_input_section()) if (isec->shdr().sh_flags & SHF_EXECINSTR) if (i64 delta = get_r_delta(*isec, sym->value)) sym->value -= delta; }); // Recompute sizes of executable sections tbb::parallel_for_each(ctx.chunks, [&](Chunk *chunk) { if (chunk->to_osec() && (chunk->shdr.sh_flags & SHF_EXECINSTR)) chunk->compute_section_size(ctx); }); } // Returns the distance between a relocated place and a symbol. template <> i64 compute_distance(Context &ctx, Symbol &sym, InputSection &isec, const ElfRel &rel) { // We handle absolute symbols as if they were infinitely far away // because `shrink_section` may increase a distance between a branch // instruction and an absolute symbol. Branching to an absolute // location is extremely rare in real code, though. if (sym.is_absolute()) return INT64_MAX; // Compute a distance between the relocated place and the symbol. i64 S = sym.get_addr(ctx); i64 A = rel.r_addend; i64 P = isec.get_addr() + rel.r_offset; return S + A - P; } } // namespace mold #endif ================================================ FILE: src/signal-unix.cc ================================================ #include "mold.h" #include #include #ifdef __FreeBSD__ # include # include #endif namespace mold { std::string errno_string() { // strerror is not thread-safe, so guard it with a lock. static std::mutex mu; std::scoped_lock lock(mu); return strerror(errno); } void cleanup() { if (output_tmpfile) unlink(output_tmpfile); } // mold mmap's an output file, and the mmap succeeds even if there's // no enough space left on the filesystem. The actual disk blocks are // not allocated on the mmap call but when the program writes to it // for the first time. // // If a disk becomes full as a result of a write to an mmap'ed memory // region, the failure of the write is reported as a SIGBUS. This // signal handler catches that signal and prints out a user-friendly // error message. Without this, it is very hard to realize that the // disk might be full. static std::string sigabrt_msg; static void sighandler(int signo, siginfo_t *info, void *ucontext) { static std::mutex mu; std::scoped_lock lock{mu}; // Handle disk full error switch (signo) { case SIGSEGV: case SIGBUS: if (output_buffer_start <= info->si_addr && info->si_addr < output_buffer_end) { const char msg[] = "mold: failed to write to an output file. Disk full?\n"; (void)!write(STDERR_FILENO, msg, sizeof(msg) - 1); } break; case SIGABRT: { (void)!write(STDERR_FILENO, &sigabrt_msg[0], sigabrt_msg.size()); break; } } // Re-throw the signal signal(SIGSEGV, SIG_DFL); signal(SIGBUS, SIG_DFL); signal(SIGABRT, SIG_DFL); cleanup(); raise(signo); } void install_signal_handler() { struct sigaction action; action.sa_sigaction = sighandler; sigemptyset(&action.sa_mask); action.sa_flags = SA_SIGINFO; sigaction(SIGSEGV, &action, NULL); sigaction(SIGBUS, &action, NULL); // OneTBB 2021.9.0 has the interface version 12090. if (TBB_runtime_interface_version() <= 12090) { sigabrt_msg = "mold: aborted\n" "mold: mold with libtbb version 2021.9.0 or older is known to be unstable " "under heavy load. Your libtbb version is " + std::string(TBB_runtime_version()) + ". Please upgrade your libtbb library and try again.\n"; sigaction(SIGABRT, &action, NULL); } } } // namespace mold ================================================ FILE: src/signal-win32.cc ================================================ #include "mold.h" #include namespace mold { void cleanup() { if (output_tmpfile) _unlink(output_tmpfile); } std::string errno_string() { LPVOID buf; DWORD dw = GetLastError(); FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, nullptr, dw, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPTSTR)&buf, 0, nullptr); std::string ret = (char *)buf; LocalFree(buf); return ret; } static LONG WINAPI vectored_handler(_EXCEPTION_POINTERS *exception_info) { static std::mutex mu; std::scoped_lock lock{mu}; PEXCEPTION_RECORD rec = exception_info->ExceptionRecord; ULONG_PTR *p = rec->ExceptionInformation; if (rec->ExceptionCode == EXCEPTION_IN_PAGE_ERROR && (ULONG_PTR)output_buffer_start <= p[1] && p[1] < (ULONG_PTR)output_buffer_end) { static const char msg[] = "mold: failed to write to an output file. Disk full?\n"; (void)!_write(_fileno(stderr), msg, sizeof(msg) - 1); } else if (rec->ExceptionCode == EXCEPTION_STACK_OVERFLOW) { static const char msg[] = "mold: stack overflow\n"; (void)!_write(_fileno(stderr), msg, sizeof(msg) - 1); } cleanup(); _exit(1); } void install_signal_handler() { AddVectoredExceptionHandler(0, vectored_handler); } } // namespace mold ================================================ FILE: src/subprocess-unix.cc ================================================ #include "config.h" #include "mold.h" #include #include #include #include #include #include #include namespace mold { static int pipe_write_fd = -1; // Exiting from a program with large memory usage is slow -- // it may take a few hundred milliseconds. To hide the latency, // we fork a child and let it do the actual linking work. template void fork_child() { int pipefd[2]; if (pipe(pipefd) == -1) { perror("pipe"); exit(1); } pid_t pid = fork(); if (pid == -1) { perror("fork"); exit(1); } if (pid > 0) { // Parent close(pipefd[1]); char buf[1]; if (read(pipefd[0], buf, 1) == 1) _exit(0); int status; waitpid(pid, &status, 0); if (WIFEXITED(status)) _exit(WEXITSTATUS(status)); if (WIFSIGNALED(status)) raise(WTERMSIG(status)); _exit(1); } // Child close(pipefd[0]); pipe_write_fd = pipefd[1]; } template void notify_parent() { if (pipe_write_fd == -1) return; char buf[] = {1}; [[maybe_unused]] int n = write(pipe_write_fd, buf, 1); assert(n == 1); pipe_write_fd = -1; } template static std::string find_dso(Context &ctx, std::filesystem::path self) { // Look for mold-wrapper.so from the same directory as the executable is. std::filesystem::path path = self.parent_path() / "mold-wrapper.so"; std::error_code ec; if (std::filesystem::is_regular_file(path, ec) && !ec) return path; // If not found, search $(MOLD_LIBDIR)/mold, which is /usr/local/lib/mold // by default. path = MOLD_LIBDIR "/mold/mold-wrapper.so"; if (std::filesystem::is_regular_file(path, ec) && !ec) return path; // Look for ../lib/mold/mold-wrapper.so path = self.parent_path() / "../lib/mold/mold-wrapper.so"; if (std::filesystem::is_regular_file(path, ec) && !ec) return path; Fatal(ctx) << "mold-wrapper.so is missing"; } template [[noreturn]] void process_run_subcommand(Context &ctx, int argc, char **argv) { #ifdef __APPLE__ Fatal(ctx) << "-run is not supported on macOS"; #else assert(argv[1] == "-run"s || argv[1] == "--run"s); if (!argv[2]) Fatal(ctx) << "-run: argument missing"; // Get the mold-wrapper.so path std::string self = get_self_path(); std::string dso_path = find_dso(ctx, self); // Set environment variables setenv("LD_PRELOAD", dso_path.c_str(), 1); setenv("MOLD_PATH", self.c_str(), 1); // If ld, ld.lld or ld.gold is specified, run mold instead if (std::string cmd = path_filename(argv[2]); cmd == "ld" || cmd == "ld.lld" || cmd == "ld.gold") { std::vector args; args.push_back(argv[0]); args.insert(args.end(), argv + 3, argv + argc); args.push_back(nullptr); execv(self.c_str(), args.data()); Fatal(ctx) << "mold -run failed: " << self << ": " << errno_string(); } // Execute a given command execvp(argv[2], argv + 2); Fatal(ctx) << "mold -run failed: " << argv[2] << ": " << errno_string(); #endif } using E = MOLD_TARGET; template void fork_child(); template void notify_parent(); template void process_run_subcommand(Context &, int, char **); } // namespace mold ================================================ FILE: src/subprocess-win32.cc ================================================ #include "mold.h" namespace mold { template void fork_child() {} template void notify_parent() {} template [[noreturn]] void process_run_subcommand(Context &ctx, int argc, char **argv) { Fatal(ctx) << "-run is supported only on Unix"; } using E = MOLD_TARGET; template void fork_child(); template void notify_parent(); template void process_run_subcommand(Context &, int, char **); } // namespace mold ================================================ FILE: src/thunks.cc ================================================ // RISC instructions are usually up to 4 bytes long, so the immediates of // their branch instructions are naturally smaller than 32 bits. This is // contrary to x86-64 on which branch instructions take 4 bytes immediates // and can jump to anywhere within PC ± 2 GiB. // // In fact, ARM32's branch instructions can jump only within ±16 MiB and // ARM64's ±128 MiB, for example. If a branch target is further than that, // we need to let it branch to a linker-synthesized code sequence that // construct a full 32 bit address in a register and jump there. That // linker-synthesized code is called "thunk". // // The function in this file creates thunks. // // Note that although thunks play an important role in an executable, they // don't take up too much space in it. For example, among the clang-16's // text segment whose size is ~300 MiB on ARM64, thunks in total occupy // only ~30 KiB or 0.01%. Of course the number depends on an ISA; we would // need more thunks on ARM32 whose branch range is shorter than ARM64. // That said, the total size of thunks still isn't that much. Therefore, // we don't need to try too hard to reduce thunk size to the absolute // minimum. #if MOLD_ARM32LE || MOLD_ARM32BE || MOLD_ARM64LE || MOLD_ARM64BE || \ MOLD_PPC32 || MOLD_PPC64V1 || MOLD_PPC64V2 #include "mold.h" #include #include namespace mold { using E = MOLD_TARGET; // We create thunks for each 25.6/3.2/6.4 MiB code block for // ARM64/ARM32/PPC, respectively. static constexpr i64 batch_size = branch_distance / 5; // We assume that a single thunk group is smaller than 1 MiB. static constexpr i64 max_thunk_size = 1024 * 1024; // We align thunks to 16 byte boundaries because many processor vendors // recommend we align branch targets to 16 byte boundaries for performance // reasons. static constexpr i64 thunk_align = 16; template static bool requires_thunk(Context &ctx, InputSection &isec, const ElfRel &rel, bool first_pass) { if (!is_func_call_rel(rel)) return false; Symbol &sym = *isec.file.symbols[rel.r_sym]; if (first_pass) { // On the first pass, we pessimistically assume that all out-of-section // relocations are out of range. InputSection *isec2 = sym.get_input_section(); if (!isec2 || isec.output_section != isec2->output_section) return true; // If the target section is in the same output section but // hasn't got any address yet, that's unreacahble. if (isec2->offset == -1) return true; // Even if the target is the same section, we branch to its PLT // if it has one. So a symbol with a PLT is also considered an // out-of-section reference. if (sym.has_plt(ctx)) return true; } // Thumb and ARM B instructions cannot be converted to BX, so we // always have to make them jump to a thunk to switch processor mode // even if their destinations are reachable. if constexpr (is_arm32) if (bool is_thumb = sym.get_addr(ctx) & 1; (rel.r_type == R_ARM_JUMP24 && is_thumb) || (rel.r_type == R_ARM_PLT32 && is_thumb) || (rel.r_type == R_ARM_THM_JUMP24 && !is_thumb)) return true; // On PowerPC, all PLT calls go through range extension thunks. if constexpr (is_ppc) if (sym.has_plt(ctx)) return true; // PowerPC before Power9 lacks PC-relative load/store instructions. // Functions compiled for Power9 or earlier assume that r2 points to // GOT+0x8000, while those for Power10 uses r2 as a scratch register. // We need a thunk to recompute r2 for interworking. if constexpr (is_ppc64v2) if ((rel.r_type == R_PPC64_REL24 && !sym.esym().ppc64_preserves_r2()) || (rel.r_type == R_PPC64_REL24_NOTOC && sym.esym().ppc64_uses_toc())) return true; // Compute a distance between the relocated place and the symbol // and check if they are within reach. i64 S = sym.get_addr(ctx, NO_OPD); i64 A = get_addend(isec, rel); i64 P = isec.get_addr() + rel.r_offset; i64 val = S + A - P; return val < -branch_distance || branch_distance <= val; } template <> void Thunk::compute_size() { offsets.clear(); for (i64 i = 0; i <= symbols.size(); i++) offsets.push_back(E::thunk_hdr_size + i * E::thunk_size); } template <> void OutputSection::create_range_extension_thunks(Context &ctx) { std::span *> m = members; if (m.empty()) return; // Initialize input sections with a dummy offset so that we can // distinguish sections that have got an address with the one who // haven't. for (InputSection *isec : m) isec->offset = -1; thunks.clear(); // We create thunks from the beginning of the section to the end. // We manage progress using four offsets which increase monotonically. // The locations they point to are always A <= B <= C <= D. // // Input sections between B and C are the current batch. // // A is the input section with the smallest address than can reach // from the current batch. // // D is the input section with the largest address such that the thunk // is reachable from the current batch if it's inserted at D. // // ................................ ............ // A B C D // ^ We insert a thunk for the current batch just before D // <---> The current batch, which is smaller than BATCH_SIZE // <--------> Smaller than BRANCH_DISTANCE // <--------> Smaller than BRANCH_DISTANCE // <-------------> Reachable from the current batch i64 a = 0; i64 b = 0; i64 c = 0; i64 d = 0; i64 offset = 0; // The smallest thunk index that is reachable from the current batch. i64 t = 0; while (b < m.size()) { // Move D foward as far as we can jump from B to a thunk at D. auto d_thunk_end = [&] { u64 d_end = align_to(offset, 1 << m[d]->p2align) + m[d]->sh_size; return align_to(d_end, thunk_align) + max_thunk_size; }; while (d < m.size() && (b == d || d_thunk_end() <= m[b]->offset + branch_distance)) { offset = align_to(offset, 1 << m[d]->p2align); m[d]->offset = offset; offset += m[d]->sh_size; d++; } // Move C forward so that C is apart from B by BATCH_SIZE. We want // to make sure that there's at least one section between B and C // to ensure progress. c = b + 1; while (c < d && m[c]->offset + m[c]->sh_size < m[b]->offset + batch_size) c++; // Move A forward so that A is reachable from C. i64 c_offset = (c == d) ? offset : m[c]->offset; while (a < b && m[a]->offset + branch_distance < c_offset) a++; // Erase references to out-of-range thunks. for (; t < thunks.size() && thunks[t]->offset < m[a]->offset; t++) for (Symbol *sym : thunks[t]->symbols) sym->flags = 0; // Create a new thunk and place it at D. offset = align_to(offset, thunk_align); thunks.emplace_back(std::make_unique>(*this, offset)); Thunk &thunk = *thunks.back(); std::mutex mu; // Scan relocations between B and C to collect symbols that need // entries in the new thunk. tbb::parallel_for(b, c, [&](i64 i) { InputSection &isec = *m[i]; for (const ElfRel &rel : isec.get_rels(ctx)) { if (requires_thunk(ctx, isec, rel, true)) { if (Symbol &sym = *isec.file.symbols[rel.r_sym]; !sym.flags.test_and_set()) { std::scoped_lock lock(mu); thunk.symbols.push_back(&sym); } } } }); // Sort symbols added to the thunk to make the output deterministic. ranges::sort(thunk.symbols, {}, [](Symbol *x) { return std::tuple{x->file->priority, x->sym_idx}; }); // Now that we know the number of symbols in the thunk, we can compute // the thunk's size. thunk.compute_size(); assert(thunk.size() < max_thunk_size); offset += thunk.size(); // Move B forward to point to the begining of the next batch. b = c; } // Reset flags for future use for (; t < thunks.size(); t++) for (Symbol *sym : thunks[t]->symbols) sym->flags = 0; this->shdr.sh_size = offset; } // create_range_extension_thunks() creates thunks with a pessimistic // assumption that all out-of-section references are out of range. // After computing output section addresses, we revisit all thunks to // remove unneeded entries from them. // // We create more thunks than necessary and then eliminate some of // them later, instead of just creating thunks at this stage. This is // because we can safely shrink sections after assigning addresses to // them without worrying about making existing references to thunks go // out of range. On the other hand, if we insert thunks after // assigning addresses to sections, references to thunks could become // out of range due to the new extra gaps for thunks. Thus, the // creation of thunks is a two-pass process. template <> void remove_redundant_thunks(Context &ctx) { Timer t(ctx, "remove_redundant_thunks"); // Gather output executable sections std::vector *> sections; for (Chunk *chunk : ctx.chunks) if (OutputSection *osec = chunk->to_osec()) if (osec->shdr.sh_flags & SHF_EXECINSTR) sections.push_back(osec); // Mark all symbols that actually need range extension thunks for (OutputSection *osec : sections) { tbb::parallel_for_each(osec->members, [&](InputSection *isec) { for (const ElfRel &rel : isec->get_rels(ctx)) if (requires_thunk(ctx, *isec, rel, false)) isec->file.symbols[rel.r_sym]->flags.test_and_set(); }); } // Remove symbols from thunks if they don't actually need range // extension thunks for (OutputSection *osec : sections) { for (std::unique_ptr> &thunk : osec->thunks) { std::erase_if(thunk->symbols, [&](Symbol *sym) { return !sym->flags; }); thunk->shrink_size(ctx); } } // Recompute section sizes tbb::parallel_for_each(sections, [&](OutputSection *osec) { std::span *> m = osec->members; std::span>> t = osec->thunks; i64 offset = 0; while (!m.empty() || !t.empty()) { if (!m.empty() && (t.empty() || m[0]->offset < t[0]->offset)) { offset = align_to(offset, 1 << m[0]->p2align); m[0]->offset = offset; offset += m[0]->sh_size; m = m.subspan(1); } else { offset = align_to(offset, thunk_align); t[0]->offset = offset; offset += t[0]->size(); t = t.subspan(1); } } osec->shdr.sh_size = offset; }); // Reset flags for future use for (OutputSection *osec : sections) for (std::unique_ptr> &thunk : osec->thunks) for (Symbol *sym : thunk->symbols) sym->flags = 0; } // When applying relocations, we want to know the address in a reachable // range extension thunk for a given symbol. Doing it by scanning all // reachable range extension thunks is too expensive. // // In this function, we create a list of all addresses in range extension // thunks for each symbol, so that it is easy to find one. // // Note that thunk_addrs must be sorted for binary search. template <> void gather_thunk_addresses(Context &ctx) { Timer t(ctx, "gather_thunk_addresses"); std::vector *> sections; for (Chunk *chunk : ctx.chunks) if (OutputSection *osec = chunk->to_osec()) if (osec->shdr.sh_flags & SHF_EXECINSTR) sections.push_back(osec); ranges::stable_sort(sections, {}, [](OutputSection *x) { return x->shdr.sh_addr; }); for (OutputSection *osec : sections) { for (std::unique_ptr> &thunk : osec->thunks) { for (i64 i = 0; i < thunk->symbols.size(); i++) { Symbol &sym = *thunk->symbols[i]; u64 addr = thunk->get_addr() + thunk->offsets[i]; sym.add_thunk_addr(ctx, addr); } } } } } // namespace mold #endif ================================================ FILE: src/tls.cc ================================================ // This file contains helper functions for thread-local storage (TLS). // TLS is probably the most obscure feature the linker has to support, // so I'll explain it in detail in this comment. // // TLS is a per-thread storage. Thread-local variables (TLVs) are in a TLS // so that each thread has its own set of thread-local variables. Taking // an address of a TLV returns a unique value for each thread. For example, // `&foo` for the following code returns different pointer values for // different threads. // // thread_local int foo; // // TLV is a relatively new feature. C for example didn't provide the // official support for it through the keyword `thread_local` until C11. // TLV needs a coordination between the compiler, the linker and the // runtime to work correctly. // // An ELF exectuable or a shared library using TLV contains a "TLS template // image" in the PT_TLS segment. For each newly created thread including the // initial one, the runtime allocates a contiguous memory for an executable // and its depending shared libraries and copies template images there. That // per-thread memory is called the "initial TLS block". After allocating and // initializing the initial TLS block, the runtime sets a register to refer // to the initial TLS block, so that the thread-local variables are // accessible relative to the register. // // The register referring to the per-thread storage is called the Thread // Pointer (TP). TP is part of the thread's context. When the kernel // scheduler switches threads, TP is saved and restored automatically just // like other registers are. // // The TLS template image is read-only. It contains TLVs' initial values // for new threads, and no one writes to it at runtime. // // Now, let's think about how to access a TLV. We need to know the TLV's // address to access it which can be done in several different ways as // follows: // // 1. If we are creating an executable, we know the exact size of the TLS // template image we are creating, and we know where the TP will be set // to after the template is copied to the initial TLS block. Therefore, // the TP-relative address of a TLV in the main executable is known at // link-time. That means, computing a TLV's address can be as easy as // `add %dst, %tp, `. // // 2. If we are creating a shared library, we don't exactly know where // its TLS template image will be copied to in terms of the // TP-relative address, because we don't know how large the main // executable's and other libraries' TLS template images are. Only the // runtime knows the exact TP-relative address. // // We can solve the problem with an indirection. Specifically, for // each TLV whose TP-relative address is known only at process startup // time, we create a GOT entry to store its TP-relative address. We // then emit a dynamic relocation to let the runtime to fill the GOT // entry with a TP-relative address. // // Computing a TLV address in this scheme needs at least two machine // instructions in most ISAs; the first instruction loads a value from // the GOT entry, and the second one adds the loaded value to TP. // // 3. Now, think about libraries that are dynamically loaded with dlopen. // The TLS block for such library may not be allocated next to the // initial TLS block, so we can have two or more discontiguous TLS // blocks. There's no easy formula to compute an address of a TLV in a // separate TLS block. // // The address of a TLV in a separate TLS block can be obtained by // calling the libc-provided function, __tls_get_addr(). The function // takes two arguments; a module ID to identify the ELF file and the // TLV's offset within the ELF file's TLS template image. Accessing a // TLV is sometimes compiled to a function call! The module ID and the // offset are usually stored to GOT as two consecutive words. // // 1) is called the Local Exec access model. 2) is Initial Exec, and 3) is // General Dynamic. // // There's another little trick that the compiler can use if it knows two // TLVs are in the same ELF file (usually in the same file as the code is). // In this case, we can call __tls_get_addr() only once with the module ID // and the offset 0 to obtain the base address of the ELF file's TLS block. // The base address obtained this way is sometimes called Dynamic Thread // Pointer or DTP. We can then compute TLVs' addresses by adding their // DTP-relative addresses to DTP. This access model is called the Local // Dynamic. // // The compiler tries to emit the most efficient code for a TLV access based // on compiler command line options and TLV properties as follows: // // 1. If -fno-PIC or -fPIE is given, and a TLV is defined as a file-local // variable, the compiler knows that it is compiling code for an // executable (i.e. not for a shared library) and the TLV is defined // within the executable. In this case, Local Exec is used to access the // variable. // // 2. If -fno-PIC or -fPIE is given (i.e. the compiler is compiling code for // a main executable), but a TLV is defined in another translation unit, // then the variable may be defined not in the main executable but in a // shared library. In this case, Initial Exec is used to access the // variable. // // 3. If -fPIC is given, it may be compiling code for dlopen'able shared // library. In this case, Local Dynamic or General Dynamic is used to // access TLVs. // // You can also manually control how the compiler emits TLV access code // globally with `-ftls-model=` or per-variable basis with // `__attribute__((tls_model()))`. For example, if you are // building a shared library that you have no plan to use with dlopen(), you // may want to compile the source files with `-ftls-model=initial-exec` to // avoid the cost associated with the General Dynamic access model. // // The linker may rewrite instructions with a code sequence for a cheaper // access model at link-time. // // === TLS Descriptor access model === // // As described above, there are arguably too many different TLS access // models from the most generic one you can use in any ELF file to the most // efficient one you can use only when building a main executable. Compiling // source code with an appropriate TLS access model is bothersome. To solve // the problem, a new TLS access model was proposed. That is called the TLS // Descriptor (TLSDESC) model. // // For a TLV compiled with TLSDESC, we allocate two consecutive GOT slots // and create a TLSDESC dynamic relocation for them. The dynamic linker // sets a function pointer to the first GOT slot and its argument to the // second slot. // // To access the TLV, we call the function pointer with the argument we // read from the second GOT slot. The function returns the TLV's // TP-relative address. // // The runtime chooses the best access method depending on the situation // and sets a pointer to the most efficient code to the first GOT slot. // For example, if a TLV's TP-relative address is known at process startup // time, the runtime sets that address to the second GOT slot and set a // function that just returns its argument to the first GOT slot. // // With TLSDECS, the compiler can always emit the same code for TLVs // without sacrificing runtime performance. // // TLSDESC is better than the traditional, non-TLSDESC TLS access models. // It's the default on ARM64, but on other targets, TLSDESC is // unfortunately either optional or even not supported at all. So we still // need to support both the traditional TLS models and the TLSDESC model. #include "mold.h" namespace mold { // Returns the TP address which can be used for efficient TLV accesses in // the main executable. TP at runtime refers to a per-process TLS block // whose address is not known at link-time. So the address returned from // this function is the TP if the TLS template image were a TLS block. template u64 get_tp_addr(const ElfPhdr &phdr) { assert(phdr.p_type == PT_TLS); if constexpr (is_x86 || is_sparc || is_s390x) { // On x86, SPARC and s390x, TP (%gs on i386, %fs on x86-64, %g7 on SPARC // and %a0/%a1 on s390x) refers to past the end of the TLS block for // historical reasons. TLVs are accessed with negative offsets from TP. return align_to(phdr.p_vaddr + phdr.p_memsz, phdr.p_align); } else if constexpr (is_arm || is_sh4) { // On ARM and SH4, the runtime appends two words at the beginning // of TLV template image when copying TLVs to the TLS block, so we need // to offset it. return align_down(phdr.p_vaddr - sizeof(Word) * 2, phdr.p_align); } else if constexpr (is_ppc || is_m68k) { // On PowerPC and m68k, TP is 0x7000 (28 KiB) past the beginning // of the TLV block to maximize the addressable range of load/store // instructions with 16-bits signed immediates. It's not exactly 0x8000 // (32 KiB) off because there's a small implementation-defined piece of // data before the initial TLV block, and the runtime wants to access // them efficiently too. return phdr.p_vaddr + 0x7000; } else { // RISC-V and LoongArch just uses the beginning of the main executable's // TLV block as TP. Their load/store instructions usually take 12-bits // signed immediates, so the beginning of the TLS block ± 2 KiB is // accessible with a single load/store instruction. static_assert(is_riscv || is_loongarch); return phdr.p_vaddr; } } // Returns the address __tls_get_addr() would return if it's called // with offset 0. template u64 get_dtp_addr(const ElfPhdr &phdr) { assert(phdr.p_type == PT_TLS); if constexpr (is_ppc || is_m68k) { // On PowerPC and m68k, R_DTPOFF is resolved to the address 0x8000 // (32 KiB) past the start of the TLS block. The bias maximizes the // accessible range of load/store instructions with 16-bits signed // immediates. That is, if the offset were right at the beginning of the // start of the TLS block, the half of addressible space (negative // immediates) would have been wasted. return phdr.p_vaddr + 0x8000; } else if constexpr (is_riscv) { // On RISC-V, the bias is 0x800 as the load/store instructions in the // ISA usually have a 12-bit immediate. return phdr.p_vaddr + 0x800; } else { // On other targets, DTP simply refers to the beginning of the TLS block. return phdr.p_vaddr; } } using E = MOLD_TARGET; template u64 get_tp_addr(const ElfPhdr &); template u64 get_dtp_addr(const ElfPhdr &); } // namespace mold ================================================ FILE: test/CMakeLists.txt ================================================ option(MOLD_ENABLE_QEMU_TESTS "Enable tests on non-native targets" ON) set(MACHINE ${CMAKE_HOST_SYSTEM_PROCESSOR}) if(EXISTS "/proc/cpuinfo") file(READ "/proc/cpuinfo" CPUINFO) endif() # CMAKE_HOST_SYSTEM_PROCESSOR returns "aarch64" on ARM32 userland # with ARM64 kernel. Here, we run `cc -dumpmachine` to determine the # compiler's default target. execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine RESULT_VARIABLE EXIT_CODE OUTPUT_VARIABLE ARCH ERROR_QUIET) if(NOT EXIT_CODE AND ARCH MATCHES "([^-]+).*") set(MACHINE ${CMAKE_MATCH_1}) endif() if(${MACHINE} MATCHES "amd64") set(MACHINE x86_64) elseif(${MACHINE} MATCHES "i.86") set(MACHINE i686) elseif(${MACHINE} MATCHES "arm.*") set(MACHINE arm) elseif(${MACHINE} STREQUAL "powerpc64") set(MACHINE ppc64) elseif(${MACHINE} STREQUAL "powerpc64le") set(MACHINE ppc64le) endif() if(MOLD_ENABLE_QEMU_TESTS) list(APPEND QEMU_ARCHS x86_64 i386 arm armeb aarch64 aarch64_be ppc ppc64 ppc64le sparc64 sh4 sh4eb s390x riscv64 riscv32 m68k loongarch64) LIST(APPEND TRIPLES x86_64-linux-gnu i686-linux-gnu aarch64-linux-gnu aarch64_be-linux-gnu arm-linux-gnueabihf armeb-linux-gnueabihf riscv64-linux-gnu powerpc-linux-gnu powerpc64-linux-gnu powerpc64le-linux-gnu sparc64-linux-gnu s390x-linux-gnu sh4-linux-gnu sh4aeb-linux-gnu riscv32-linux-gnu m68k-linux-gnu loongarch64-linux-gnu) foreach(ARCH IN LISTS QEMU_ARCHS) find_program(HAS_qemu-${ARCH} qemu-${ARCH}) endforeach() foreach(TRIPLE IN LISTS TRIPLES) find_program(HAS_${TRIPLE}-gcc ${TRIPLE}-gcc) endforeach() endif() function(add_target ARCH TRIPLE) set(CPU ${ARGV2}) if(${ARCH} STREQUAL ${MACHINE}) set(IS_NATIVE 1) endif() file(GLOB ALL_TESTS RELATIVE ${CMAKE_CURRENT_LIST_DIR} CONFIGURE_DEPENDS "*.sh") list(FILTER ALL_TESTS EXCLUDE REGEX "^arch-") file(GLOB TESTS RELATIVE ${CMAKE_CURRENT_LIST_DIR} CONFIGURE_DEPENDS "arch-${ARCH}-*.sh") list(APPEND TESTS ${ALL_TESTS}) foreach(TEST IN LISTS TESTS) if(CPU) string(REGEX REPLACE "\\.sh$" "" TESTNAME "${ARCH}_${CPU}-${TEST}") else() string(REGEX REPLACE "\\.sh$" "" TESTNAME "${ARCH}-${TEST}") endif() add_test(NAME ${TESTNAME} COMMAND bash -x ${CMAKE_CURRENT_LIST_DIR}/${TEST} WORKING_DIRECTORY ${mold_BINARY_DIR}) set_property(TEST ${TESTNAME} APPEND PROPERTY ENVIRONMENT "MACHINE=${MACHINE};CPU=${CPU}") if(IS_NATIVE) set_property(TEST ${TESTNAME} APPEND PROPERTY SKIP_REGULAR_EXPRESSION "skipped") else() set_property(TEST ${TESTNAME} APPEND PROPERTY ENVIRONMENT "TRIPLE=${TRIPLE}") endif() endforeach() endfunction() if(${MACHINE} STREQUAL "x86_64" OR (HAS_qemu-x86_64 AND HAS_x86_64-linux-gnu-gcc)) add_target(x86_64 x86_64-linux-gnu) endif() if(${MACHINE} STREQUAL "i686" OR (HAS_qemu-i386 AND HAS_i686-linux-gnu-gcc)) add_target(i686 i686-linux-gnu) endif() if(${MACHINE} STREQUAL "aarch64" OR (HAS_qemu-aarch64 AND HAS_aarch64-linux-gnu-gcc)) add_target(aarch64 aarch64-linux-gnu) endif() if(${MACHINE} STREQUAL "aarch64_be" OR (HAS_qemu-aarch64_be AND HAS_aarch64_be-linux-gnu-gcc)) add_target(aarch64_be aarch64_be-linux-gnu) endif() if(${MACHINE} STREQUAL "arm" OR (HAS_qemu-arm AND HAS_arm-linux-gnueabihf-gcc)) add_target(arm arm-linux-gnueabihf) endif() if(${MACHINE} STREQUAL "armeb" OR (HAS_qemu-armeb AND HAS_armeb-linux-gnueabihf-gcc)) add_target(armeb armeb-linux-gnueabihf) endif() if(${MACHINE} STREQUAL "riscv64" OR (HAS_qemu-riscv64 AND HAS_riscv64-linux-gnu-gcc)) add_target(riscv64 riscv64-linux-gnu) endif() if(${MACHINE} STREQUAL "riscv32" OR (HAS_qemu-riscv32 AND HAS_riscv32-linux-gnu-gcc)) add_target(riscv32 riscv32-linux-gnu) endif() if(${MACHINE} STREQUAL "ppc" OR (HAS_qemu-ppc AND HAS_powerpc-linux-gnu-gcc)) add_target(ppc powerpc-linux-gnu) endif() if(${MACHINE} STREQUAL "ppc64" OR (HAS_qemu-ppc64 AND HAS_powerpc64-linux-gnu-gcc)) add_target(ppc64 powerpc64-linux-gnu) endif() if(${MACHINE} STREQUAL "ppc64le" OR (HAS_qemu-ppc64le AND HAS_powerpc64le-linux-gnu-gcc)) add_target(ppc64le powerpc64le-linux-gnu) endif() if(${MACHINE} STREQUAL "ppc64le" AND "${CPUINFO}" MATCHES "POWER10") add_target(ppc64le powerpc64le-linux-gnu power10) elseif(HAS_qemu-ppc64le AND HAS_powerpc64le-linux-gnu-gcc) file(WRITE "${CMAKE_BINARY_DIR}/empty.c" "") execute_process( COMMAND powerpc64le-linux-gnu-gcc -mcpu=power10 -E "${CMAKE_BINARY_DIR}/empty.c" RESULT_VARIABLE GCC_EXIT_CODE OUTPUT_QUIET ERROR_QUIET) execute_process( COMMAND qemu-ppc64le -cpu help OUTPUT_VARIABLE QEMU_CPUS) if("${GCC_EXIT_CODE}" EQUAL "0" AND "${QEMU_CPUS}" MATCHES power10_v2.0) add_target(ppc64le powerpc64le-linux-gnu power10) endif() endif() if(${MACHINE} STREQUAL "sparc64" OR (HAS_qemu-sparc64 AND HAS_sparc64-linux-gnu-gcc)) add_target(sparc64 sparc64-linux-gnu) endif() if(${MACHINE} STREQUAL "s390x" OR (HAS_qemu-s390x AND HAS_s390x-linux-gnu-gcc)) add_target(s390x s390x-linux-gnu) endif() if(${MACHINE} STREQUAL "sh4" OR (HAS_qemu-sh4 AND HAS_sh4-linux-gnu-gcc)) add_target(sh4 sh4-linux-gnu) endif() if(${MACHINE} STREQUAL "sh4aeb" OR (HAS_qemu-sh4eb AND HAS_sh4aeb-linux-gnu-gcc)) add_target(sh4aeb sh4aeb-linux-gnu) endif() if(${MACHINE} STREQUAL "m68k" OR (HAS_qemu-m68k AND HAS_m68k-linux-gnu-gcc)) add_target(m68k m68k-linux-gnu) endif() if(${MACHINE} STREQUAL "loongarch64" OR (HAS_qemu-loongarch64 AND HAS_loongarch64-linux-gnu-gcc)) add_target(loongarch64 loongarch64-linux-gnu) endif() ================================================ FILE: test/abs-error.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc [ $MACHINE = aarch64 ] && skip [ $MACHINE = ppc64 ] && skip [ $MACHINE = ppc64le ] && skip [ $MACHINE = s390x ] && skip [[ $MACHINE = loongarch* ]] && skip cat < extern char foo; int main() { printf("foo=%p\n", &foo); } EOF not $CC -B. -o $t/exe -pie $t/a.o $t/b.o -Wl,-z,text |& grep 'recompile with -fPIC' ================================================ FILE: test/absolute-symbols.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # This test crashes only on qemu-sparc64 running on GitHub Actions, # even though it works on a local x86-64 machine and on an actual # SPARC machine. [ $MACHINE = sparc64 ] && skip cat < #include #include #include void handler(int signum, siginfo_t *info, void *ptr) { printf("ip=%p\n", info->si_addr); exit(0); } extern volatile int foo; int main() { struct sigaction act; act.sa_flags = SA_SIGINFO | SA_RESETHAND; act.sa_sigaction = handler; sigemptyset(&act.sa_mask); sigaction(SIGSEGV, &act, 0); foo = 5; } EOF $CC -B. -o $t/exe -no-pie $t/a.o $t/b.o $QEMU $t/exe | grep '^ip=0xa0000.$' ================================================ FILE: test/allow-multiple-definition.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc echo 'int main() { return 0; }' | $CC -c -o $t/a.o -xc - echo 'int main() { return 1; }' | $CC -c -o $t/b.o -xc - not $CC -B. -o $t/exe $t/a.o $t/b.o 2> /dev/null $CC -B. -o $t/exe $t/a.o $t/b.o -Wl,-allow-multiple-definition $CC -B. -o $t/exe $t/a.o $t/b.o -Wl,-z,muldefs ================================================ FILE: test/ar-alignment.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat <> $t/a.o cat < int two(); int three(); int main() { printf("%d\n", two() + three()); } EOF rm -f $t/d.a ar rcs $t/d.a $t/a.o $t/b.o $CC -B. -o $t/exe $t/c.o $t/d.a ================================================ FILE: test/arch-aarch64-long-thunk.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/a.c #include void fn3(); void fn4(); __attribute__((section(".low"))) void fn1() { printf(" fn1"); fn3(); } __attribute__((section(".low"))) void fn2() { printf(" fn2"); fn4(); } int main() { printf(" main"); fn1(); printf("\n"); } EOF cat < $t/b.c #include void fn1(); void fn2(); __attribute__((section(".high"))) void fn3() { printf(" fn3"); fn2(); } __attribute__((section(".high"))) void fn4() { printf(" fn4"); } EOF $CC -c -o $t/c.o $t/a.c $CC -c -o $t/d.o $t/b.c $CC -B. -o $t/exe1 $t/c.o $t/d.o \ -Wl,--section-start=.low=0x10000000,--section-start=.high=0x100000000 $QEMU $t/exe1 | grep 'main fn1 fn3 fn2 fn4' $CC -B. -o $t/exe2 $t/c.o $t/d.o \ -Wl,--section-start=.high=0x10000000,--section-start=.low=0x100000000 $QEMU $t/exe2 | grep 'main fn1 fn3 fn2 fn4' $CC -c -o $t/e.o $t/a.c -fno-PIC -mcmodel=large $CC -c -o $t/f.o $t/b.c -fno-PIC -mcmodel=large $CC -B. -o $t/exe3 $t/e.o $t/f.o -pie \ -Wl,--section-start=.low=0x10000000,--section-start=.high=0x400000000 $QEMU $t/exe3 | grep 'main fn1 fn3 fn2 fn4' $CC -B. -o $t/exe4 $t/e.o $t/f.o -pie \ -Wl,--section-start=.high=0x10000000,--section-start=.low=0x400000000 $QEMU $t/exe4 | grep 'main fn1 fn3 fn2 fn4' ================================================ FILE: test/arch-aarch64-range-extension-thunk-disassembly.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < void fn1(); void fn2(); __attribute__((section(".low"))) void fn1() { fn2(); } __attribute__((section(".high"))) void fn2() { fn1(); } int main() { fn1(); } EOF $CC -B. -o $t/exe $t/a.o \ -Wl,--section-start=.low=0x10000000,--section-start=.high=0x20000000 $OBJDUMP -dr $t/exe | grep -E ':' ================================================ FILE: test/arch-aarch64-variant-pcs.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < /dev/null || skip .global foo .type foo, %function .variant_pcs foo foo: ret EOF $CC -B. -shared -o $t/b.so $t/a.o readelf -W --dyn-syms $t/b.so | grep foo | grep -F '[VARIANT_PCS]' cat < /dev/null || skip #include extern char foo; int main() { printf("foo=%p\n", &foo); } EOF $CC -o $t/exe -pie $t/a.o $t/b.o >& /dev/null && skip not $CC -B. -o $t/exe -pie $t/a.o $t/b.o |& grep 'recompile with -fPIC' ================================================ FILE: test/arch-arm-exidx-sentinel.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < extern char _etext[]; int main() { printf("%p\n", _etext); } EOF $CXX -B. -o $t/exe $t/a.o -no-pie readelf --unwind $t/exe | grep "$($QEMU $t/exe) .*cantunwind" ================================================ FILE: test/arch-arm-range-extension-thunk-disassembly.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < void fn1(); void fn2(); __attribute__((section(".low"))) void fn1() { fn2(); } __attribute__((section(".high"))) void fn2() { fn1(); } int main() { fn1(); } EOF $CC -B. -o $t/exe $t/a.o \ -Wl,--section-start=.low=0x10000000,--section-start=.high=0x20000000 $OBJDUMP -dr $t/exe | grep -E -A7 ':' > $t/log grep -E 'bx\s+pc' $t/log grep -E 'add\s+pc, ip, pc' $t/log ================================================ FILE: test/arch-arm-range-extension-thunk.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc test_cflags -mthumb || skip cat < $t/a.c #include void fn3(); void fn4(); __attribute__((section(".low"))) void fn1() { printf(" fn1"); fn3(); } __attribute__((section(".low"))) void fn2() { printf(" fn2"); fn4(); } int main() { printf(" main"); fn1(); printf("\n"); } EOF cat < $t/b.c #include void fn1(); void fn2(); __attribute__((section(".high"))) void fn3() { printf(" fn3"); fn2(); } __attribute__((section(".high"))) void fn4() { printf(" fn4"); } EOF $CC -c -o $t/c.o $t/a.c -O0 -mthumb $CC -c -o $t/d.o $t/b.c -O0 -marm $CC -B. -o $t/exe $t/c.o $t/d.o \ -Wl,--section-start=.low=0x10000000,--section-start=.high=0x20000000 $QEMU $t/exe | grep 'main fn1 fn3 fn2 fn4' $CC -c -o $t/e.o $t/a.c -O2 -mthumb $CC -c -o $t/f.o $t/b.c -O2 -marm $CC -B. -o $t/exe $t/e.o $t/f.o \ -Wl,--section-start=.low=0x10000000,--section-start=.high=0x20000000 $QEMU $t/exe | grep 'main fn1 fn3 fn2 fn4' ================================================ FILE: test/arch-arm-target1.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < extern char *foo; char bar[] = "Hello world"; int main() { printf("%s\n", foo); } EOF $CC -B. -o $t/exe -pie $t/a.o $t/b.o $QEMU $t/exe | grep 'Hello world' ================================================ FILE: test/arch-arm-thm-jump19.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat <' ================================================ FILE: test/arch-arm-thm-jump8.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat <' ================================================ FILE: test/arch-arm-thumb-interwork.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc test_cflags -mthumb || skip cat < int bar(); int foo() { printf(" foo"); bar(); } EOF cat < int bar() { printf(" bar\n"); } int foo(); int main() { printf("main"); foo(); } EOF $CC -B. -o $t/exe $t/a.o $t/b.o $QEMU $t/exe | grep 'main foo bar' ================================================ FILE: test/arch-arm-tlsdesc.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc is_musl && skip test_cflags -mthumb || skip cat < $t/a.c extern _Thread_local int foo; __attribute__((section(".low"))) int get_foo() { int y = foo; return y; } static _Thread_local int bar = 5; __attribute__((section(".high"))) int get_bar() { return bar; } EOF cat < $t/b.c #include _Thread_local int foo; int get_foo(); int get_bar(); int main() { foo = 42; printf("%d %d\n", get_foo(), get_bar()); return 0; } EOF $GCC -fPIC -mtls-dialect=gnu2 -c -o $t/c.o $t/a.c -marm $GCC -fPIC -mtls-dialect=gnu2 -c -o $t/d.o $t/b.c -marm $CC -B. -o $t/exe1 $t/c.o $t/d.o $QEMU $t/exe1 | grep '42 5' $CC -B. -o $t/exe2 $t/c.o $t/d.o -Wl,-no-relax $QEMU $t/exe2 | grep '42 5' $CC -B. -o $t/exe3 $t/c.o $t/d.o -Wl,-no-relax \ -Wl,--section-start=.low=0x10000000,--section-start=.high=0x20000000 $QEMU $t/exe3 | grep '42 5' $GCC -B. -shared -o $t/c.so $t/c.o -Wl,-z,nodlopen $CC -B. -o $t/exe4 $t/c.so $t/d.o $QEMU $t/exe4 | grep '42 5' $GCC -fPIC -mtls-dialect=gnu2 -c -o $t/e.o $t/a.c -mthumb $GCC -fPIC -mtls-dialect=gnu2 -c -o $t/f.o $t/b.c -mthumb $CC -B. -o $t/exe5 $t/e.o $t/f.o $QEMU $t/exe5 | grep '42 5' $CC -B. -o $t/exe6 $t/e.o $t/f.o -Wl,-no-relax $QEMU $t/exe6 | grep '42 5' $CC -B. -o $t/exe7 $t/e.o $t/f.o -Wl,-no-relax \ -Wl,--section-start=.low=0x10000000,--section-start=.high=0x20000000 $QEMU $t/exe7 | grep '42 5' $GCC -B. -shared -o $t/e.so $t/e.o -Wl,-z,nodlopen $CC -B. -o $t/exe8 $t/e.so $t/f.o $QEMU $t/exe8 | grep '42 5' ================================================ FILE: test/arch-armeb-be32.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int get_foo(); extern _Thread_local int bar; int main() { printf("%d %d\n", get_foo(), bar); } EOF $CC -o $t/exe1 $t/a.o $t/b.o $t/c.o -pie $QEMU $t/exe1 | grep '^20 3$' $CC -o $t/exe2 $t/a.o $t/b.o $t/c.o -Wl,-no-relax -pie $QEMU $t/exe2 | grep '^20 3$' $CC -o $t/d.so $t/a.o -shared $CC -o $t/exe3 $t/b.o $t/c.o $t/d.so -pie $QEMU $t/exe3 | grep '^20 3$' ================================================ FILE: test/arch-i686-tlsdesc.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc supports_tlsdesc || skip cat <<'EOF' | $GCC -c -o $t/a.o -xassembler - .globl get_foo .type get_foo, @function get_foo: pushl %ebx call __x86.get_pc_thunk.bx addl $_GLOBAL_OFFSET_TABLE_, %ebx subl $8, %esp leal foo@TLSDESC(%ebx), %ebx movl %ebx, %eax call *foo@TLSCALL(%eax) movl %gs:(%eax), %eax addl $8, %esp popl %ebx ret EOF cat < _Thread_local int foo; int get_foo(); int main() { foo = 42; printf("%d\n", get_foo()); } EOF $CC -B. -o $t/exe1 $t/a.o $t/b.o $QEMU $t/exe1 | grep 42 $CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,-no-relax $QEMU $t/exe2 | grep 42 $CC -B. -shared -o $t/c.so $t/a.o $CC -B. -o $t/exe3 $t/b.o $t/c.so $QEMU $t/exe3 | grep 42 $CC -B. -shared -o $t/c.so $t/a.o -Wl,-no-relax $CC -B. -o $t/exe4 $t/b.o $t/c.so -Wl,-no-relax $QEMU $t/exe4 | grep 42 ================================================ FILE: test/arch-loongarch64-mcmodel-extreme.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < char msg[] = "Hello world\n"; int main() { printf(msg); } EOF $CC -B. -o $t/exe1 $t/a.o $QEMU $t/exe1 | grep 'Hello world' ================================================ FILE: test/arch-loongarch64-relax-call36.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat <<'EOF' | $CC -o $t/a.o -c -xassembler - .globl foo, bar .space 0x100000 foo: move $s0, $ra .reloc ., R_LARCH_CALL36, print .reloc ., R_LARCH_RELAX pcaddu18i $t0, 0 jirl $ra, $t0, 0 move $ra, $s0 ret bar: .reloc ., R_LARCH_CALL36, print .reloc ., R_LARCH_RELAX pcaddu18i $t0, 0 jirl $zero, $t0, 0 .space 0x100000 EOF cat < void foo(); void bar(); void print() { printf("foo"); } int main() { foo(); bar(); printf("\n"); } EOF $CC -B. -o $t/exe1 $t/a.o $t/b.o -Wl,--no-relax $QEMU $t/exe1 | grep foofoo $OBJDUMP -d $t/exe1 > $t/exe1.objdump grep -A2 ':' $t/exe1.objdump | grep -w pcaddu18i grep -A2 ':' $t/exe1.objdump | grep -w pcaddu18i $CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,--relax $QEMU $t/exe2 | grep foofoo $OBJDUMP -d $t/exe2 > $t/exe2.objdump grep -A2 ':' $t/exe2.objdump | grep -w bl grep -A2 ':' $t/exe2.objdump | grep -w b ================================================ FILE: test/arch-loongarch64-relax-got-load.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int get_foo(); int main() { printf("%d\n", get_foo()); } EOF $CC -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o -pie -Wl,--no-relax $QEMU $t/exe1 | grep '^3$' $OBJDUMP -d $t/exe1 | grep -A2 ':' | grep -Fw pcalau12i $OBJDUMP -d $t/exe1 | grep -A2 ':' | grep -Fw ld.d $CC -B. -o $t/exe2 $t/a.o $t/b.o $t/c.o -pie -Wl,--relax $QEMU $t/exe2 | grep '^3$' $OBJDUMP -d $t/exe2 | grep -A1 ':' | grep -Fw pcaddi $CC -B. -o $t/exe3 $t/a.o $t/b.o $t/c.o -pie -Wl,--relax \ -Wl,-Ttext=0x1000000,-Tdata=0x2000000 $QEMU $t/exe3 | grep '^3$' $OBJDUMP -d $t/exe3 | grep -A2 ':' | grep -Fw pcalau12i $OBJDUMP -d $t/exe3 | grep -A2 ':' | grep -Fw addi.d ================================================ FILE: test/arch-loongarch64-relax-pcala-addi.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat <<'EOF' | $CC -o $t/a.o -c -xassembler - .globl get_sym1, get_sym2, get_sym3 get_sym1: la.pcrel $a0, sym1 ret get_sym2: la.pcrel $a0, sym2 ret get_sym3: la.pcrel $a0, sym3 ret EOF cat <<'EOF' | $CC -o $t/b.o -c -xassembler - .globl sym1, sym2, sym3 sym1: li.d $a0, 1 ret .space 1024 * 1024 sym2: li.d $a0, 2 ret .space 1024 * 1024 sym3: li.d $a0, 3 ret EOF cat < int (*get_sym1())(); int (*get_sym2())(); int (*get_sym3())(); int main() { printf("%d %d %d\n", get_sym1()(), get_sym2()(), get_sym3()()); } EOF $CC -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o -Wl,--no-relax $QEMU $t/exe1 | grep '^1 2 3$' $OBJDUMP -d $t/exe1 > $t/exe1.objdump grep -A1 ':' $t/exe1.objdump | grep pcalau12i grep -A1 ':' $t/exe1.objdump | grep pcalau12i grep -A1 ':' $t/exe1.objdump | grep pcalau12i $CC -B. -o $t/exe2 $t/a.o $t/b.o $t/c.o -Wl,--relax $QEMU $t/exe2 | grep '^1 2 3$' $OBJDUMP -d $t/exe2 > $t/exe2.objdump grep -A1 ':' $t/exe2.objdump | grep pcaddi grep -A1 ':' $t/exe2.objdump | grep pcaddi grep -A1 ':' $t/exe2.objdump | grep pcalau12i ================================================ FILE: test/arch-loongarch64-relax-tlsdesc.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat <<'EOF' | $CC -o $t/a.o -c -xc - -fPIC _Thread_local char foo[4] = "foo"; _Thread_local char padding[100000] = "padding"; EOF cat <<'EOF' | $CC -o $t/b.o -c -xc - -fPIC _Thread_local char bar[4] = "bar"; EOF cat <<'EOF' | $CC -o $t/c.o -c -xc - -fPIC -mtls-dialect=desc -O2 extern _Thread_local char foo[4]; extern _Thread_local char bar[4]; char *get_foo() { return foo; } char *get_bar() { return bar; } EOF cat < char *get_foo(); char *get_bar(); int main() { printf("%s %s\n", get_foo(), get_bar()); } EOF $CC -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o $t/d.o -Wl,--no-relax { $QEMU $t/exe1; true; } |& grep 'unexpected reloc type' && skip $QEMU $t/exe1 | grep 'foo bar' $OBJDUMP -d $t/exe1 > $t/exe1.objdump grep -A6 ':' $t/exe1.objdump | grep -F pcalau12i grep -A6 ':' $t/exe1.objdump | grep -F pcalau12i $CC -B. -o $t/exe2 $t/a.o $t/b.o $t/c.o $t/d.o -Wl,--relax $QEMU $t/exe2 | grep 'foo bar' $OBJDUMP -d $t/exe2 > $t/exe2.objdump grep -A6 ':' $t/exe2.objdump | grep -F li.w grep -A6 ':' $t/exe2.objdump | grep -F lu12i.w ================================================ FILE: test/arch-ppc64le-save-restore-gprs.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello world\n"); } EOF $CC -B. -o $t/exe $t/a.o $OBJDUMP -d $t/exe | grep '<_savegpr0_14>' ================================================ FILE: test/arch-riscv64-attributes.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/log not grep -F .riscv.attributes $t/log not grep -F RISCV_ATTR $t/log ================================================ FILE: test/arch-riscv64-global-pointer-dso.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello world\n"); } EOF $CC -B. -o $t/exe $t/b.so $t/c.o $t/d.o $QEMU $t/exe | grep 'Hello world' ================================================ FILE: test/arch-riscv64-global-pointer.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello world\n"); } EOF $CC -B. -o $t/exe1 $t/a.o -fno-PIE readelf -W --dyn-syms $t/exe1 | grep -F '__global_pointer$' $CC -B. -o $t/exe2 $t/a.o -fPIE readelf -W --dyn-syms $t/exe2 | grep -F '__global_pointer$' cat < int hello() { printf("Hello world\n"); } EOF $CC -B. -o $t/c.so $t/b.o -shared readelf -W --dyn-syms $t/c.so | not grep -F '__global_pointer$' ================================================ FILE: test/arch-riscv64-obj-compatible.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc set -x cat < #include extern char x1, x2; int main() { printf("%lu %lu %lu\n", (uintptr_t)&x1 % 32, (uintptr_t)&x2 % 32, &x2 - &x1); } EOF $CC -B. -o $t/exe $t/a.o $t/b.o $t/c.o $QEMU $t/exe | grep '0 0 32' ================================================ FILE: test/arch-riscv64-relax-got.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int get_sym1(); int get_sym2(); int get_sym3(); int get_sym4(); int get_sym5(); int main() { printf("%x %x %x %x %x\n", get_sym1(), get_sym2(), get_sym3(), get_sym4(), get_sym5()); } EOF $CC -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o -Wl,--no-relax $QEMU $t/exe1 | grep -E '^0 ba beef 11beef deadbeef$' $CC -B. -o $t/exe2 $t/a.o $t/b.o $t/c.o $QEMU $t/exe2 | grep -E '^0 ba beef 11beef deadbeef$' $OBJDUMP -d $t/exe2 | grep -A2 ':' | grep -E $'li[ \t]+a0,186$' ================================================ FILE: test/arch-riscv64-relax-hi20.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int get_foo(); int get_foo2(); int get_bar(); int get_bar2(); int get_baz(); int main() { printf("%x %x %x %x %x\n", get_foo(), get_foo2(), get_bar(), get_bar2(), get_baz()); } EOF $CC -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o -Wl,--no-relax $QEMU $t/exe1 | grep 'f00 10000f00 0 1ffff 11beef' $CC -B. -o $t/exe2 $t/a.o $t/b.o $t/c.o $QEMU $t/exe2 | grep 'f00 10000f00 0 1ffff 11beef' readelf --syms $t/exe1 > $t/log1 grep -E ' 10 NOTYPE .* get_foo$' $t/log1 grep -E ' 10 NOTYPE .* get_foo2$' $t/log1 grep -E ' 10 NOTYPE .* get_bar$' $t/log1 grep -E ' 10 NOTYPE .* get_bar2$' $t/log1 grep -E ' 10 NOTYPE .* get_baz$' $t/log1 readelf --syms $t/exe2 > $t/log2 grep -E ' 8 NOTYPE .* get_foo$' $t/log2 grep -E ' 10 NOTYPE .* get_foo2$' $t/log2 grep -E ' 6 NOTYPE .* get_bar$' $t/log2 grep -E ' 10 NOTYPE .* get_bar2$' $t/log2 grep -E ' 10 NOTYPE .* get_baz$' $t/log2 ================================================ FILE: test/arch-riscv64-relax-j.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat <' cat <' ================================================ FILE: test/arch-riscv64-reloc-overflow.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int get_foo(); int get_bar(); int main() { printf("%x %x\n", get_foo(), get_bar()); } EOF $CC -B. -o $t/exe $t/a.o $t/b.o $t/c.o readelf --syms $t/a.o | grep -E ' 10 FUNC .* get_foo$' readelf --syms $t/a.o | grep -E ' 10 FUNC .* get_bar$' readelf --syms $t/exe | grep -E ' 8 FUNC .* get_foo$' readelf --syms $t/exe | grep -E ' 8 FUNC .* get_bar$' ================================================ FILE: test/arch-riscv64-variant-cc.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < /dev/null || skip .global foo .type foo, %function .variant_cc foo foo: ret EOF $CC -B. -shared -o $t/b.so $t/a.o readelf -W --dyn-syms $t/b.so | grep foo | grep -F '[VARIANT_CC]' cat < long foo(); int main() { printf("%ld\n", foo()); } EOF $CC -B. -static -o $t/exe $t/a.o $t/b.o $QEMU $t/exe | grep '^0$' ================================================ FILE: test/arch-s390x-got.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # GOT[0] must be set to the link-time address of .dynamic on s390x. cat < extern char _DYNAMIC; extern void *got[]; int main() { printf("%d %p %p\n", &_DYNAMIC == got[0], &_DYNAMIC, got[0]); } EOF $CC -B. -o $t/exe $t/a.o -Wl,-defsym=got=_GLOBAL_OFFSET_TABLE_ -no-pie $QEMU $t/exe | grep -E '^1' ================================================ FILE: test/arch-x86_64-address-equality.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < void *foo(); void bar(); int main() { printf("%d %p %p\n", foo() == bar, foo(), bar); } EOF cat <& /dev/null || skip { sde64 -help; true; } | grep 'Diamond Rapids' || skip cat < void foo(); void *get_foo_addr(); int main() { printf("%d %p %p\n", foo == get_foo_addr(), foo, get_foo_addr()); } EOF $CC -B. -o $t/exe1 $t/a.o $t/b.o $t/d.o sde64 -dmr -- $t/exe1 | grep -E '^1 ' $OBJDUMP -d $t/exe1 | grep -A1 ':' | grep -w lea $CC -B. -o $t/exe2 $t/a.o $t/c.so $t/d.o sde64 -dmr -- $t/exe2 | grep -E '^1 ' $OBJDUMP -d $t/exe2 | grep -A1 ':' | grep -w mov ================================================ FILE: test/arch-x86_64-apx-gottpoff.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # We use Intel SDE to run programs compiled for APX command -v sde64 >& /dev/null || skip { sde64 -help; true; } | grep 'Diamond Rapids' || skip cat <<'EOF' | $CC -o $t/a.o -c -xassembler - || skip .globl get_foo get_foo: mov foo@gottpoff(%rip), %r16 mov %fs:(%r16), %eax ret EOF $OBJDUMP -r $t/a.o | grep -w R_X86_64_CODE_4_GOTTPOFF cat < _Thread_local int foo = 3; int get_foo(); int main() { printf("%d %d\n", foo, get_foo()); } EOF $CC -B. -o $t/exe1 $t/a.o $t/b.o sde64 -dmr -- $t/exe1 | grep -E '^3 3$' $CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,-no-relax sde64 -dmr -- $t/exe2 | grep -E '^3 3$' ================================================ FILE: test/arch-x86_64-apx-gottpoff2.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # We use Intel SDE to run programs compiled for APX command -v sde64 >& /dev/null || skip { sde64 -help; true; } | grep 'Diamond Rapids' || skip cat < _Thread_local int foo = 3; int get_foo(); int main() { printf("%d %d\n", foo, get_foo()); } EOF $CC -B. -o $t/exe1 $t/a.o $t/b.o sde64 -dmr -- $t/exe1 | grep -E '^3 3$' ================================================ FILE: test/arch-x86_64-apx-tlsdesc.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc supports_tlsdesc || skip # We use Intel SDE to run programs compiled for APX command -v sde64 >& /dev/null || skip { sde64 -help; true; } | grep 'Diamond Rapids' || skip cat < _Thread_local int foo; int get_foo(); int main() { foo = 42; printf("%d\n", get_foo()); } EOF $CC -B. -o $t/exe1 $t/a.o $t/b.o sde64 -dmr -- $t/exe1 | grep 42 $CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,-no-relax sde64 -dmr -- $t/exe2 | grep 42 $CC -shared -o $t/c.so $t/a.o $CC -B. -o $t/exe3 $t/b.o $t/c.so sde64 -dmr -- $t/exe3 | grep 42 $CC -shared -o $t/c.so $t/a.o -Wl,-no-relax $CC -B. -o $t/exe4 $t/b.o $t/c.so -Wl,-no-relax sde64 -dmr -- $t/exe4 | grep 42 ================================================ FILE: test/arch-x86_64-empty-arg.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc nm -D mold | grep __msan_init && skip not ./mold -m elf_x86_64 '' |& grep 'cannot open :' ================================================ FILE: test/arch-x86_64-empty-mergeable-section.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/log grep '.rodata.str1.1 .* AMS ' $t/log not grep '.rodata.str1.1 .* AM ' $t/log ================================================ FILE: test/arch-x86_64-emulation-deduction.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat <& /dev/null readelf --segments -W $t/exe | grep 'GNU_STACK.* RW ' $CC -B. -o $t/exe $t/a.o -Wl,-z,execstack-if-needed readelf --segments -W $t/exe | grep 'GNU_STACK.* RWE ' ================================================ FILE: test/arch-x86_64-function-multiversion.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc supports_ifunc || skip [ "$(uname)" = FreeBSD ] && skip cat < class Hello { public: __attribute__((target("default"))) void say() { std::cout << "Hello\n"; } __attribute__((target("popcnt"))) void say() { std::cout << "Howdy\n"; } }; void hello() { Hello().say(); } EOF $CXX -B. -shared -o $t/b.so $t/a.o cat <& /dev/null || skip $OBJDUMP -d $t/exe | grep -A1 '<__x86.get_pc_thunk.bx>:' | grep -F 'puts$plt' ================================================ FILE: test/arch-x86_64-gnu-retain.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc echo '.section foo,"R"' | $CC -o /dev/null -c -xassembler - 2> /dev/null || skip cat < extern char foo[5000L * 1000 * 1000]; extern char bar[1000 * 1000]; int main() { printf("%d %d\n", foo[0], bar[0]); } EOF $CC -B. -o $t/exe $t/a.o $t/b.o $QEMU $t/exe ================================================ FILE: test/arch-x86_64-ifunc-alias.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc supports_ifunc || skip test_cflags -static || skip cat < #include __attribute__((target("default"))) int foo() { return 0; } __attribute__((target("ssse3,avx2"))) int foo() { return 1; } int (*p)() = foo; int main() { int val = foo(); assert(val == p()); } EOF $CXX -B. -o $t/exe $t/a.o -static $QEMU $t/exe ================================================ FILE: test/arch-x86_64-ifunc-export.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc supports_ifunc || skip cat < struct MyClass { MyClass(T data) { myfunc(data); } }; template struct MyClass; EOF $CXX -shared -o $t/b.so $t/a.o cat < struct MyClass { MyClass(T data) { myfunc(data); } }; extern template struct MyClass; int main() { MyClass obj(3); } EOF $CXX -B. -o $t/exe $t/c.o $t/b.so -fno-PIE $QEMU $t/exe |& grep 'circular dependency' || skip $QEMU $t/exe ================================================ FILE: test/arch-x86_64-incompatible-libs-linker-script.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc test_cflags -m32 || skip mkdir -p $t/foo echo 'char hello[] = "Hello world";' | $CC -shared -o $t/libbar.so -m32 -xc - echo 'char hello[] = "Hello world";' | $CC -shared -o $t/foo/libbar.so -xc - cat < extern char hello[]; int main() { printf("%s\n", hello); } EOF cat < $t/b.script INPUT(libbar.so) EOF cd $t $CC -B$OLDPWD -o exe1 -Lfoo a.o b.script LD_LIBRARY_PATH=. $QEMU ./exe1 | grep 'Hello world' $CC -B$OLDPWD -o exe2 -Lfoo b.script a.o LD_LIBRARY_PATH=. $QEMU ./exe2 | grep 'Hello world' ================================================ FILE: test/arch-x86_64-incompatible-libs-linker-script2.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc nm mold | grep '__tsan_init' && skip test_cflags -m32 || skip mkdir -p $t/foo cat < extern char hello[]; int main() { printf("%s\n", hello); } EOF cat < $t/d.script INPUT(a.o) EOF cd $t $OLDPWD/ld -o e.o -r -Lfoo d.script c.o $OLDPWD/ld -o f.o -r -Lfoo c.o d.script ================================================ FILE: test/arch-x86_64-incompatible-libs.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc test_cflags -m32 || skip cat < extern char hello[]; int main() { printf("%s\n", hello); } EOF mkdir -p $t/script echo 'OUTPUT_FORMAT(elf32-i386)' > $t/script/libfoo.so $CC -B. -o $t/exe -L$t/script -L$t/lib32 -L$t/lib64 \ $t/e.o -lfoo -Wl,-rpath $t/lib64 >& $t/log grep 'script/libfoo.so: skipping incompatible file' $t/log grep 'lib32/libfoo.so: skipping incompatible file' $t/log grep 'lib32/libfoo.a: skipping incompatible file' $t/log $QEMU $t/exe | grep 'Hello world' ================================================ FILE: test/arch-x86_64-incompatible-libs2.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc test_cflags -m32 || skip nm mold | grep '__tsan_init' && skip cat < extern char hello[]; int main() { printf("%s\n", hello); } EOF mkdir -p $t/script echo 'GROUP(libfoo.so)' > $t/script/libfoo.so $CC -B. -o $t/exe -L$t/lib32 -L$t/lib64 -lfoo $t/e.o -Wl,-rpath $t/lib64 |& grep 'lib32/libfoo.so: skipping incompatible file' $QEMU $t/exe | grep 'Hello world' ================================================ FILE: test/arch-x86_64-incompatible-obj.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc test_cflags -m32 || skip cat <& $t/log grep "$t/b.o: incompatible file type: x86_64 is expected but got i386" $t/log ================================================ FILE: test/arch-x86_64-init-array-readonly.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < void init1() { printf("init1 "); } void init2() { printf("init2 "); } int main() { return 0; } EOF $CC -B. -o $t/exe $t/a.o $t/b.o $t/c.o $QEMU $t/exe | grep 'init1 init2' ================================================ FILE: test/arch-x86_64-init-array.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < void init1() { printf("init1 "); } void init2() { printf("init2 "); } void fini1() { printf("fini1\n"); } void fini2() { printf("fini2 "); } int main() { return 0; } EOF $CC -B. -o $t/exe $t/a.o $t/b.o $QEMU $t/exe | grep 'init1 init2 fini2 fini1' ================================================ FILE: test/arch-x86_64-isa-level.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < char arr1[0xc0000000]; extern char arr2[0xc0000000]; int main() { printf("%d %lx\n", (void *)arr1 < (void *)arr2, arr2 - arr1); } EOF cat < /dev/null || skip test_cflags -fcf-protection=branch || skip cat < $t/log1 grep -E '.note.bar\s+NOTE.+000008 00 A 0 0 4' $t/log1 grep -E '.note.baz\s+NOTE.+000008 00 A 0 0 8' $t/log1 grep -E '.note.nonalloc\s+NOTE.+000008 00 0 0 1' $t/log1 readelf --segments $t/exe > $t/log2 grep -F '01 .note.baz .note.foo .note.bar' $t/log2 not grep 'NOTE.*0x0000000000000000 0x0000000000000000' $t/log2 ================================================ FILE: test/arch-x86_64-note2.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # Binutils 2.32 injects their own .note.gnu.property section interfering with the tests test_cflags -Xassembler -mx86-used-note=no && CFLAGS="-Xassembler -mx86-used-note=no" || CFLAGS="" cat < #include void preinit() { write(STDOUT_FILENO, "preinit ", 8); } void init() { write(STDOUT_FILENO, "init ", 5); } void fini() { write(STDOUT_FILENO, "fini\n", 5); } int main() {} EOF $CC -B. -o $t/exe $t/a.o $t/b.o $QEMU $t/exe | grep 'preinit init fini' ================================================ FILE: test/arch-x86_64-relax.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # Skip if target is not x86-64 echo ret | cc -c -o /dev/null -xassembler -Wa,-mrelax-relocations=yes - 2> /dev/null || skip cat <:/,/<.*>:/p' > $t/log grep -E 'lea \s*0x.+\(%rip\),%rax .*' $t/log grep -E 'lea \s*0x.+\(%rip\),%rcx .*' $t/log grep -E 'lea \s*0x.+\(%rip\),%rdx .*' $t/log grep -E 'lea \s*0x.+\(%rip\),%rbx .*' $t/log grep -E 'lea \s*0x.+\(%rip\),%rbp .*' $t/log grep -E 'lea \s*0x.+\(%rip\),%rsi .*' $t/log grep -E 'lea \s*0x.+\(%rip\),%rdi .*' $t/log grep -E 'lea \s*0x.+\(%rip\),%r8 .*' $t/log grep -E 'lea \s*0x.+\(%rip\),%r9 .*' $t/log grep -E 'lea \s*0x.+\(%rip\),%r10 .*' $t/log grep -E 'lea \s*0x.+\(%rip\),%r11 .*' $t/log grep -E 'lea \s*0x.+\(%rip\),%r12 .*' $t/log grep -E 'lea \s*0x.+\(%rip\),%r13 .*' $t/log grep -E 'lea \s*0x.+\(%rip\),%r14 .*' $t/log grep -E 'lea \s*0x.+\(%rip\),%r15 .*' $t/log grep -E 'lea \s*0x.+\(%rip\),%r16 .*' $t/log grep -E 'lea \s*0x.+\(%rip\),%r17 .*' $t/log grep -E 'lea \s*0x.+\(%rip\),%r18 .*' $t/log grep -E 'lea \s*0x.+\(%rip\),%r19 .*' $t/log grep -E 'lea \s*0x.+\(%rip\),%r20 .*' $t/log grep -E 'lea \s*0x.+\(%rip\),%r21 .*' $t/log grep -E 'lea \s*0x.+\(%rip\),%r22 .*' $t/log grep -E 'lea \s*0x.+\(%rip\),%r23 .*' $t/log grep -E 'lea \s*0x.+\(%rip\),%r24 .*' $t/log grep -E 'lea \s*0x.+\(%rip\),%r25 .*' $t/log grep -E 'lea \s*0x.+\(%rip\),%r26 .*' $t/log grep -E 'lea \s*0x.+\(%rip\),%r27 .*' $t/log grep -E 'lea \s*0x.+\(%rip\),%r28 .*' $t/log grep -E 'lea \s*0x.+\(%rip\),%r29 .*' $t/log grep -E 'lea \s*0x.+\(%rip\),%r30 .*' $t/log grep -E 'lea \s*0x.+\(%rip\),%r31 .*' $t/log grep -E 'call.*' $t/log grep -E 'jmp.*' $t/log ================================================ FILE: test/arch-x86_64-reloc-overflow.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int print(int x) { printf("%d\n", x); return 0; } int print64(long x) { printf("%ld\n", x); return 0; } EOF $CC -shared -o $t/c.so $t/a.o $t/b.o -Wl,-z,noexecstack # Absolute symbol cat <<'EOF' > $t/d.s .globl abs_sym .set abs_sym, 42 .globl main main: sub $8, %rsp lea abs_sym, %edi call print add $8, %rsp ret EOF $CC -B. -o $t/exe $t/c.so $t/d.s -no-pie $QEMU $t/exe | grep '^42$' $CC -B. -o $t/exe $t/c.so $t/d.s -pie $QEMU $t/exe | grep '^42$' # GOT cat <<'EOF' > $t/d.s .globl main main: sub $8, %rsp mov ext_var@GOTPCREL(%rip), %rdi mov (%rdi), %edi call print add $8, %rsp ret EOF $CC -B. -o $t/exe $t/c.so $t/d.s -no-pie $QEMU $t/exe | grep '^56$' $CC -B. -o $t/exe $t/c.so $t/d.s -pie $QEMU $t/exe | grep '^56$' # Copyrel cat <<'EOF' > $t/d.s .globl main main: sub $8, %rsp mov ext_var(%rip), %edi call print add $8, %rsp ret EOF $CC -c -o $t/d.o $t/d.s $CC -B. -o $t/exe $t/c.so $t/d.o -no-pie $QEMU $t/exe | grep '^56$' $CC -B. -o $t/exe $t/c.so $t/d.s -pie $QEMU $t/exe | grep '^56$' # Copyrel cat <<'EOF' > $t/d.s .globl main main: sub $8, %rsp mov foo(%rip), %rdi mov (%rdi), %edi call print add $8, %rsp ret .data foo: .quad ext_var EOF $CC -B. -o $t/exe $t/c.so $t/d.s -no-pie $QEMU $t/exe | grep '^56$' $CC -B. -o $t/exe $t/c.so $t/d.s -pie $QEMU $t/exe | grep '^56$' # PLT cat <<'EOF' > $t/d.s .globl main main: sub $8, %rsp mov $76, %edi call print@PLT add $8, %rsp ret EOF $CC -B. -o $t/exe $t/c.so $t/d.s -no-pie $QEMU $t/exe | grep '^76$' $CC -B. -o $t/exe $t/c.so $t/d.s -pie $QEMU $t/exe | grep '^76$' # PLT cat <<'EOF' > $t/d.s .globl main main: sub $8, %rsp mov $76, %edi lea print(%rip), %rax call *%rax add $8, %rsp ret EOF $CC -B. -o $t/exe $t/c.so $t/d.s -no-pie $QEMU $t/exe | grep '^76$' $CC -B. -o $t/exe $t/c.so $t/d.s -pie $QEMU $t/exe | grep '^76$' # SIZE32 cat <<'EOF' > $t/d.s .globl main main: sub $8, %rsp mov $foo+2@SIZE, %edi call print@PLT add $8, %rsp ret .data .globl foo .type foo, %object .size foo, 24 foo: EOF $CC -B. -o $t/exe $t/c.so $t/d.s $QEMU $t/exe | grep '^26$' # SIZE64 cat <<'EOF' > $t/d.s .globl main main: sub $8, %rsp movabs $foo+5@SIZE, %rdi call print64@PLT add $8, %rsp ret .data .globl foo .type foo, %object .size foo, 56 foo: EOF $CC -B. -o $t/exe $t/c.so $t/d.s $QEMU $t/exe | grep '^61$' # GOTPCREL64 cat <<'EOF' > $t/e.c extern long ext_var; static long arr[50000] = {1, 2, 3}; void print64(long); int main() { print64(ext_var * 1000000 + arr[2]); } EOF $CC -c -o $t/e.o $t/e.c -mcmodel=large -fPIC $CC -B. -o $t/exe $t/c.so $t/e.o $QEMU $t/exe | grep '^56000003$' # R_X86_64_32 against non-alloc section cat <<'EOF' > $t/f.s .globl main main: sub $8, %rsp add $8, %rsp ret .section .foo, "", @progbits .zero 16 foo: .quad bar .section .bar, "", @progbits .zero 24 bar: .quad foo EOF $CC -c -o $t/f.o $t/f.s $CC -B. -o $t/exe $t/f.o readelf -x .foo -x .bar $t/exe > $t/log grep -F '0x00000010 00000000 00000000 10000000 00000000' $t/log grep -F '0x00000010 18000000 00000000' $t/log ================================================ FILE: test/arch-x86_64-section-alignment.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat <<'EOF' | $CC -c -o $t/a.o -xc - #include #include __attribute__((aligned(8192))) int foo = 1; typedef struct { uint8_t e_ident[16]; uint16_t e_type; uint16_t e_machine; uint32_t e_version; uint64_t e_entry; uint64_t e_phoff; uint64_t e_shoff; uint32_t e_flags; uint16_t e_ehsize; uint16_t e_phentsize; uint16_t e_phnum; uint16_t e_shentsize; uint16_t e_shnum; uint16_t e_shstrndx; } Ehdr; char __ehdr_start; int main() { Ehdr *e = (Ehdr *)&__ehdr_start; printf("%lu %lu %lu\n", e->e_phoff % 8, e->e_shoff % 8, (uint64_t)&foo % 8192); } EOF $CC -B. -o $t/exe $t/a.o $QEMU $t/exe | grep '^0 0 0$' ================================================ FILE: test/arch-x86_64-section-name.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat <<'EOF' | $CC -o $t/a.o -c -x assembler - .globl _start .text _start: ret .section .text.hot .ascii ".text.hot " .section .text.hot.foo .ascii ".text.hot.foo " .section .text.unknown .ascii ".text.unknown " .section .text.unknown.foo .ascii ".text.unknown.foo " .section .text.unlikely .ascii ".text.unlikely " .section .text.unlikely.foo .ascii ".text.unlikely.foo " .section .text.startup .ascii ".text.startup " .section .text.startup.foo .ascii ".text.startup.foo " .section .text.exit .ascii ".text.exit " .section .text.exit.foo .ascii ".text.exit.foo " .section .text .ascii ".text " .section .text.foo .ascii ".text.foo " .section .data.rel.ro .ascii ".data.rel.ro " .section .data.rel.ro.foo .ascii ".data.rel.ro.foo " .section .data .ascii ".data " .section .data.foo .ascii ".data.foo " .section .rodata .ascii ".rodata " .section .rodata.foo .ascii ".rodata.foo " EOF ./mold -o $t/exe $t/a.o -z keep-text-section-prefix readelf -p .text.hot $t/exe | grep -F '.text.hot .text.hot.foo' readelf -p .text.unknown $t/exe | grep -F '.text.unknown .text.unknown.foo' readelf -p .text.unlikely $t/exe | grep -F '.text.unlikely .text.unlikely.foo' readelf -p .text.startup $t/exe | grep -F '.text.startup .text.startup.foo' readelf -p .text.exit $t/exe | grep -F '.text.exit .text.exit.foo' readelf -p .text $t/exe | grep -F '.text .text.foo' readelf -p .data.rel.ro $t/exe | grep -F '.data.rel.ro .data.rel.ro.foo' readelf -p .data $t/exe | grep -F '.data .data.foo' readelf -p .rodata $t/exe | grep -F '.rodata .rodata.foo' ./mold -o $t/exe $t/a.o readelf --sections $t/exe | not grep -F .text.hot ./mold -o $t/exe $t/a.o -z nokeep-text-section-prefix readelf --sections $t/exe | not grep -F .text.hot ================================================ FILE: test/arch-x86_64-tbss-only.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # Test if grep supports backreferences echo abab | grep -E '(ab)\1' || skip cat < static _Thread_local int x1 = 1; static _Thread_local int x2; extern _Thread_local int x3; extern _Thread_local int x4; int get_x5(); int get_x6(); int main() { x2 = 2; printf("%d %d %d %d %d %d\n", x1, x2, x3, x4, get_x5(), get_x6()); return 0; } EOF cat < __attribute__((tls_model("global-dynamic"))) static _Thread_local int x1 = 1; __attribute__((tls_model("global-dynamic"))) _Thread_local int x2 = 2; __attribute__((tls_model("global-dynamic"))) _Thread_local int x3; int foo() { x3 = 3; printf("%d %d %d\n", x1, x2, x3); return 0; } EOF cat < extern _Thread_local char x[1024000]; extern _Thread_local char y[1024000]; int main() { x[0] = 3; x[1023] = 5; printf("%d %d %d %d %d %d\n", x[0], x[1], x[1023], y[0], y[1], y[1023]); } EOF $CC -B. -o $t/exe $t/a.o $t/b.o $QEMU $t/exe | grep '^3 0 5 0 0 0$' ================================================ FILE: test/arch-x86_64-tls-ld-mcmodel-large.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < extern _Thread_local int foo; static _Thread_local int bar; int *get_foo_addr() { return &foo; } int *get_bar_addr() { return &bar; } int main() { bar = 5; printf("%d %d %d %d\n", *get_foo_addr(), *get_bar_addr(), foo, bar); return 0; } EOF cat < int get_foo(); extern _Thread_local int bar; int main() { printf("%d %d\n", get_foo(), bar); } EOF $CC -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o $QEMU $t/exe1 | grep '^20 3$' $CC -B. -o $t/exe2 $t/a.o $t/b.o $t/c.o -Wl,-no-relax $QEMU $t/exe2 | grep '^20 3$' $CC -B. -o $t/d.so $t/a.o -shared $CC -B. -o $t/exe3 $t/b.o $t/c.o $t/d.so $QEMU $t/exe3 | grep '^20 3$' ================================================ FILE: test/arch-x86_64-tlsdesc.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc supports_tlsdesc || skip cat < _Thread_local int foo; int get_foo(); int main() { foo = 42; printf("%d\n", get_foo()); } EOF $CC -B. -o $t/exe1 $t/a.o $t/b.o $QEMU $t/exe1 | grep 42 $CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,-no-relax $QEMU $t/exe2 | grep 42 $CC -B. -shared -o $t/c.so $t/a.o $CC -B. -o $t/exe3 $t/b.o $t/c.so $QEMU $t/exe3 | grep 42 $CC -B. -shared -o $t/c.so $t/a.o -Wl,-no-relax $CC -B. -o $t/exe4 $t/b.o $t/c.so -Wl,-no-relax $QEMU $t/exe4 | grep 42 ================================================ FILE: test/arch-x86_64-unique.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat <& $t/log grep 'relocation against symbol `main'\'' in read-only section' $t/log grep 'creating a DT_TEXTREL in an output file' $t/log ================================================ FILE: test/arch-x86_64-warn-textrel.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # Skip if libc is musl is_musl && skip cat <<'EOF' | $CC -c -o $t/a.o -x assembler - .globl fn fn: movabs main, %rax ret EOF cat <& $t/log grep 'relocation against symbol `main'\'' in read-only section' $t/log grep 'creating a DT_TEXTREL in an output file' $t/log ================================================ FILE: test/arch-x86_64-z-dynamic-undefined-weak.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc [ "$(uname)" = FreeBSD ] && skip cat < __attribute__((weak)) extern int foo; int main() { printf("%p\n", &foo); } EOF not $CC -B. -o $t/exe3 $t/b.o -no-pie -Wl,-z,dynamic-undefined-weak |& grep 'cannot create a copy relocation for foo' ================================================ FILE: test/arch-x86_64-z-ibt.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc echo endbr64 | $CC -o /dev/null -c -xassembler - 2> /dev/null || skip cat < void hello() { printf("Hello"); } void world() { printf("world"); } EOF $CC -B. -o $t/b.so -shared $t/a.o -Wl,-z,ibtplt cat < void hello(); void world(); int main() { hello(); printf(" "); world(); printf("\n"); } EOF $CC -B. -o $t/exe $t/c.o $t/b.so -Wl,-z,ibtplt $QEMU $t/exe | grep 'Hello world' ================================================ FILE: test/arch-x86_64-z-rewrite-endbr.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc test_cflags -fcf-protection || skip cat < $t/log1 grep -A1 ':' $t/log1 | grep endbr64 grep -A1 ':' $t/log1 | grep endbr64 grep -A1 '
:' $t/log1 | grep endbr64 $CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,-z,rewrite-endbr $OBJDUMP -dr $t/exe2 > $t/log2 grep -A1 ':' $t/log2 | grep nop grep -A1 ':' $t/log2 | grep nop grep -A1 '
:' $t/log2 | grep endbr64 ================================================ FILE: test/arch-x86_64-z-rewrite-endbr2.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc test_cflags -fcf-protection || skip cat < $t/log1 grep -A1 ':' $t/log1 | grep endbr64 grep -A1 ':' $t/log1 | grep endbr64 grep -A1 '
:' $t/log1 | grep endbr64 $CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,-z,rewrite-endbr $OBJDUMP -dr $t/exe2 > $t/log2 grep -A1 ':' $t/log2 | grep nop grep -A1 ':' $t/log2 | grep nop grep -A1 '
:' $t/log2 | grep endbr64 ================================================ FILE: test/arch-x86_64-z-rewrite-endbr3.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc test_cflags -fcf-protection || skip [ "$QEMU" == '' ] || skip # Check if Intel SDE CPU emulator is available command -v sde64 >& /dev/null || skip cat < int main() { printf("Hello world\n"); } EOF $CC -B. -o $t/exe $t/a.o -Wl,-z,rewrite-endbr sde64 -cet 1 -- $t/exe | grep 'Hello world' ================================================ FILE: test/arch-x86_64-z-shstk.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc echo endbr64 | $CC -o /dev/null -c -xassembler - 2> /dev/null || skip cat < int fn1(); int fn2() { return 3; } void *ptr = fn2; int main() { printf("%d\n", fn1()); } EOF $CC -B. -pie -o $t/exe $t/a.o $t/b.o $QEMU $t/exe | grep 3 readelf --dynamic $t/exe | grep -F '(TEXTREL)' readelf --dynamic $t/exe | grep '\(FLAGS\).*TEXTREL' ================================================ FILE: test/as-needed-dso.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/log2 grep libbar $t/log2 not grep libfoo $t/log2 ================================================ FILE: test/as-needed-dso2.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/log2 not grep libfoo $t/log2 not grep libbar $t/log2 grep libbaz $t/log2 ================================================ FILE: test/as-needed-weak.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/log1 grep -F 'Shared library: [libfoo.so]' $t/log1 grep -F 'Shared library: [libbar.so]' $t/log1 $CC -B. -o $t/exe2 $t/a.o -Wl,-as-needed -L$t -lbar -lfoo readelf --dynamic $t/exe2 > $t/log2 grep -F 'Shared library: [libfoo.so]' $t/log2 not grep -F 'Shared library: [libbar.so]' $t/log2 ================================================ FILE: test/as-needed.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/log grep -F 'Shared library: [libfoo.so]' $t/log grep -F 'Shared library: [libbar.so]' $t/log $CC -B. -o $t/exe $t/a.o -Wl,--as-needed $t/b.so $t/c.so readelf --dynamic $t/exe > $t/log grep -F 'Shared library: [libfoo.so]' $t/log not grep -F 'Shared library: [libbar.so]' $t/log ================================================ FILE: test/audit.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/log grep -F 'Auxiliary library: [foo]' $t/log grep -F 'Auxiliary library: [bar]' $t/log not ./mold -o exe $t/a.o -f bar |& grep 'auxiliary may not be used without -shared' # -fuse-ld is ignored ./mold -o exe $t/a.o -fuse-ld=mold ================================================ FILE: test/bno-symbolic.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # GCC produces buggy code for this test case on s390x. # https://sourceware.org/bugzilla/show_bug.cgi?id=29655 [ $MACHINE = s390x ] && $CC -v |& grep -E '^gcc version 1[0-5]\.' && skip cat < extern int foo; int get_foo(); void *bar(); int main() { foo = 3; printf("%d %d %d\n", foo, get_foo(), bar == bar()); } EOF $CC -B. -no-pie -o $t/exe $t/c.o $t/b.so $QEMU $t/exe | grep '3 3 1' ================================================ FILE: test/bsymbolic-functions.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int foo = 3; int x = 5; int get_foo(); void *bar() { return &x; } int main() { printf("%d %d %d\n", foo, get_foo(), bar == bar()); } EOF $CC -B. -no-pie -o $t/exe $t/c.o $t/b.so $QEMU $t/exe | grep '3 3 0' ================================================ FILE: test/bsymbolic-non-weak-functions.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int foo = 3; int bar = 3; int baz = 3; int get_foo1() { return 7; } int get_bar1() { return 7; } int get_baz1() { return 7; } int get_foo2(); int get_bar2(); int get_baz2(); int main() { printf("%d %d %d %d %d %d\n", foo, bar, baz, get_foo2(), get_bar2(), get_baz2()); } EOF $CC -B. -o $t/exe $t/c.o $t/b.so $QEMU $t/exe | grep '^3 3 3 3 3 7$' ================================================ FILE: test/bsymbolic-non-weak.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int foo = 3; int bar = 3; int baz = 3; int get_foo1() { return 7; } int get_bar1() { return 7; } int get_baz1() { return 7; } int get_foo2(); int get_bar2(); int get_baz2(); int main() { printf("%d %d %d %d %d %d\n", foo, bar, baz, get_foo2(), get_bar2(), get_baz2()); } EOF $CC -B. -o $t/exe $t/c.o $t/b.so $QEMU $t/exe | grep '^3 3 3 3 4 7$' ================================================ FILE: test/bsymbolic.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int foo = 3; int get_foo(); int main() { printf("%d %d\n", foo, get_foo()); } EOF $CC -B. -no-pie -o $t/exe $t/c.o $t/b.so $QEMU $t/exe | grep '3 4' ================================================ FILE: test/build-id.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc echo 'int main() { return 0; }' | $CC -c -o $t/a.o -xc - $CC -B. -o $t/exe1 $t/a.o -Wl,-build-id readelf -n $t/exe1 | grep 'GNU.*0x00000014.*NT_GNU_BUILD_ID' $CC -B. -o $t/exe2 $t/a.o -Wl,-build-id=uuid readelf -nW $t/exe2 | grep -E 'Build ID: ............4...[89abcdef]' $CC -B. -o $t/exe3 $t/a.o -Wl,-build-id=md5 readelf -n $t/exe3 | grep 'GNU.*0x00000010.*NT_GNU_BUILD_ID' $CC -B. -o $t/exe4 $t/a.o -Wl,-build-id=sha1 readelf -n $t/exe4 | grep 'GNU.*0x00000014.*NT_GNU_BUILD_ID' $CC -B. -o $t/exe5 $t/a.o -Wl,-build-id=sha256 readelf -n $t/exe5 | grep 'GNU.*0x00000020.*NT_GNU_BUILD_ID' $CC -B. -o $t/exe6 $t/a.o -Wl,-build-id=fast readelf -n $t/exe6 | grep 'GNU.*0x00000020.*NT_GNU_BUILD_ID' $CC -B. -o $t/exe7 $t/a.o -Wl,-build-id=0xdeadbeefdeadbeef readelf -n $t/exe7 | grep 'Build ID: deadbeefdeadbeef' ================================================ FILE: test/canonical-plt.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # GCC produces buggy code for this test case on s390x. # https://sourceware.org/bugzilla/show_bug.cgi?id=29655 [ $MACHINE = s390x ] && $CC -v |& grep -E '^gcc version 1[0-5]\.' && skip cat < void *foo(); void *bar(); void *baz(); int main() { printf("%d %d %d\n", foo == foo(), bar == bar(), bar == baz()); } EOF $CC -B. -no-pie -o $t/exe $t/a.so $t/b.o $t/c.o $QEMU $t/exe | grep '^1 1 1$' ================================================ FILE: test/cmdline.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc not ./mold -zfoo |& grep 'unknown command line option: -zfoo' not ./mold -z foo |& grep 'unknown command line option: -z foo' not ./mold -abcdefg |& grep 'unknown command line option: -abcdefg' not ./mold --abcdefg |& grep 'unknown command line option: --abcdefg' ================================================ FILE: test/color-diagnostics.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int foo; int bar; extern int baz; __attribute__((weak)) int two(); int main() { printf("%d %d %d %d\n", foo, bar, baz, two ? two() : -1); } EOF cat < int bar; int main() { printf("%d\n", bar); } EOF cat < int foo; int bar = 5; int baz; int main() { printf("%d %d %d\n", foo, bar, baz); } EOF $CC -B. -o $t/exe $t/a.o $t/b.o $QEMU $t/exe | grep '0 5 42' readelf --sections $t/exe > $t/log grep '.common .*NOBITS' $t/log ================================================ FILE: test/common.inc ================================================ # -*- mode: sh -*- # Make sure all commands print out messages in English export LC_ALL=C canonical_name() { case $1 in i?86) echo i686 ;; amd64) echo x86_64 ;; armeb*) echo armeb ;; arm*) echo arm ;; powerpc) echo ppc ;; powerpc64) echo ppc64 ;; powerpc64le) echo ppc64le ;; *) echo $1 esac } if [ -z "$MACHINE" ]; then MACHINE=$(canonical_name $(uname -m)) fi # Set tool names if [ -z "$TRIPLE" ]; then TESTDIR=out/test/$MACHINE CC="${TEST_CC:-cc}" CXX="${TEST_CXX:-c++}" GCC="${TEST_GCC:-gcc}" GXX="${TEST_GXX:-g++}" OBJDUMP=objdump OBJCOPY=objcopy STRIP=strip QEMU= elif [ "$TRIPLE" = powerpc64le-linux-gnu -a "$CPU" = power10 ]; then MACHINE=ppc64le TESTDIR=out/test/ppc64le-power10 CC="${TEST_CC:-$TRIPLE-gcc} -mcpu=power10" CXX="${TEST_CXX:-$TRIPLE-g++} -mcpu=power10" GCC="${TEST_GCC:-$TRIPLE-gcc} -mcpu=power10" GXX="${TEST_GXX:-$TRIPLE-g++} -mcpu=power10" OBJDUMP="$TRIPLE-objdump" OBJCOPY="$TRIPLE-objcopy" STRIP="$TRIPLE-strip" QEMU="qemu-ppc64le -L /usr/$TRIPLE -cpu power10" else MACHINE=$(canonical_name $(echo $TRIPLE | sed 's/-.*//')) TESTDIR=out/test/$MACHINE CC="${TEST_CC:-$TRIPLE-gcc}" CXX="${TEST_CXX:-$TRIPLE-g++}" GCC="${TEST_GCC:-$TRIPLE-gcc}" GXX="${TEST_GXX:-$TRIPLE-g++}" OBJDUMP="$TRIPLE-objdump" OBJCOPY="$TRIPLE-objcopy" STRIP="$TRIPLE-strip" case $MACHINE in i686) QEMU="qemu-i386 -L /usr/$TRIPLE" ;; sh4aeb) QEMU="qemu-sh4eb -L /usr/$TRIPLE" ;; *) QEMU="qemu-$MACHINE -L /usr/$TRIPLE" ;; esac fi testname=$(basename "$0" .sh) t=$TESTDIR/$testname mkdir -p $t case $MACHINE in x86_64 | i686 | arm | armeb) tlsdesc_opt=-mtls-dialect=gnu2 ;; aarch64 | riscv* | loongarch*) tlsdesc_opt=-mtls-dialect=desc ;; esac # We want to use GNU's binutils even on BSDs. `pkg install binutils` # installs GNU binutils under /usr/local/bin. if [ "$(uname)" = FreeBSD ]; then export PATH="/usr/local/bin:$PATH" fi # Common functions not() { if "$@"; then return 1; else return 0; fi } test_cflags() { echo 'int main() {}' | $CC -B. "$@" -o /dev/null -xc - >& /dev/null } test_cxxflags() { echo 'int main() {}' | $CXX -B. "$@" -o /dev/null -xc++ - >& /dev/null } is_musl() { { ldd --version; true; } |& grep musl > /dev/null } supports_ifunc() { ! is_musl && echo 'void x() __attribute__((ifunc("y"))); void *y() { return 0; }' | $CC -c -o /dev/null -xc - >& /dev/null } supports_tlsdesc() { # musl's tlsdesc on arm32 seems to be broken [[ $MACHINE = arm* ]] && is_musl && return 1 # FreeBSD's loader doesn't seem to support TLSDESC relocs in an executable [ "$(uname)" = FreeBSD ] && return 1 [ "$tlsdesc_opt" = '' ] && return 1 # TLSDESC may not be supported on old systems. Compile a DSO with # it to see if it is actually supported. echo '_Thread_local int x; int y() { return x; }' | $CC -shared -fPIC -xc -o $t/tlsdesc.so $tlsdesc_opt - 2> /dev/null || return 1 echo 'int y(); int main() { y(); }' | $CC -xc -c -o $t/tlsdesc.o - $CC -o $t/tlsdesc $t/tlsdesc.o $t/tlsdesc.so $QEMU $t/tlsdesc 2> /dev/null } on_qemu() { [ "$QEMU" != '' ] || grep -w qemu /proc/cpuinfo >& /dev/null } skip() { echo skipped trap - EXIT exit 0 } on_error() { code=$? echo "command failed: $1: $BASH_COMMAND" trap - EXIT exit $code } on_exit() { echo OK exit 0 } trap 'on_error $LINENO' ERR trap on_exit EXIT # Print out the startup message echo -n "Testing $testname ... " set -o pipefail set -x ================================================ FILE: test/compress-debug-sections-zstd.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # arm-linux-gnueabihf-objcopy crashes on x86-64 [[ $MACHINE = arm* ]] && skip [ $MACHINE = riscv32 ] && skip command -v zstdcat >& /dev/null || skip cat < int main() { printf("Hello world\n"); return 0; } EOF $CC -B. -o $t/exe $t/a.o -Wl,--compress-debug-sections=zstd $OBJCOPY --dump-section .debug_info=$t/debug_info $t/exe dd if=$t/debug_info of=$t/debug_info.zstd bs=24 skip=1 status=none zstdcat $t/debug_info.zstd > /dev/null ================================================ FILE: test/compress-debug-sections.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello world\n"); } EOF $CC -B. -o $t/exe $t/a.o -Wl,--compress-debug-sections=zlib readelf -WS $t/exe > $t/log grep '\.debug_info .* [Cx] ' $t/log grep '\.debug_str .* MS[Cx] ' $t/log ================================================ FILE: test/compressed-debug-info.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc command -v dwarfdump >& /dev/null || skip cat < /dev/null readelf --sections $t/exe | grep -F .debug_info ================================================ FILE: test/copyrel-alignment.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc [ $MACHINE = ppc64 ] && skip [ $MACHINE = ppc64le ] && skip [[ $MACHINE = loongarch* ]] && skip cat < extern int foo; int main() { printf("%d %p\n", foo, &foo); } EOF $CC -B. -o $t/exe1 $t/d.o $t/a.so -no-pie $QEMU $t/exe1 > /dev/null readelf -W --sections $t/exe1 | grep '\.copyrel.* 32$' $CC -B. -o $t/exe2 $t/d.o $t/b.so -no-pie $QEMU $t/exe2 > /dev/null readelf -W --sections $t/exe2 | grep '\.copyrel.* 8$' $CC -B. -o $t/exe3 $t/d.o $t/c.so -no-pie $QEMU $t/exe3 > /dev/null readelf -W --sections $t/exe3 | grep '\.copyrel.* 256$' ================================================ FILE: test/copyrel-norelro.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc [[ $MACHINE = ppc64* ]] && skip [[ $MACHINE = loongarch* ]] && skip cat < extern char msg[100]; int main() { printf("%s\n", msg); } EOF cat < #include #include extern const char readonly[100]; extern char readwrite[100]; static int segv = 0; static jmp_buf buf; void handler(int sig) { segv = 1; longjmp(buf, 1); } int main() { signal(SIGSEGV, handler); readwrite[0] = 5; int x = segv; if (setjmp(buf) == 0) *(char *)readonly = 5; int y = segv; printf("sigsegv %d %d\n", x, y); } EOF cat < #include #include extern char readonly[100]; extern char readwrite[100]; static int segv = 0; static jmp_buf buf; void handler(int sig) { segv = 1; longjmp(buf, 1); } int main() { signal(SIGSEGV, handler); readwrite[0] = 5; int x = segv; if (setjmp(buf) == 0) *(char *)readonly = 5; int y = segv; printf("sigsegv %d %d\n", x, y); } EOF cat < extern int foo; extern int *get_bar(); int main() { printf("%d %d %d\n", foo, *get_bar(), &foo == get_bar()); return 0; } EOF cat < int main() { printf("Hello world\n"); } EOF clang -B. -o $t/exe $t/a.o $QEMU $t/exe | grep 'Hello world' ================================================ FILE: test/ctors-in-init-array.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < static void ctor1() { printf("ctor1 "); } static void ctor2() { printf("ctor2 "); } static void ctor3() { printf("ctor3 "); } static void ctor4() { printf("ctor4 "); } static void dtor1() { printf("dtor1 "); } static void dtor2() { printf("dtor2 "); } static void dtor3() { printf("dtor3 "); } static void dtor4() { printf("dtor4 "); } __attribute__((aligned(sizeof(void *)), section(".ctors.65435"))) void (*ctors65435[])() = { ctor1 }; __attribute__((aligned(sizeof(void *)), section(".ctors.65433"))) void (*ctors65433[])() = { ctor2 }; __attribute__((aligned(sizeof(void *)), section(".ctors"))) void (*ctors[])() = { ctor4, ctor3 }; __attribute__((aligned(sizeof(void *)), section(".dtors"))) void (*dtors[])() = { dtor1, dtor2 }; __attribute__((aligned(sizeof(void *)), section(".dtors.65433"))) void (*dtors65433[])() = { dtor3 }; __attribute__((aligned(sizeof(void *)), section(".dtors.65435"))) void (*dtors65435[])() = { dtor4 }; EOF cat < __attribute__((constructor(101))) static void init1() { printf("init1 "); } __attribute__((constructor)) static void init2() { printf("init2 "); } __attribute__((destructor(101))) static void fini1() { printf("fini1 "); } __attribute__((destructor)) static void fini2() { printf("fini2 "); } int main() {} EOF $CC -B. -o $t/exe $t/a.o $t/b.o $QEMU $t/exe | grep 'ctor1 init1 ctor2 ctor3 ctor4 init2 fini2 dtor1 dtor2 dtor3 fini1 dtor4' ================================================ FILE: test/dead-debug-sections.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc command -v dwarfdump >& /dev/null || skip cat < const char *msg; int main() { printf("%s\n", msg); } EOF $CXX -o $t/exe $t/a.o $t/b.o $t/c.o -g $QEMU $t/exe | grep 'Hello world' dwarfdump $t/exe > /dev/null ================================================ FILE: test/debug-macro-section.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/a.h #define A 23 #define B 99 EOF cat < $t/b.ver $CC -B. -o $t/c.so -shared $t/a.o -Wl,--default-symver -Wl,--version-script=$t/b.ver readelf --dyn-syms $t/c.so | grep -F ' foo@@c.so' ================================================ FILE: test/default-symver.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < extern void live_func(); void dead_func() { printf("OK\n"); } int main() { live_func(); } EOF $CC -B. -flto -o $t/exe $t/a.o -Wl,-defsym,live_func=dead_func $QEMU $t/exe | grep "^OK$" ================================================ FILE: test/defsym-missing-symbol.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < extern char foo; extern char bar; void baz(); void print() { printf("Hello %p %p\n", &foo, &bar); } int main() { baz(); } EOF $CC -B. -o $t/exe $t/a.o -pie -Wl,-defsym=foo=16 \ -Wl,-defsym=bar=0x2000 -Wl,-defsym=baz=print $QEMU $t/exe | grep '^Hello 0x10 0x2000$' ================================================ FILE: test/defsym2.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < as core::iter::iterator::Iterator>::rposition::::{closure#0}' ================================================ FILE: test/demangle.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < unsigned la_version(unsigned v) { fprintf(stderr, "version=%d\n", v); return 0; } void foo() {} EOF cat <<'EOF' | $CC -B. -shared -o $t/b.so -xc - -fPIC -Wl,--audit=$t/a.so void foo(); void bar() { foo(); } EOF cat <<'EOF' | $CC -c -o $t/c.o -xc - void bar(); int main() { bar(); } EOF $CC -B. -o $t/exe1 $t/c.o $t/b.so -Wl,--allow-shlib-undefined readelf --dynamic $t/exe1 | grep 'Dependency audit library:..*/a.so' $CC -B. -o $t/exe2 $t/c.o $t/b.so -Wl,--depaudit=foo -Wl,--allow-shlib-undefined readelf --dynamic $t/exe2 | grep 'Dependency audit library:..*foo:.*/a.so' ================================================ FILE: test/dependency-file-lto.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc test_cflags -flto -fno-fat-lto-objects || skip cat < int main() { printf("Hello world\n"); } EOF $CC -B. -flto -o $t/exe $t/a.o -Wl,-dependency-file=$t/dep grep '/exe:.*/a.o ' $t/dep grep '/a.o:$' $t/dep not grep '^/tmp' $t/dep ================================================ FILE: test/dependency-file-response-file.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello world\n"); } EOF echo "$t/a.o -Wl,-dependency-file=$t/dep" > $t/rsp $CC -B. -o $t/exe @$t/rsp grep '/exe:.*/a.o ' $t/dep grep '/a.o:$' $t/dep not grep '^/tmp' $t/dep ================================================ FILE: test/dependency-file.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello world\n"); } EOF $CC -B. -o $t/exe $t/a.o -Wl,-dependency-file=$t/dep grep "dependency-file/exe:.*/a.o " $t/dep grep ".*/a.o:$" $t/dep ================================================ FILE: test/disable-new-dtags.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/log grep -F _start $t/log grep -F foo $t/log grep -F .Lbar $t/log ./mold -o $t/exe $t/a.o --discard-locals readelf --symbols $t/exe > $t/log grep -F _start $t/log grep -F foo $t/log not grep -F .Lbar $t/log ./mold -o $t/exe $t/a.o --discard-all readelf --symbols $t/exe > $t/log grep -F _start $t/log not grep -F foo $t/log not grep -F .Lbar $t/log ./mold -o $t/exe $t/a.o --strip-all readelf --symbols $t/exe > $t/log not grep -F _start $t/log not grep -F foo $t/log not grep -F .Lbar $t/log ================================================ FILE: test/dso-undef.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < void init() { printf("init\n"); } void fini() { printf("fini\n"); } void keep() {} EOF $CC -B. -o $t/c.so -shared $t/b.o $CC -B. -o $t/d.so -shared $t/b.o -Wl,-init,init -Wl,-fini,fini $CC -B. -o $t/exe1 $t/a.o $t/c.so $CC -B. -o $t/exe2 $t/a.o $t/d.so $QEMU $t/exe1 > $t/log1 $QEMU $t/exe2 > $t/log2 not grep init $t/log1 not grep fini $t/log1 grep init $t/log2 grep fini $t/log2 ================================================ FILE: test/dt-needed.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/log grep 'duplicate symbol:.* foo' $t/log ================================================ FILE: test/duplicate-error-gc-sections.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc nm mold | grep '__tsan_init' && skip cat < $t/log grep -F '(DEBUG)' $t/log cat < $t/log grep -w foo $t/log not grep -w bar $t/log ================================================ FILE: test/dynamic-list.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/log not grep ' foo' $t/log not grep ' bar' $t/log cat < $t/dyn { foo; bar; }; EOF $CC -B. -o $t/exe1 $t/a.o -Wl,-dynamic-list=$t/dyn readelf --dyn-syms $t/exe1 > $t/log1 grep ' foo' $t/log1 grep ' bar' $t/log1 $CC -B. -o $t/exe2 $t/a.o -Wl,--export-dynamic-symbol-list=$t/dyn readelf --dyn-syms $t/exe2 > $t/log2 grep ' foo' $t/log2 grep ' bar' $t/log2 $CC -B. -o $t/exe3 $t/a.o -Wl,--export-dynamic-symbol=foo,--export-dynamic-symbol=bar readelf --dyn-syms $t/exe3 > $t/log3 grep ' foo' $t/log3 grep ' bar' $t/log3 ================================================ FILE: test/dynamic-list2.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/log not grep ' foo' $t/log not grep ' bar' $t/log cat < $t/dyn { foo; extern "C++" { "baz(int)"; }; }; EOF $CC -B. -o $t/exe1 $t/a.o $t/b.o -Wl,-dynamic-list=$t/dyn readelf --dyn-syms $t/exe1 > $t/log1 grep ' foo' $t/log1 not grep ' bar' $t/log1 grep ' _Z3bazi' $t/log1 $CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,--export-dynamic-symbol-list=$t/dyn readelf --dyn-syms $t/exe2 > $t/log2 grep ' foo' $t/log2 not grep ' bar' $t/log2 grep ' _Z3bazi' $t/log2 ================================================ FILE: test/dynamic-list3.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat <<'EOF' > $t/dyn { xyz; foo*bar*[abc]x; }; EOF cat < $t/log1 grep ' xyz' $t/log1 not grep ' foobarzx' $t/log1 grep ' foobarcx' $t/log1 grep ' foo123bar456bx' $t/log1 not grep ' foo123bar456c' $t/log1 not grep ' foo123bar456x' $t/log1 $CC -B. -Wl,--export-dynamic-symbol-list=$t/dyn -o $t/exe2 $t/b.o readelf --dyn-syms $t/exe2 > $t/log2 grep ' xyz' $t/log2 not grep ' foobarzx' $t/log2 grep ' foobarcx' $t/log2 grep ' foo123bar456bx' $t/log2 not grep ' foo123bar456c' $t/log2 not grep ' foo123bar456x' $t/log2 ================================================ FILE: test/dynamic-list4.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < void foo() { printf("foo1 "); } void bar() { printf("bar1 "); } void baz() { printf("baz1 "); } void print() { foo(); bar(); baz(); printf("\n"); } EOF cat < $t/dyn { foo; bar; }; EOF $CC -B. -shared -o $t/b.so $t/a.o -Wl,--dynamic-list=$t/dyn cat < void foo() { printf("foo2 "); } void bar() { printf("bar2 "); } void baz() { printf("baz2 "); } EOF $CC -B. -shared -o $t/d.so $t/c.o cat < void print(); int main() { print(); } EOF $CC -B. -o $t/exe1 $t/e.o -Wl,-push-state,-no-as-needed $t/b.so -Wl,-pop-state $QEMU $t/exe1 | grep 'foo1 bar1 baz1' $CC -B. -o $t/exe2 $t/e.o -Wl,-push-state,-no-as-needed $t/d.so $t/b.so -Wl,-pop-state $QEMU $t/exe2 | grep 'foo2 bar2 baz1' ================================================ FILE: test/dynamic.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc echo '.globl main; main:' | $CC -o $t/a.o -c -x assembler - $CC -B. -o $t/exe $t/a.o readelf --dynamic $t/exe | grep -E 'Shared library:.*\blibc\b' readelf -W --dyn-syms --use-dynamic $t/exe | grep -E 'FUNC\s+GLOBAL\s+DEFAULT.*UND\s+__libc_start' ================================================ FILE: test/emit-relocs-cpp.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc [[ $MACHINE = arm* ]] && skip cat < int main() { printf("Hello world\n"); } EOF $CXX -B. -o $t/exe $t/a.o -Wl,-emit-relocs $QEMU $t/exe | grep 'Hello world' readelf -SW $t/exe | grep -E 'rela?\.text' readelf -SW $t/exe | grep -E 'rela?\.eh_frame' ================================================ FILE: test/emit-relocs-dead-sections.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc [ $MACHINE = m68k ] && skip [ $MACHINE = sh4 ] && skip cat < struct Foo { Foo() { std::cout << "Hello world\n"; } }; Foo x; EOF cat < struct Foo { Foo() { std::cout << "Hello world\n"; } }; Foo y; int main() {} EOF $CXX -B. -o $t/exe $t/a.o $t/b.o -Wl,-emit-relocs $QEMU $t/exe | grep 'Hello world' ================================================ FILE: test/emit-relocs.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { puts("Hello world"); } EOF $CC -B. -o $t/exe $t/a.o -Wl,-emit-relocs $QEMU $t/exe | grep 'Hello world' readelf -S $t/exe | grep -E 'rela?\.text' ================================================ FILE: test/empty-dso.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello world\n"); } EOF rm -f $t/b.script touch $t/b.script $CC -B. -o $t/exe $t/a.o -Wl,--version-script,$t/b.script $QEMU $t/exe | grep 'Hello world' ================================================ FILE: test/empty-input.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc rm -f $t/a.o touch $t/a.o not $CC -B. -o $t/exe $t/a.o &> $t/log grep 'unknown file type' $t/log ================================================ FILE: test/empty-version.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/log grep "Entry point address:.*0x1000$" $t/log $CC -B. -o $t/exe2 -Wl,-e,bar $t/a.o $t/b.o readelf -e $t/exe2 > $t/log grep "Entry point address:.*0x2000$" $t/log ================================================ FILE: test/exception-multiple-ehframe.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc [ $MACHINE = sh4aeb ] && skip nm mold | grep '__tsan_init' && skip command -v perl > /dev/null || skip [ $MACHINE = sh4 ] && skip cat < int foo(); int bar(); int main() { printf("%d %d\n", foo(), bar()); } EOF $CXX -B. -o $t/exe1 $t/d.o $t/c.o $QEMU $t/exe1 | grep '^1 3$' ================================================ FILE: test/exception.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc static= test_cxxflags -static && static=-static # I don't know why, but we need -pthread on m68k static="$static -pthread" cat < $t/a.cc int main() { try { throw 0; } catch (int x) { return x; } return 1; } EOF $CXX -c -o $t/b.o $t/a.cc -fPIC $CXX -c -o $t/c.o $t/a.cc -fno-PIC $CXX -B. -o $t/exe1 $t/b.o $static $QEMU $t/exe1 $CXX -B. -o $t/exe2 $t/c.o -no-pie $static $QEMU $t/exe2 $CXX -B. -o $t/exe3 $t/b.o -pie $QEMU $t/exe3 $CXX -B. -o $t/exe4 $t/c.o -no-pie $QEMU $t/exe4 $CXX -B. -o $t/exe5 $t/b.o -pie -Wl,--gc-sections $QEMU $t/exe5 $CXX -B. -o $t/exe6 $t/c.o -no-pie $static -Wl,--gc-sections $QEMU $t/exe6 if [ $MACHINE = x86_64 ]; then $CXX -c -o $t/d.o $t/a.cc -mcmodel=large -fPIC $CXX -B. -o $t/exe7 $t/d.o $static $QEMU $t/exe7 $CXX -B. -o $t/exe8 $t/d.o -pie $QEMU $t/exe8 fi if [ $MACHINE = x86_64 -o $MACHINE = aarch64 ]; then $CXX -c -o $t/e.o $t/a.cc -mcmodel=large -fno-PIC $CXX -B. -o $t/exe9 $t/e.o -no-pie $static $QEMU $t/exe9 $CXX -B. -o $t/exe10 $t/e.o -no-pie $QEMU $t/exe10 fi # riscv64-linux-gnu-strip crashes for some reason if [ $MACHINE != riscv32 ]; then $CXX -B. -o $t/exe11 $t/b.o -pie $STRIP $t/exe11 $QEMU $t/exe11 $CXX -B. -o $t/exe12 $t/c.o -no-pie $STRIP $t/exe12 $QEMU $t/exe12 fi ================================================ FILE: test/exclude-libs.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/log grep -F foo $t/log grep -F bar $t/log grep -F baz $t/log $CC -B. -shared -o $t/f.so $t/e.o $t/c.a $t/d.a -Wl,-exclude-libs=c.a readelf --dyn-syms $t/f.so > $t/log not grep -F foo $t/log grep -F bar $t/log grep -F baz $t/log $CC -B. -shared -o $t/f.so $t/e.o $t/c.a $t/d.a -Wl,-exclude-libs=c.a -Wl,-exclude-libs=d.a readelf --dyn-syms $t/f.so > $t/log not grep -F foo $t/log not grep -F bar $t/log grep -F baz $t/log $CC -B. -shared -o $t/f.so $t/e.o $t/c.a $t/d.a -Wl,-exclude-libs=c.a:d.a readelf --dyn-syms $t/f.so > $t/log not grep -F foo $t/log not grep -F bar $t/log grep -F baz $t/log $CC -B. -shared -o $t/f.so $t/e.o $t/c.a $t/d.a -Wl,-exclude-libs=ALL readelf --dyn-syms $t/f.so > $t/log not grep -F foo $t/log not grep -F bar $t/log grep -F baz $t/log ================================================ FILE: test/exclude-libs2.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello world\n"); } EOF $CC -B. -o $t/exe $t/a.o -Wl,--execute-only $QEMU $t/exe | grep 'Hello world' readelf -W --segments $t/exe | grep -E 'LOAD\s.*[0-9a-f] E 0x' ================================================ FILE: test/export-dynamic.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/log grep -E 'NOTYPE\s+GLOBAL DEFAULT\s+[0-9]+ bar' $t/log grep -E 'NOTYPE\s+GLOBAL DEFAULT\s+[0-9]+ _start' $t/log ================================================ FILE: test/export-from-exe.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < /dev/null not $CC -B. -o $t/exe $t/a.o $t/b.o -Wl,-warn-common -Wl,-fatal-warnings 2> /dev/null ================================================ FILE: test/filler.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc test_cflags -static || skip cat < __attribute__((aligned(512))) char hello[] = "Hello"; __attribute__((aligned(512))) char world[] = "world"; int main() { printf("%s %s\n", hello, world); } EOF $CC -B. -static -Wl,--filler,0xfe -o $t/exe1 $t/a.o sed -i -e 's/--filler 0xfe/--filler 0x00/' $t/exe1 od -x $t/exe1 > $t/txt1 $CC -B. -static -Wl,--filler,0x00 -o $t/exe2 $t/a.o od -x $t/exe2 > $t/txt2 diff -q $t/txt1 $t/txt2 ================================================ FILE: test/filter.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/log grep -F 'Filter library: [foo]' $t/log grep -F 'Filter library: [bar]' $t/log ================================================ FILE: test/func-addr.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < typedef void Func(); void fn(); Func *const ptr = fn; int main() { printf("%d\n", fn == ptr); } EOF $CC -B. -o $t/exe -no-pie $t/b.o $t/a.so $QEMU $t/exe | grep 1 ================================================ FILE: test/gc-sections.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int two() { return 2; } int live_var1 = 1; int live_var2 = two(); int dead_var1 = 3; int dead_var2 = 4; void live_fn1() {} void live_fn2() { live_fn1(); } void dead_fn1() {} void dead_fn2() { dead_fn1(); } int main() { printf("%d %d\n", live_var1, live_var2); live_fn2(); } EOF $CXX -B. -o $t/exe1 $t/a.o readelf --symbols $t/exe1 > $t/log1 $QEMU $t/exe1 | grep '1 2' grep live_fn1 $t/log1 grep live_fn2 $t/log1 grep dead_fn1 $t/log1 grep dead_fn2 $t/log1 grep live_var1 $t/log1 grep live_var2 $t/log1 grep dead_var1 $t/log1 grep dead_var2 $t/log1 $CXX -B. -o $t/exe2 $t/a.o -Wl,-gc-sections readelf --symbols $t/exe2 > $t/log2 $QEMU $t/exe2 | grep '1 2' grep live_fn1 $t/log2 grep live_fn2 $t/log2 not grep dead_fn1 $t/log2 not grep dead_fn2 $t/log2 grep live_var1 $t/log2 grep live_var2 $t/log2 not grep dead_var1 $t/log2 not grep dead_var2 $t/log2 ================================================ FILE: test/gdb-index-compress-output.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc on_qemu && skip [ $MACHINE = riscv64 -o $MACHINE = riscv32 -o $MACHINE = sparc64 ] && skip command -v gdb >& /dev/null || skip cat < void trap() {} static void hello() { printf("Hello world\n"); trap(); } void greet() { hello(); } EOF $CC -B. -shared -o $t/b.so $t/a.o -Wl,--gdb-index -Wl,--compress-debug-sections=zlib-gabi readelf -WS $t/b.so 2> /dev/null | grep -F .gdb_index cat < /dev/null | grep -F .gdb_index $QEMU $t/exe | grep 'Hello world' DEBUGINFOD_URLS= gdb $t/exe -nx -batch -ex 'b main' -ex r -ex 'b trap' \ -ex c -ex bt -ex quit >& $t/log grep 'hello () at .*:7' $t/log grep 'greet () at .*:11' $t/log grep 'main () at .*:4' $t/log ================================================ FILE: test/gdb-index-dwarf2.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc on_qemu && skip [ $MACHINE = riscv64 -o $MACHINE = riscv32 -o $MACHINE = sparc64 ] && skip command -v gdb >& /dev/null || skip test_cflags -gdwarf-2 -g || skip cat < void trap() {} void hello2() { printf("Hello world\n"); trap(); } EOF $CC -B. -shared -o $t/c.so $t/a.o $t/b.o -Wl,--gdb-index readelf -WS $t/c.so 2> /dev/null | grep -F .gdb_index cat < /dev/null | grep -F .gdb_index $QEMU $t/exe | grep 'Hello world' DEBUGINFOD_URLS= gdb $t/exe -nx -batch -ex 'b main' -ex r -ex 'b trap' \ -ex c -ex bt -ex quit >& $t/log grep 'hello2 () at .*:7' $t/log grep 'hello () at .*:4' $t/log grep 'greet () at .*:8' $t/log grep 'main () at .*:4' $t/log ================================================ FILE: test/gdb-index-dwarf3.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc on_qemu && skip [ $MACHINE = riscv64 -o $MACHINE = riscv32 -o $MACHINE = sparc64 ] && skip command -v gdb >& /dev/null || skip test_cflags -gdwarf-3 || skip cat < void trap() {} void hello2() { printf("Hello world\n"); trap(); } EOF $CC -B. -shared -o $t/c.so $t/a.o $t/b.o -Wl,--gdb-index readelf -WS $t/c.so 2> /dev/null | grep -F .gdb_index cat < /dev/null | grep -F .gdb_index $QEMU $t/exe | grep 'Hello world' DEBUGINFOD_URLS= gdb $t/exe -nx -batch -ex 'b main' -ex r -ex 'b trap' \ -ex c -ex bt -ex quit >& $t/log grep 'hello2 () at .*:7' $t/log grep 'hello () at .*:4' $t/log grep 'greet () at .*:8' $t/log grep 'main () at .*:4' $t/log ================================================ FILE: test/gdb-index-dwarf4.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc on_qemu && skip [ $MACHINE = riscv64 -o $MACHINE = riscv32 -o $MACHINE = sparc64 ] && skip command -v gdb >& /dev/null || skip test_cflags -gdwarf-4 -g || skip cat < void trap() {} void hello2() { printf("Hello world\n"); trap(); } EOF $CC -B. -shared -o $t/c.so $t/a.o $t/b.o -Wl,--gdb-index readelf -WS $t/c.so 2> /dev/null | grep -F .gdb_index cat < /dev/null | grep -F .gdb_index $QEMU $t/exe | grep 'Hello world' DEBUGINFOD_URLS= gdb $t/exe -nx -batch -ex 'b main' -ex r -ex 'b trap' \ -ex c -ex bt -ex quit >& $t/log grep 'hello2 () at .*:7' $t/log grep 'hello () at .*:4' $t/log grep 'greet () at .*:8' $t/log grep 'main () at .*:4' $t/log ================================================ FILE: test/gdb-index-dwarf5.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc on_qemu && skip [ $MACHINE = riscv64 -o $MACHINE = riscv32 -o $MACHINE = sparc64 ] && skip command -v gdb >& /dev/null || skip test_cflags -gdwarf-5 -g || skip cat < $t/a.c void fn3(); static void fn2() { fn3(); } void fn1() { fn2(); } EOF cat < $t/b.c void fn5(); static void fn4() { fn5(); } void fn3() { fn4(); } EOF cat < $t/c.c void fn7(); static void fn6() { fn7(); } void fn5() { fn6(); } EOF cat < $t/d.c #include void trap() {} static void fn8() { printf("Hello world\n"); trap(); } void fn7() { fn8(); } EOF $CC -c -o $t/a.o $t/a.c -fPIC -g -ggnu-pubnames -gdwarf-5 -ffunction-sections $CC -c -o $t/b.o $t/b.c -fPIC -g -ggnu-pubnames -gdwarf-4 -ffunction-sections $CC -c -o $t/c.o $t/c.c -fPIC -g -ggnu-pubnames -gdwarf-5 $CC -c -o $t/d.o $t/d.c -fPIC -g -ggnu-pubnames -gdwarf-5 -ffunction-sections $CC -B. -shared -o $t/e.so $t/a.o $t/b.o $t/c.o $t/d.o -Wl,--gdb-index readelf -WS $t/e.so 2> /dev/null | grep -F .gdb_index readelf --debug=gdb_index $t/e.so 2> /dev/null | grep 'fn1: .* \[global, function\]' readelf --debug=gdb_index $t/e.so 2> /dev/null | grep 'char: .* \[static, type\]' cat < /dev/null | grep -F .gdb_index readelf --debug=gdb_index $t/exe 2> /dev/null | grep 'main: .* \[global, function\]' $QEMU $t/exe | grep 'Hello world' DEBUGINFOD_URLS= gdb $t/exe -nx -batch -ex 'b main' -ex r -ex 'b trap' \ -ex c -ex bt -ex quit >& $t/log grep 'fn8 () at .*/d.c:6' $t/log grep 'fn7 () at .*/d.c:10' $t/log grep 'fn6 () at .*/c.c:4' $t/log grep 'fn5 () at .*/c.c:8' $t/log grep 'fn4 () at .*/b.c:4' $t/log grep 'fn3 () at .*/b.c:8' $t/log grep 'fn2 () at .*/a.c:4' $t/log grep 'fn1 () at .*/a.c:8' $t/log ================================================ FILE: test/gdb-index-dwarf64.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc on_qemu && skip [ $MACHINE = riscv64 -o $MACHINE = riscv32 -o $MACHINE = sparc64 ] && skip command -v gdb >& /dev/null || skip test_cflags -gdwarf-5 -g -gdwarf64 || skip cat < $t/a.c void fn3(); static void fn2() { fn3(); } void fn1() { fn2(); } EOF cat < $t/b.c void fn5(); static void fn4() { fn5(); } void fn3() { fn4(); } EOF cat < $t/c.c void fn7(); static void fn6() { fn7(); } void fn5() { fn6(); } EOF cat < $t/d.c #include void trap() {} static void fn8() { printf("Hello world\n"); trap(); } void fn7() { fn8(); } EOF $CC -c -o $t/a.o $t/a.c -fPIC -g -ggnu-pubnames -gdwarf-5 -gdwarf64 -ffunction-sections $CC -c -o $t/b.o $t/b.c -fPIC -g -ggnu-pubnames -gdwarf-4 -gdwarf64 -ffunction-sections $CC -c -o $t/c.o $t/c.c -fPIC -g -ggnu-pubnames -gdwarf-5 -gdwarf64 $CC -c -o $t/d.o $t/d.c -fPIC -g -ggnu-pubnames -gdwarf-5 -gdwarf64 -ffunction-sections $CC -B. -shared -o $t/e.so $t/a.o $t/b.o $t/c.o $t/d.o $CC -B. -shared -o $t/f.so $t/a.o $t/b.o $t/c.o $t/d.o -Wl,--gdb-index readelf -WS $t/f.so 2> /dev/null | grep -F .gdb_index cat < /dev/null | grep -F .gdb_index $QEMU $t/exe2 | grep 'Hello world' DEBUGINFOD_URLS= gdb $t/exe2 -nx -batch -ex 'b main' -ex r -ex 'b trap' \ -ex c -ex bt -ex quit >& $t/log2 grep 'fn8 () at .*/d.c:6' $t/log2 grep 'fn7 () at .*/d.c:10' $t/log2 grep 'fn6 () at .*/c.c:4' $t/log2 grep 'fn5 () at .*/c.c:8' $t/log2 grep 'fn4 () at .*/b.c:4' $t/log2 grep 'fn3 () at .*/b.c:8' $t/log2 grep 'fn2 () at .*/a.c:4' $t/log2 grep 'fn1 () at .*/a.c:8' $t/log2 ================================================ FILE: test/gdb-index-empty.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc echo 'void _start() {}' | $CC -c -o $t/a.o -xc - ./mold -o $t/exe $t/a.o -gdb-index readelf -WS $t/exe | not grep -F .gdb_index ================================================ FILE: test/gdb-index-split-dwarf.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc on_qemu && skip [ $MACHINE = riscv64 -o $MACHINE = riscv32 -o $MACHINE = sparc64 ] && skip command -v gdb >& /dev/null || skip test_cflags -gdwarf-5 -g || skip cat < $t/a.c void fn3(); static void fn2() { fn3(); } void fn1() { fn2(); } EOF cat < $t/b.c void fn5(); static void fn4() { fn5(); } void fn3() { fn4(); } EOF cat < $t/c.c void fn7(); static void fn6() { fn7(); } void fn5() { fn6(); } EOF cat < $t/d.c #include void trap() {} static void fn8() { printf("Hello world\n"); trap(); } void fn7() { fn8(); } EOF $CC -c -o $t/a.o $t/a.c -fPIC -g -ggnu-pubnames -gdwarf-5 -gsplit-dwarf $CC -c -o $t/b.o $t/b.c -fPIC -g -ggnu-pubnames -gdwarf-4 -gsplit-dwarf $CC -c -o $t/c.o $t/c.c -fPIC -g -ggnu-pubnames -gdwarf-5 $CC -c -o $t/d.o $t/d.c -fPIC -g -ggnu-pubnames -gdwarf-5 -gsplit-dwarf $CC -B. -shared -o $t/e.so $t/a.o $t/b.o $t/c.o $t/d.o -Wl,--gdb-index readelf -WS $t/e.so 2> /dev/null | grep -F .gdb_index cat < /dev/null | grep -F .gdb_index $QEMU $t/exe | grep 'Hello world' ================================================ FILE: test/glibc-2.22-bug.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # glibc 2.22 or prior have a bug that ld-linux.so.2 crashes on dlopen() # if .rela.dyn and .rela.plt are not contiguous in a given DSO. # This test verifies that these sections are contiguous in mold's output. cat < int main() { printf("Hello world\n"); } EOF $CC -B. -o $t/b.so -shared $t/a.o readelf -W --sections $t/b.so | grep -E -A1 '\.rela?\.dyn' | grep -E '\.rela?\.plt' ================================================ FILE: test/global-offset-table.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc [ $MACHINE = ppc64le ] && skip cat < extern char foo; int main() { printf("%lx\n", (unsigned long)&foo); } EOF $CC -B. -no-pie -o $t/exe $t/a.o -Wl,-defsym=foo=_GLOBAL_OFFSET_TABLE_ $QEMU $t/exe > /dev/null GOT_ADDR=$($QEMU $t/exe) # _GLOBAL_OFFSET_TABLE_ refers the end of .got only on x86. # We assume .got is followed by .gotplt. if [ $MACHINE = x86_64 -o $MACHINE = i686 ]; then readelf -WS $t/exe | grep "\.got\.plt .*$GOT_ADDR " else readelf -WS $t/exe | grep "\.got .*$GOT_ADDR " fi ================================================ FILE: test/gnu-hash.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < /dev/null __attribute__((retain)) int foo() {} int bar() {} int main() {} EOF # Older versions of GCC does not support __attribute__((retain)) readelf -WS $t/a.o | grep '\.text\.foo.*AXR' || skip $CC -B. -o $t/exe $t/a.o -Wl,-gc-sections nm $t/exe > $t/log grep foo $t/log not grep bar $t/log ================================================ FILE: test/gnu-unique.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc command -v $GXX >& /dev/null || skip cat < inline int foo = 5; int main() { printf("foo=%d\n", foo); } EOF $CC -B. -o $t/exe $t/a.o $t/b.o -no-pie $QEMU $t/exe | grep 'foo=5' ================================================ FILE: test/gnu-warning.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < __attribute__((section(".gnu.warning.foo"))) static const char foo[] = "foo is deprecated"; __attribute__((section(".gnu.warning.bar"))) const char bar[] = "bar is deprecated"; int main() { printf("Hello world\n"); } EOF # Make sure that we do not copy .gnu.warning.* sections. $CC -B. -o $t/exe $t/a.o -no-pie $QEMU $t/exe | grep 'Hello world' ================================================ FILE: test/hash-style-sysv.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < void hello() { printf("Hello world\n"); } EOF $CC -B. -shared -o $t/b.so $t/a.o -Wl,--hash-style=sysv cat < $t/log not grep -F ' .hash' $t/log not grep -F ' .gnu.hash' $t/log ================================================ FILE: test/hello-dynamic.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello world\n"); } EOF $CC -B. -o $t/exe $t/a.o -no-pie $QEMU $t/exe | grep 'Hello world' ================================================ FILE: test/hello-static.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc test_cflags -static || skip cat < int main() { printf("Hello world\n"); } EOF $CC -B. -o $t/exe $t/a.o -static $QEMU $t/exe | grep 'Hello world' ================================================ FILE: test/help.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc ./mold --help | grep Usage ================================================ FILE: test/hidden-archive.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/log not grep -w foo $t/log grep -w bar $t/log ================================================ FILE: test/icf-gcc-except-table.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc [ $MACHINE = arm ] && skip [ $MACHINE = armeb ] && skip [[ $MACHINE = riscv* ]] && skip [[ $MACHINE = loongarch* ]] && skip cat < template struct X { static void raise() { throw std::logic_error("foo"); } }; int main() { X().raise(); X().raise(); } EOF $CXX -B. -o $t/exe $t/a.o -Wl,-icf=safe,--print-icf-sections |& grep -E 'removing .*/a.o:\(.gcc_except_table' ================================================ FILE: test/icf-safe.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # GCC 11 or older uses R_390_PLT32DBL to take an address of a function # instead of R_390_PC32DBL if [ $MACHINE = s390x ]; then echo 'void *foo() { return foo; }' | $CC -c -o $t/a.o -xc - readelf -r $t/a.o | grep R_390_PLT32DBL && skip fi cat < int foo1(); int foo2(); int foo3(); int main() { printf("%d %d\n", foo1 == foo2, foo2 == foo3); } EOF $CC -B. -o $t/exe1 -Wl,-icf=safe $t/a.o $t/b.o $QEMU $t/exe1 | grep '^0 0$' cat < int bar() { return 5; } int foo1(int x) { return bar() + x; } int foo2(int x) { return bar() + x; } int foo3() { bar(); return 5; } int main() { printf("%d %d\n", (long)foo1 == (long)foo2, (long)foo1 == (long)foo3); return 0; } EOF $CC -B. -o $t/exe $t/a.o -Wl,-icf=all $QEMU $t/exe | grep '1 0' ================================================ FILE: test/ifunc-address-equality-exported.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # Clang miscompiles the test code, so skip it if Clang. # https://github.com/llvm/llvm-project/issues/111338 $CC --version | grep clang && skip supports_ifunc || skip cat < typedef void Func(); __attribute__((ifunc("resolve_foo"))) void foo(void); void real_foo(void) { printf("foo "); } Func *resolve_foo() { return real_foo; } Func *get_foo(); int main() { printf("%p %p\n", foo, get_foo()); foo(); printf("\n"); } EOF $CC -B. -o $t/exe $t/c.o $t/b.so -no-pie $QEMU $t/exe | grep -E '^(\S+) \1' ================================================ FILE: test/ifunc-address-equality.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc supports_ifunc || skip cat < typedef void Func(); __attribute__((ifunc("resolve_foo"))) void foo(void); void real_foo(void) { printf("foo "); } Func *resolve_foo() { return real_foo; } __attribute__((ifunc("resolve_bar"))) void bar(void); void real_bar(void) { printf("bar "); } Func *resolve_bar() { return real_bar; } EOF cat < typedef void Func(); void foo(); void bar(); Func *get_foo(); Func *get_bar(); int main() { printf("%p %p %p %p\n", foo, get_foo(), bar, get_bar()); foo(); bar(); printf("\n"); } EOF $CC -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o -no-pie $QEMU $t/exe1 | grep -E '^(\S+) \1 (\S+) \2' readelf --dynamic $t/exe1 | not grep TEXTREL ================================================ FILE: test/ifunc-alias.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc supports_ifunc || skip cat < void foo() {} int bar() __attribute__((ifunc("resolve_bar"))); void *resolve_bar() { return foo; } void *bar2 = bar; int main() { printf("%p %p\n", bar, bar2); } EOF $CC -B. -o $t/exe1 $t/a.o -pie $QEMU $t/exe1 | grep -E '^(\S+) \1$' $CC -B. -o $t/exe2 $t/a.o -no-pie $QEMU $t/exe2 | grep -E '^(\S+) \1$' ================================================ FILE: test/ifunc-dlopen.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc supports_ifunc || skip cat < #include typedef void Func(); void foo(void); int main() { void *handle = dlopen(NULL, RTLD_NOW); Func *p = dlsym(handle, "foo"); foo(); p(); printf("%p %p\n", foo, p); } EOF cat < __attribute__((ifunc("resolve_foo"))) void foo(void); static void real_foo(void) { printf("foo "); } typedef void Func(); static Func *resolve_foo(void) { return real_foo; } EOF $CC -B. -o $t/c.so $t/b.o -shared $CC -B. -o $t/exe $t/a.o $t/c.so -no-pie -ldl $QEMU $t/exe | grep 'foo foo' ================================================ FILE: test/ifunc-dso.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc supports_ifunc || skip cat < __attribute__((ifunc("resolve_foobar"))) void foobar(void); static void real_foobar(void) { printf("Hello world\n"); } typedef void Func(); static Func *resolve_foobar(void) { return real_foobar; } EOF $CC -B. -o $t/c.so $t/b.o -shared readelf -W --dyn-syms $t/c.so | grep -E '(IFUNC|: 10).*foobar' $CC -B. -o $t/exe $t/a.o $t/c.so $QEMU $t/exe | grep 'Hello world' ================================================ FILE: test/ifunc-dynamic.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc supports_ifunc || skip cat < __attribute__((ifunc("resolve_foobar"))) static void foobar(void); static void real_foobar(void) { printf("Hello world\n"); } typedef void Func(); static Func *resolve_foobar(void) { return real_foobar; } int main() { foobar(); } EOF $CC -B. -o $t/exe1 $t/a.o -Wl,-z,lazy $QEMU $t/exe1 | grep 'Hello world' $CC -B. -o $t/exe2 $t/a.o -Wl,-z,now $QEMU $t/exe2 | grep 'Hello world' ================================================ FILE: test/ifunc-export.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc supports_ifunc || skip cat < __attribute__((ifunc("resolve_foobar"))) void foobar(void); void real_foobar(void) { printf("Hello world\n"); } typedef void Func(); Func *resolve_foobar(void) { return real_foobar; } EOF $CC -B. -shared -o $t/b.so $t/a.o readelf --dyn-syms $t/b.so | grep -E '(IFUNC|: 10)\s+GLOBAL DEFAULT.* foobar' ================================================ FILE: test/ifunc-funcptr.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc supports_ifunc || skip cat < typedef int Fn(); Fn *get_foo(); int main() { Fn *f = get_foo(); printf("%d\n", f()); } EOF $CC -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o -pie $QEMU $t/exe1 | grep '^3$' $CC -B. -o $t/exe2 $t/a.o $t/b.o $t/c.o -no-pie $QEMU $t/exe2 | grep '^3$' ================================================ FILE: test/ifunc-noplt.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc supports_ifunc || skip cat < __attribute__((ifunc("resolve_foo"))) void foo(void); void hello(void) { printf("Hello world\n"); } typedef void Fn(); Fn *resolve_foo(void) { return hello; } int main() { foo(); } EOF $CC -B. -o $t/exe1 $t/a.o -pie $QEMU $t/exe1 | grep 'Hello world' $CC -B. -o $t/exe2 $t/a.o -no-pie $QEMU $t/exe2 | grep 'Hello world' ================================================ FILE: test/ifunc-static-pie.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc test_cflags -static-pie || skip supports_ifunc || skip cat < void foo() __attribute__((ifunc("resolve_foo"))); void hello() { printf("Hello world\n"); } void *resolve_foo() { return hello; } int main() { foo(); return 0; } EOF $CC -B. -o $t/exe2 $t/a.o -static-pie $QEMU $t/exe2 | grep 'Hello world' ================================================ FILE: test/ifunc-static.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc test_cflags -static || skip supports_ifunc || skip cat < void foo() __attribute__((ifunc("resolve_foo"))); void hello() { printf("Hello world\n"); } void *resolve_foo() { return hello; } int main() { foo(); return 0; } EOF $CC -B. -o $t/exe $t/a.o -static $QEMU $t/exe | grep 'Hello world' ================================================ FILE: test/image-base.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello world\n"); return 0; } EOF $CC -B. -no-pie -o $t/exe1 $t/a.o -Wl,--image-base=0x8000000 $QEMU $t/exe1 | grep 'Hello world' readelf -W --sections $t/exe1 | grep -E '.interp\s+PROGBITS\s+0*8000...\b' cat < __attribute__((constructor(10000))) void init4() { printf("1"); } EOF cat <<'EOF' | $CC -c -o $t/b.o -xc - #include __attribute__((constructor(1000))) void init3() { printf("2"); } EOF cat <<'EOF' | $CC -c -o $t/c.o -xc - #include __attribute__((constructor)) void init1() { printf("3"); } EOF cat <<'EOF' | $CC -c -o $t/d.o -xc - #include __attribute__((constructor)) void init2() { printf("4"); } EOF cat <<'EOF' | $CC -c -o $t/e.o -xc - #include __attribute__((destructor(10000))) void fini4() { printf("5"); } EOF cat <<'EOF' | $CC -c -o $t/f.o -xc - #include __attribute__((destructor(1000))) void fini3() { printf("6"); } EOF cat <<'EOF' | $CC -c -o $t/g.o -xc - #include __attribute__((destructor)) void fini1() { printf("7"); } EOF cat <<'EOF' | $CC -c -o $t/h.o -xc - #include __attribute__((destructor)) void fini2() { printf("8"); } EOF cat < void foo() { printf("Hello world\n"); } EOF $CC -B. -shared -o $t/b.so $t/a.o -Wl,-z,initfirst readelf --dynamic $t/b.so | grep 'Flags:.*INITFIRST' ================================================ FILE: test/interpose.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < void foo() { printf("Hello world\n"); } EOF $CC -B. -shared -o $t/b.so $t/a.o -Wl,-z,interpose readelf --dynamic $t/b.so | grep 'Flags:.*INTERPOSE' ================================================ FILE: test/invalid-version-script.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc echo 'int main() {}' | $CC -c -o $t/a.o -xc - echo 'VER1 { foo[12; };' > $t/b.ver not $CC -B. -shared -o $t/c.so -Wl,-version-script,$t/b.ver $t/a.o |& grep 'invalid version pattern' ================================================ FILE: test/issue646.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc [ $MACHINE = sh4aeb ] && skip cat < #include class Foo : public std::runtime_error { public: using std::runtime_error::runtime_error; }; static void do_throw() { throw Foo("exception"); } int main() { try { do_throw(); } catch (const Foo &e) { std::cout << "error: " << e.what() << std::endl; } } EOF $CXX -B. -o $t/exe $t/a.o $QEMU $t/exe | grep 'error: exception' ================================================ FILE: test/large-alignment-dso.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc [ $MACHINE = i686 ] && skip cat < #include void hello() __attribute__((aligned(32768), section(".hello"))); void world() __attribute__((aligned(32768), section(".world"))); void hello() { printf("Hello"); } void world() { printf(" world"); } void greet() { hello(); world(); } EOF $CC -B. -o $t/b.so $t/a.o -shared cat < #include void hello() __attribute__((aligned(32768), section(".hello"))); void world() __attribute__((aligned(32768), section(".world"))); void hello() { printf("Hello"); } void world() { printf(" world"); } int main() { hello(); world(); // Linux kernel may ignore a riduculously large alignment requirement, // but we still want to verify that an executable with a large // alignment requirement can still run. // // printf(" %lu %lu\n", // (unsigned long)((uintptr_t)hello % 32768), // (unsigned long)((uintptr_t)world % 32768)); } EOF $CC -B. -o $t/exe $t/a.o $QEMU $t/exe | grep 'Hello world' ================================================ FILE: test/large-max-page-size-strip.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # ARM32's strip command crashes on the output of this test for some reason. [[ $MACHINE = arm* ]] && skip strip=$STRIP command -v $strip >& /dev/null || skip cat < int main() { printf("Hello world\n"); } EOF $CC -B. -o $t/exe $t/a.o -pie -Wl,-zmax-page-size=0x200000 $strip $t/exe $QEMU $t/exe | grep 'Hello world' ================================================ FILE: test/large-max-page-size.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello world\n"); } EOF $CC -B. -o $t/exe $t/a.o -pie -Wl,-zmax-page-size=0x200000 $QEMU $t/exe | grep 'Hello world' ================================================ FILE: test/large-text.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello world\n"); } EOF $CC -B. -o $t/exe $t/a.o $t/b.o $QEMU $t/exe | grep 'Hello world' ================================================ FILE: test/library.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < void hello() { printf("Hello world\n"); } EOF $CC -B. -shared -o $t/libfoobar.so $t/a.o cat < $t/script bar = foo; EOF $CC -B. -o $t/b.so -shared $t/script $t/a.o readelf -sW $t/b.so | grep 'FUNC .* bar' cat < int bar(); int main() { printf("%d\n", bar()); return 0; } EOF $CC -B. -o $t/exe $t/c.o $t/b.so $QEMU $t/exe | grep 42 ================================================ FILE: test/linker-script-error.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/b.script not $CC -B. -o $t/exe $t/a.o $t/b.script |& grep 'unclosed comment' ================================================ FILE: test/linker-script-relocatable.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # OneTBB isn't tsan-clean nm mold | grep '__tsan_init' && skip cat < void hello() { printf("Hello world\n"); } EOF cat < $t/c.script ./mold --relocatable -o $t/d.o $t/c.script $CC -B. -o $t/exe $t/d.o $QEMU $t/exe | grep Hello ================================================ FILE: test/linker-script.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello world\n"); } EOF cat < $t/script GROUP("$t/a.o") EOF $CC -B. -o $t/exe $t/script $QEMU $t/exe | grep 'Hello world' $CC -B. -o $t/exe -Wl,-T,$t/script $QEMU $t/exe | grep 'Hello world' $CC -B. -o $t/exe -Wl,--script,$t/script $QEMU $t/exe | grep 'Hello world' ================================================ FILE: test/linker-script2.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/b.script INPUT(-lfoo) EOF $CC -B. -o $t/exe -L$t/foo/bar $t/b.script ================================================ FILE: test/linker-script3.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc mkdir -p $t/foo cat < $t/b.script INPUT(a.o) EOF $CC -B. -o $t/exe -L$t/foo $t/b.script ================================================ FILE: test/linker-script4.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc echo 'VERSION { ver_x { global: *; }; };' > $t/a.script cat < $t/b.s .globl foo, bar, baz foo: nop bar: nop baz: nop EOF $CC -B. -shared -o $t/c.so $t/a.script $t/b.s readelf --version-info $t/c.so > $t/log grep -F 'Rev: 1 Flags: none Index: 2 Cnt: 1 Name: ver_x' $t/log ================================================ FILE: test/linker-script5.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc mkdir -p $t/foo cat < $t/foo/b.script INPUT(a.o) EOF $CC -B. -o $t/exe $t/foo/b.script ================================================ FILE: test/linker-script6.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc mkdir -p $t/foo cat < $t/foo/b.script INPUT(a.o) EOF $CC -B. -o $t/exe $t/foo/b.script ================================================ FILE: test/lto-archive.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc [ "$CC" = cc ] || skip test_cflags -flto || skip cat < void hello() { printf("Hello world\n"); } EOF cat < void howdy() { printf("Hello world\n"); } EOF rm -f $t/c.a ar rc $t/c.a $t/a.o $t/b.o cat < $t/log grep hello $t/log not grep howdy $t/log ================================================ FILE: test/lto-archive2.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc test_cflags -flto=auto || skip echo | $CC -o $t/a.o -c -flto=auto -xc - rm -f $t/b.a ar rc $t/b.a $t/a.o cat <> $t/a.ll $foo = comdat any @foo = global i32 42, comdat($foo) EOF cp $t/a.ll $t/b.ll clang -S -emit-llvm -flto -o $t/a.bc $t/a.ll clang -S -emit-llvm -flto -o $t/b.bc $t/b.ll cat <<'EOF' | clang -o $t/c.o -c -flto -xc - #include extern int foo; int main() { printf("%d\n", foo); } EOF clang -B. -o $t/exe -flto $t/a.bc $t/b.bc $t/c.o $QEMU $t/exe | grep 42 ================================================ FILE: test/lto-dso.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc test_cflags -flto || skip cat <& /dev/null || skip cat < int main() { printf("Hello world\n"); } EOF $GCC -B. -o $t/exe1 -flto $t/a.o $QEMU $t/exe1 | grep 'Hello world' # Test that LTO is used for FAT LTO objects cat < int main() { printf("Hello world\n"); } EOF $GCC -B. -o $t/exe2 $t/b.o --verbose |& grep -- -fwpa # Test FAT objects if -fno-use-linker-plugin is used cat < int main() { printf("Hello world\n"); } EOF $GCC -B. -o $t/exe3 -flto -fno-use-linker-plugin $t/c.o $QEMU $t/exe3 | grep 'Hello world' ================================================ FILE: test/lto-llvm.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc [ $MACHINE = $(uname -m) ] || skip echo 'int main() {}' | clang -B. -flto -o /dev/null -xc - >& /dev/null || skip cat < int main() { printf("Hello world\n"); } EOF clang -B. -o $t/exe -flto $t/a.o $t/exe | grep 'Hello world' ================================================ FILE: test/lto-llvm2.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc [ $MACHINE = $(uname -m) ] || skip echo 'int main() {}' | clang -B. -flto -o /dev/null -xc - >& /dev/null || skip cat <& /dev/null || skip echo 'int main() {}' | clang -c -o $t/a.o -xc - echo 'void foo() {}' | clang -c -o $t/b.o -xc - -flto not ./mold -o /dev/null $t/a.o $t/b.o |& grep "b.o: unable to handle this LTO object file" ================================================ FILE: test/lto-nostdlib.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc test_cflags -flto || skip cat < $t/b.script { global: foo; local: *; }; EOF $CC -B. -shared -o $t/c.so -flto $t/a.o -Wl,-version-script=$t/b.script if [ $MACHINE = ppc64 ]; then # On PPC64V1, function symbol refers a function descriptor in .opd nm -D $t/c.so | grep 'D foo' nm -D $t/c.so | not grep 'D bar' else nm -D $t/c.so | grep 'T foo' nm -D $t/c.so | not grep 'T bar' fi ================================================ FILE: test/main-in-dso.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc [ $MACHINE = ppc64 ] && skip cat < int main() { printf("Hello world\n"); } EOF $CC -B. -shared -o $t/b.so $t/a.o $CC -o $t/c.o -c -xc /dev/null -fPIC $CC -B. -o $t/exe1 $t/c.o $t/b.so -pie $QEMU $t/exe1 | grep 'Hello world' $CC -o $t/c.o -c -xc /dev/null -fno-PIC $CC -B. -o $t/exe2 $t/c.o $t/b.so -no-pie $QEMU $t/exe2 | grep 'Hello world' ================================================ FILE: test/many-input-sections.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc seq 1 100000 | sed 's/.*/.section .data.&,"aw"\n.word 0\n/g' | $CC -c -xassembler -o $t/a.o - cat <<'EOF' | $CC -c -xc -o $t/b.o - #include int main() { printf("Hello\n"); return 0; } EOF $CC -B. -o $t/exe $t/a.o $t/b.o $QEMU $t/exe | grep Hello ================================================ FILE: test/many-input-sections2.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # OneTBB isn't tsan-clean nm mold | grep '__tsan_init' && skip echo 'foo = 0x1000' > $t/a.s seq 1 100000 | sed 's/.*/.section .data.&,"aw"\n.globl x&\nx&: .word 0\n/g' >> $t/a.s $CC -c -xassembler -o $t/a.o $t/a.s ./mold --relocatable -o $t/b.o $t/a.o readelf -WS $t/b.o > $t/log1 grep -F .data.100000 $t/log1 readelf -Ws $t/b.o > $t/log2 grep -F 'GLOBAL DEFAULT 100000' $t/log2 grep -F 'ABS foo' $t/log2 not grep -F 'ABS x68966' $t/log2 ================================================ FILE: test/many-output-sections.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc seq 1 100000 | sed 's/.*/.globl foo&\n.section .foo&,"aw"\nfoo&:.word 0\n/g' | $CC -c -xassembler -o $t/a.o - cat <<'EOF' | $CC -c -xc -o $t/b.o - -fPIC extern int foo100000; int bar() { return foo100000; } EOF not $CC -B. -shared -o $t/c.so $t/a.o $t/b.o |& grep -F 'too many output sections' ================================================ FILE: test/mcmodel-large.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc test_cflags -static -mcmodel=large || skip cat < int main() { printf("Hello world\n"); } EOF $CC -B. -o $t/exe $t/a.o -static -no-pie $QEMU $t/exe | grep 'Hello world' ================================================ FILE: test/mergeable-strings.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < #include char *cstr1 = "foo"; wchar_t *wide1 = L"foo"; char16_t *utf16_1 = u"foo"; char32_t *utf32_1 = U"foo"; EOF cat < #include #include extern char *cstr1; extern wchar_t *wide1; extern char16_t *utf16_1; extern char32_t *utf32_1; char *cstr2 = "foo"; wchar_t *wide2 = L"foo"; char16_t *utf16_2 = u"foo"; char32_t *utf32_2 = U"foo"; int main() { printf("%p %p %p %p %p %p %p %p\n", cstr1, cstr2, wide1, wide2, utf16_1, utf16_2, utf32_1, utf32_2); } EOF # String merging is an optional feature, so test it with the default # linker first to verify that it does work on this system. $CC -o $t/exe1 $t/a.o $t/b.o -no-pie if $QEMU $t/exe1 | grep -E '^(\S+) \1 (\S+) \2 (\S+) \3 (\S+) \4$'; then $CC -B. -o $t/exe2 $t/a.o $t/b.o -no-pie $QEMU $t/exe2 | grep -E '^(\S+) \1 (\S+) \2 (\S+) \3 (\S+) \4$' fi ================================================ FILE: test/missing-but-ok.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/log grep 'undefined symbol: foo' $t/log grep '>>> .*a\.o' $t/log ================================================ FILE: test/mold-wrapper.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc [ "$CC" = cc ] || skip ldd mold-wrapper.so | grep libasan && skip nm mold | grep '__[at]san_init' && skip cat <<'EOF' > $t/a.sh #!/usr/bin/env bash echo "$0" "$@" $FOO EOF chmod 755 $t/a.sh cat <<'EOF' | $CC -xc -o $t/exe - #define _GNU_SOURCE 1 #include #include #include #include extern char **environ; int main(int argc, char **argv) { if (!strcmp(argv[1], "execl")) { execl("/usr/bin/ld", "/usr/bin/ld", "execl", (char *)0); perror("execl"); return 1; } if (!strcmp(argv[1], "execlp")) { execlp("/usr/bin/ld", "/usr/bin/ld", "execlp", (char *)0); perror("execl"); return 1; } if (!strcmp(argv[1], "execle")) { execle("/usr/bin/ld", "/usr/bin/ld", "execle", (char *)0, environ); perror("execl"); return 1; } if (!strcmp(argv[1], "execv")) { execv("/usr/bin/ld", (char *[]){"/usr/bin/ld", "execv", (char *)0}); perror("execl"); return 1; } if (!strcmp(argv[1], "execvp")) { execvp("/usr/bin/ld", (char *[]){"/usr/bin/ld", "execvp", (char *)0}); perror("execl"); return 1; } if (!strcmp(argv[1], "execvpe")) { char *env[] = {"FOO=bar", NULL}; execvpe("/usr/bin/ld", (char *[]){"/usr/bin/ld", "execvpe", (char *)0}, env); perror("execl"); return 1; } fprintf(stderr, "unreachable: %s\n", argv[1]); return 1; } EOF LD_PRELOAD=`pwd`/mold-wrapper.so MOLD_PATH=$t/a.sh $t/exe execl | grep 'a.sh execl' LD_PRELOAD=`pwd`/mold-wrapper.so MOLD_PATH=$t/a.sh $t/exe execlp | grep 'a.sh execlp' LD_PRELOAD=`pwd`/mold-wrapper.so MOLD_PATH=$t/a.sh $t/exe execle | grep 'a.sh execle' LD_PRELOAD=`pwd`/mold-wrapper.so MOLD_PATH=$t/a.sh $t/exe execv | grep 'a.sh execv' LD_PRELOAD=`pwd`/mold-wrapper.so MOLD_PATH=$t/a.sh $t/exe execvp | grep 'a.sh execvp' LD_PRELOAD=`pwd`/mold-wrapper.so MOLD_PATH=$t/a.sh $t/exe execvpe | grep 'a.sh execvpe bar' ================================================ FILE: test/mold-wrapper2.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc ldd mold-wrapper.so | grep libasan && skip nm mold | grep '__[at]san_init' && skip ./mold -run bash -c 'echo $LD_PRELOAD' | grep -F mold-wrapper.so ================================================ FILE: test/nmagic.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat <& $t/log grep -F 'undefined symbol: foo' $t/log ================================================ FILE: test/no-allow-shlib-undefined2.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc [ "$(uname)" = FreeBSD ] && skip cat < int main() { printf("Hello world\n"); return 0; } EOF $CC -B. -o $t/exe $t/a.o -Wl,-no-quick-exit $QEMU $t/exe | grep 'Hello world' ================================================ FILE: test/no-undefined-version.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc echo 'ver_x { global: foo; };' > $t/a.ver cat < extern int foo; extern int bar; int main() { printf("%d %d\n", foo, bar); return 0; } EOF $CC -B. -no-pie -o $t/exe $t/a.so $t/b.o $QEMU $t/exe | grep '3 5' not $CC -B. -o $t/exe $t/a.so $t/b.o -no-pie -Wl,-z,nocopyreloc |& grep 'recompile with -fPIC' ================================================ FILE: test/noinhibit-exec.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < void *foo(); void *bar(); void *baz(); int main() { printf("%d %d %d\n", foo == foo(), bar == bar(), bar == baz()); } EOF $CC -B. -no-pie -o $t/exe $t/a.so $t/b.o $t/c.o $QEMU $t/exe | grep '^1 1 1$' readelf --dyn-syms $t/exe | grep '00000000 .* foo' readelf --dyn-syms $t/exe | grep '00000000 .* bar' ================================================ FILE: test/nostdlib.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/log not grep -F ' .dynsym ' $t/log not grep -F ' .dynstr ' $t/log ================================================ FILE: test/oformat-binary.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello world\n"); return 0; } EOF $CC -B. $t/a.o -o $t/exe -static -no-pie -Wl,--omagic readelf -W --segments $t/exe | grep -w RWE ================================================ FILE: test/package-metadata.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello world\n"); } EOF $CC -B. -o $t/exe1 $t/a.o -Wl,-package-metadata='{"foo":"bar"}' readelf -x .note.package $t/exe1 | grep -F '{"foo":"bar"}' $CC -B. -o $t/exe2 $t/a.o -Wl,--package-metadata='%7B%22foo%22%3A%22bar%22%7D' readelf -x .note.package $t/exe2 | grep -F '{"foo":"bar"}' not $CC -B. -o $t/exe3 $t/a.o -Wl,--package-metadata='foo%x' |& grep 'invalid string: foo%x' ================================================ FILE: test/physical-image-base.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc [ $MACHINE = ppc64 ] && skip # Test if grep supports backreferences echo abab | grep -E '(ab)\1' || skip cat < __attribute__((section("foo"))) int bar; int main() { printf("Hello world\n"); return 0; } EOF $CC -B. -no-pie -o $t/exe1 $t/a.o -Wl,--image-base=0x200000 \ -Wl,--physical-image-base=0x800000 $QEMU $t/exe1 | grep 'Hello world' readelf -W --segments $t/exe1 | grep -E 'LOAD\s+0x000000 0x0*200000 0x0*800000' readelf -Ws $t/exe1 | grep __phys_start_foo $CC -B. -no-pie -o $t/exe2 $t/a.o -Wl,--physical-image-base=0x800000 \ -Wl,--section-order='=0x800000 TEXT RODATA =0x900000 DATA BSS' readelf -W --segments $t/exe2 | grep -E 'LOAD\s+\S+\s+(\S+)\s\1.*R E 0' readelf -W --segments $t/exe2 | grep -E 'LOAD\s+\S+\s+(\S+)\s\1.*R 0' ================================================ FILE: test/pie.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello world\n"); return 0; } EOF $CC -B. -pie -o $t/exe $t/a.o readelf --file-header $t/exe | grep -E '(Shared object file|Position-Independent Executable file)' $QEMU $t/exe | grep 'Hello world' ================================================ FILE: test/plt-dso.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < void world() { printf("world\n"); } void real_hello() { printf("Hello "); world(); } void hello() { real_hello(); } EOF $CC -B. -shared -o $t/b.so $t/a.o cat < void world() { printf("WORLD\n"); } void hello(); int main() { hello(); } EOF $CC -B. -o $t/exe -Wl,-rpath=$t $t/c.o $t/b.so $QEMU $t/exe | grep 'Hello WORLD' ================================================ FILE: test/plt-symbols.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < void ignore(void *foo) {} void hello() { printf("Hello world\n"); } EOF $CC -B. -shared -o $t/b.so $t/a.o cat < int foo() { return 3; } int x = 5; int bar(); void *baz() { return &x; } int main() { printf("%d %d %d\n", foo(), bar(), baz == baz()); } EOF $CC -B. -no-pie -o $t/exe $t/c.o $t/b.so $QEMU $t/exe 2> /dev/null | grep '3 4 0' ================================================ FILE: test/push-pop-state.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/log1 grep -F a.so $t/log1 not grep -F b.so $t/log1 if test_cflags -static; then $CC -B. -o $t/exe2 $t/c.o -no-pie -static readelf --dynamic $t/exe2 | grep -F 'no dynamic section' $CC -B. -o $t/exe3 $t/c.o -no-pie -Wl,-push-state,-static,-pop-state readelf --dynamic $t/exe3 | grep -F libc fi ================================================ FILE: test/range-extension-thunk.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # Skip if 32 bits as we use very large addresses in this test. [ $MACHINE = i686 ] && skip [ $MACHINE = riscv32 ] && skip [ $MACHINE = m68k ] && skip # It looks like SPARC's runtime can't handle PLT if it's too far from GOT. [ $MACHINE = sparc64 ] && skip # Current LoongArch compilers emit BL for function calls, but I believe # they'll emit PCADDU18I + JIRL (which can address PC ± 128 GiB) in the # future. [[ $MACHINE = loongarch* ]] && skip # qemu aborts with the "Unknown exception 0x5" error, although this # test passes on a real POWER10 machine. on_qemu && [ "$CPU" = power10 ] && skip cat < $t/a.c #include void fn3(); void fn4(); __attribute__((section(".low"))) void fn1() { printf(" fn1"); fn3(); } __attribute__((section(".low"))) void fn2() { printf(" fn2"); fn4(); } int main() { printf(" main"); fn1(); printf("\n"); } EOF cat < $t/b.c #include void fn1(); void fn2(); __attribute__((section(".high"))) void fn3() { printf(" fn3"); fn2(); } __attribute__((section(".high"))) void fn4() { printf(" fn4"); } EOF $CC -c -o $t/c.o $t/a.c -O0 $CC -c -o $t/d.o $t/b.c -O0 $CC -B. -o $t/exe1 $t/c.o $t/d.o \ -Wl,--section-start=.low=0x10000000,--section-start=.high=0x20000000 $QEMU $t/exe1 | grep 'main fn1 fn3 fn2 fn4' $CC -c -o $t/e.o $t/a.c -O2 $CC -c -o $t/f.o $t/b.c -O2 $CC -B. -o $t/exe2 $t/e.o $t/f.o \ -Wl,--section-start=.low=0x10000000,--section-start=.high=0x20000000 $QEMU $t/exe2 | grep 'main fn1 fn3 fn2 fn4' ================================================ FILE: test/range-extension-thunk2.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < void f0(int); void f1(int); void f2(int); void f3(int); void f4(int); void f5(int); void f6(int); void f7(int); void f8(int); void f9(int); void f10(int); void f11(int); void f12(int); void f13(int); void f14(int); void f15(int); void f16(int); void f17(int); void f18(int); void f19(int); void f0(int x) { printf("0 "); if (!x) f9(x); } void space0() { __asm__(".space 1024*1024"); } void f1(int x) { printf("1 "); f8(x); } void space1() { __asm__(".space 1024*1024"); } void f2(int x) { printf("2 "); f7(x); } void space2() { __asm__(".space 1024*1024"); } void f3(int x) { printf("3 "); f6(x); } void space3() { __asm__(".space 1024*1024"); } void f4(int x) { printf("4 "); f5(x); } void space4() { __asm__(".space 1024*1024"); } void f5(int x) { printf("5 "); f10(x); } void space5() { __asm__(".space 1024*1024"); } void f6(int x) { printf("6 "); f4(x); } void space6() { __asm__(".space 1024*1024"); } void f7(int x) { printf("7 "); f3(x); } void space7() { __asm__(".space 1024*1024"); } void f8(int x) { printf("8 "); f2(x); } void space8() { __asm__(".space 1024*1024"); } void f9(int x) { printf("9 "); f1(x); } void space9() { __asm__(".space 1024*1024"); } void f10(int x) { printf("10 "); f19(x); } void space10() { __asm__(".space 8*1024*1024"); } void f11(int x) { printf("11 "); f18(x); } void space11() { __asm__(".space 8*1024*1024"); } void f12(int x) { printf("12 "); f17(x); } void space12() { __asm__(".space 8*1024*1024"); } void f13(int x) { printf("13 "); f16(x); } void space13() { __asm__(".space 8*1024*1024"); } void f14(int x) { printf("14 "); f15(x); } void space14() { __asm__(".space 8*1024*1024"); } void f15(int x) { printf("15 "); f0(x + 1); } void space15() { __asm__(".space 8*1024*1024"); } void f16(int x) { printf("16 "); f14(x); } void space16() { __asm__(".space 8*1024*1024"); } void f17(int x) { printf("17 "); f13(x); } void space17() { __asm__(".space 8*1024*1024"); } void f18(int x) { printf("18 "); f12(x); } void space18() { __asm__(".space 8*1024*1024"); } void f19(int x) { printf("19 "); f11(x); } void space19() { __asm__(".space 8*1024*1024"); } int main() { f0(0); printf("\n"); } EOF $CC -B. -o $t/exe $t/a.o $QEMU $t/exe | grep -E '^0 9 1 8 2 7 3 6 4 5 10 19 11 18 12 17 13 16 14 15 0 $' ================================================ FILE: test/range-extension-thunk3.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc [ $MACHINE = sh4 ] && skip seq 1 10000 | sed 's/.*/void func&() {}/' > $t/a.c $CC -B. -o $t/b.so -shared $t/a.c seq 1 10000 | sed 's/.*/void func&();/' > $t/c.c echo 'int main() {' >> $t/c.c seq 1 10000 | sed 's/.*/func&();/' >> $t/c.c echo '}' >> $t/c.c $CC -c -o $t/d.o $t/c.c $CC -B. -o $t/exe $t/d.o $t/b.so $QEMU $t/exe ================================================ FILE: test/range-extension-thunk4.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc [[ $MACHINE = ppc* ]] && skip cat < void hello() { printf("Hello world\n"); } EOF $CC -B. -shared -o $t/b.so $t/a.o cat < void hello(); int main() { hello(); } EOF $CC -B. -o $t/exe $t/c.o $t/b.so readelf -W --syms $t/exe | not grep -F 'hello$thunk' ================================================ FILE: test/relax-got-load.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < extern char *msg; void hello() { printf("%s\n", msg); } EOF cat < $t/log1 $OBJDUMP -d $t/exe2 | grep -v exe2 > $t/log2 not diff $t/log1 $t/log2 > /dev/null ;; esac ================================================ FILE: test/reloc-rodata.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc exit [ $MACHINE = aarch64 ] && skip cat < int foo; int * const bar = &foo; int main() { printf("%d\n", *bar); } EOF not $CC -B. -o $t/exe $t/a.o -pie |& grep -E 'relocation against symbol .+ can not be used; recompile with -fPIC' ================================================ FILE: test/relocatable-archive.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # OneTBB isn't tsan-clean nm mold | grep '__tsan_init' && skip cat < $t/log grep 'foo\b' $t/log grep 'bar\b' $t/log not grep 'baz\b' $t/log ================================================ FILE: test/relocatable-c++.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # OneTBB isn't tsan-clean nm mold | grep '__tsan_init' && skip cat < struct Foo { Foo() { hello(); } }; template struct Bar { Bar() { world(); } }; void baz() { Foo foo; Bar bar; } EOF cat < void hello() { std::cout << "Hello "; } void world() { std::cout << "world\n"; } void baz(); int main() { baz(); } EOF ./mold --relocatable -o $t/c.o $t/a.o ./mold --relocatable -o $t/d.o $t/b.o $CXX -B. -o $t/exe $t/c.o $t/d.o $QEMU $t/exe | grep 'Hello world' ================================================ FILE: test/relocatable-compressed-debug-info.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # OneTBB isn't tsan-clean nm mold | grep '__tsan_init' && skip test_cflags -g3 -gz || skip cat < void hello() { printf("Hello world\n"); } EOF cat < void hello() { printf("Hello world\n"); } EOF cat < /dev/null |& not grep Warning ================================================ FILE: test/relocatable-exception.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc [ $MACHINE = m68k ] && skip [ $MACHINE = sh4 ] && skip [ $MACHINE = sh4aeb ] && skip # OneTBB isn't tsan-clean nm mold | grep '__tsan_init' && skip cat < int foo(); int main() { std::cout << foo() << "\n"; } EOF ./mold --relocatable -o $t/c.o $t/a.o $t/b.o $CXX -B. -o $t/exe $t/c.o $QEMU $t/exe ================================================ FILE: test/relocatable-many-sections.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # OneTBB isn't tsan-clean nm mold | grep '__tsan_init' && skip seq 1 80000 | sed 's/.*/.section .data.&,"aw"\n.word 0\n/g' | $CC -c -xassembler -o $t/a.o - cat <<'EOF' | $CC -c -xc -o $t/b.o - #include int main() { printf("Hello\n"); return 0; } EOF ./mold -r -o $t/c.o $t/a.o $t/b.o $CC -B. -o $t/exe $t/c.o $QEMU $t/exe | grep Hello ================================================ FILE: test/relocatable-merge-sections.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # OneTBB isn't tsan-clean nm mold | grep '__tsan_init' && skip cat < $t/log1 grep -F .text.foo $t/log1 grep -F .text.bar $t/log1 ./mold --relocatable -o $t/c.o $t/a.o --relocatable-merge-sections readelf -WS $t/c.o > $t/log2 not grep -F .text.foo $t/log2 not grep -F .text.bar $t/log2 ================================================ FILE: test/relocatable-mergeable-sections.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # OneTBB isn't tsan-clean nm mold | grep '__tsan_init' && skip cat < void hello() { printf("Hello world\n"); } EOF cat < #include #include #define PT_GNU_RELRO 0x6474e552 typedef struct { char e_ident[16]; uint16_t e_type; uint16_t e_machine; uint32_t e_version; uint64_t e_entry; uint64_t e_phoff; uint64_t e_shoff; uint32_t e_flags; uint16_t e_ehsize; uint16_t e_phentsize; uint16_t e_phnum; uint16_t e_shentsize; uint16_t e_shnum; uint16_t e_shstrndx; } Ehdr; typedef struct { uint32_t p_type; uint32_t p_flags; uint64_t p_offset; uint64_t p_vaddr; uint64_t p_paddr; uint64_t p_filesz; uint64_t p_memsz; uint64_t p_align; } Phdr; extern char __ehdr_start[]; int main() { Ehdr *ehdr = (Ehdr *)__ehdr_start; Phdr *phdr = (Phdr *)(__ehdr_start + ehdr->e_phoff); int pagesz = sysconf(_SC_PAGESIZE); for (int i = 0; i < ehdr->e_phnum; i++) { if (phdr[i].p_type == PT_GNU_RELRO) { if ((phdr[i].p_vaddr + phdr[i].p_memsz) % pagesz == 0) { printf("Aligned: vaddr=%lx memsz=%lx pagesize=%x\n", phdr[i].p_vaddr, phdr[i].p_memsz, pagesz); } else { printf("Unaligned: vaddr=%lx memsz=%lx pagesize=%x\n", phdr[i].p_vaddr, phdr[i].p_memsz, pagesz); } return 0; } } printf("PT_GNU_RELRO missing\n"); } EOF $CC -B. -o $t/exe1 $t/a.o -Wl,-z,relro $QEMU $t/exe1 | grep Aligned $CC -B. -o $t/exe2 $t/a.o -Wl,-z,relro,-z,separate-loadable-segments $QEMU $t/exe2 | grep Aligned ================================================ FILE: test/relro.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello world\n"); } EOF $CC -B. -o $t/exe1 $t/a.o -Wl,-z,relro,-z,lazy $QEMU $t/exe1 | grep 'Hello world' readelf --segments -W $t/exe1 | grep -w GNU_RELRO $CC -B. -o $t/exe2 $t/a.o -Wl,-z,relro,-z,now $QEMU $t/exe2 | grep 'Hello world' readelf --segments -W $t/exe2 | grep -w GNU_RELRO $CC -B. -o $t/exe3 $t/a.o -Wl,-z,norelro $QEMU $t/exe3 | grep 'Hello world' readelf --segments -W $t/exe3 | not grep -w GNU_RELRO ================================================ FILE: test/repro.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello world\n"); return 0; } EOF rm -rf $t/exe.repro $t/exe.repro.tar $CC -B. -o $t/exe $t/a.o not [ -f $t/exe.repro.tar ] $CC -B. -o $t/exe $t/a.o -Wl,-repro tar -C $t -xf $t/exe.repro.tar tar -C $t -tvf $t/exe.repro.tar | grep ' exe.repro/.*/a.o' grep /a.o $t/exe.repro/response.txt grep mold $t/exe.repro/version.txt rm -rf $t/exe.repro $t/exe.repro.tar MOLD_REPRO=1 $CC -B. -o $t/exe $t/a.o tar -C $t -tvf $t/exe.repro.tar | grep ' exe.repro/.*/a.o' tar -C $t -xf $t/exe.repro.tar grep /a.o $t/exe.repro/response.txt grep mold $t/exe.repro/version.txt ================================================ FILE: test/require-defined.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/rsp1 echo $t'"\/b."\o' >> $t/rsp1 ./mold -o $t/c.so -shared @$t/rsp1 echo $t/a.o > $t/rsp2 echo '\foo\bar' >> $t/rsp2 not ./mold -o $t/d.so -shared @$t/rsp2 |& grep 'cannot open foobar:' ================================================ FILE: test/response-file.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/rsp ./mold -o $t/d.so -shared $t/a.o @$t/rsp ================================================ FILE: test/response-file2.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/rsp1 echo "@$t/rsp1" > $t/rsp2 ./mold -o $t/c.so -shared $t/a.o @$t/rsp2 ================================================ FILE: test/retain-symbols-file.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/symbols foo baz EOF $CC -B. -o $t/exe $t/a.o -Wl,--retain-symbols-file=$t/symbols readelf -W --symbols $t/exe > $t/log not grep ' foo$' $t/log not grep ' bar$' $t/log not grep ' main$' $t/log grep ' baz$' $t/log ================================================ FILE: test/reverse-sections.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < static struct Foo1 { Foo1() { printf("foo1 "); } } x; static struct Foo2 { Foo2() { printf("foo2 "); } } y; EOF cat < static struct Foo3 { Foo3() { printf("foo3 "); } } x; static struct Foo4 { Foo4() { printf("foo4 "); } } y; EOF cat < static struct Foo5 { Foo5() { printf("foo5 "); } } x; static struct Foo6 { Foo6() { printf("foo6 "); } } y; int main() { printf("\n"); } EOF $CXX -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o $QEMU $t/exe1 | grep 'foo1 foo2 foo3 foo4 foo5 foo6' $CXX -B. -o $t/exe2 $t/a.o $t/b.o $t/c.o -Wl,--reverse-sections $QEMU $t/exe2 | grep 'foo5 foo6 foo3 foo4 foo1 foo2' ================================================ FILE: test/rodata-name.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # ARM assembler has a differnet grammar than the others. # Concretely speaking, ARM as uses "@" as a start of a comment. [[ $MACHINE = arm* ]] && skip # All data symbols need to be aligned to 2 byte boundaries on s390x, # so rodata.str1.1 in this file is invalid on s390x. [ $MACHINE = s390x ] && skip cat <<'EOF' | $CC -c -o $t/a.o -x assembler - .globl val1, val2, val3, val4, val5 .section .rodata.str1.1,"aMS",@progbits,1 val1: .ascii "Hello \0" .section .rodata.foo,"aMS",@progbits,4 .p2align 2 val2: .ascii "world \0\0\0\0" .section .rodata.x,"aMS",@progbits,1 val3: .ascii "foobar\0" .section .rodata.cst8,"aM",@progbits,8 .p2align 3 val4: .ascii "abcdefgh" .section .rodatabaz,"aMS",@progbits,1 val5: .ascii "baz\0" EOF cat <<'EOF' | $CC -c -o $t/b.o -xc - #include extern char val1, val2, val3, val4, val5; int main() { printf("%p %p %p %p %p\n", &val1, &val2, &val3, &val4, &val5); } EOF $CC -B. -o $t/exe $t/a.o $t/b.o readelf -p .rodata.str1.1 $t/exe | grep Hello readelf -p .rodata.str4.4 $t/exe | grep world readelf -p .rodata.str1.1 $t/exe | grep foobar readelf -p .rodata.cst8 $t/exe | grep abcdefgh readelf -p .rodatabaz $t/exe | grep baz ================================================ FILE: test/rosegment.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello world\n"); } EOF $CC -B. -o $t/exe1 $t/a.o readelf -W --segments $t/exe1 | not grep '\.interp .* \.text' $CC -B. -o $t/exe2 $t/a.o -Wl,--rosegment readelf -W --segments $t/exe2 | not grep '\.interp .* \.text' $CC -B. -o $t/exe3 $t/a.o -Wl,--no-rosegment readelf -W --segments $t/exe3 | grep '\.interp .* \.text' ================================================ FILE: test/rpath.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat <& /dev/null || skip cat <<'EOF' | $CC -xc -c -o $t/a.o - #include int main() { printf("Hello\n"); return 0; } EOF LD_PRELOAD=`pwd`/mold-wrapper.so MOLD_PATH=`pwd`/mold \ clang -no-pie -o $t/exe $t/a.o -fuse-ld=/usr/bin/ld readelf -p .comment $t/exe | grep mold ================================================ FILE: test/run.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc [ "$CC" = cc ] || skip # ASAN doesn't work with LD_PRELOAD nm mold | grep '__[at]san_init' && skip cat <<'EOF' | $CC -xc -c -o $t/a.o - #include int main() { printf("Hello\n"); return 0; } EOF LD_PRELOAD=`pwd`/mold-wrapper.so MOLD_PATH=`pwd`/mold \ $CC -o $t/exe $t/a.o -B/usr/bin readelf -p .comment $t/exe | grep mold ./mold -run env | grep '^MOLD_PATH=.*/mold$' ./mold -run /usr/bin/ld --version | grep mold ./mold -run /usr/bin/ld.lld --version | grep mold ./mold -run /usr/bin/ld.gold --version | grep mold rm -f $t/ld $t/ld.lld $t/ld.gold $t/foo.ld touch $t/ld $t/ld.lld $t/ld.gold echo "#!/bin/sh" >$t/foo.ld chmod 755 $t/ld $t/ld.lld $t/ld.gold $t/foo.ld ./mold -run $t/ld --version | grep mold ./mold -run $t/ld.lld --version | grep mold ./mold -run $t/ld.gold --version | grep mold ./mold -run $t/foo.ld --version | not grep mold cat <<'EOF' > $t/sh #!/bin/sh $1 --version EOF chmod 755 $t/sh ./mold -run $t/sh ld --version | grep mold ./mold -run $t/sh $t/foo.ld --version |& not grep mold ./mold -run $t/sh $t/ld --version | grep mold ./mold -run $t/sh $t/ld.lld --version | grep mold ./mold -run $t/sh $t/ld.gold --version | grep mold ./mold -run $t/sh $t/foo.ld --version | not grep mold ================================================ FILE: test/section-align.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < __attribute__((section(".fn1"))) void fn1() { printf(" fn1"); } __attribute__((section(".fn2"))) void fn2() { printf(" fn2"); } int main() { printf("Hello world\n"); } EOF $CC -B. -o $t/exe1 $t/a.o -no-pie \ -Wl,--section-order='=0x100000 EHDR PHDR =0x200000 .fn2 TEXT =0x300000 .fn1 DATA BSS RODATA' $QEMU $t/exe1 | grep Hello readelf -SW $t/exe1 | grep '\.fn2 .*00200000' readelf -SW $t/exe1 | grep '\.fn1 .*00300000' $CC -B. -o $t/exe2 $t/a.o -no-pie \ -Wl,--section-order='=0x200000 EHDR RODATA =0x300000 PHDR =0x400000 .fn2 TEXT DATA BSS' readelf -SW $t/exe2 | grep '\.fn2 .*00400000' readelf -sW $t/exe2 | grep -E ': 0+200000\s.*\s__ehdr_start$' readelf -W --segments $t/exe2 | grep -E 'PHDR\s.*0x0+300000\s' $CC -B. -o $t/exe3 $t/a.o -no-pie \ -Wl,--section-order='=0x200000 !ehdr_start EHDR %0x20 !rodata_start RODATA =0x300000 !phdr_start PHDR %4096 !phdr_end =0x400000 !text_start TEXT DATA BSS' readelf -sW $t/exe3 > $t/log3 grep -E '\b0+200000 .* ehdr_start$' $t/log3 grep -E '\b0+200040 .* rodata_start$' $t/log3 grep -E '\b0+300000 .* phdr_start$' $t/log3 grep -E '\b0+301000 .* phdr_end$' $t/log3 grep -E '\b0+400000 .* text_start$' $t/log3 not $CC -B. -o $t/exe4 $t/a.o -no-pie \ -Wl,--section-order='=0x800000 EHDR PHDR =0x200000 .fn2 TEXT =0x300000 .fn1 DATA BSS RODATA' |& grep 'address goes backward' ================================================ FILE: test/section-start.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # On PPC64V1, function pointers do not refer function entry addresses # but instead refers "function descriptors" in .opd. [ $MACHINE = ppc64 ] && skip # The crt*.o compiled with B26 caused far form GOT. [[ $MACHINE = loongarch* ]] && skip [ $MACHINE = arm ] && flags=-marm cat < __attribute__((section(".fn1"))) void fn1() { printf(" fn1"); } __attribute__((section(".fn2"))) void fn2() { printf(" fn2"); } int main() { printf("main"); fn1(); fn2(); printf(" %p %p\n", fn1, fn2); } EOF $CC -B. -o $t/exe1 $t/a.o -no-pie \ -Wl,--section-start=.fn1=0x10000000,--section-start=.fn2=0x20000000 $QEMU $t/exe1 | grep 'main fn1 fn2 0x10000000 0x20000000' # PT_LOAD must be sorted on p_vaddr readelf -W --segments $t/exe1 | grep ' LOAD ' | sed 's/0x[0-9a-f]*//' > $t/log1 diff $t/log1 <(sort $t/log1) $CC -B. -o $t/exe2 $t/a.o -no-pie \ -Wl,--section-start=.fn1=0x20000000,--section-start=.fn2=0x10000000 $QEMU $t/exe2 | grep 'main fn1 fn2 0x20000000 0x10000000' readelf -W --segments $t/exe2 | grep ' LOAD ' | sed 's/0x[0-9a-f]*//' > $t/log2 diff $t/log2 <(sort $t/log2) ================================================ FILE: test/separate-debug-file-sort.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc nm mold | grep '__tsan_init' && skip command -v flock >& /dev/null || skip cat < $t/a.c int x = 1; void foo() {} EOF $CC -o $t/a.o -c -g -gdwarf32 $t/a.c || skip cat < $t/b.c int y = 1; int main() {} EOF $CC -o $t/b.o -c -g -gdwarf64 $t/b.c MOLD_DEBUG=1 $CC -B. -o $t/exe $t/a.o $t/b.o -g -Wl,--separate-debug-file flock $t/exe.dbg true readelf -p .debug_line_str $t/exe.dbg > $t/str if grep -Fw a.c $t/str; then grep -A10 -Fw a.c $t/str | grep -Fw b.c fi ================================================ FILE: test/separate-debug-file.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc nm mold | grep '__tsan_init' && skip on_qemu && skip command -v gdb >& /dev/null || skip command -v flock >& /dev/null || skip cat < $t/a.c #include int main() { printf("Hello world\n"); } EOF $CC -c -o $t/a.o $t/a.c -g $CC -B. -o $t/exe1 $t/a.o -Wl,--separate-debug-file readelf -SW $t/exe1 | grep -F .gnu_debuglink flock $t/exe1.dbg true gdb $t/exe1 -ex 'list main' -ex 'quit' | grep -F printf readelf -W --sections $t/exe1.dbg | not grep -E '[EPS]HDR' $CC -c -o $t/a.o $t/a.c -g $CC -B. -o $t/exe2 $t/a.o -Wl,--separate-debug-file,--no-build-id readelf -SW $t/exe2 | grep -F .gnu_debuglink flock $t/exe2.dbg true gdb $t/exe2 -ex 'list main' -ex 'quit' | grep -F printf $CC -c -o $t/a.o $t/a.c -g $CC -B. -o $t/exe3 $t/a.o -Wl,--separate-debug-file,--compress-debug-sections=zlib readelf -SW $t/exe3 | grep -F .gnu_debuglink flock $t/exe3.dbg true readelf -W --sections $t/exe3.dbg | grep '\.debug_info .*C' gdb $t/exe3 -ex 'list main' -ex 'quit' | grep -F printf ================================================ FILE: test/shared-abs-sym.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # This test is flaky on FreeBSD [ "$(uname)" = FreeBSD ] && skip cat < extern char foo; int main() { printf("foo=%p\n", &foo); } EOF cp $t/a.so $t/c.so $CC -B. -o $t/exe1 $t/d.o $t/c.so -pie || skip $QEMU $t/exe1 | grep 'foo=0x3' || skip cp $t/b.so $t/c.so $QEMU $t/exe1 | grep 'foo=0x5' cp $t/a.so $t/c.so $CC -B. -o $t/exe2 $t/d.o $t/c.so -no-pie $QEMU $t/exe2 | grep 'foo=0x3' cp $t/b.so $t/c.so $QEMU $t/exe1 | grep 'foo=0x5' ================================================ FILE: test/shared.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat <<'EOF' | $CC -fPIC -c -o $t/a.o -xc - void fn2(); void fn1() { fn2(); } void fn3() {} EOF $CC -B. -shared -o $t/b.so $t/a.o readelf --dyn-syms $t/b.so > $t/log grep '00000000 0 NOTYPE GLOBAL DEFAULT UND fn2' $t/log grep -E 'FUNC GLOBAL DEFAULT .* fn1' $t/log cat < int fn1(); void fn2() { printf("hello\n"); } int main() { fn1(); return 0; } EOF $CC -B. -o $t/exe $t/c.o $t/b.so $QEMU $t/exe | grep hello readelf --symbols $t/exe | not grep fn3 ================================================ FILE: test/shuffle-sections-seed.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello world\n"); } EOF # Create a lot of sections to lower the probability that # we get the identical output as a result of shuffling. for i in `seq 1 1000`; do echo "void fn$i() {}"; done | $CC -o $t/b.o -ffunction-sections -c -xc - $CC -B. -o $t/exe1 $t/a.o $t/b.o $QEMU $t/exe1 | grep 'Hello world' $CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,-shuffle-sections=42 $QEMU $t/exe2 | grep 'Hello world' $CC -B. -o $t/exe3 $t/a.o $t/b.o -Wl,-shuffle-sections=42 $QEMU $t/exe3 | grep 'Hello world' $CC -B. -o $t/exe4 $t/a.o $t/b.o -Wl,-shuffle-sections=5 $QEMU $t/exe4 | grep 'Hello world' not diff $t/exe1 $t/exe2 >& /dev/null diff $t/exe2 $t/exe3 not diff $t/exe3 $t/exe4 >& /dev/null ================================================ FILE: test/shuffle-sections.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello world\n"); } EOF # Create a lot of sections to lower the probability that # we get the identical output as a result of shuffling. for i in `seq 1 1000`; do echo "void fn$i() {}"; done | $CC -o $t/b.o -ffunction-sections -c -xc - $CC -B. -o $t/exe1 $t/a.o $t/b.o $QEMU $t/exe1 | grep 'Hello world' $CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,-shuffle-sections $QEMU $t/exe2 | grep 'Hello world' not diff $t/exe1 $t/exe2 >& /dev/null ================================================ FILE: test/soname.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/a.c int x = 1; void foo() {} EOF $CC -o $t/a.o -c -g -gdwarf32 $t/a.c || skip cat < $t/b.c int y = 1; int main() {} EOF $CC -o $t/b.o -c -g -gdwarf64 $t/b.c # Test if DWARF32 precedes DWARF64 in the output .debug_info MOLD_DEBUG=1 $CC -B. -o $t/exe1 $t/a.o $t/b.o -g -Wl,-Map=$t/map1 grep -A10 -F '/a.o:(.debug_info)' $t/map1 | grep -F '/b.o:(.debug_info)' readelf -p .debug_line_str $t/exe1 > $t/str1 if grep -Fw a.c $t/str1; then grep -A10 -Fw a.c $t/str1 | grep -Fw b.c fi MOLD_DEBUG=1 $CC -B. -o $t/exe2 $t/b.o $t/a.o -g -Wl,-Map=$t/map2 grep -A10 -F '/a.o:(.debug_info)' $t/map2 | grep -F '/b.o:(.debug_info)' readelf -p .debug_line_str $t/exe2 > $t/str2 if grep -Fw a.c $t/str2; then grep -A10 -Fw a.c $t/str2 | grep -Fw b.c fi if [ -z "$QEMU" ] && command -v gdb; then gdb $t/exe1 -ex 'list main' -ex 'quit' | grep 'int y = 1' gdb $t/exe2 -ex 'list main' -ex 'quit' | grep 'int y = 1' fi ================================================ FILE: test/spare-program-headers.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello world\n"); } EOF $CC -B. -o $t/exe1 $t/a.o $QEMU $t/exe1 | grep 'Hello world' [ "$(readelf -Wl $t/exe1 | grep NULL | wc -l)" -eq 0 ] $CC -B. -o $t/exe2 $t/a.o -Wl,--spare-program-headers=0 $QEMU $t/exe2 | grep 'Hello world' [ "$(readelf -Wl $t/exe2 | grep NULL | wc -l)" -eq 0 ] $CC -B. -o $t/exe3 $t/a.o -Wl,--spare-program-headers=1 $QEMU $t/exe3 | grep 'Hello world' [ "$(readelf -Wl $t/exe3 | grep NULL | wc -l)" -eq 1 ] $CC -B. -o $t/exe4 $t/a.o -Wl,--spare-program-headers=5 $QEMU $t/exe4 | grep 'Hello world' [ "$(readelf -Wl $t/exe4 | grep NULL | wc -l)" -eq 5 ] ================================================ FILE: test/start-lib.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/log not grep ' foo$' $t/log grep ' bar$' $t/log ================================================ FILE: test/start-stop-symbol.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat <<'EOF' | $CC -c -o $t/a.o -xc - __attribute__((section("foo"))) char data[] = "section foo"; EOF ar rcs $t/b.a $t/a.o cat < extern char data[]; extern char __start_foo[]; extern char __stop_foo[]; int main() { printf("%.*s %s\n", (int)(__stop_foo - __start_foo), __start_foo, data); } EOF $CC -B. -o $t/exe $t/c.o $t/b.a $QEMU $t/exe | grep 'section foo section foo' $CC -B. -o $t/exe $t/c.o $t/b.a -Wl,-gc-sections $QEMU $t/exe | grep 'section foo section foo' ================================================ FILE: test/start-stop.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int three(); int five(); int main() { printf("%d\n", three() + five()); } EOF rm -f $t/d.a (cd $t; ar rcs d.a long-long-long-filename.o b.o) $CC -B. -Wl,--trace -o $t/exe $t/c.o $t/d.a > $t/log grep -F 'static-archive/d.a(long-long-long-filename.o)' $t/log grep -F 'static-archive/d.a(b.o)' $t/log grep -F static-archive/c.o $t/log $QEMU $t/exe | grep '8' ================================================ FILE: test/static-pie.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc test_cflags -static-pie || skip cat < int main() { printf("Hello world\n"); } EOF $CC -B. -o $t/exe1 $t/a.o -static-pie $QEMU $t/exe1 | grep 'Hello world' $CC -B. -o $t/exe2 $t/a.o -static-pie -Wl,--no-relax $QEMU $t/exe2 | grep 'Hello world' ================================================ FILE: test/stdout.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello world\n"); return 0; } EOF $CC -B. -Wl,-build-id=sha1 $t/a.o -o - > $t/exe chmod 755 $t/exe $QEMU $t/exe | grep 'Hello world' ================================================ FILE: test/strip-debug.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/log grep -F _start $t/log grep -F foo $t/log grep -F bar $t/log if [[ $MACHINE != riscv* ]] && [[ $MACHINE != loongarch* ]]; then grep -F .L.baz $t/log fi ./mold -o $t/exe $t/a.o -strip-all readelf --symbols $t/exe > $t/log not grep -F _start $t/log not grep -F foo $t/log not grep -F bar $t/log if [[ $MACHINE != riscv* ]] && [[ $MACHINE != loongarch* ]]; then not grep -F .L.baz $t/log fi ================================================ FILE: test/stt-common.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < /dev/null || skip int foo; int bar; int baz = 42; EOF cat < int foo; int bar = 5; int baz; int main() { printf("%d %d %d\n", foo, bar, baz); } EOF $CC -B. -o $t/exe $t/a.o $t/b.o -Wl,--fatal-warnings $QEMU $t/exe | grep '0 5 42' readelf --sections $t/exe > $t/log grep '.common .*NOBITS' $t/log ================================================ FILE: test/symbol-rank.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < __attribute__((weak)) extern int a; __attribute__((weak)) extern int b; extern int c; __attribute__((weak)) extern int weak_undef; __attribute__((weak)) int weak_def = 5; int common; int main() { (void)common; (void)c; printf("%d %d %d %d\n", !!&a, !!&b, !!&weak_undef, weak_def); } EOF rm -f $t/x.a ar rcs $t/x.a $t/a.o $t/b.o $t/c.o $CC -B. -o $t/exe $t/d.o $t/x.a $QEMU $t/exe | grep '^0 0 0 5$' ================================================ FILE: test/symbol-version-as-needed.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < void foo1() { printf("foo1\n"); } void foo2() { printf("foo2\n"); } __asm__(".symver foo1, foo@ver1"); __asm__(".symver foo2, foo@@ver2"); EOF echo 'ver1 { local: *; }; ver2 { local: *; };' > $t/b.ver $CC -B. -shared -o $t/libfoo.so $t/a.o -Wl,--version-script=$t/b.ver cat < $t/b.ver $CC -B. -shared -o $t/c.so $t/a.o -Wl,--version-script=$t/b.ver -flto readelf --symbols $t/c.so > $t/log grep -F 'foo@@VER1' $t/log ================================================ FILE: test/symbol-version-multi.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/b.version TEST1 { local: *; }; TEST2 {}; TEST3 {}; EOF $CC -B. -o $t/c.so -shared $t/a.o -Wl,--version-script=$t/b.version readelf -W --dyn-syms $t/c.so | grep -F 'foo@TEST1' readelf -W --dyn-syms $t/c.so | grep -F 'foo@@TEST2' ================================================ FILE: test/symbol-version.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/b.ver $CC -B. -shared -o $t/c.so $t/a.o -Wl,--version-script=$t/b.ver readelf --symbols $t/c.so > $t/log grep -F 'foo@VER1' $t/log grep -F 'foo@VER2' $t/log grep -F 'foo@@VER3' $t/log ================================================ FILE: test/symbol-version2.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/b.version TEST { global: foo; }; EOF $CC -B. -o $t/c.so -shared $t/a.o -Wl,--version-script=$t/b.version readelf -W --dyn-syms $t/c.so > $t/log grep ' foo@TEST' $t/log grep ' bar@TEST' $t/log grep ' bar1' $t/log not grep ' foo@@TEST' $t/log ================================================ FILE: test/symbol-version3.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/b.version TEST1 { global: foo; }; TEST2 {}; TEST3 {}; EOF $CC -B. -o $t/c.so -shared $t/a.o -Wl,--version-script=$t/b.version readelf -W --dyn-syms $t/c.so > $t/log grep ' foo@@TEST1' $t/log grep ' foo@TEST2' $t/log grep ' foo@TEST3' $t/log not grep ' foo$' $t/log ================================================ FILE: test/symbol-version4.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < void foo() { printf("foo "); } void foo2() {} void foo3() {} __asm__(".symver foo2, foo@TEST2"); __asm__(".symver foo3, foo@TEST3"); EOF cat < $t/b.version TEST1 { global: foo; }; TEST2 {}; TEST3 {}; EOF $CC -B. -o $t/c.so -shared $t/a.o -Wl,--version-script=$t/b.version cat < void foo(); void bar() { printf("bar "); } void bar2() { foo(); } void bar3() {} __asm__(".symver bar2, bar@TEST2"); __asm__(".symver bar3, bar@TEST3"); EOF cat < $t/e.version TEST1 { global: bar; }; TEST2 {}; TEST3 {}; EOF $CC -B. -o $t/f.so -shared $t/d.o $t/c.so -Wl,--version-script=$t/e.version cat < void foo(); void bar(); int main() { foo(); bar(); printf("\n"); } EOF $CC -B. -o $t/exe $t/g.o $t/f.so $t/c.so $QEMU $t/exe | grep 'foo bar' ================================================ FILE: test/symbol-version5.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/c.version VERSION { global: foo; local: *; }; EOF $CC -B. -shared -o $t/libfoo.so $t/a.o $CC -B. -shared -o $t/libbar.so $t/b.o -Wl,--version-script=$t/c.version cat < int main() { puts("Hello world"); } EOF $CC -B. -o $t/exe $t/a.o nm $t/exe | grep 'U puts$' ================================================ FILE: test/symtab-section-symbols.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello world\n"); } EOF $CC -B. -o $t/exe $t/a.o readelf -s $t/exe | grep 'SECTION LOCAL DEFAULT' ================================================ FILE: test/symtab.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/c.map ./mold -o $t/exe $t/a.o $t/b.o --version-script=$t/c.map readelf --symbols $t/exe > $t/log grep -E '0 NOTYPE LOCAL DEFAULT .* local1' $t/log grep -E '0 NOTYPE LOCAL DEFAULT .* local2' $t/log grep -E '0 NOTYPE LOCAL DEFAULT .* module_local' $t/log grep -E '0 NOTYPE GLOBAL DEFAULT .* foo' $t/log grep -E '0 NOTYPE GLOBAL DEFAULT .* bar' $t/log grep -E '0 NOTYPE GLOBAL DEFAULT .* this_is_global' $t/log ================================================ FILE: test/synthetic-symbols.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < #include #include extern char __ehdr_start[]; extern char __executable_start[]; extern char __dso_handle[]; extern char _end[]; extern char end[]; extern char _etext[]; extern char etext[]; extern char _edata[]; extern char edata[]; extern char __start_foo[]; extern char __stop_foo[]; int main() { assert(_end); assert(_end == end); assert(_etext); assert(_etext == etext); assert(_edata); assert(_edata == edata); printf("__ehdr_start=%p\n", &__ehdr_start); printf("__executable_start=%p\n", &__executable_start); printf("__dso_handle=%p\n", &__dso_handle); printf("%.*s\n", (int)(__stop_foo - __start_foo), __start_foo); } EOF $CC -B. -no-pie -Wl,--image-base=0x40000 -o $t/exe $t/a.o $t/b.o $QEMU $t/exe > $t/log grep '^__ehdr_start=0x40000$' $t/log grep '^__executable_start=0x40000$' $t/log grep '^__dso_handle=' $t/log grep '^section foo$' $t/log # Make sure that synthetic symbols overwrite existing ones cat < #include #include char __ehdr_start[] = "foo"; char __executable_start[] = "foo"; char _end[] = "foo"; char end[] = "foo"; char _etext[] = "foo"; char etext[] = "foo"; char _edata[] = "foo"; char edata[] = "foo"; char __start_foo[] = "foo"; char __stop_foo[] = "foo"; int main() { assert(_end); assert(_end != end); assert(_etext); assert(_etext != etext); assert(_edata); assert(_edata != edata); printf("end=%s\n", end); printf("etext=%s\n", etext); printf("edata=%s\n", edata); printf("__ehdr_start=%p\n", &__ehdr_start); printf("__executable_start=%p\n", &__executable_start); printf("%.*s\n", (int)(__stop_foo - __start_foo), __start_foo); } EOF $CC -B. -no-pie -Wl,--image-base=0x40000 -o $t/exe $t/a.o $t/c.o $QEMU $t/exe > $t/log grep '^end=foo$' $t/log grep '^etext=foo$' $t/log grep '^edata=foo$' $t/log grep '^__ehdr_start=0x40000$' $t/log grep '^__executable_start=0x40000$' $t/log grep '^section foo$' $t/log ================================================ FILE: test/sysroot-linker-script.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc [ $MACHINE = armeb ] && skip [[ $MACHINE = loongarch* ]] && skip cat < $t/foo/bar/b.script INPUT(/foo/bar/libfoo.a) EOF cat <& /dev/null not $CC -B. -o $t/exe $t/c.o -Wl,--sysroot=$t -Wl,-Lfoo/bar -lfoo >& /dev/null ================================================ FILE: test/sysroot2.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc [ $MACHINE = armeb ] && skip [[ $MACHINE = loongarch* ]] && skip mkdir -p $t/bin $t/sysroot/foo cat < $t/a.script INPUT(=/foo/x.o) EOF cat < $t/sysroot/b.script INPUT(/foo/y.o) EOF cat < void hello() { printf("Hello world\n"); } EOF cat < void hello2() { printf("Hello world\n"); } EOF cat < int add2(int n); int main() { printf("%d\n", add2(40)); return 0; } EOF $CC -B. -o $t/exe $t/a.o $t/b.o $t/c.o $QEMU $t/exe | grep '42' if [ $MACHINE = riscv32 -o $MACHINE = riscv64 ]; then $OBJDUMP -d $t/exe | grep bfed # c.j pc - 6 fi ================================================ FILE: test/tbss-only.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < __attribute__((section(".text"))) int (*fn)(const char *s) = puts; void hello() { puts("Hello world"); } EOF $CC -o $t/exe $t/a.o $t/b.o -no-pie $QEMU $t/exe || skip $CC -B. -o $t/exe $t/a.o $t/b.o -no-pie $QEMU $t/exe | grep 'Hello world' ================================================ FILE: test/textrel2.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < void hello() { puts("Hello world"); } __attribute__((section(".text"))) void (*p)() = hello; int main() { p(); } EOF $CC -o $t/exe1 $t/a.o -pie $QEMU $t/exe1 || skip $CC -B. -o $t/exe2 $t/a.o -pie $QEMU $t/exe2 | grep 'Hello world' $CC -o $t/exe3 $t/a.o -pie -Wl,-z,pack-relative-relocs 2> /dev/null || skip readelf -WS $t/exe3 | grep -F .relr.dyn || skip $QEMU $t/exe3 2> /dev/null | grep 'Hello world' || skip $CC -B. -o $t/exe4 $t/a.o -pie -Wl,-z,pack-relative-relocs $QEMU $t/exe4 | grep 'Hello world' ================================================ FILE: test/thin-archive.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int three(); int five(); int seven(); int main() { printf("%d\n", three() + five() + seven()); } EOF rm -f $t/d.a (cd $t; ar rcsT d.a long-long-long-filename.o b.o "`pwd`"/c.o) $CC -B. -Wl,--trace -o $t/exe $t/d.o $t/d.a > $t/log grep -E 'thin-archive/d.a\(.*long-long-long-filename.o\)' $t/log grep -E 'thin-archive/d.a\((.*/)?b.o\)' $t/log grep -F thin-archive/d.o $t/log $QEMU $t/exe | grep 15 ================================================ FILE: test/thread-count.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello world\n"); } EOF $CC -B. -o $t/exe $t/a.o -Wl,-no-threads $CC -B. -o $t/exe $t/a.o -Wl,-thread-count=1 $CC -B. -o $t/exe $t/a.o -Wl,-threads $CC -B. -o $t/exe $t/a.o -Wl,-threads=1 $CC -B. -o $t/exe $t/a.o -Wl,--threads=1 ================================================ FILE: test/tls-alignment-multi.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # Test a tricky case of TLS alignment requirement where not only the virtual # address of a symbol but also its offset against the TLS base address has to # be aligned. # # On glibc, this issue requires a TLS model equivalent to global-dynamic in # order to be triggered. cat < // .tdata _Thread_local int x = 42; // .tbss __attribute__((aligned(64))) _Thread_local int y; void *test(void *unused) { printf("%p %lu", &y, (unsigned long)&y % 64); return NULL; } EOF cat < #include #include int main() { void *handle = dlopen("c.so", RTLD_NOW); void *(*test)(void *) = dlsym(handle, "test"); pthread_t th; test(NULL); printf(" "); pthread_create(&th, NULL, test, NULL); pthread_join(th, NULL); printf("\n"); } EOF $CC -B. -shared -o $t/c.so $t/a.o $CC -B. -ldl -pthread -o $t/exe $t/b.o -Wl,-rpath,$t $QEMU $t/exe | grep -E '^0x[0-9a-f]+ 0 0x[0-9a-f]+ 0$' ================================================ FILE: test/tls-common.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < extern _Thread_local int foo; int main() { printf("foo=%d\n", foo); } EOF $CC -B. -o $t/exe $t/a.o $t/b.o readelf -WS $t/exe | grep -F .tls_common $QEMU $t/exe | grep '^foo=0$' ================================================ FILE: test/tls-df-static-tls.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < static _Thread_local int foo = 5; int bar() { return foo; } EOF $CC -B. -shared -o $t/b.so $t/a.o -Wl,--relax readelf --dynamic $t/b.so | grep STATIC_TLS $CC -B. -shared -o $t/c.so $t/a.o -Wl,--no-relax readelf --dynamic $t/c.so | grep STATIC_TLS ================================================ FILE: test/tls-dso.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < _Thread_local int foo; extern _Thread_local int bar; int get_foo1(); int get_bar1(); int get_foo2() { return foo; } int get_bar2() { return bar; } int main() { foo = 5; bar = 3; printf("%d %d %d %d %d %d\n", foo, bar, get_foo1(), get_bar1(), get_foo2(), get_bar2()); return 0; } EOF $CC -B. -o $t/exe $t/a.so $t/b.o $QEMU $t/exe | grep '5 3 5 3 5 3' ================================================ FILE: test/tls-gd-dlopen.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < #include #include #include int main(int argc, char **argv) { void *handle = dlopen(argv[1], RTLD_LAZY); if (!handle) { fprintf(stderr, "dlopen failed: %s: %s: \n", argv[1], dlerror()); exit(1); } int (*get)(int) = dlsym(handle, "get_foo"); assert(get); printf("%d %d %d\n", get(0), get(1), get(9999)); } EOF $CC -B. -o $t/exe $t/c.o -ldl $QEMU $t/exe $t/b.so | grep '3 0 5' ================================================ FILE: test/tls-gd-noplt.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < __attribute__((tls_model("global-dynamic"))) static _Thread_local int x1 = 1; __attribute__((tls_model("global-dynamic"))) static _Thread_local int x2; __attribute__((tls_model("global-dynamic"))) extern _Thread_local int x3; __attribute__((tls_model("global-dynamic"))) extern _Thread_local int x4; int get_x5(); int get_x6(); int main() { x2 = 2; printf("%d %d %d %d %d %d\n", x1, x2, x3, x4, get_x5(), get_x6()); return 0; } EOF cat < __attribute__((tls_model("global-dynamic"))) static _Thread_local int x1 = 1; __attribute__((tls_model("global-dynamic"))) _Thread_local int x2 = 2; __attribute__((tls_model("global-dynamic"))) _Thread_local int x3; int foo() { x3 = 3; printf("%d %d %d\n", x1, x2, x3); return 0; } EOF cat < $t/a.c #include __attribute__((tls_model("global-dynamic"))) static _Thread_local int x1 = 1; __attribute__((tls_model("global-dynamic"))) static _Thread_local int x2; __attribute__((tls_model("global-dynamic"))) extern _Thread_local int x3; __attribute__((tls_model("global-dynamic"))) extern _Thread_local int x4; int get_x5(); int get_x6(); int main() { x2 = 2; printf("%d %d %d %d %d %d\n", x1, x2, x3, x4, get_x5(), get_x6()); return 0; } EOF $CC -fPIC -c -o $t/b.o $t/a.c $CC -fPIC -c -o $t/c.o $t/a.c -O2 cat < __attribute__((tls_model("initial-exec"))) static _Thread_local int foo; __attribute__((tls_model("initial-exec"))) static _Thread_local int bar; void set() { foo = 3; bar = 5; } void print() { printf("%d %d ", foo, bar); } EOF $CC -B. -shared -o $t/b.so $t/a.o cat < _Thread_local int baz; void set(); void print(); int main() { baz = 7; print(); set(); print(); printf("%d\n", baz); } EOF $CC -B. -o $t/exe $t/b.so $t/c.o $QEMU $t/exe | grep '^0 0 3 5 7$' $CC -B. -o $t/exe $t/b.so $t/c.o -Wl,-no-relax $QEMU $t/exe | grep '^0 0 3 5 7$' ================================================ FILE: test/tls-irregular-start-addr.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < _Thread_local char x1 = 42; int main() { printf("%d\n", x1); } EOF $CC -B. -o $t/exe1 $t/a.o -pie -Wl,-section-start=.tdata=0x100001 -Wl,-relax $QEMU $t/exe1 | grep '^42$' $CC -B. -o $t/exe2 $t/a.o -pie -Wl,-section-start=.tdata=0x100001 -Wl,-no-relax $QEMU $t/exe2 | grep '^42$' $CC -B. -o $t/exe3 $t/a.o -pie -Wl,-section-start=.tdata=0x10000f -Wl,-relax $QEMU $t/exe3 | grep '^42$' $CC -B. -o $t/exe4 $t/a.o -pie -Wl,-section-start=.tdata=0x10000f -Wl,-no-relax $QEMU $t/exe4 | grep '^42$' ================================================ FILE: test/tls-large-alignment.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < extern _Thread_local int x; extern _Thread_local int y[]; int main() { printf("%lu %d %d %d %d\n", (unsigned long)&x % 256, x, y[0], y[1], y[2]); } EOF $CC -B. -shared -o $t/d.so $t/a.o $t/b.o $CC -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o $QEMU $t/exe1 | grep '^0 42 1 2 3$' $CC -B. -o $t/exe2 $t/c.o $t/d.so $QEMU $t/exe2 | grep '^0 42 1 2 3$' ================================================ FILE: test/tls-large-static-image.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < extern _Thread_local int x[]; int main() { printf("%d %d %d %d %d\n", x[0], x[1], x[2], x[3], x[10000]); } EOF $CC -B. -o $t/exe $t/a.o $t/b.o $QEMU $t/exe | grep '^1 2 3 0 5$' ================================================ FILE: test/tls-ld-noplt.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < extern _Thread_local int foo; static _Thread_local int bar; int *get_foo_addr() { return &foo; } int *get_bar_addr() { return &bar; } int main() { bar = 5; printf("%d %d %d %d\n", *get_foo_addr(), *get_bar_addr(), foo, bar); return 0; } EOF cat < $t/a.c #include extern _Thread_local int foo; static _Thread_local int bar; int *get_foo_addr() { return &foo; } int *get_bar_addr() { return &bar; } int main() { bar = 5; printf("%d %d %d %d\n", *get_foo_addr(), *get_bar_addr(), foo, bar); return 0; } EOF $CC -fPIC -ftls-model=local-dynamic -c -o $t/b.o $t/a.c $CC -fPIC -ftls-model=local-dynamic -c -o $t/c.o $t/a.c -O2 cat < __attribute__((tls_model("local-exec"))) extern _Thread_local int foo; __attribute__((tls_model("local-exec"))) static _Thread_local int bar; int *get_foo_addr() { return &foo; } int *get_bar_addr() { return &bar; } int main() { bar = 5; printf("%d %d %d %d\n", *get_foo_addr(), *get_bar_addr(), foo, bar); return 0; } EOF cat < __attribute__((tls_model("global-dynamic"))) extern _Thread_local int foo; __attribute__((tls_model("global-dynamic"))) static _Thread_local int bar; int *get_foo_addr() { return &foo; } int *get_bar_addr() { return &bar; } int main() { foo = 3; bar = 5; printf("%d %d %d %d\n", *get_foo_addr(), *get_bar_addr(), foo, bar); return 0; } EOF cat < __attribute__((tls_model("global-dynamic"))) extern _Thread_local int foo; __attribute__((tls_model("global-dynamic"))) static _Thread_local int bar; int *get_foo_addr() { return &foo; } int *get_bar_addr() { return &bar; } int main() { bar = 5; printf("%d %d %d %d\n", *get_foo_addr(), *get_bar_addr(), foo, bar); return 0; } EOF cat < extern _Thread_local char x; int main() { printf("%d\n", x); } EOF $CC -B. -shared -o $t/d.so $t/a.o $t/b.o $CC -B. -o $t/exe1 $t/a.o $t/b.o $t/c.o $QEMU $t/exe1 | grep '^42$' $CC -B. -o $t/exe2 $t/c.o $t/d.so $QEMU $t/exe2 | grep '^42$' ================================================ FILE: test/tlsdesc-dlopen.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc supports_tlsdesc || skip cat < #include #include #include int main(int argc, char **argv) { void *handle = dlopen(argv[1], RTLD_LAZY); if (!handle) { fprintf(stderr, "dlopen failed: %s: %s: \n", argv[1], dlerror()); exit(1); } int (*get)(int) = dlsym(handle, "get_foo"); assert(get); printf("%d %d %d\n", get(0), get(1), get(9999)); } EOF $CC -B. -o $t/exe $t/c.o -ldl $QEMU $t/exe $t/b.so | grep '3 0 5' ================================================ FILE: test/tlsdesc-import.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc supports_tlsdesc || skip cat < extern _Thread_local int foo; extern _Thread_local int bar; int main() { bar = 7; printf("%d %d\n", foo, bar); } EOF cat < extern _Thread_local int foo; int get_foo1(); int get_foo2() { return foo; } int main() { printf("%d %d %d\n", foo, get_foo1(), get_foo2()); } EOF $CC -B. -o $t/exe1 $t/c.o $t/d.o $t/b.so $QEMU $t/exe1 | grep '^5 5 5$' $OBJDUMP --dynamic-reloc $t/exe1 | not grep -E 'TLS_?DESC' $CC -B. -o $t/exe2 $t/c.o $t/d.o $t/b.so -Wl,--no-relax $QEMU $t/exe2 | grep '^5 5 5$' $OBJDUMP --dynamic-reloc $t/exe2 | grep -E 'TLS_?DESC' ================================================ FILE: test/tlsdesc-local-dynamic.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc supports_tlsdesc || skip cat < _Thread_local int foo; int get_foo(); int get_bar(); int main() { foo = 42; printf("%d %d\n", get_foo(), get_bar()); return 0; } EOF $CC -B. -o $t/exe1 $t/a.o $t/b.o $QEMU $t/exe1 | grep '42 5' $CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,--no-relax $QEMU $t/exe2 | grep '42 5' ================================================ FILE: test/tlsdesc-static.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc supports_tlsdesc || skip test_cflags -static || skip cat < extern _Thread_local int foo; int main() { foo = 42; printf("%d\n", foo); } EOF cat < _Thread_local int foo; extern _Thread_local int bar; int get_foo(); int get_baz(); int main() { foo = 42; printf("%d %d %d\n", get_foo(), bar, get_baz()); return 0; } EOF $CC -B. -o $t/exe1 $t/a.o $t/b.o $QEMU $t/exe1 | grep '42 3 5' $CC -B. -o $t/exe2 $t/a.o $t/b.o -Wl,-no-relax $QEMU $t/exe2 | grep '42 3 5' $CC -B. -shared -o $t/c.so $t/a.o $CC -B. -o $t/exe3 $t/b.o $t/c.so $QEMU $t/exe3 | grep '42 3 5' $CC -B. -shared -o $t/c.so $t/a.o -Wl,-no-relax $CC -B. -o $t/exe4 $t/b.o $t/c.so -Wl,-no-relax $QEMU $t/exe4 | grep '42 3 5' ================================================ FILE: test/trace-symbol-symver.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/b.version VER1 { local: *; }; VER2 { local: *; }; VER3 { local: *; }; EOF $CC -B. -o $t/c.so -shared $t/a.o -Wl,--version-script=$t/b.version \ -Wl,--trace-symbol='foo@VER1' > /dev/null cat < /dev/null $QEMU $t/exe ================================================ FILE: test/trace-symbol.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < void foo(); void bar() { foo(); printf("Hello world\n"); } EOF cat < $t/log grep 'trace-symbol: .*/a.o: reference to foo' $t/log grep 'trace-symbol: .*/b.o: definition of foo' $t/log grep 'trace-symbol: .*/c.so: definition of baz' $t/log ================================================ FILE: test/trace.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello world\n"); return 0; } EOF $CC -B. -o $t/exe $t/a.o -Wl,-trace > $t/log grep '/a\.o$' $t/log ================================================ FILE: test/undefined-glob-gc-sections.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc test_cflags -static || skip cat < $t/log2 grep foo $t/log2 grep foobar $t/log2 not grep baz $t/log2 ================================================ FILE: test/undefined-glob.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc test_cflags -static || skip cat < $t/log1 not grep foo $t/log1 not grep foobar $t/log1 not grep baz $t/log1 $CC -B. -o $t/exe2 $t/d.a $t/e.o -Wl,--undefined-glob='foo*' readelf -W --symbols $t/exe2 > $t/log2 grep foo $t/log2 grep foobar $t/log2 not grep baz $t/log2 ================================================ FILE: test/undefined.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc test_cflags -static || skip cat < $t/log not grep foo $t/log not grep bar $t/log ./mold -static -o $t/exe $t/a.o $t/d.a -u foo readelf --symbols $t/exe > $t/log grep foo $t/log not grep bar $t/log ./mold -static -o $t/exe $t/a.o $t/d.a -u foo --undefined=bar readelf --symbols $t/exe > $t/log grep foo $t/log grep bar $t/log ================================================ FILE: test/undefined2.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc test_cflags -static || skip cat < $t/log grep foo $t/log not grep ndefined $t/log ================================================ FILE: test/unknown-section-type.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < /dev/null || skip .section .my_section,"a",@0x80000000 EOF not $CC -B. -o $t/exe $t/a.o |& grep 'unsupported section type: 0x80000000' ================================================ FILE: test/unresolved-symbols.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello world\n"); } EOF $CC -B. -Wl,--verbose -o $t/exe $t/a.o > /dev/null ================================================ FILE: test/version-script-search-paths.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc mkdir -p $t/foo/bar echo 'ver_x { global: *; };' > $t/foo/bar/a.ver cat < $t/b.s .globl foo, bar, baz foo: nop bar: nop baz: nop EOF $CC -B. -shared -o $t/c.so -Wl,-L$t/foo/bar -Wl,-version-script,a.ver $t/b.s readelf --version-info $t/c.so > $t/log grep -F 'Rev: 1 Flags: none Index: 2 Cnt: 1 Name: ver_x' $t/log ================================================ FILE: test/version-script.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc echo 'ver_x { global: *; };' > $t/a.ver cat < $t/b.s .globl foo, bar, baz foo: nop bar: nop baz: nop EOF $CC -B. -shared -o $t/c.so -Wl,-version-script,$t/a.ver $t/b.s readelf --version-info $t/c.so > $t/log grep -F 'Rev: 1 Flags: BASE Index: 1 Cnt: 1 Name: c.so' $t/log grep -F 'Rev: 1 Flags: none Index: 2 Cnt: 1 Name: ver_x' $t/log ================================================ FILE: test/version-script10.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc echo 'VER1 { foo[12]; }; VER2 {};' > $t/a.ver cat < $t/b.s .globl foo1, foo2, foo3 foo1: nop foo2: nop foo3: nop EOF $CC -B. -shared -o $t/c.so -Wl,-version-script,$t/a.ver $t/b.s readelf --dyn-syms $t/c.so > $t/log grep ' foo1@@VER1$' $t/log grep ' foo2@@VER1$' $t/log not grep ' foo3@@VER1$' $t/log ================================================ FILE: test/version-script11.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat <<'EOF' > $t/a.ver VER_X1 { global: *; local: b*; }; EOF cat < $t/log grep 'foo@@VER_X1' $t/log not grep ' bar' $t/log not grep ' baz' $t/log ================================================ FILE: test/version-script12.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat <<'EOF' > $t/a.ver { global: *; *foo_*; local: *foo*; }; EOF cat < $t/log grep ' xyz' $t/log grep ' foo_bar' $t/log not grep ' foo$' $t/log ================================================ FILE: test/version-script13.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat <<'EOF' > $t/a.ver { global: *; local: foo; }; EOF cat < $t/log grep ' foobar' $t/log not grep ' foo$' $t/log ================================================ FILE: test/version-script14.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat <<'EOF' > $t/a.ver { local: *; global: xyz; foo*bar*[abc]x; }; EOF cat < $t/log grep ' xyz' $t/log not grep ' foobarzx' $t/log grep ' foobarcx' $t/log grep ' foo123bar456bx' $t/log not grep ' foo123bar456c' $t/log not grep ' foo123bar456x' $t/log ================================================ FILE: test/version-script15.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat <<'EOF' > $t/a.ver { local: *; global: [abc][^abc][^\]a-zABC]; }; EOF cat < $t/log grep ' azZ' $t/log grep ' czZ' $t/log not grep ' azC' $t/log not grep ' aaZ' $t/log ================================================ FILE: test/version-script16.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat <<'EOF' > $t/a.ver { local: *; global: extern "C++" { *foo*; }; }; EOF cat < $t/c.ver { local: *; global: xyz; }; EOF $CC -B. -o $t/exe2 $t/a.o $t/b.so -Wl,--version-script=$t/c.ver -Wl,--undefined-version nm -g $t/exe2 | not grep foo cat <<'EOF' > $t/d.ver { local: *; }; EOF $CC -B. -o $t/exe3 $t/a.o $t/b.so -Wl,--version-script=$t/d.ver nm -g $t/exe3 | not grep foo ================================================ FILE: test/version-script18.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # Test version script precedence. cat <<'EOF' > $t/a.ver { global: extern "C++" { *libalpha::*; }; local: *libbeta*; }; EOF cat < void foo() {} template void foo(); } EOF $CC -B. -shared -Wl,--version-script=$t/a.ver -o $t/c.so $t/b.o readelf --wide --dyn-syms $t/c.so | grep libalpha | grep Bar ================================================ FILE: test/version-script19.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat <<'EOF' > $t/a.ver { local: extern "C++" { foo*; }; }; EOF cat < $t/log not grep -E foobar $t/log grep -E 'GLOBAL.*baz' $t/log ================================================ FILE: test/version-script2.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/a.ver ver1 { global: foo; local: *; }; ver2 { global: bar; }; ver3 { global: baz; }; EOF cat < $t/log grep -F 'foo@ver1' $t/log grep -F 'bar@ver2' $t/log grep -F 'baz@ver3' $t/log ================================================ FILE: test/version-script20.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat <<'EOF' > $t/a.ver VER1 { foo*; }; VER2 { foo_x; }; EOF cat < $t/log grep -F 'foo_x@@VER2' $t/log grep -F 'foo_y@@VER1' $t/log grep -F 'foo_z@@VER1' $t/log ================================================ FILE: test/version-script21.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat <<'EOF' > $t/a.ver VER1 { foo_x; }; VER2 { foo*; }; EOF cat < $t/log grep -F 'foo_x@@VER1' $t/log grep -F 'foo_y@@VER2' $t/log grep -F 'foo_z@@VER2' $t/log ================================================ FILE: test/version-script22.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat <<'EOF' > $t/a.ver VER1 { foo*; }; VER2 { foo*bar*; }; EOF cat < $t/log grep -F 'foo_bar@@VER2' $t/log ================================================ FILE: test/version-script23.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat <<'EOF' > $t/a.ver VER1 { foo\?; }; EOF cat <& /dev/null || skip .globl "foo?" "foo?": EOF $CC -B. -shared -Wl,--version-script=$t/a.ver -o $t/c.so $t/b.o readelf -W --dyn-syms $t/c.so > $t/log grep -F 'foo?@@VER1' $t/log ================================================ FILE: test/version-script3.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/a.ver ver1 { global: f*o; local: *; }; ver2 { global: b*; }; EOF cat < $t/log grep -F 'foo@ver1' $t/log grep -F 'bar@ver2' $t/log grep -F 'baz@ver2' $t/log ================================================ FILE: test/version-script4.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/a.ver { global: extern "C++" { foo::bar; }; local: *; }; EOF cat < $t/log grep -F _ZN3foo3barE $t/log not grep -F ' bar' $t/log ================================================ FILE: test/version-script5.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/a.ver { extern "C" { foo }; local: *; }; EOF cat < $t/log grep -F foo $t/log not grep -F ' main' $t/log ================================================ FILE: test/version-script6.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat <<'EOF' > $t/a.ver VER_X1 { foo; }; VER_X2 { bar; }; EOF cat < $t/d.ver VER_Y1 { local; *; }; VER_Y2 { baz; }; VER_Y3 { foo; }; EOF cat < $t/log grep 'foo@VER_X1' $t/log grep 'bar@VER_X2' $t/log grep 'baz@@VER_Y2' $t/log ================================================ FILE: test/version-script7.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat <<'EOF' > $t/a.ver VER_X1 { *; }; EOF cat < $t/log grep 'foo$' $t/log grep 'bar@@VER_X1' $t/log ================================================ FILE: test/version-script8.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/a.ver ver1 { global: ?oo; local: *; }; ver2 { global: b?r; }; EOF cat < $t/log grep -F 'foo@@ver1' $t/log grep -F 'bar@@ver2' $t/log not grep -F 'baz' $t/log ================================================ FILE: test/version-script9.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc echo 'VER1 { extern "C++" {}; foo; }; VER2 {};' > $t/a.ver cat < $t/b.s .globl foo, bar, baz foo: nop bar: nop baz: nop EOF $CC -B. -shared -o $t/c.so -Wl,-version-script,$t/a.ver $t/b.s readelf --dyn-syms $t/c.so > $t/log grep ' foo@@VER1$' $t/log ================================================ FILE: test/version.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # OneTBB isn't tsan-clean nm mold | grep '__tsan_init' && skip ./mold -v | grep 'mold .*compatible with GNU ld' ./mold --version | grep 'mold .*compatible with GNU ld' ./mold -V | grep 'mold .*compatible with GNU ld' ./mold -V | grep elf_x86_64 ./mold -V | grep elf_i386 cat < int main() { printf("Hello world\n"); } EOF rm -f $t/exe $CC -B. -Wl,--version -o $t/exe1 $t/a.o |& grep mold not [ -f $t/exe1 ] $CC -B. -Wl,-v -o $t/exe2 $t/a.o |& grep mold $QEMU $t/exe2 | grep 'Hello world' not ./mold --v |& grep 'unknown command line option:' ================================================ FILE: test/versioned-undef.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # Skip if libc is musl because musl does not fully support GNU-style # symbol versioning. is_musl && skip cat < $t/b.ver $CC -B. -shared -o $t/c.so $t/a.o -Wl,--version-script=$t/b.ver cat < int foo1(); int foo2(); int foo3(); int foo(); int bar(); __asm__(".symver foo1, foo@VER1"); __asm__(".symver foo2, foo@VER2"); __asm__(".symver foo3, foo@VER3"); __asm__(".symver bar, bar@VER3"); int main() { printf("%d %d %d %d %d\n", foo1(), foo2(), foo3(), foo(), bar()); } EOF $CC -B. -o $t/exe $t/d.o $t/c.so $QEMU $t/exe | grep '^1 2 3 3 4$' ================================================ FILE: test/visibility.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat <& $t/log [ $(grep 'undefined symbol:.* foo$' $t/log | wc -l) = 1 ] ================================================ FILE: test/warn-symbol-type.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int times = -1; // times collides with clock_t times(struct tms *buffer) int main() { printf ("times: %d\n", times); } EOF $CC -B. -shared -o $t/b.so $t/a.o |& grep 'warning: symbol type mismatch: times' ================================================ FILE: test/warn-unresolved-symbols.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < __attribute__((weak)) int foo(); int main() { printf("%d\n", foo ? foo() : 3); } EOF $CC -B. -o $t/b.so $t/a.o -shared $CC -B. -o $t/c.so $t/a.o -shared -Wl,-z,defs readelf --dyn-syms $t/b.so | grep 'WEAK DEFAULT UND foo' readelf --dyn-syms $t/c.so | grep 'WEAK DEFAULT UND foo' ================================================ FILE: test/weak-export-dso2.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < __attribute__((weak)) int foo(); int main() { printf("%d\n", foo ? foo() : 3); } EOF $CC -B. -o $t/d.so $t/c.o $t/b.so -shared readelf -W --dyn-syms $t/d.so | grep 'WEAK DEFAULT .* UND foo' ================================================ FILE: test/weak-export-exe.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < __attribute__((weak)) int foo(); int main() { printf("%d\n", foo ? foo() : 3); } EOF $CC -B. -o $t/exe $t/a.o readelf --dyn-syms $t/exe | not grep 'WEAK DEFAULT UND foo' $QEMU $t/exe | grep '^3$' ================================================ FILE: test/weak-undef-dso.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int bar(); int main() { printf("bar=%d\n", bar()); } EOF $CC -B. -o $t/exe1 $t/c.o $t/b.so $QEMU $t/exe1 | grep 'bar=-1' cat < int foo() { return 5; } int bar(); int main() { printf("bar=%d\n", bar()); } EOF $CC -B. -o $t/exe2 $t/d.o $t/b.so $QEMU $t/exe2 | grep 'bar=5' ================================================ FILE: test/weak-undef.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < __attribute__((weak)) int foo(); int main() { printf("%d\n", foo ? foo() : -1); } EOF cat < __attribute__((weak)) int foo(); int main() { printf("%d\n", foo ? foo() : -1); } EOF cat < __attribute__((weak)) int foo(); int main() { printf("%d\n", foo ? foo() : -1); } EOF cat < __attribute__((weak)) int foo(); int bar(); int main() { bar(); printf("%d\n", foo ? foo() : -1); } EOF cat < __attribute__((weak)) int foo(); int main() { printf("%d\n", foo ? foo() : -1); } EOF cat < __attribute__((weak)) int foo(); int main() { printf("%d\n", foo ? foo() : -1); } EOF cat < int foo() { return 2; } EOF $CC -B. -o $t/libfoobar.so $t/b.o -shared $CC -B. -o $t/exe $t/a.o -Wl,--as-needed -L$t -lfoobar -Wl,-rpath,$t readelf --dynamic $t/exe | grep 'NEEDED.*libfoobar' $QEMU $t/exe | grep '^2$' ================================================ FILE: test/whole-archive.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < $t/log not grep fn1 $t/log not grep fn2 $t/log $CC -B. -nostdlib -o $t/exe $t/a.o -Wl,--whole-archive $t/d.a readelf --symbols $t/exe > $t/log grep fn1 $t/log grep fn2 $t/log $CC -B. -nostdlib -o $t/exe $t/a.o -Wl,--whole-archive \ -Wl,--no-whole-archive $t/d.a readelf --symbols $t/exe > $t/log not grep fn1 $t/log not grep fn2 $t/log ================================================ FILE: test/wrap-lto.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc test_cflags -flto || skip cat < void foo() { printf("foo\n"); } EOF cat < void foo(); void __wrap_foo() { printf("wrap_foo\n"); } int main() { foo(); } EOF cat < void __real_foo(); int main() { __real_foo(); } EOF $CC -B. -o $t/exe $t/a.so $t/b.o -flto $QEMU $t/exe | grep '^foo$' $CC -B. -o $t/exe $t/a.so $t/b.o -Wl,-wrap,foo -flto $QEMU $t/exe | grep '^wrap_foo$' $CC -B. -o $t/exe $t/a.so $t/c.o -Wl,-wrap,foo -flto $QEMU $t/exe | grep '^foo$' ================================================ FILE: test/wrap.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < void foo() { printf("foo\n"); } EOF cat < void foo(); void __wrap_foo() { printf("wrap_foo\n"); } int main() { foo(); } EOF cat < void __real_foo(); int main() { __real_foo(); } EOF $CC -B. -o $t/exe $t/a.so $t/b.o $QEMU $t/exe | grep '^foo$' $CC -B. -o $t/exe $t/a.so $t/b.o -Wl,-wrap,foo $QEMU $t/exe | grep '^wrap_foo$' $CC -B. -o $t/exe $t/a.so $t/c.o -Wl,-wrap,foo $QEMU $t/exe | grep '^foo$' ================================================ FILE: test/z-cet-report.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat <& $t/log grep 'a.o: -cet-report=warning: missing GNU_PROPERTY_X86_FEATURE_1_IBT' $t/log grep 'a.o: -cet-report=warning: missing GNU_PROPERTY_X86_FEATURE_1_SHSTK' $t/log not $CC -B. -o $t/exe $t/a.o -Wl,-z,cet-report=error >& $t/log grep 'a.o: -cet-report=error: missing GNU_PROPERTY_X86_FEATURE_1_IBT' $t/log grep 'a.o: -cet-report=error: missing GNU_PROPERTY_X86_FEATURE_1_SHSTK' $t/log ================================================ FILE: test/z-defs.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int foo() { return 1; } int bar(); int main() { printf("%d\n", bar()); } EOF $CC -B. -o $t/exe1 $t/e.o $t/b.so $QEMU $t/exe1 | grep 1 $CC -B. -o $t/exe2 $t/e.o $t/c.so $QEMU $t/exe2 | grep 1 $CC -B. -o $t/exe3 $t/e.o $t/d.so $QEMU $t/exe3 | grep 0 ================================================ FILE: test/z-dynamic-undefined-weak2.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < __attribute__((weak)) extern int foo; int main() { printf("%p\n", &foo); } EOF if $CC -B. -o $t/exe $t/a.o -no-pie -Wl,-z,dynamic-undefined-weak >& $t/log; then $QEMU $t/exe | grep -F '(nil)' else grep 'recompile with -fPIE or -fPIC' $t/log fi ================================================ FILE: test/z-max-page-size.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello world\n"); } EOF $CC -B. -o $t/exe1 $t/a.o -Wl,-z,max-page-size=65536 \ -Wl,-z,separate-loadable-segments $QEMU $t/exe1 | grep 'Hello world' readelf -W --segments $t/exe1 | grep 'LOAD.*R 0x10000$' $CC -B. -o $t/exe2 $t/a.o -Wl,-zmax-page-size=$((1024*1024)) \ -Wl,-z,separate-loadable-segments $QEMU $t/exe2 | grep 'Hello world' readelf -W --segments $t/exe2 | grep 'LOAD.*R 0x100000$' $CC -B. -o $t/exe3 $t/a.o -Wl,-zmax-page-size=$((1024*1024)) $QEMU $t/exe3 | grep 'Hello world' readelf -W --segments $t/exe3 | grep 'LOAD.*R 0x100000$' ================================================ FILE: test/z-nodefaultlib.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello world\n"); } EOF $CC -B. -o $t/exe $t/a.o -Wl,-z,nodefaultlib readelf --dynamic $t/exe | grep 'Flags:.*NODEFLIB' ================================================ FILE: test/z-nodump.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello "); puts("world"); } EOF $CC -B. -o $t/exe $t/a.o -Wl,-z,now $QEMU $t/exe | grep 'Hello world' ================================================ FILE: test/z-origin.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello world\n"); } EOF $CC -B. -o $t/exe $t/a.o -Wl,-z,origin readelf --dynamic $t/exe | grep -E '\(FLAGS\)\s+ORIGIN' readelf --dynamic $t/exe | grep -E 'Flags:.*ORIGIN' ================================================ FILE: test/z-pack-relative-relocs.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello world\n"); } EOF $CC -o $t/exe1 $t/a.o -pie -Wl,-z,pack-relative-relocs 2> /dev/null || skip readelf -WS $t/exe1 | grep -F .relr.dyn || skip $QEMU $t/exe1 2> /dev/null | grep Hello || skip $CC -B. -o $t/exe2 $t/a.o -pie -Wl,-z,pack-relative-relocs $QEMU $t/exe2 | grep Hello readelf --dynamic $t/exe2 > $t/log2 grep -Ew 'RELR|: 24' $t/log2 grep -Ew 'RELRSZ|: 23' $t/log2 grep -Ew 'RELRENT|: 25' $t/log2 ================================================ FILE: test/z-rodynamic.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < int main() { printf("Hello "); puts("world"); } EOF $CC -B. -o $t/exe $t/a.o -Wl,-z,nosectionheader $QEMU $t/exe | grep 'Hello world' readelf -h $t/exe |& grep -E 'Size of section headers:\s+0 ' ================================================ FILE: test/z-separate-code.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc # musl doesn't work with `-z noseparate-code` is_musl && skip cat < int main() { printf("Hello world\n"); } EOF $CC -B. -o $t/exe1 $t/a.o -Wl,-z,separate-loadable-segments $QEMU $t/exe1 | grep 'Hello world' $CC -B. -o $t/exe2 $t/a.o -Wl,-z,separate-code -Wl,-z,norelro $QEMU $t/exe2 | grep 'Hello world' $CC -B. -o $t/exe3 $t/a.o -Wl,-z,noseparate-code -Wl,-z,norelro $QEMU $t/exe3 | grep 'Hello world' ================================================ FILE: test/z-stack-size.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc cat < __attribute__((section("hello"))) static char msg[] = "Hello world"; int main() { printf("%s\n", msg); } EOF $CC -B. -o $t/exe1 $t/a.o readelf -W --dyn-syms $t/exe1 > $t/log1 not grep __start_hello $t/log1 not grep __stop_hello $t/log1 $CC -B. -o $t/exe2 $t/a.o -Wl,-z,start-stop-visibility=hidden readelf -W --dyn-syms $t/exe2 > $t/log2 not grep __start_hello $t/log2 not grep __stop_hello $t/log2 $CC -B. -o $t/exe3 $t/a.o -Wl,-z,start-stop-visibility=protected readelf -W --dyn-syms $t/exe3 > $t/log3 grep __start_hello $t/log3 grep __stop_hello $t/log3 ================================================ FILE: test/zero-to-bss.sh ================================================ #!/bin/bash . $(dirname $0)/common.inc [ "$(uname)" = FreeBSD ] && skip cat < __attribute__((section(".zero"))) char zero[256]; __attribute__((section(".const"))) const char konst[256]; int main() { printf("Hello world\n"); } EOF $CC -B. -o $t/exe1 $t/a.o readelf -WS $t/exe1 | grep -E '.zero\s+PROGBITS' readelf -WS $t/exe1 | grep -E '.const\s+PROGBITS' $QEMU $t/exe1 | grep Hello $CC -B. -o $t/exe2 $t/a.o -Wl,--zero-to-bss readelf -WS $t/exe2 | grep -E '.zero\s+NOBITS' readelf -WS $t/exe2 | grep -E '.const\s+PROGBITS' $QEMU $t/exe2 | grep Hello ================================================ FILE: third-party/blake3/.cargo/config.toml ================================================ [target.wasm32-wasip1] runner = "wasmtime" ================================================ FILE: third-party/blake3/.git-blame-ignore-revs ================================================ # CMakeLists.txt whitespace fixups 3e14f865d30271c74fc68d417af488ea91b66d48 ================================================ FILE: third-party/blake3/.github/workflows/build_b3sum.py ================================================ #! /usr/bin/env python3 from pathlib import Path import platform import shutil import subprocess import sys ROOT = Path(__file__).parent.parent.parent RUST_TARGET = sys.argv[1] subprocess.run( ["cargo", "build", "--target", sys.argv[1], "--release"], cwd=ROOT / "b3sum" ) if platform.system() == "Windows": original_exe_name = "b3sum.exe" else: original_exe_name = "b3sum" if platform.system() == "Windows": new_exe_name = "b3sum_windows_x64_bin.exe" elif platform.system() == "Darwin": new_exe_name = "b3sum_macos_x64_bin" elif platform.system() == "Linux": new_exe_name = "b3sum_linux_x64_bin" else: raise RuntimeError("Unexpected platform: " + platform.system()) # Copy the built binary so that it has the upload name we want. out_dir = ROOT / "b3sum/target" / RUST_TARGET / "release" original_exe_path = str(out_dir / original_exe_name) new_exe_path = str(out_dir / new_exe_name) print("copying", repr(original_exe_path), "to", repr(new_exe_path)) shutil.copyfile(original_exe_path, new_exe_path) # This lets the subsequent upload step get the filepath. print("::set-output name=bin_path::" + new_exe_path) ================================================ FILE: third-party/blake3/.github/workflows/ci.yml ================================================ name: tests on: push: branches: - "*" # not on tags pull_request: env: BLAKE3_CI: "1" RUSTFLAGS: "-D warnings" RUST_BACKTRACE: "1" jobs: library_tests: name: ${{ matrix.target.name }} ${{ matrix.channel }} runs-on: ${{ matrix.target.os }} strategy: fail-fast: false matrix: target: [ { "os": "ubuntu-latest", "toolchain": "x86_64-unknown-linux-gnu", "name": "Linux GNU" }, { "os": "macOS-latest", "toolchain": "aarch64-apple-darwin", "name": "macOS" }, { "os": "windows-latest", "toolchain": "x86_64-pc-windows-msvc", "name": "Windows MSVC" }, { "os": "windows-latest", "toolchain": "x86_64-pc-windows-gnu", "name": "Windows GNU" } ] channel: ["stable", "beta", "nightly"] steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@master with: toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }} # Print the compiler version, for debugging. - name: print compiler version run: cargo run --quiet working-directory: ./tools/compiler_version # Print out instruction set support, for debugging. - name: print instruction set support run: cargo run --quiet working-directory: ./tools/instruction_set_support # Default tests plus Rayon and trait implementations. - run: cargo test --features=rayon,traits-preview,serde,zeroize # Same but with only one thread in the Rayon pool. This can find deadlocks. - name: "again with RAYON_NUM_THREADS=1" run: cargo test --features=rayon,traits-preview,serde,zeroize env: RAYON_NUM_THREADS: 1 # The mmap feature by itself (update_mmap_rayon is omitted). - run: cargo test --features=mmap # All public features put together. - run: cargo test --features=mmap,rayon,traits-preview,serde,zeroize # no_std tests. - run: cargo test --no-default-features # A matrix of different test settings: # - debug vs release # - assembly vs Rust+C intrinsics vs pure Rust intrinsics # - different levels of SIMD support # # Full SIMD support. - run: cargo test --features= - run: cargo test --features=prefer_intrinsics - run: cargo test --features=pure - run: cargo test --features= --release - run: cargo test --features=prefer_intrinsics --release - run: cargo test --features=pure --release # No AVX-512. - run: cargo test --features=no_avx512 - run: cargo test --features=no_avx512,prefer_intrinsics - run: cargo test --features=no_avx512,pure - run: cargo test --features=no_avx512 --release - run: cargo test --features=no_avx512,prefer_intrinsics --release - run: cargo test --features=no_avx512,pure --release # No AVX2. - run: cargo test --features=no_avx512,no_avx2 - run: cargo test --features=no_avx512,no_avx2,prefer_intrinsics - run: cargo test --features=no_avx512,no_avx2,pure - run: cargo test --features=no_avx512,no_avx2 --release - run: cargo test --features=no_avx512,no_avx2,prefer_intrinsics --release - run: cargo test --features=no_avx512,no_avx2,pure --release # No SSE4.1 - run: cargo test --features=no_avx512,no_avx2,no_sse41 - run: cargo test --features=no_avx512,no_avx2,no_sse41,prefer_intrinsics - run: cargo test --features=no_avx512,no_avx2,no_sse41,pure - run: cargo test --features=no_avx512,no_avx2,no_sse41 --release - run: cargo test --features=no_avx512,no_avx2,no_sse41,prefer_intrinsics --release - run: cargo test --features=no_avx512,no_avx2,no_sse41,pure --release # No SSE2 - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2 - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2,prefer_intrinsics - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2,pure - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2 --release - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2,prefer_intrinsics --release - run: cargo test --features=no_avx512,no_avx2,no_sse41,no_sse2,pure --release # Test benchmarks. RUSTC_BOOTSTRAP=1 lets this run on non-nightly toolchains. - run: cargo test --benches --features=rayon env: RUSTC_BOOTSTRAP: 1 # Test vectors. - name: test vectors run: cargo test working-directory: ./test_vectors - name: test vectors intrinsics run: cargo test --features=prefer_intrinsics working-directory: ./test_vectors - name: test vectors pure run: cargo test --features=pure working-directory: ./test_vectors # Test C code. - name: cargo test C bindings assembly run: cargo test working-directory: ./c/blake3_c_rust_bindings - name: cargo test C bindings intrinsics run: cargo test --features=prefer_intrinsics working-directory: ./c/blake3_c_rust_bindings - name: cargo test C bindings no AVX-512 run: cargo test working-directory: ./c/blake3_c_rust_bindings env: CFLAGS: -DBLAKE3_NO_AVX512 - name: cargo test C bindings no AVX2 run: cargo test working-directory: ./c/blake3_c_rust_bindings env: CFLAGS: -DBLAKE3_NO_AVX512 -DBLAKE3_NO_AVX2 - name: cargo test C bindings no SSE41 run: cargo test working-directory: ./c/blake3_c_rust_bindings env: CFLAGS: -DBLAKE3_NO_AVX512 -DBLAKE3_NO_AVX2 -DBLAKE3_NO_SSE41 - name: cargo test C bindings no SSE2 run: cargo test working-directory: ./c/blake3_c_rust_bindings env: CFLAGS: -DBLAKE3_NO_AVX512 -DBLAKE3_NO_AVX2 -DBLAKE3_NO_SSE41 -DBLAKE3_NO_SSE2 # Reference impl doc test. - name: reference impl doc test run: cargo test working-directory: ./reference_impl msrv_build: name: MSRV build ${{ matrix.os }} runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: os: ["ubuntu-latest", "macOS-latest", "windows-latest"] steps: - uses: actions/checkout@v4 # The current MSRV. This crate doesn't have an official MSRV policy, # but in practice we'll probably do what libc does: # https://github.com/rust-lang/libs-team/issues/72. # This test target is here so that we notice if we accidentally bump # the MSRV, but it's not a promise that we won't bump it. - uses: dtolnay/rust-toolchain@1.66.1 - run: cargo build --features=mmap,rayon,traits-preview,serde,zeroize b3sum_tests: name: b3sum ${{ matrix.target.name }} ${{ matrix.channel }} runs-on: ${{ matrix.target.os }} strategy: fail-fast: false matrix: target: [ { "os": "ubuntu-latest", "toolchain": "x86_64-unknown-linux-gnu", "name": "Linux GNU" }, { "os": "macOS-latest", "toolchain": "aarch64-apple-darwin", "name": "macOS" }, { "os": "windows-latest", "toolchain": "x86_64-pc-windows-msvc", "name": "Windows MSVC" }, { "os": "windows-latest", "toolchain": "x86_64-pc-windows-gnu", "name": "Windows GNU" } ] channel: ["stable", "beta", "nightly"] steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@master with: toolchain: ${{ format('{0}-{1}', matrix.channel, matrix.target.toolchain) }} # Test b3sum. - name: test b3sum run: cargo test working-directory: ./b3sum - name: test b3sum --no-default-features run: cargo test --no-default-features working-directory: ./b3sum cross_tests: name: cross ${{ matrix.arch }} runs-on: ubuntu-latest strategy: fail-fast: false matrix: arch: - i586-unknown-linux-musl - i686-unknown-linux-musl - armv7-unknown-linux-gnueabihf - aarch64-unknown-linux-gnu # Big-endian targets. See https://twitter.com/burntsushi5/status/1695483429997945092. - powerpc64-unknown-linux-gnu - s390x-unknown-linux-gnu steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable - run: cargo install cross # Test the portable implementation on everything. - run: cross test --target ${{ matrix.arch }} # Test building for ancient i386 processors without guaranteed SSE2 support. - run: cross rustc --target ${{ matrix.arch }} -- -C target-cpu=i386 if: startsWith(matrix.arch, 'i586-') || startsWith(matrix.arch, 'i686-') # Test the NEON implementation on ARM targets. - run: cross test --target ${{ matrix.arch }} --features=neon if: startsWith(matrix.arch, 'armv7-') || startsWith(matrix.arch, 'aarch64-') # NEON is enabled by default on aarch64, disabling it through the no_neon feature. - run: cross test --target ${{ matrix.arch }} --features=no_neon if: startsWith(matrix.arch, 'aarch64-') # Test vectors. Note that this uses a hacky script due to path dependency limitations. - run: ./test_vectors/cross_test.sh --target ${{ matrix.arch }} # C code. Same issue with the hacky script. - run: ./c/blake3_c_rust_bindings/cross_test.sh --target ${{ matrix.arch }} - run: ./c/blake3_c_rust_bindings/cross_test.sh --target ${{ matrix.arch }} --features=neon if: startsWith(matrix.arch, 'armv7-') || startsWith(matrix.arch, 'aarch64-') wasm_tests: name: WASM tests runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable with: targets: wasm32-wasip1 - name: install Wasmtime run: | curl https://wasmtime.dev/install.sh -sSf | bash echo PATH: $PATH mkdir -p ~/.local/bin ln -s ~/.wasmtime/bin/wasmtime ~/.local/bin/wasmtime - run: cargo test --target wasm32-wasip1 - run: cargo test --target wasm32-wasip1 --no-default-features - run: cargo test --target wasm32-wasip1 --features wasm32_simd - run: cargo test --target wasm32-wasip1 --no-default-features --features wasm32_simd - run: cargo test --target wasm32-wasip1 --benches --features=wasm32_simd env: RUSTC_BOOTSTRAP: 1 - name: test vectors w/o SIMD run: cargo test --target wasm32-wasip1 working-directory: ./test_vectors - name: test vectors w/ SIMD run: cargo test --target wasm32-wasip1 --features wasm32_simd working-directory: ./test_vectors cargo_xwin_test: name: cargo xwin test runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - run: docker run -v $(pwd):/io -w /io messense/cargo-xwin cargo xwin test --target x86_64-pc-windows-msvc --features=mmap,rayon,traits-preview,serde,zeroize # Currently only on x86. cmake_c_tests: name: CMake C tests SIMD=${{ matrix.simd }} TBB=${{ matrix.use_tbb }} runs-on: ubuntu-latest strategy: fail-fast: false matrix: use_tbb: ["OFF", "ON"] simd: ["x86-intrinsics", "amd64-asm"] steps: - uses: actions/checkout@v4 - run: | sudo apt-get update sudo apt-get install ninja-build libtbb-dev libtbb12 # Test the intrinsics-based and assembly-based implementations. - run: | cmake --fresh -S c -B c/build -G Ninja -DBLAKE3_TESTING=ON -DBLAKE3_TESTING_CI=ON "-DBLAKE3_SIMD_TYPE=${{ matrix.simd }}" cmake --build c/build --target test cat c/build/Testing/Temporary/LastTest.log - run: | cmake --fresh -S c -B c/build -G Ninja -DBLAKE3_TESTING=ON -DBLAKE3_TESTING_CI=ON "-DBLAKE3_SIMD_TYPE=${{ matrix.simd }}" -DBLAKE3_NO_SSE2=1 cmake --build c/build --target test cat c/build/Testing/Temporary/LastTest.log - run: | cmake --fresh -S c -B c/build -G Ninja -DBLAKE3_TESTING=ON -DBLAKE3_TESTING_CI=ON "-DBLAKE3_SIMD_TYPE=${{ matrix.simd }}" -DBLAKE3_NO_SSE2=1 -DBLAKE3_NO_SSE41=1 cmake --build c/build --target test cat c/build/Testing/Temporary/LastTest.log - run: | cmake --fresh -S c -B c/build -G Ninja -DBLAKE3_TESTING=ON -DBLAKE3_TESTING_CI=ON "-DBLAKE3_SIMD_TYPE=${{ matrix.simd }}" "-DBLAKE3_NO_SSE2=1" "-DBLAKE3_NO_SSE41=1" "-DBLAKE3_NO_AVX2=1" cmake --build c/build --target test cat c/build/Testing/Temporary/LastTest.log - run: | cmake --fresh -S c -B c/build -G Ninja -DBLAKE3_TESTING=ON -DBLAKE3_TESTING_CI=ON "-DBLAKE3_SIMD_TYPE=${{ matrix.simd }}" "-DBLAKE3_NO_SSE2=1" "-DBLAKE3_NO_SSE41=1" "-DBLAKE3_NO_AVX2=1" "-DBLAKE3_NO_AVX512=1" cmake --build c/build --target test cat c/build/Testing/Temporary/LastTest.log # Test with TBB disabled/enabled. - run: | cmake --fresh -S c -B c/build -G Ninja -DBLAKE3_TESTING=ON -DBLAKE3_TESTING_CI=ON "-DBLAKE3_USE_TBB=${{ matrix.use_tbb }}" cmake --build c/build --target test cat c/build/Testing/Temporary/LastTest.log # Build the example with TBB disabled/enabled. - run: | cmake --fresh -S c -B c/build -G Ninja -DBLAKE3_TESTING=ON -DBLAKE3_TESTING_CI=ON -DBLAKE3_EXAMPLES=ON "-DBLAKE3_USE_TBB=${{ matrix.use_tbb }}" cmake --build c/build --target blake3-example # Currently only on x86. pkg_config_c_tests: name: pkg-config C tests TBB=${{ matrix.use_tbb }} BUILD_SHARED_LIBS=${{ matrix.shared_libs }} STDLIB=${{ matrix.stdlib }} runs-on: ubuntu-latest strategy: fail-fast: false matrix: use_tbb: ["OFF", "ON"] shared_libs: ["OFF", "ON"] stdlib: ["libc++", "libstdc++"] steps: - uses: actions/checkout@v4 - name: update packages run: | sudo apt-get update sudo apt-get install ninja-build libtbb-dev libtbb12 ${{ matrix.stdlib != 'libc++' || 'sudo apt-get install libc++-dev libc++abi-dev' }} - name: configure cmake run: | export CXXFLAGS=${{ matrix.stdlib == 'libc++' && '-stdlib=libc++' || '' }} export CC=${{ matrix.stdlib == 'libc++' && 'clang' || 'gcc' }} export CXX=${{ matrix.stdlib == 'libc++' && 'clang++' || 'g++' }} cmake --fresh -S c -B c/build -G Ninja -DCMAKE_INSTALL_PREFIX=${{ github.workspace }}/target "-DBLAKE3_USE_TBB=${{ matrix.use_tbb }}" "-DBUILD_SHARED_LIBS=${{ matrix.shared_libs }}" -DCMAKE_VERBOSE_MAKEFILE=1 - run: cmake --build c/build --target install - run: mkdir -p ${{ github.workspace }}/target/bin - run: echo "PKG_CONFIG_PATH=${{ github.workspace }}/target/lib/pkgconfig" >> $GITHUB_ENV - run: gcc -O3 -o ${{ github.workspace }}/target/bin/blake3-example c/example.c $(pkg-config --cflags --libs libblake3) - if: matrix.use_tbb == 'ON' run: gcc -O3 -o ${{ github.workspace }}/target/bin/blake3-example-tbb c/example_tbb.c $(pkg-config --cflags --libs libblake3) # Note that this jobs builds AArch64 binaries from an x86_64 host. build_apple_silicon: name: build for Apple Silicon runs-on: macOS-latest strategy: fail-fast: false steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable with: targets: aarch64-apple-darwin - name: build blake3 run: cargo build --target aarch64-apple-darwin - name: build b3sum run: cargo build --target aarch64-apple-darwin working-directory: ./b3sum build_tinycc: name: build with the Tiny C Compiler runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: install TCC run: sudo apt-get install -y tcc - name: compile run: > tcc -shared -O3 -o libblake3.so \ -DBLAKE3_NO_SSE2 -DBLAKE3_NO_SSE41 -DBLAKE3_NO_AVX2 -DBLAKE3_NO_AVX512 \ blake3.c blake3_dispatch.c blake3_portable.c working-directory: ./c # See https://github.com/BLAKE3-team/BLAKE3/issues/271 for why we test this. # Note that this isn't guaranteed to execute on an AVX-512-supporting server, # but hopefully at least some of the time it will. gcc54: name: "compile and test with GCC 5.4" runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: addnab/docker-run-action@v3 with: image: gcc:5.4 options: -v ${{ github.workspace }}:/work run: | cat /proc/cpuinfo curl https://sh.rustup.rs -sSf | sh -s -- -y --profile minimal cd /work ~/.cargo/bin/cargo test --features prefer_intrinsics # CMake build test (Library only). cmake_current_build: name: CMake ${{ matrix.os }} CC=${{ matrix.toolchain.cc }} CXX=${{ matrix.toolchain.cxx }} TBB=${{ matrix.use_tbb }} runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: cmakeVersion: [latest] ninjaVersion: [latest] os: [ubuntu-latest, macOS-latest, windows-latest] toolchain: [ { cc: cl, cxx: cl }, { cc: clang, cxx: clang++ }, { cc: clang-cl, cxx: clang-cl }, { cc: gcc, cxx: g++ }, ] use_tbb: [OFF, ON] exclude: - os: macOS-latest toolchain: { cc: cl, cxx: cl } - os: macOS-latest toolchain: { cc: clang-cl, cxx: clang-cl } - os: ubuntu-latest toolchain: { cc: cl, cxx: cl } - os: ubuntu-latest toolchain: { cc: clang-cl, cxx: clang-cl } - os: windows-latest toolchain: { cc: clang, cxx: clang++ } use_tbb: ON - os: windows-latest toolchain: { cc: gcc, cxx: g++ } use_tbb: ON steps: - uses: actions/checkout@v4 - uses: lukka/get-cmake@5f6e04f5267c8133f1273bf2103583fc72c46b17 with: cmakeVersion: ${{ matrix.cmakeVersion }} ninjaVersion: ${{ matrix.ninjaVersion }} - if: matrix.os == 'macOS-latest' name: Install dependencies on macOS run: | brew update brew install tbb - if: matrix.os == 'ubuntu-latest' name: Install dependencies on Linux run: | sudo apt-get update sudo apt-get install libtbb-dev libtbb12 - name: CMake generation, build, install run: | ${{ matrix.os != 'windows-latest' || '& "C:/Program Files/Microsoft Visual Studio/2022/Enterprise/Common7/Tools/Launch-VsDevShell.ps1" -Arch amd64 -SkipAutomaticLocation' }} cmake -S c -B c/build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=${{ github.workspace }}/target -DCMAKE_C_COMPILER=${{ matrix.toolchain.cc }} -DCMAKE_CXX_COMPILER=${{ matrix.toolchain.cxx }} -DBLAKE3_USE_TBB=${{ matrix.use_tbb }} -DBLAKE3_FETCH_TBB=${{ matrix.os == 'windows-latest' && 'YES' || 'NO' }} -DBLAKE3_EXAMPLES=ON cmake --build c/build --target install cmake_3-9_build: name: CMake 3.9.6 ubuntu-latest runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: lukka/get-cmake@5f6e04f5267c8133f1273bf2103583fc72c46b17 with: cmakeVersion: 3.9.6 - name: Create build directory run: mkdir c/build - name: CMake generation run: cmake .. -DCMAKE_INSTALL_PREFIX=${{github.workspace}}/target working-directory: c/build - name: CMake build / install run: make install working-directory: c/build miri_smoketest: name: Miri smoketest runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@nightly with: components: miri # Currently the test search "miri" only matches "test_miri_smoketest", but # we might add more. If this accidentally picks up anything incompatible or # slow, we can narrow it. - run: cargo miri test miri tbb_rust_bindings_tests: name: TBB test bindings ${{ matrix.os }} runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: os: ["ubuntu-latest", "macOS-latest"] steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable - name: install TBB if: matrix.os == 'ubuntu-latest' run: | sudo apt-get update sudo apt-get install libtbb-dev libtbb12 - name: install TBB if: matrix.os == 'macOS-latest' run: | brew install tbb echo "CXXFLAGS=-I$(brew --prefix)/include $CPPFLAGS" >> $GITHUB_ENV echo "RUSTFLAGS=-L$(brew --prefix)/lib $RUSTFLAGS" >> $GITHUB_ENV - name: cargo test C bindings with TBB run: cargo test --features=tbb working-directory: ./c/blake3_c_rust_bindings ================================================ FILE: third-party/blake3/.github/workflows/tag.yml ================================================ name: publish_b3sum_binaries on: push: tags: - "*" env: BLAKE3_CI: "1" RUSTFLAGS: "-D warnings" jobs: cargo_tests: name: ${{ matrix.target.name }} runs-on: ${{ matrix.target.os }} strategy: fail-fast: false matrix: target: [ { "os": "ubuntu-latest", "rust-target": "x86_64-unknown-linux-musl", "name": "Linux" }, { "os": "macOS-latest", "rust-target": "x86_64-apple-darwin", "name": "macOS" }, { "os": "windows-latest", "rust-target": "x86_64-pc-windows-msvc", "name": "Windows" }, ] steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: python-version: "3.x" - run: pip install PyGithub - run: sudo apt-get install musl-tools if: matrix.target.os == 'ubuntu-latest' - uses: dtolnay/rust-toolchain@stable with: targets: ${{ matrix.target.rust-target }} - name: build b3sum id: build_b3sum run: python -u .github/workflows/build_b3sum.py ${{ matrix.target.rust-target }} - name: upload release asset env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_TAG: ${{ github.ref }} run: python -u .github/workflows/upload_github_release_asset.py ${{ steps.build_b3sum.outputs.bin_path }} ================================================ FILE: third-party/blake3/.github/workflows/upload_github_release_asset.py ================================================ #! /usr/bin/env python3 import github import os import sys import time RETRIES = 10 g = github.Github(os.environ["GITHUB_TOKEN"]) tag_name = os.environ["GITHUB_TAG"] tag_prefix = "refs/tags/" if tag_name.startswith(tag_prefix): tag_name = tag_name[len(tag_prefix) :] assert len(sys.argv) == 2 asset_path = sys.argv[1] asset_name = os.path.basename(asset_path) repo = g.get_repo(os.environ["GITHUB_REPOSITORY"]) tags = list(repo.get_tags()) for tag in tags: if tag.name == tag_name: break else: raise RuntimeError("no tag named " + repr(tag_name)) try: print("Creating GitHub release for tag " + repr(tag_name) + "...") repo.create_git_release(tag_name, tag_name, tag.commit.commit.message) except github.GithubException as github_error: if github_error.data["errors"][0]["code"] == "already_exists": print("Release for tag " + repr(tag_name) + " already exists.") else: raise def get_release(): for i in range(RETRIES): releases = list(repo.get_releases()) for release in releases: if release.tag_name == tag_name: return release print(f"Release for tag {repr(tag_name)} not found. Retrying...") time.sleep(1) raise RuntimeError("no release for tag " + repr(tag_name)) release = get_release() print("Uploading " + repr(asset_path) + "...") for i in range(RETRIES): try: print("Upload attempt #{} of {}...".format(i + 1, RETRIES)) release.upload_asset(asset_path) break except github.GithubException as github_error: # Unfortunately the asset upload API is flaky. Even worse, it often # partially succeeds, returning an error to the caller but leaving the # release in a state where subsequent uploads of the same asset will # fail with an "already_exists" error. (Though the asset is not visible # on github.com, so we can't just declare victory and move on.) If we # detect this case, explicitly delete the asset and continue retrying. print(github_error) for asset in release.get_assets(): if asset.name == asset_name: print("Found uploaded asset after failure. Deleting...") asset.delete_asset() else: raise RuntimeError("All upload attempts failed.") print("Success!") ================================================ FILE: third-party/blake3/.gitignore ================================================ Cargo.lock target ================================================ FILE: third-party/blake3/CONTRIBUTING.md ================================================ # Contributing We welcome and encourage third-party contributions to BLAKE3, be it reports of issues encountered while using the software or proposals of patches. ## Bug reports Bugs and other problems should be reported on [GitHub Issues](https://github.com/BLAKE3/BLAKE3/issues). If you report a bug, please: * Check that it's not already reported in the [GitHub Issues](https://github.com/BLAKE3/BLAKE3/issues). * Provide information to help us diagnose and ideally reproduce the bug. ## Patches We encourage you to fix a bug via a [GitHub Pull request](https://github.com/BLAKE3/BLAKE3/pulls), preferably after creating a related issue and referring it in the PR. If you contribute code and submit a patch, please note the following: * We use Rust's stable branch for developing BLAKE3. * Pull requests should target the `master` branch. * Try to follow the established Rust [style guidelines](https://doc.rust-lang.org/1.0.0/style/). Also please make sure to create new unit tests covering your code additions. You can execute the tests by running: ```bash cargo test ``` All third-party contributions will be recognized in the list of contributors. ================================================ FILE: third-party/blake3/Cargo.toml ================================================ [package] name = "blake3" version = "1.8.2" authors = ["Jack O'Connor ", "Samuel Neves"] description = "the BLAKE3 hash function" repository = "https://github.com/BLAKE3-team/BLAKE3" license = "CC0-1.0 OR Apache-2.0 OR Apache-2.0 WITH LLVM-exception" documentation = "https://docs.rs/blake3" readme = "README.md" edition = "2021" [features] default = ["std"] # The NEON implementation does not participate in dynamic feature detection, # which is currently x86-only. If "neon" is on, NEON support is assumed. Note # that AArch64 always supports NEON, but support on ARMv7 varies. The NEON # implementation uses C intrinsics and requires a C compiler. neon = [] # The Wasm SIMD implementation does not participate in dynamic feature detection, # which is currently x86-only. If "wasm_simd" is on, Wasm SIMD support is assumed. # Note that not all Wasm implementations support the Wasm SIMD specification. # This may become the default in the future. wasm32_simd = [] # This crate uses libstd for std::io trait implementations, and also for # runtime CPU feature detection. This feature is enabled by default. If you use # --no-default-features, the only way to use the SIMD implementations in this # crate is to enable the corresponding instruction sets statically for the # entire build, with e.g. RUSTFLAGS="-C target-cpu=native". std = [] # The `rayon` feature (disabled by default, but enabled for docs.rs) adds the # `update_rayon` and (in combination with `mmap` below) `update_mmap_rayon` # methods, for multithreaded hashing. However, even if this feature is enabled, # all other APIs remain single-threaded. # # Implementation detail: We take a dependency on rayon-core instead of rayon, # because it builds faster and still includes all the APIs we need. rayon = ["dep:rayon-core"] # The `mmap` feature (disabled by default, but enabled for docs.rs) adds the # `update_mmap` and (in combination with `rayon` above) `update_mmap_rayon` # helper methods for memory-mapped IO. mmap = ["std", "dep:memmap2"] # Implement the zeroize::Zeroize trait for types in this crate. zeroize = ["dep:zeroize", "arrayvec/zeroize"] # This crate implements traits from the RustCrypto project, exposed here as the # "traits-preview" feature. However, these traits aren't stable, and they're # expected to change in incompatible ways before they reach 1.0. For that # reason, this crate makes no SemVer guarantees for this feature, and callers # who use it should expect breaking changes between patch versions of this # crate. (The "*-preview" feature name follows the conventions of the RustCrypto # "signature" crate.) traits-preview = ["dep:digest"] # ---------- Features below this line are undocumented and unstable. ---------- # The following features are mainly intended for testing and benchmarking, and # they might change or disappear at any time without a major version bump. # It wasn't originally intended to expose "digest" as its own feature, but the # traits-preview feature above predated the "dep:" syntax in Cargo. Version # 1.5.2 of this crate started using "dep:" syntax, but that broke some callers # in the wild (https://solana.stackexchange.com/q/17787/29050). This feature # unbreaks those callers. When Cargo gains the ability to deprecate features, # this feature will be deprecated. Note that the relevant trait implementations # are still gated by "traits-preview". digest = ["dep:digest"] # By default on x86_64, this crate uses Samuel Neves' hand-written assembly # implementations for SSE4.1, AVX2, and AVX512. (These provide both the best # runtime performance, and the fastest build times.) And by default on 32-bit # x86, this crate uses Rust intrinsics implementations for SSE4.1 and AVX2, and # a C intrinsics implementation for AVX-512. In both cases, if a C compiler is # not detected, or if AVX-512 support is missing from the detected compiler, # build.rs automatically falls back to a pure Rust build. This feature forces # that fallback, for testing purposes. (Note that in CI testing, we set the # BLAKE3_CI environment variable, which instructs build.rs to error out rather # than doing an automatic fallback.) pure = [] # As described above, on x86_64 this crate use assembly implementations by # default. Enabling the "prefer_intrinsics" feature makes this crate use # intrinsics implementations on both 32-bit and 64-bit x86, again for testing # purposes. prefer_intrinsics = [] # Disable individual instruction sets. CI testing uses these flags to simulate # different levels of hardware SIMD support. Note that code for the # corresponding instruction set is still compiled; only detection is disabled. # # As noted above, these flags are *for testing only* and are not stable. It's # possible that some users might find that their particular use case performs # better if e.g. AVX-512 is disabled, because of issues like CPU downclocking. # If that comes up, and if disabling the instruction set here at the feature # level turns out to be the right approach, then we can design a stable # feature. Until then, we reserve the right to break these features in a patch # release. no_sse2 = [] no_sse41 = [] no_avx2 = [] no_avx512 = [] no_neon = [] [package.metadata.docs.rs] # Document the rayon/mmap methods and the Serialize/Deserialize/Zeroize impls on docs.rs. features = ["mmap", "rayon", "serde", "zeroize"] [dependencies] arrayref = "0.3.5" arrayvec = { version = "0.7.4", default-features = false } constant_time_eq = { version = "0.3.1", default-features = false } cfg-if = "1.0.0" digest = { version = "0.10.1", features = ["mac"], optional = true } memmap2 = { version = "0.9", optional = true } rayon-core = { version = "1.12.1", optional = true } serde = { version = "1.0", default-features = false, features = ["derive"], optional = true } zeroize = { version = "1", default-features = false, optional = true } [dev-dependencies] hmac = "0.12.0" hex = "0.4.2" page_size = "0.6.0" rand = "0.9.0" rand_chacha = "0.9.0" reference_impl = { path = "./reference_impl" } tempfile = "3.8.0" serde_json = "1.0.107" ciborium = "0.2.2" [build-dependencies] cc = "1.1.12" ================================================ FILE: third-party/blake3/LICENSE_A2 ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2019 Jack O'Connor and Samuel Neves Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: third-party/blake3/LICENSE_A2LLVM ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2019 Jack O'Connor and Samuel Neves Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ---- LLVM Exceptions to the Apache 2.0 License ---- As an exception, if, as a result of your compiling your source code, portions of this Software are embedded into an Object form of such source code, you may redistribute such embedded portions in such Object form without complying with the conditions of Sections 4(a), 4(b) and 4(d) of the License. In addition, if you combine or link compiled forms of this Software with software that is licensed under the GPLv2 ("Combined Software") and if a court of competent jurisdiction determines that the patent provision (Section 3), the indemnity provision (Section 9) or other Section of the License conflicts with the conditions of the GPLv2, you may retroactively and prospectively choose to deem waived or otherwise exclude such Section(s) of the License, but only in their entirety and only with respect to the Combined Software. ================================================ FILE: third-party/blake3/LICENSE_CC0 ================================================ Creative Commons Legal Code CC0 1.0 Universal CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED HEREUNDER. Statement of Purpose The laws of most jurisdictions throughout the world automatically confer exclusive Copyright and Related Rights (defined below) upon the creator and subsequent owner(s) (each and all, an "owner") of an original work of authorship and/or a database (each, a "Work"). Certain owners wish to permanently relinquish those rights to a Work for the purpose of contributing to a commons of creative, cultural and scientific works ("Commons") that the public can reliably and without fear of later claims of infringement build upon, modify, incorporate in other works, reuse and redistribute as freely as possible in any form whatsoever and for any purposes, including without limitation commercial purposes. These owners may contribute to the Commons to promote the ideal of a free culture and the further production of creative, cultural and scientific works, or to gain reputation or greater distribution for their Work in part through the use and efforts of others. For these and/or other purposes and motivations, and without any expectation of additional consideration or compensation, the person associating CC0 with a Work (the "Affirmer"), to the extent that he or she is an owner of Copyright and Related Rights in the Work, voluntarily elects to apply CC0 to the Work and publicly distribute the Work under its terms, with knowledge of his or her Copyright and Related Rights in the Work and the meaning and intended legal effect of CC0 on those rights. 1. Copyright and Related Rights. A Work made available under CC0 may be protected by copyright and related or neighboring rights ("Copyright and Related Rights"). Copyright and Related Rights include, but are not limited to, the following: i. the right to reproduce, adapt, distribute, perform, display, communicate, and translate a Work; ii. moral rights retained by the original author(s) and/or performer(s); iii. publicity and privacy rights pertaining to a person's image or likeness depicted in a Work; iv. rights protecting against unfair competition in regards to a Work, subject to the limitations in paragraph 4(a), below; v. rights protecting the extraction, dissemination, use and reuse of data in a Work; vi. database rights (such as those arising under Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, and under any national implementation thereof, including any amended or successor version of such directive); and vii. other similar, equivalent or corresponding rights throughout the world based on applicable law or treaty, and any national implementations thereof. 2. Waiver. To the greatest extent permitted by, but not in contravention of, applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and unconditionally waives, abandons, and surrenders all of Affirmer's Copyright and Related Rights and associated claims and causes of action, whether now known or unknown (including existing as well as future claims and causes of action), in the Work (i) in all territories worldwide, (ii) for the maximum duration provided by applicable law or treaty (including future time extensions), (iii) in any current or future medium and for any number of copies, and (iv) for any purpose whatsoever, including without limitation commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each member of the public at large and to the detriment of Affirmer's heirs and successors, fully intending that such Waiver shall not be subject to revocation, rescission, cancellation, termination, or any other legal or equitable action to disrupt the quiet enjoyment of the Work by the public as contemplated by Affirmer's express Statement of Purpose. 3. Public License Fallback. Should any part of the Waiver for any reason be judged legally invalid or ineffective under applicable law, then the Waiver shall be preserved to the maximum extent permitted taking into account Affirmer's express Statement of Purpose. In addition, to the extent the Waiver is so judged Affirmer hereby grants to each affected person a royalty-free, non transferable, non sublicensable, non exclusive, irrevocable and unconditional license to exercise Affirmer's Copyright and Related Rights in the Work (i) in all territories worldwide, (ii) for the maximum duration provided by applicable law or treaty (including future time extensions), (iii) in any current or future medium and for any number of copies, and (iv) for any purpose whatsoever, including without limitation commercial, advertising or promotional purposes (the "License"). The License shall be deemed effective as of the date CC0 was applied by Affirmer to the Work. Should any part of the License for any reason be judged legally invalid or ineffective under applicable law, such partial invalidity or ineffectiveness shall not invalidate the remainder of the License, and in such case Affirmer hereby affirms that he or she will not (i) exercise any of his or her remaining Copyright and Related Rights in the Work or (ii) assert any associated claims and causes of action with respect to the Work, in either case contrary to Affirmer's express Statement of Purpose. 4. Limitations and Disclaimers. a. No trademark or patent rights held by Affirmer are waived, abandoned, surrendered, licensed or otherwise affected by this document. b. Affirmer offers the Work as-is and makes no representations or warranties of any kind concerning the Work, express, implied, statutory or otherwise, including without limitation warranties of title, merchantability, fitness for a particular purpose, non infringement, or the absence of latent or other defects, accuracy, or the present or absence of errors, whether or not discoverable, all to the greatest extent permissible under applicable law. c. Affirmer disclaims responsibility for clearing rights of other persons that may apply to the Work or any use thereof, including without limitation any person's Copyright and Related Rights in the Work. Further, Affirmer disclaims responsibility for obtaining any necessary consents, permissions or other rights required for any use of the Work. d. Affirmer understands and acknowledges that Creative Commons is not a party to this document and has no duty or obligation with respect to this CC0 or use of the Work. ================================================ FILE: third-party/blake3/README.md ================================================ # BLAKE3 BLAKE3 is a cryptographic hash function that is: - **Much faster** than MD5, SHA-1, SHA-2, SHA-3, and BLAKE2. - **Secure**, unlike MD5 and SHA-1. And secure against length extension, unlike SHA-2. - **Highly parallelizable** across any number of threads and SIMD lanes, because it's a Merkle tree on the inside. - Capable of **verified streaming** and **incremental updates**, again because it's a Merkle tree. - A **PRF**, **MAC**, **KDF**, and **XOF**, as well as a regular hash. - **One algorithm with no variants**, which is fast on x86-64 and also on smaller architectures. The [chart below](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/benchmarks/bar_chart.py) is an example benchmark of 16 KiB inputs on a Cascade Lake-SP 8275CL server CPU from 2019. For more detailed benchmarks, see the [BLAKE3 paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf).

performance graph

BLAKE3 is based on an optimized instance of the established hash function [BLAKE2](https://blake2.net) and on the [original Bao tree mode](https://github.com/oconnor663/bao/blob/master/docs/spec_0.9.1.md). The specifications and design rationale are available in the [BLAKE3 paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). The default output size is 256 bits. The current version of [Bao](https://github.com/oconnor663/bao) implements verified streaming with BLAKE3. This repository is the official implementation of BLAKE3. It includes: * The [`blake3`](https://crates.io/crates/blake3) Rust crate, which includes optimized implementations for SSE2, SSE4.1, AVX2, AVX-512, NEON, and WASM, with automatic runtime CPU feature detection on x86. The `rayon` feature provides multithreading. * The [`b3sum`](https://crates.io/crates/b3sum) Rust crate, which provides a command line interface. It uses multithreading by default, making it an order of magnitude faster than e.g. `sha256sum` on typical desktop hardware. * The [C implementation](c), which like the Rust implementation includes SIMD optimizations (all except WASM), CPU feature detection on x86, and optional multithreading. See [`c/README.md`](c/README.md). * The [Rust reference implementation](reference_impl/reference_impl.rs), which is discussed in Section 5.1 of the [BLAKE3 paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). This implementation is much smaller and simpler than the optimized ones above. If you want to see how BLAKE3 works, or you're writing a port that doesn't need multithreading or SIMD optimizations, start here. Ports of the reference implementation to other languages are hosted in separate repositories ([C](https://github.com/oconnor663/blake3_reference_impl_c), [Python](https://github.com/oconnor663/pure_python_blake3)). * A [set of test vectors](https://github.com/BLAKE3-team/BLAKE3/blob/master/test_vectors/test_vectors.json) that covers extended outputs, all three modes, and a variety of input lengths. * [![Actions Status](https://github.com/BLAKE3-team/BLAKE3/workflows/tests/badge.svg)](https://github.com/BLAKE3-team/BLAKE3/actions) BLAKE3 was designed by: * [@oconnor663] (Jack O'Connor) * [@sneves] (Samuel Neves) * [@veorq] (Jean-Philippe Aumasson) * [@zookozcash] (Zooko) The development of BLAKE3 was sponsored by [Electric Coin Company](https://electriccoin.co). BLAKE3 is also [specified](https://c2sp.org/BLAKE3) in the [Community Cryptography Specification Project (C2SP)](https://c2sp.org). *NOTE: BLAKE3 is not a password hashing algorithm, because it's designed to be fast, whereas password hashing should not be fast. If you hash passwords to store the hashes or if you derive keys from passwords, we recommend [Argon2](https://github.com/P-H-C/phc-winner-argon2).* ## Usage ### The `b3sum` utility The `b3sum` command line utility prints the BLAKE3 hashes of files or of standard input. Prebuilt binaries are available for Linux, Windows, and macOS (requiring the [unidentified developer workaround](https://support.apple.com/guide/mac-help/open-a-mac-app-from-an-unidentified-developer-mh40616/mac)) on the [releases page](https://github.com/BLAKE3-team/BLAKE3/releases). If you've [installed Rust and Cargo](https://doc.rust-lang.org/cargo/getting-started/installation.html), you can also build `b3sum` yourself with: ```bash cargo install b3sum ``` If `rustup` didn't configure your `PATH` for you, you might need to go looking for the installed binary in e.g. `~/.cargo/bin`. You can test out how fast BLAKE3 is on your machine by creating a big file and hashing it, for example: ```bash # Create a 1 GB file. head -c 1000000000 /dev/zero > /tmp/bigfile # Hash it with SHA-256. time openssl sha256 /tmp/bigfile # Hash it with BLAKE3. time b3sum /tmp/bigfile ``` ### The `blake3` crate [![docs.rs](https://docs.rs/blake3/badge.svg)](https://docs.rs/blake3) To use BLAKE3 from Rust code, add a dependency on the `blake3` crate to your `Cargo.toml`. Here's an example of hashing some input bytes: ```rust // Hash an input all at once. let hash1 = blake3::hash(b"foobarbaz"); // Hash an input incrementally. let mut hasher = blake3::Hasher::new(); hasher.update(b"foo"); hasher.update(b"bar"); hasher.update(b"baz"); let hash2 = hasher.finalize(); assert_eq!(hash1, hash2); // Extended output. OutputReader also implements Read and Seek. let mut output = [0; 1000]; let mut output_reader = hasher.finalize_xof(); output_reader.fill(&mut output); assert_eq!(hash1, output[..32]); // Print a hash as hex. println!("{}", hash1); ``` Besides `hash`, BLAKE3 provides two other modes, `keyed_hash` and `derive_key`. The `keyed_hash` mode takes a 256-bit key: ```rust // MAC an input all at once. let example_key = [42u8; 32]; let mac1 = blake3::keyed_hash(&example_key, b"example input"); // MAC incrementally. let mut hasher = blake3::Hasher::new_keyed(&example_key); hasher.update(b"example input"); let mac2 = hasher.finalize(); assert_eq!(mac1, mac2); ``` The `derive_key` mode takes a context string and some key material (not a password). The context string should be hardcoded, globally unique, and application-specific. A good default format for the context string is `"[application] [commit timestamp] [purpose]"`: ```rust // Derive a couple of subkeys for different purposes. const EMAIL_CONTEXT: &str = "BLAKE3 example 2020-01-07 17:10:44 email key"; const API_CONTEXT: &str = "BLAKE3 example 2020-01-07 17:11:21 API key"; let input_key_material = b"usually at least 32 random bytes, not a password"; let email_key = blake3::derive_key(EMAIL_CONTEXT, input_key_material); let api_key = blake3::derive_key(API_CONTEXT, input_key_material); assert_ne!(email_key, api_key); ``` ### The C implementation See [`c/README.md`](c/README.md). ### Other implementations We post links to third-party bindings and implementations on the [@BLAKE3team Twitter account](https://twitter.com/BLAKE3team) whenever we hear about them. Some highlights include [an optimized Go implementation](https://github.com/zeebo/blake3), [Wasm bindings for Node.js and browsers](https://github.com/connor4312/blake3), [binary wheels for Python](https://github.com/oconnor663/blake3-py), [.NET bindings](https://github.com/xoofx/Blake3.NET), and [JNI bindings](https://github.com/sken77/BLAKE3jni). ## Contributing Please see [CONTRIBUTING.md](CONTRIBUTING.md). ## Licenses This work is released into the public domain with [CC0 1.0](./LICENSE_CC0). Alternatively, it is licensed under any of the following: * [Apache 2.0](./LICENSE_A2) * [Apache 2.0 with LLVM exceptions](./LICENSE_A2LLVM) ## Adoption & deployment * [Bazel](https://github.com/bazelbuild/bazel/releases/tag/6.4.0) * [Cargo](https://github.com/rust-lang/cargo/pull/14137) * [Ccache](https://github.com/ccache/ccache/pull/519) * [Chia](https://github.com/Chia-Network/chia-blockchain/blob/main/CHANGELOG.md#10beta8-aka-beta-18---2020-07-16) * [Clickhouse](https://github.com/ClickHouse/ClickHouse/blob/master/rust/chcache/Cargo.toml#L7) * [Farcaster](https://www.farcaster.xyz/) * [IPFS](https://github.com/ipfs/go-verifcid/issues/13) * [Iroh](https://www.iroh.computer/blog/blake3-hazmat-api) * [LLVM](https://reviews.llvm.org/D121510) * [Nix](https://github.com/NixOS/nix/pull/12379) * [Nym](https://github.com/nymtech/nym/blob/59056a22c5e6b01a38da2124662bd1fa3c8abef2/common/nymsphinx/params/src/lib.rs#L5) * [OpenZFS](https://github.com/openzfs/zfs/) * [Redox](https://www.redox-os.org/news/pkgar-introduction/) * [Solana](https://docs.rs/solana-program/1.9.5/solana_program/blake3/index.html) * [Tekken 8](https://x.com/rodarmor/status/1751567502050771189) * [Wasmer](https://github.com/wasmerio/wasmer/blob/4f935a8c162bf604df223003e434e4f7ca253688/lib/cache/src/hash.rs#L21) ## Miscellany - [@veorq] and [@oconnor663] did [an interview with Cryptography FM](https://www.cryptography.fm/3). - [@oconnor663] did [an interview with Saito](https://www.youtube.com/watch?v=cJkmIt7yN_E). [@oconnor663]: https://github.com/oconnor663 [@sneves]: https://github.com/sneves [@veorq]: https://github.com/veorq [@zookozcash]: https://github.com/zookozcash ================================================ FILE: third-party/blake3/b3sum/.gitignore ================================================ !Cargo.lock ================================================ FILE: third-party/blake3/b3sum/Cargo.lock ================================================ # This file is automatically @generated by Cargo. # It is not intended for manual editing. version = 4 [[package]] name = "anstream" version = "0.6.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" dependencies = [ "anstyle", "anstyle-parse", "anstyle-query", "anstyle-wincon", "colorchoice", "is_terminal_polyfill", "utf8parse", ] [[package]] name = "anstyle" version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" [[package]] name = "anstyle-parse" version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" dependencies = [ "windows-sys", ] [[package]] name = "anstyle-wincon" version = "3.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca3534e77181a9cc07539ad51f2141fe32f6c3ffd4df76db8ad92346b003ae4e" dependencies = [ "anstyle", "once_cell", "windows-sys", ] [[package]] name = "anyhow" version = "1.0.98" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" [[package]] name = "arrayref" version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" [[package]] name = "arrayvec" version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "b3sum" version = "1.8.2" dependencies = [ "anyhow", "blake3", "clap", "duct", "hex", "rayon-core", "tempfile", "wild", ] [[package]] name = "bitflags" version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c8214115b7bf84099f1309324e63141d4c5d7cc26862f97a0a857dbefe165bd" [[package]] name = "blake3" version = "1.8.2" dependencies = [ "arrayref", "arrayvec", "cc", "cfg-if", "constant_time_eq", "memmap2", "rayon-core", ] [[package]] name = "cc" version = "1.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e3a13707ac958681c13b39b458c073d0d9bc8a22cb1b2f4c8e55eb72c13f362" dependencies = [ "shlex", ] [[package]] name = "cfg-if" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clap" version = "4.5.37" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eccb054f56cbd38340b380d4a8e69ef1f02f1af43db2f0cc817a4774d80ae071" dependencies = [ "clap_builder", "clap_derive", ] [[package]] name = "clap_builder" version = "4.5.37" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "efd9466fac8543255d3b1fcad4762c5e116ffe808c8a3043d4263cd4fd4862a2" dependencies = [ "anstream", "anstyle", "clap_lex", "strsim", "terminal_size", ] [[package]] name = "clap_derive" version = "4.5.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09176aae279615badda0765c0c0b3f6ed53f4709118af73cf4655d85d1530cd7" dependencies = [ "heck", "proc-macro2", "quote", "syn", ] [[package]] name = "clap_lex" version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" [[package]] name = "colorchoice" version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" [[package]] name = "constant_time_eq" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" [[package]] name = "crossbeam-deque" version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" dependencies = [ "crossbeam-epoch", "crossbeam-utils", ] [[package]] name = "crossbeam-epoch" version = "0.9.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" dependencies = [ "crossbeam-utils", ] [[package]] name = "crossbeam-utils" version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "duct" version = "0.13.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e4ab5718d1224b63252cd0c6f74f6480f9ffeb117438a2e0f5cf6d9a4798929c" dependencies = [ "libc", "once_cell", "os_pipe", "shared_child", ] [[package]] name = "errno" version = "0.3.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "976dd42dc7e85965fe702eb8164f21f450704bdde31faefd6471dba214cb594e" dependencies = [ "libc", "windows-sys", ] [[package]] name = "fastrand" version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "getrandom" version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73fea8450eea4bac3940448fb7ae50d91f034f941199fcd9d909a5a07aa455f0" dependencies = [ "cfg-if", "libc", "r-efi", "wasi", ] [[package]] name = "glob" version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" [[package]] name = "heck" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" [[package]] name = "hex" version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" [[package]] name = "is_terminal_polyfill" version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" [[package]] name = "libc" version = "0.2.172" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" [[package]] name = "linux-raw-sys" version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12" [[package]] name = "memmap2" version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fd3f7eed9d3848f8b98834af67102b720745c4ec028fcd0aa0239277e7de374f" dependencies = [ "libc", ] [[package]] name = "once_cell" version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" [[package]] name = "os_pipe" version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5ffd2b0a5634335b135d5728d84c5e0fd726954b87111f7506a61c502280d982" dependencies = [ "libc", "windows-sys", ] [[package]] name = "proc-macro2" version = "1.0.95" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" dependencies = [ "unicode-ident", ] [[package]] name = "quote" version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" dependencies = [ "proc-macro2", ] [[package]] name = "r-efi" version = "5.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" [[package]] name = "rayon-core" version = "1.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" dependencies = [ "crossbeam-deque", "crossbeam-utils", ] [[package]] name = "rustix" version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d97817398dd4bb2e6da002002db259209759911da105da92bec29ccb12cf58bf" dependencies = [ "bitflags", "errno", "libc", "linux-raw-sys", "windows-sys", ] [[package]] name = "shared_child" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09fa9338aed9a1df411814a5b2252f7cd206c55ae9bf2fa763f8de84603aa60c" dependencies = [ "libc", "windows-sys", ] [[package]] name = "shlex" version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "strsim" version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "syn" version = "2.0.100" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] [[package]] name = "tempfile" version = "3.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7437ac7763b9b123ccf33c338a5cc1bac6f69b45a136c19bdd8a65e3916435bf" dependencies = [ "fastrand", "getrandom", "once_cell", "rustix", "windows-sys", ] [[package]] name = "terminal_size" version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "45c6481c4829e4cc63825e62c49186a34538b7b2750b73b266581ffb612fb5ed" dependencies = [ "rustix", "windows-sys", ] [[package]] name = "unicode-ident" version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" [[package]] name = "utf8parse" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "wasi" version = "0.14.2+wasi-0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3" dependencies = [ "wit-bindgen-rt", ] [[package]] name = "wild" version = "2.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a3131afc8c575281e1e80f36ed6a092aa502c08b18ed7524e86fbbb12bb410e1" dependencies = [ "glob", ] [[package]] name = "windows-sys" version = "0.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" dependencies = [ "windows-targets", ] [[package]] name = "windows-targets" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ "windows_aarch64_gnullvm", "windows_aarch64_msvc", "windows_i686_gnu", "windows_i686_gnullvm", "windows_i686_msvc", "windows_x86_64_gnu", "windows_x86_64_gnullvm", "windows_x86_64_msvc", ] [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" [[package]] name = "windows_aarch64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" [[package]] name = "windows_i686_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" [[package]] name = "windows_i686_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" [[package]] name = "windows_i686_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" [[package]] name = "windows_x86_64_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" [[package]] name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "wit-bindgen-rt" version = "0.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" dependencies = [ "bitflags", ] ================================================ FILE: third-party/blake3/b3sum/Cargo.toml ================================================ [package] name = "b3sum" version = "1.8.2" authors = ["Jack O'Connor "] description = "a command line implementation of the BLAKE3 hash function" repository = "https://github.com/BLAKE3-team/BLAKE3" license = "CC0-1.0 OR Apache-2.0 OR Apache-2.0 WITH LLVM-exception" readme = "README.md" edition = "2021" [features] neon = ["blake3/neon"] prefer_intrinsics = ["blake3/prefer_intrinsics"] pure = ["blake3/pure"] [dependencies] anyhow = "1.0.25" blake3 = { version = "1.8", path = "..", features = ["mmap", "rayon"] } clap = { version = "4.0.8", features = ["derive", "wrap_help"] } hex = "0.4.0" rayon-core = "1.12.1" wild = "2.0.3" [dev-dependencies] duct = "0.13.3" tempfile = "3.1.0" ================================================ FILE: third-party/blake3/b3sum/README.md ================================================ # b3sum A command line utility for calculating [BLAKE3](https://github.com/BLAKE3-team/BLAKE3) hashes, similar to Coreutils tools like `b2sum` or `md5sum`. ``` Usage: b3sum [OPTIONS] [FILE]... Arguments: [FILE]... Files to hash, or checkfiles to check Options: --keyed Use the keyed mode, reading the 32-byte key from stdin --derive-key Use the key derivation mode, with the given context string -l, --length The number of output bytes, before hex encoding [default: 32] --seek The starting output byte offset, before hex encoding [default: 0] --num-threads The maximum number of threads to use --no-mmap Disable memory mapping --no-names Omit filenames in the output --raw Write raw output bytes to stdout, rather than hex --tag Output BSD-style checksums: BLAKE3 ([FILE]) = [HASH] -c, --check Read BLAKE3 sums from the [FILE]s and check them --quiet Skip printing OK for each checked file -h, --help Print help (see more with '--help') -V, --version Print version ``` See also [this document about how the `--check` flag works](https://github.com/BLAKE3-team/BLAKE3/blob/master/b3sum/what_does_check_do.md). # Example Hash the file `foo.txt`: ```bash b3sum foo.txt ``` Time hashing a gigabyte of data, to see how fast it is: ```bash # Create a 1 GB file. head -c 1000000000 /dev/zero > /tmp/bigfile # Hash it with SHA-256. time openssl sha256 /tmp/bigfile # Hash it with BLAKE3. time b3sum /tmp/bigfile ``` # Installation Prebuilt binaries are available for Linux, Windows, and macOS (requiring the [unidentified developer workaround](https://support.apple.com/guide/mac-help/open-a-mac-app-from-an-unidentified-developer-mh40616/mac)) on the [releases page](https://github.com/BLAKE3-team/BLAKE3/releases). If you've [installed Rust and Cargo](https://doc.rust-lang.org/cargo/getting-started/installation.html), you can also build `b3sum` yourself with: ``` cargo install b3sum ``` On Linux for example, Cargo will put the compiled binary in `~/.cargo/bin`. You might want to add that directory to your `$PATH`, or `rustup` might have done it for you when you installed Cargo. If you want to install directly from this directory, you can run `cargo install --path .`. Or you can just build with `cargo build --release`, which puts the binary at `./target/release/b3sum`. ================================================ FILE: third-party/blake3/b3sum/src/main.rs ================================================ use anyhow::{bail, ensure}; use clap::Parser; use std::cmp; use std::fs::File; use std::io; use std::io::prelude::*; use std::path::{Path, PathBuf}; #[cfg(test)] mod unit_tests; const NAME: &str = "b3sum"; const DERIVE_KEY_ARG: &str = "derive_key"; const KEYED_ARG: &str = "keyed"; const LENGTH_ARG: &str = "length"; const NO_NAMES_ARG: &str = "no_names"; const RAW_ARG: &str = "raw"; const TAG_ARG: &str = "tag"; const CHECK_ARG: &str = "check"; #[derive(Parser)] #[command(version, max_term_width(100))] struct Inner { /// Files to hash, or checkfiles to check /// /// When no file is given, or when - is given, read standard input. file: Vec, /// Use the keyed mode, reading the 32-byte key from stdin #[arg(long, requires("file"))] keyed: bool, /// Use the key derivation mode, with the given context string /// /// Cannot be used with --keyed. #[arg(long, value_name("CONTEXT"), conflicts_with(KEYED_ARG))] derive_key: Option, /// The number of output bytes, before hex encoding #[arg( short, long, default_value_t = blake3::OUT_LEN as u64, value_name("LEN") )] length: u64, /// The starting output byte offset, before hex encoding #[arg(long, default_value_t = 0, value_name("SEEK"))] seek: u64, /// The maximum number of threads to use /// /// By default, this is the number of logical cores. If this flag is /// omitted, or if its value is 0, RAYON_NUM_THREADS is also respected. #[arg(long, value_name("NUM"))] num_threads: Option, /// Disable memory mapping /// /// Currently this also disables multithreading. #[arg(long)] no_mmap: bool, /// Omit filenames in the output #[arg(long)] no_names: bool, /// Write raw output bytes to stdout, rather than hex /// /// --no-names is implied. In this case, only a single input is allowed. #[arg(long)] raw: bool, /// Output BSD-style checksums: BLAKE3 ([FILE]) = [HASH] #[arg(long)] tag: bool, /// Read BLAKE3 sums from the [FILE]s and check them #[arg( short, long, conflicts_with(DERIVE_KEY_ARG), conflicts_with(KEYED_ARG), conflicts_with(LENGTH_ARG), conflicts_with(RAW_ARG), conflicts_with(TAG_ARG), conflicts_with(NO_NAMES_ARG) )] check: bool, /// Skip printing OK for each checked file /// /// Must be used with --check. #[arg(long, requires(CHECK_ARG))] quiet: bool, } struct Args { inner: Inner, file_args: Vec, base_hasher: blake3::Hasher, } impl Args { fn parse() -> anyhow::Result { // wild::args_os() is equivalent to std::env::args_os() on Unix, // but on Windows it adds support for globbing. let inner = Inner::parse_from(wild::args_os()); let file_args = if !inner.file.is_empty() { inner.file.clone() } else { vec!["-".into()] }; if inner.raw && file_args.len() > 1 { bail!("Only one filename can be provided when using --raw"); } let base_hasher = if inner.keyed { // In keyed mode, since stdin is used for the key, we can't handle // `-` arguments. Input::open handles that case below. blake3::Hasher::new_keyed(&read_key_from_stdin()?) } else if let Some(ref context) = inner.derive_key { blake3::Hasher::new_derive_key(context) } else { blake3::Hasher::new() }; Ok(Self { inner, file_args, base_hasher, }) } fn num_threads(&self) -> Option { self.inner.num_threads } fn check(&self) -> bool { self.inner.check } fn raw(&self) -> bool { self.inner.raw } fn tag(&self) -> bool { self.inner.tag } fn no_mmap(&self) -> bool { self.inner.no_mmap } fn no_names(&self) -> bool { self.inner.no_names } fn len(&self) -> u64 { self.inner.length } fn seek(&self) -> u64 { self.inner.seek } fn keyed(&self) -> bool { self.inner.keyed } fn quiet(&self) -> bool { self.inner.quiet } } fn hash_path(args: &Args, path: &Path) -> anyhow::Result { let mut hasher = args.base_hasher.clone(); if path == Path::new("-") { if args.keyed() { bail!("Cannot open `-` in keyed mode"); } hasher.update_reader(io::stdin().lock())?; } else if args.no_mmap() { hasher.update_reader(File::open(path)?)?; } else { // The fast path: Try to mmap the file and hash it with multiple threads. hasher.update_mmap_rayon(path)?; } let mut output_reader = hasher.finalize_xof(); output_reader.set_position(args.seek()); Ok(output_reader) } fn write_hex_output(mut output: blake3::OutputReader, args: &Args) -> anyhow::Result<()> { // Encoding multiples of the 64 bytes is most efficient. // TODO: This computes each output block twice when the --seek argument isn't a multiple of 64. // We'll refactor all of this soon anyway, once SIMD optimizations are available for the XOF. let mut len = args.len(); let mut block = [0; blake3::BLOCK_LEN]; while len > 0 { output.fill(&mut block); let hex_str = hex::encode(&block[..]); let take_bytes = cmp::min(len, block.len() as u64); print!("{}", &hex_str[..2 * take_bytes as usize]); len -= take_bytes; } Ok(()) } fn write_raw_output(output: blake3::OutputReader, args: &Args) -> anyhow::Result<()> { let mut output = output.take(args.len()); let stdout = std::io::stdout(); let mut handler = stdout.lock(); std::io::copy(&mut output, &mut handler)?; Ok(()) } fn read_key_from_stdin() -> anyhow::Result<[u8; blake3::KEY_LEN]> { let mut bytes = Vec::with_capacity(blake3::KEY_LEN + 1); let n = std::io::stdin() .lock() .take(blake3::KEY_LEN as u64 + 1) .read_to_end(&mut bytes)?; if n < blake3::KEY_LEN { bail!( "expected {} key bytes from stdin, found {}", blake3::KEY_LEN, n, ) } else if n > blake3::KEY_LEN { bail!("read more than {} key bytes from stdin", blake3::KEY_LEN) } else { Ok(bytes[..blake3::KEY_LEN].try_into().unwrap()) } } struct FilepathString { filepath_string: String, is_escaped: bool, } // returns (string, did_escape) fn filepath_to_string(filepath: &Path) -> FilepathString { let unicode_cow = filepath.to_string_lossy(); let mut filepath_string = unicode_cow.to_string(); // If we're on Windows, normalize backslashes to forward slashes. This // avoids a lot of ugly escaping in the common case, and it makes // checkfiles created on Windows more likely to be portable to Unix. It // also allows us to set a blanket "no backslashes allowed in checkfiles on // Windows" rule, rather than allowing a Unix backslash to potentially get // interpreted as a directory separator on Windows. if cfg!(windows) { filepath_string = filepath_string.replace('\\', "/"); } let mut is_escaped = false; if filepath_string.contains(['\\', '\n', '\r']) { filepath_string = filepath_string .replace('\\', "\\\\") .replace('\n', "\\n") .replace('\r', "\\r"); is_escaped = true; } FilepathString { filepath_string, is_escaped, } } fn hex_half_byte(c: char) -> anyhow::Result { // The hex characters in the hash must be lowercase for now, though we // could support uppercase too if we wanted to. if '0' <= c && c <= '9' { return Ok(c as u8 - '0' as u8); } if 'a' <= c && c <= 'f' { return Ok(c as u8 - 'a' as u8 + 10); } bail!("Invalid hex"); } // The `check` command is a security tool. That means it's much better for a // check to fail more often than it should (a false negative), than for a check // to ever succeed when it shouldn't (a false positive). By forbidding certain // characters in checked filepaths, we avoid a class of false positives where // two different filepaths can get confused with each other. fn check_for_invalid_characters(utf8_path: &str) -> anyhow::Result<()> { // Null characters in paths should never happen, but they can result in a // path getting silently truncated on Unix. if utf8_path.contains('\0') { bail!("Null character in path"); } // Because we convert invalid UTF-8 sequences in paths to the Unicode // replacement character, multiple different invalid paths can map to the // same UTF-8 string. if utf8_path.contains('�') { bail!("Unicode replacement character in path"); } // We normalize all Windows backslashes to forward slashes in our output, // so the only natural way to get a backslash in a checkfile on Windows is // to construct it on Unix and copy it over. (Or of course you could just // doctor it by hand.) To avoid confusing this with a directory separator, // we forbid backslashes entirely on Windows. Note that this check comes // after unescaping has been done. if cfg!(windows) && utf8_path.contains('\\') { bail!("Backslash in path"); } Ok(()) } fn unescape(mut path: &str) -> anyhow::Result { let mut unescaped = String::with_capacity(2 * path.len()); while let Some(i) = path.find('\\') { ensure!(i < path.len() - 1, "Invalid backslash escape"); unescaped.push_str(&path[..i]); match path[i + 1..].chars().next().unwrap() { // Anything other than a recognized escape sequence is an error. 'n' => unescaped.push_str("\n"), 'r' => unescaped.push_str("\r"), '\\' => unescaped.push_str("\\"), _ => bail!("Invalid backslash escape"), } path = &path[i + 2..]; } unescaped.push_str(path); Ok(unescaped) } #[derive(Debug)] struct ParsedCheckLine { file_string: String, is_escaped: bool, file_path: PathBuf, expected_hash: blake3::Hash, } fn split_untagged_check_line(line_after_slash: &str) -> Option<(&str, &str)> { // Of the form " ". The file might contain " ", so we need to split from the // left. line_after_slash.split_once(" ") } fn split_tagged_check_line(line_after_slash: &str) -> Option<(&str, &str)> { // Of the form "BLAKE3 () = ". The file might contain ") = ", so we need to split // from the *right*. let prefix = "BLAKE3 ("; if !line_after_slash.starts_with(prefix) { return None; } line_after_slash[prefix.len()..].rsplit_once(") = ") } fn parse_check_line(mut line: &str) -> anyhow::Result { // Trim off the trailing newlines, if any. line = line.trim_end_matches(['\r', '\n']); // If there's a backslash at the front of the line, that means we need to // unescape the path below. This matches the behavior of e.g. md5sum. let Some(first) = line.chars().next() else { bail!("Empty line"); }; let line_after_slash; let is_escaped; if first == '\\' { is_escaped = true; line_after_slash = &line[1..]; } else { is_escaped = false; line_after_slash = line; } // Split the line. It might be " " or "BLAKE3 () = ". The latter comes // from the --tag flag. let hash_hex; let file_str; if let Some((left, right)) = split_untagged_check_line(line_after_slash) { hash_hex = left; file_str = right; } else if let Some((left, right)) = split_tagged_check_line(line_after_slash) { file_str = left; hash_hex = right; } else { bail!("Invalid check line format"); } // Decode the hex hash. ensure!(hash_hex.len() == 2 * blake3::OUT_LEN, "Invalid hash length"); let mut hex_chars = hash_hex.chars(); let mut hash_bytes = [0; blake3::OUT_LEN]; for byte in &mut hash_bytes { let high_char = hex_chars.next().unwrap(); let low_char = hex_chars.next().unwrap(); *byte = 16 * hex_half_byte(high_char)? + hex_half_byte(low_char)?; } let expected_hash: blake3::Hash = hash_bytes.into(); // Unescape and validate the filepath. let file_path_string = if is_escaped { unescape(file_str)? } else { file_str.to_string() }; ensure!(!file_path_string.is_empty(), "empty file path"); check_for_invalid_characters(&file_path_string)?; Ok(ParsedCheckLine { file_string: file_str.to_string(), is_escaped, file_path: file_path_string.into(), expected_hash, }) } fn hash_one_input(path: &Path, args: &Args) -> anyhow::Result<()> { let output = hash_path(args, path)?; if args.raw() { write_raw_output(output, args)?; return Ok(()); } if args.no_names() { write_hex_output(output, args)?; println!(); return Ok(()); } let FilepathString { filepath_string, is_escaped, } = filepath_to_string(path); if is_escaped { print!("\\"); } if args.tag() { print!("BLAKE3 ({}) = ", filepath_string); write_hex_output(output, args)?; println!(); return Ok(()); } write_hex_output(output, args)?; println!(" {}", filepath_string); Ok(()) } // Returns true for success. Having a boolean return value here, instead of // passing down the files_failed reference, makes it less likely that we might // forget to set it in some error condition. fn check_one_line(line: &str, args: &Args) -> bool { let parse_result = parse_check_line(&line); let ParsedCheckLine { file_string, is_escaped, file_path, expected_hash, } = match parse_result { Ok(parsed) => parsed, Err(e) => { eprintln!("{}: {}", NAME, e); return false; } }; let file_string = if is_escaped { "\\".to_string() + &file_string } else { file_string }; let found_hash: blake3::Hash; match hash_path(args, &file_path) { Ok(mut output) => { let mut found_hash_bytes = [0; blake3::OUT_LEN]; output.fill(&mut found_hash_bytes); found_hash = found_hash_bytes.into(); } Err(e) => { println!("{}: FAILED ({})", file_string, e); return false; } }; // This is a constant-time comparison. if expected_hash == found_hash { if !args.quiet() { println!("{}: OK", file_string); } true } else { println!("{}: FAILED", file_string); false } } fn check_one_checkfile(path: &Path, args: &Args, files_failed: &mut u64) -> anyhow::Result<()> { let mut file; let stdin; let mut stdin_lock; let mut bufreader: io::BufReader<&mut dyn Read>; if path == Path::new("-") { stdin = io::stdin(); stdin_lock = stdin.lock(); bufreader = io::BufReader::new(&mut stdin_lock); } else { file = File::open(path)?; bufreader = io::BufReader::new(&mut file); } let mut line = String::new(); loop { line.clear(); let n = bufreader.read_line(&mut line)?; if n == 0 { return Ok(()); } // check_one_line() prints errors and turns them into a success=false // return, so it doesn't return a Result. let success = check_one_line(&line, args); if !success { // We use `files_failed > 0` to indicate a mismatch, so it's important for correctness // that it's impossible for this counter to overflow. *files_failed = files_failed.saturating_add(1); } } } fn main() -> anyhow::Result<()> { let args = Args::parse()?; let mut thread_pool_builder = rayon_core::ThreadPoolBuilder::new(); if let Some(num_threads) = args.num_threads() { thread_pool_builder = thread_pool_builder.num_threads(num_threads); } let thread_pool = thread_pool_builder.build()?; thread_pool.install(|| { let mut files_failed = 0u64; // Note that file_args automatically includes `-` if nothing is given. for path in &args.file_args { if args.check() { check_one_checkfile(path, &args, &mut files_failed)?; } else { // Errors encountered in hashing are tolerated and printed to // stderr. This allows e.g. `b3sum *` to print errors for // non-files and keep going. However, if we encounter any // errors we'll still return non-zero at the end. let result = hash_one_input(path, &args); if let Err(e) = result { files_failed = files_failed.saturating_add(1); eprintln!("{}: {}: {}", NAME, path.to_string_lossy(), e); } } } if args.check() && files_failed > 0 { eprintln!( "{}: WARNING: {} computed checksum{} did NOT match", NAME, files_failed, if files_failed == 1 { "" } else { "s" }, ); } std::process::exit(if files_failed > 0 { 1 } else { 0 }); }) } #[cfg(test)] mod test { use clap::CommandFactory; #[test] fn test_args() { crate::Inner::command().debug_assert(); } } ================================================ FILE: third-party/blake3/b3sum/src/unit_tests.rs ================================================ use std::path::Path; #[test] fn test_parse_check_line() { // ========================= // ===== Success Cases ===== // ========================= // the basic case let crate::ParsedCheckLine { file_string, is_escaped, file_path, expected_hash, } = crate::parse_check_line( "0909090909090909090909090909090909090909090909090909090909090909 foo", ) .unwrap(); assert_eq!(expected_hash, blake3::Hash::from([0x09; 32])); assert!(!is_escaped); assert_eq!(file_string, "foo"); assert_eq!(file_path, Path::new("foo")); // regular whitespace let crate::ParsedCheckLine { file_string, is_escaped, file_path, expected_hash, } = crate::parse_check_line( "fafafafafafafafafafafafafafafafafafafafafafafafafafafafafafafafa \t\r\n\n\r \t\r\n\n\r", ) .unwrap(); assert_eq!(expected_hash, blake3::Hash::from([0xfa; 32])); assert!(!is_escaped); assert_eq!(file_string, " \t\r\n\n\r \t"); assert_eq!(file_path, Path::new(" \t\r\n\n\r \t")); // path is one space let crate::ParsedCheckLine { file_string, is_escaped, file_path, expected_hash, } = crate::parse_check_line( "4242424242424242424242424242424242424242424242424242424242424242 ", ) .unwrap(); assert_eq!(expected_hash, blake3::Hash::from([0x42; 32])); assert!(!is_escaped); assert_eq!(file_string, " "); assert_eq!(file_path, Path::new(" ")); // *Unescaped* backslashes. Note that this line does *not* start with a // backslash, so something like "\" + "n" is interpreted as *two* // characters. We forbid all backslashes on Windows, so this test is // Unix-only. if cfg!(not(windows)) { let crate::ParsedCheckLine { file_string, is_escaped, file_path, expected_hash, } = crate::parse_check_line( "4343434343434343434343434343434343434343434343434343434343434343 fo\\a\\no", ) .unwrap(); assert_eq!(expected_hash, blake3::Hash::from([0x43; 32])); assert!(!is_escaped); assert_eq!(file_string, "fo\\a\\no"); assert_eq!(file_path, Path::new("fo\\a\\no")); } // escaped newlines let crate::ParsedCheckLine { file_string, is_escaped, file_path, expected_hash, } = crate::parse_check_line( "\\4444444444444444444444444444444444444444444444444444444444444444 fo\\r\\n\\n\\ro", ) .unwrap(); assert_eq!(expected_hash, blake3::Hash::from([0x44; 32])); assert!(is_escaped); assert_eq!(file_string, "fo\\r\\n\\n\\ro"); assert_eq!(file_path, Path::new("fo\r\n\n\ro")); // Escaped newline and backslash. Again because backslash is not allowed on // Windows, this test is Unix-only. if cfg!(not(windows)) { let crate::ParsedCheckLine { file_string, is_escaped, file_path, expected_hash, } = crate::parse_check_line( "\\4545454545454545454545454545454545454545454545454545454545454545 fo\\n\\\\o", ) .unwrap(); assert_eq!(expected_hash, blake3::Hash::from([0x45; 32])); assert!(is_escaped); assert_eq!(file_string, "fo\\n\\\\o"); assert_eq!(file_path, Path::new("fo\n\\o")); } // non-ASCII path let crate::ParsedCheckLine { file_string, is_escaped, file_path, expected_hash, } = crate::parse_check_line( "4646464646464646464646464646464646464646464646464646464646464646 否认", ) .unwrap(); assert_eq!(expected_hash, blake3::Hash::from([0x46; 32])); assert!(!is_escaped); assert_eq!(file_string, "否认"); assert_eq!(file_path, Path::new("否认")); // untagged separator " " in the file name let crate::ParsedCheckLine { file_string, is_escaped, file_path, expected_hash, } = crate::parse_check_line( "4747474747474747474747474747474747474747474747474747474747474747 foo bar", ) .unwrap(); assert_eq!(expected_hash, blake3::Hash::from([0x47; 32])); assert!(!is_escaped); assert_eq!(file_string, "foo bar"); assert_eq!(file_path, Path::new("foo bar")); // tagged separator ") = " in the file name let crate::ParsedCheckLine { file_string, is_escaped, file_path, expected_hash, } = crate::parse_check_line( "BLAKE3 (foo) = bar) = 4848484848484848484848484848484848484848484848484848484848484848", ) .unwrap(); assert_eq!(expected_hash, blake3::Hash::from([0x48; 32])); assert!(!is_escaped); assert_eq!(file_string, "foo) = bar"); assert_eq!(file_path, Path::new("foo) = bar")); // ========================= // ===== Failure Cases ===== // ========================= // too short crate::parse_check_line("").unwrap_err(); crate::parse_check_line("0").unwrap_err(); crate::parse_check_line("00").unwrap_err(); crate::parse_check_line("0000000000000000000000000000000000000000000000000000000000000000") .unwrap_err(); crate::parse_check_line("0000000000000000000000000000000000000000000000000000000000000000 ") .unwrap_err(); // not enough spaces crate::parse_check_line("0000000000000000000000000000000000000000000000000000000000000000 foo") .unwrap_err(); // capital letter hex crate::parse_check_line( "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA foo", ) .unwrap_err(); // non-hex hex crate::parse_check_line( "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx foo", ) .unwrap_err(); // non-ASCII hex crate::parse_check_line("你好, 我叫杰克. 认识你很高兴. 要不要吃个香蕉? foo").unwrap_err(); // invalid escape sequence crate::parse_check_line( "\\0000000000000000000000000000000000000000000000000000000000000000 fo\\o", ) .unwrap_err(); // truncated escape sequence crate::parse_check_line( "\\0000000000000000000000000000000000000000000000000000000000000000 foo\\", ) .unwrap_err(); // null char crate::parse_check_line( "0000000000000000000000000000000000000000000000000000000000000000 fo\0o", ) .unwrap_err(); // Unicode replacement char crate::parse_check_line( "0000000000000000000000000000000000000000000000000000000000000000 fo�o", ) .unwrap_err(); // On Windows only, backslashes are not allowed, escaped or otherwise. if cfg!(windows) { crate::parse_check_line( "0000000000000000000000000000000000000000000000000000000000000000 fo\\o", ) .unwrap_err(); crate::parse_check_line( "\\0000000000000000000000000000000000000000000000000000000000000000 fo\\\\o", ) .unwrap_err(); } } #[test] fn test_filepath_to_string() { let output = crate::filepath_to_string(Path::new("foo")); assert_eq!(output.filepath_string, "foo"); assert!(!output.is_escaped); let output = crate::filepath_to_string(Path::new("f\\ \t\r\noo")); if cfg!(windows) { // We normalize backslashes to forward slashes on Windows. assert_eq!(output.filepath_string, "f/ \t\\r\\noo"); } else { assert_eq!(output.filepath_string, "f\\\\ \t\\r\\noo"); } assert!(output.is_escaped); } ================================================ FILE: third-party/blake3/b3sum/tests/cli_tests.rs ================================================ use duct::cmd; use std::ffi::OsString; use std::fs; use std::io::prelude::*; use std::path::PathBuf; pub fn b3sum_exe() -> PathBuf { env!("CARGO_BIN_EXE_b3sum").into() } #[test] fn test_hash_one() { let expected = format!("{} -", blake3::hash(b"foo").to_hex()); let output = cmd!(b3sum_exe()).stdin_bytes("foo").read().unwrap(); assert_eq!(&*expected, output); } #[test] fn test_hash_one_tag() { let expected = format!("BLAKE3 (-) = {}", blake3::hash(b"foo").to_hex()); let output = cmd!(b3sum_exe(), "--tag") .stdin_bytes("foo") .read() .unwrap(); assert_eq!(&*expected, output); } #[test] fn test_hash_one_raw() { let expected = blake3::hash(b"foo").as_bytes().to_owned(); let output = cmd!(b3sum_exe(), "--raw") .stdin_bytes("foo") .stdout_capture() .run() .unwrap() .stdout; assert_eq!(expected, output.as_slice()); } #[test] fn test_hash_many() { let dir = tempfile::tempdir().unwrap(); let file1 = dir.path().join("file1"); fs::write(&file1, b"foo").unwrap(); let file2 = dir.path().join("file2"); fs::write(&file2, b"bar").unwrap(); let output = cmd!(b3sum_exe(), &file1, &file2).read().unwrap(); let foo_hash = blake3::hash(b"foo"); let bar_hash = blake3::hash(b"bar"); let expected = format!( "{} {}\n{} {}", foo_hash.to_hex(), // account for slash normalization on Windows file1.to_string_lossy().replace("\\", "/"), bar_hash.to_hex(), file2.to_string_lossy().replace("\\", "/"), ); assert_eq!(expected, output); let output_no_names = cmd!(b3sum_exe(), "--no-names", &file1, &file2) .read() .unwrap(); let expected_no_names = format!("{}\n{}", foo_hash.to_hex(), bar_hash.to_hex(),); assert_eq!(expected_no_names, output_no_names); } #[test] fn test_hash_many_tag() { let dir = tempfile::tempdir().unwrap(); let file1 = dir.path().join("file1"); fs::write(&file1, b"foo").unwrap(); let file2 = dir.path().join("file2"); fs::write(&file2, b"bar").unwrap(); let output = cmd!(b3sum_exe(), "--tag", &file1, &file2).read().unwrap(); let foo_hash = blake3::hash(b"foo"); let bar_hash = blake3::hash(b"bar"); let expected = format!( "BLAKE3 ({}) = {}\nBLAKE3 ({}) = {}", // account for slash normalization on Windows file1.to_string_lossy().replace("\\", "/"), foo_hash.to_hex(), file2.to_string_lossy().replace("\\", "/"), bar_hash.to_hex(), ); assert_eq!(expected, output); } #[test] fn test_missing_files() { let dir = tempfile::tempdir().unwrap(); let file1 = dir.path().join("file1"); fs::write(&file1, b"foo").unwrap(); let file2 = dir.path().join("file2"); fs::write(&file2, b"bar").unwrap(); let output = cmd!(b3sum_exe(), "file1", "missing_file", "file2") .dir(dir.path()) .stdout_capture() .stderr_capture() .unchecked() .run() .unwrap(); assert!(!output.status.success()); let foo_hash = blake3::hash(b"foo"); let bar_hash = blake3::hash(b"bar"); let expected_stdout = format!( "{} file1\n{} file2\n", foo_hash.to_hex(), bar_hash.to_hex(), ); assert_eq!(expected_stdout.as_bytes(), &output.stdout[..]); let bing_error = fs::File::open(dir.path().join("missing_file")).unwrap_err(); let expected_stderr = format!("b3sum: missing_file: {}\n", bing_error.to_string()); assert_eq!(expected_stderr.as_bytes(), &output.stderr[..]); } #[test] fn test_hash_length_and_seek() { let mut expected = [0; 100]; blake3::Hasher::new() .update(b"foo") .finalize_xof() .fill(&mut expected); let output = cmd!(b3sum_exe(), "--raw", "--length=100") .stdin_bytes("foo") .stdout_capture() .run() .unwrap() .stdout; assert_eq!(expected[..], output); let short_output = cmd!(b3sum_exe(), "--raw", "--length=99") .stdin_bytes("foo") .stdout_capture() .run() .unwrap() .stdout; assert_eq!(expected[..99], short_output); let seek1_output = cmd!(b3sum_exe(), "--raw", "--length=99", "--seek=1") .stdin_bytes("foo") .stdout_capture() .run() .unwrap() .stdout; assert_eq!(expected[1..], seek1_output); let seek99_output = cmd!(b3sum_exe(), "--raw", "--length=1", "--seek=99") .stdin_bytes("foo") .stdout_capture() .run() .unwrap() .stdout; assert_eq!(expected[99..], seek99_output); } #[test] fn test_keyed() { let key = [42; blake3::KEY_LEN]; let f = tempfile::NamedTempFile::new().unwrap(); f.as_file().write_all(b"foo").unwrap(); f.as_file().flush().unwrap(); let expected = blake3::keyed_hash(&key, b"foo").to_hex(); let output = cmd!(b3sum_exe(), "--keyed", "--no-names", f.path()) .stdin_bytes(&key[..]) .read() .unwrap(); assert_eq!(&*expected, &*output); // Make sure that keys of the wrong length lead to errors. for bad_length in [0, 1, blake3::KEY_LEN - 1, blake3::KEY_LEN + 1] { dbg!(bad_length); let output = cmd!(b3sum_exe(), "--keyed", f.path()) .stdin_bytes(vec![0; bad_length]) .stdout_capture() .stderr_capture() .unchecked() .run() .unwrap(); assert!(!output.status.success()); assert!(output.stdout.is_empty()); // Make sure the error message is relevant. let stderr = std::str::from_utf8(&output.stderr).unwrap(); assert!(stderr.contains("key bytes")); } } #[test] fn test_derive_key() { let context = "BLAKE3 2019-12-28 10:28:41 example context"; let f = tempfile::NamedTempFile::new().unwrap(); f.as_file().write_all(b"key material").unwrap(); f.as_file().flush().unwrap(); let expected = hex::encode(blake3::derive_key(context, b"key material")); let output = cmd!(b3sum_exe(), "--derive-key", context, "--no-names", f.path()) .read() .unwrap(); assert_eq!(&*expected, &*output); } #[test] fn test_no_mmap() { let f = tempfile::NamedTempFile::new().unwrap(); f.as_file().write_all(b"foo").unwrap(); f.as_file().flush().unwrap(); let expected = blake3::hash(b"foo").to_hex(); let output = cmd!(b3sum_exe(), "--no-mmap", "--no-names", f.path()) .read() .unwrap(); assert_eq!(&*expected, &*output); } #[test] fn test_length_without_value_is_an_error() { let result = cmd!(b3sum_exe(), "--length") .stdin_bytes("foo") .stderr_capture() .run(); assert!(result.is_err()); } #[test] fn test_raw_with_multi_files_is_an_error() { let f1 = tempfile::NamedTempFile::new().unwrap(); let f2 = tempfile::NamedTempFile::new().unwrap(); // Make sure it doesn't error with just one file let result = cmd!(b3sum_exe(), "--raw", f1.path()).stdout_capture().run(); assert!(result.is_ok()); // Make sure it errors when both file are passed let result = cmd!(b3sum_exe(), "--raw", f1.path(), f2.path()) .stderr_capture() .run(); assert!(result.is_err()); } #[test] #[cfg(unix)] fn test_newline_and_backslash_escaping_on_unix() { let empty_hash = blake3::hash(b"").to_hex(); let dir = tempfile::tempdir().unwrap(); fs::create_dir(dir.path().join("subdir")).unwrap(); let names = [ "abcdef", "abc\ndef", "abc\\def", "abc\rdef", "abc\r\ndef", "subdir/foo", ]; let mut paths = Vec::new(); for name in &names { let path = dir.path().join(name); println!("creating file at {:?}", path); fs::write(&path, b"").unwrap(); paths.push(path); } let output = cmd(b3sum_exe(), &names).dir(dir.path()).read().unwrap(); let expected = format!( "\ {0} abcdef \\{0} abc\\ndef \\{0} abc\\\\def \\{0} abc\\rdef \\{0} abc\\r\\ndef {0} subdir/foo", empty_hash, ); println!("output"); println!("======"); println!("{}", output); println!(); println!("expected"); println!("========"); println!("{}", expected); println!(); assert_eq!(expected, output); } #[test] #[cfg(windows)] fn test_slash_normalization_on_windows() { let empty_hash = blake3::hash(b"").to_hex(); let dir = tempfile::tempdir().unwrap(); fs::create_dir(dir.path().join("subdir")).unwrap(); // Note that filenames can't contain newlines or backslashes on Windows, so // we don't test escaping here. We only test forward slash and backslash as // directory separators. let names = ["abcdef", "subdir/foo", "subdir\\bar"]; let mut paths = Vec::new(); for name in &names { let path = dir.path().join(name); println!("creating file at {:?}", path); fs::write(&path, b"").unwrap(); paths.push(path); } let output = cmd(b3sum_exe(), &names).dir(dir.path()).read().unwrap(); let expected = format!( "\ {0} abcdef {0} subdir/foo {0} subdir/bar", empty_hash, ); println!("output"); println!("======"); println!("{}", output); println!(); println!("expected"); println!("========"); println!("{}", expected); println!(); assert_eq!(expected, output); } #[test] #[cfg(unix)] fn test_invalid_unicode_on_unix() { use std::os::unix::ffi::OsStringExt; let empty_hash = blake3::hash(b"").to_hex(); let dir = tempfile::tempdir().unwrap(); let names = ["abcdef".into(), OsString::from_vec(b"abc\xffdef".to_vec())]; let mut paths = Vec::new(); for name in &names { let path = dir.path().join(name); println!("creating file at {:?}", path); // Note: Some operating systems, macOS in particular, simply don't // allow invalid Unicode in filenames. On those systems, this write // will fail. That's fine, we'll just short-circuit this test in that // case. But assert that at least Linux allows this. let write_result = fs::write(&path, b""); if cfg!(target_os = "linux") { write_result.expect("Linux should allow invalid Unicode"); } else if write_result.is_err() { return; } paths.push(path); } let output = cmd(b3sum_exe(), &names).dir(dir.path()).read().unwrap(); let expected = format!( "\ {0} abcdef {0} abc�def", empty_hash, ); println!("output"); println!("======"); println!("{}", output); println!(); println!("expected"); println!("========"); println!("{}", expected); println!(); assert_eq!(expected, output); } #[test] #[cfg(windows)] fn test_invalid_unicode_on_windows() { use std::os::windows::ffi::OsStringExt; let empty_hash = blake3::hash(b"").to_hex(); let dir = tempfile::tempdir().unwrap(); let surrogate_char = 0xDC00; let bad_unicode_wchars = [ 'a' as u16, 'b' as u16, 'c' as u16, surrogate_char, 'd' as u16, 'e' as u16, 'f' as u16, ]; let bad_osstring = OsString::from_wide(&bad_unicode_wchars); let names = ["abcdef".into(), bad_osstring]; let mut paths = Vec::new(); for name in &names { let path = dir.path().join(name); println!("creating file at {:?}", path); fs::write(&path, b"").unwrap(); paths.push(path); } let output = cmd(b3sum_exe(), &names).dir(dir.path()).read().unwrap(); let expected = format!( "\ {0} abcdef {0} abc�def", empty_hash, ); println!("output"); println!("======"); println!("{}", output); println!(); println!("expected"); println!("========"); println!("{}", expected); println!(); assert_eq!(expected, output); } #[test] fn test_check() { // Make a directory full of files, and make sure the b3sum output in that // directory is what we expect. let a_hash = blake3::hash(b"a").to_hex(); let b_hash = blake3::hash(b"b").to_hex(); let cd_hash = blake3::hash(b"cd").to_hex(); for tagged in [false, true] { let dir = tempfile::tempdir().unwrap(); fs::write(dir.path().join("a"), b"a").unwrap(); fs::write(dir.path().join("b"), b"b").unwrap(); fs::create_dir(dir.path().join("c")).unwrap(); fs::write(dir.path().join("c/d"), b"cd").unwrap(); dbg!(tagged); let mut args = vec!["a", "b", "c/d"]; if tagged { args.push("--tag"); } let output = cmd(b3sum_exe(), args) .dir(dir.path()) .stdout_capture() .stderr_capture() .run() .unwrap(); let stdout = std::str::from_utf8(&output.stdout).unwrap(); let stderr = std::str::from_utf8(&output.stderr).unwrap(); let expected_checkfile = if tagged { format!( "BLAKE3 (a) = {}\n\ BLAKE3 (b) = {}\n\ BLAKE3 (c/d) = {}\n", a_hash, b_hash, cd_hash, ) } else { format!( "{} a\n\ {} b\n\ {} c/d\n", a_hash, b_hash, cd_hash, ) }; dbg!(&expected_checkfile); assert_eq!(expected_checkfile, stdout); assert_eq!("", stderr); // Now use the output we just validated as a checkfile, passed to stdin. let output = cmd!(b3sum_exe(), "--check") .stdin_bytes(expected_checkfile.as_bytes()) .dir(dir.path()) .stdout_capture() .stderr_capture() .run() .unwrap(); let stdout = std::str::from_utf8(&output.stdout).unwrap(); let stderr = std::str::from_utf8(&output.stderr).unwrap(); let expected_check_output = "\ a: OK\n\ b: OK\n\ c/d: OK\n"; assert_eq!(expected_check_output, stdout); assert_eq!("", stderr); // Check the same file, but with Windows-style newlines. let windows_style = expected_checkfile.replace("\n", "\r\n"); let output = cmd!(b3sum_exe(), "--check") .stdin_bytes(windows_style.as_bytes()) .dir(dir.path()) .stdout_capture() .stderr_capture() .run() .unwrap(); let stdout = std::str::from_utf8(&output.stdout).unwrap(); let stderr = std::str::from_utf8(&output.stderr).unwrap(); let expected_check_output = "\ a: OK\n\ b: OK\n\ c/d: OK\n"; assert_eq!(expected_check_output, stdout); assert_eq!("", stderr); // Now pass the same checkfile twice on the command line just for fun. let checkfile_path = dir.path().join("checkfile"); fs::write(&checkfile_path, &expected_checkfile).unwrap(); let output = cmd!(b3sum_exe(), "--check", &checkfile_path, &checkfile_path) .dir(dir.path()) .stdout_capture() .stderr_capture() .run() .unwrap(); let stdout = std::str::from_utf8(&output.stdout).unwrap(); let stderr = std::str::from_utf8(&output.stderr).unwrap(); let mut double_check_output = String::new(); double_check_output.push_str(&expected_check_output); double_check_output.push_str(&expected_check_output); assert_eq!(double_check_output, stdout); assert_eq!("", stderr); // Corrupt one of the files and check again. fs::write(dir.path().join("b"), b"CORRUPTION").unwrap(); let output = cmd!(b3sum_exe(), "--check", &checkfile_path) .dir(dir.path()) .stdout_capture() .stderr_capture() .unchecked() .run() .unwrap(); let stdout = std::str::from_utf8(&output.stdout).unwrap(); let stderr = std::str::from_utf8(&output.stderr).unwrap(); let expected_check_failure = "\ a: OK\n\ b: FAILED\n\ c/d: OK\n"; assert!(!output.status.success()); assert_eq!(expected_check_failure, stdout); assert_eq!( "b3sum: WARNING: 1 computed checksum did NOT match\n", stderr, ); // Delete one of the files and check again. fs::remove_file(dir.path().join("b")).unwrap(); let open_file_error = fs::File::open(dir.path().join("b")).unwrap_err(); let output = cmd!(b3sum_exe(), "--check", &checkfile_path) .dir(dir.path()) .stdout_capture() .stderr_capture() .unchecked() .run() .unwrap(); let stdout = std::str::from_utf8(&output.stdout).unwrap(); let stderr = std::str::from_utf8(&output.stderr).unwrap(); let expected_check_failure = format!( "a: OK\n\ b: FAILED ({})\n\ c/d: OK\n", open_file_error, ); assert!(!output.status.success()); assert_eq!(expected_check_failure, stdout); assert_eq!( "b3sum: WARNING: 1 computed checksum did NOT match\n", stderr, ); // Confirm that --quiet suppresses the OKs but not the FAILEDs. let output = cmd!(b3sum_exe(), "--check", "--quiet", &checkfile_path) .dir(dir.path()) .stdout_capture() .stderr_capture() .unchecked() .run() .unwrap(); let stdout = std::str::from_utf8(&output.stdout).unwrap(); let stderr = std::str::from_utf8(&output.stderr).unwrap(); let expected_check_failure = format!("b: FAILED ({})\n", open_file_error); assert!(!output.status.success()); assert_eq!(expected_check_failure, stdout); assert_eq!( "b3sum: WARNING: 1 computed checksum did NOT match\n", stderr, ); } } #[test] fn test_check_invalid_characters() { // Check that a null character in the path fails. let output = cmd!(b3sum_exe(), "--check") .stdin_bytes("0000000000000000000000000000000000000000000000000000000000000000 \0") .stdout_capture() .stderr_capture() .unchecked() .run() .unwrap(); let stdout = std::str::from_utf8(&output.stdout).unwrap(); let stderr = std::str::from_utf8(&output.stderr).unwrap(); let expected_stderr = "\ b3sum: Null character in path\n\ b3sum: WARNING: 1 computed checksum did NOT match\n"; assert!(!output.status.success()); assert_eq!("", stdout); assert_eq!(expected_stderr, stderr); // Check that a Unicode replacement character in the path fails. let output = cmd!(b3sum_exe(), "--check") .stdin_bytes("0000000000000000000000000000000000000000000000000000000000000000 �") .stdout_capture() .stderr_capture() .unchecked() .run() .unwrap(); let stdout = std::str::from_utf8(&output.stdout).unwrap(); let stderr = std::str::from_utf8(&output.stderr).unwrap(); let expected_stderr = "\ b3sum: Unicode replacement character in path\n\ b3sum: WARNING: 1 computed checksum did NOT match\n"; assert!(!output.status.success()); assert_eq!("", stdout); assert_eq!(expected_stderr, stderr); // Check that an invalid escape sequence in the path fails. let output = cmd!(b3sum_exe(), "--check") .stdin_bytes("\\0000000000000000000000000000000000000000000000000000000000000000 \\a") .stdout_capture() .stderr_capture() .unchecked() .run() .unwrap(); let stdout = std::str::from_utf8(&output.stdout).unwrap(); let stderr = std::str::from_utf8(&output.stderr).unwrap(); let expected_stderr = "\ b3sum: Invalid backslash escape\n\ b3sum: WARNING: 1 computed checksum did NOT match\n"; assert!(!output.status.success()); assert_eq!("", stdout); assert_eq!(expected_stderr, stderr); // Windows also forbids literal backslashes. Check for that if and only if // we're on Windows. if cfg!(windows) { let output = cmd!(b3sum_exe(), "--check") .stdin_bytes("0000000000000000000000000000000000000000000000000000000000000000 \\") .stdout_capture() .stderr_capture() .unchecked() .run() .unwrap(); let stdout = std::str::from_utf8(&output.stdout).unwrap(); let stderr = std::str::from_utf8(&output.stderr).unwrap(); let expected_stderr = "\ b3sum: Backslash in path\n\ b3sum: WARNING: 1 computed checksum did NOT match\n"; assert!(!output.status.success()); assert_eq!("", stdout); assert_eq!(expected_stderr, stderr); } } #[test] fn test_globbing() { // On Unix, globbing is provided by the shell. On Windows, globbing is // provided by us, using the `wild` crate. let dir = tempfile::tempdir().unwrap(); let file1 = dir.path().join("file1"); fs::write(&file1, b"foo").unwrap(); let file2 = dir.path().join("file2"); fs::write(&file2, b"bar").unwrap(); let foo_hash = blake3::hash(b"foo"); let bar_hash = blake3::hash(b"bar"); // NOTE: This assumes that the glob will be expanded in alphabetical order, // to "file1 file2" rather than "file2 file1". So far, this seems to // be true (guaranteed?) of Unix shell behavior, and true in practice // with the `wild` crate on Windows. It's possible that this could // start failing in the future, though, or on some unknown platform. // If that ever happens, we'll need to relax this test somehow, // probably by just testing for both possible outputs. I'm not // handling that case in advance, though, because I'd prefer to hear // about it if it comes up. let expected = format!("{} file1\n{} file2", foo_hash.to_hex(), bar_hash.to_hex()); let star_command = format!("{} *", b3sum_exe().to_str().unwrap()); let (exe, c_flag) = if cfg!(windows) { ("cmd.exe", "/C") } else { ("/bin/sh", "-c") }; let output = cmd!(exe, c_flag, star_command) .dir(dir.path()) .read() .unwrap(); assert_eq!(expected, output); } ================================================ FILE: third-party/blake3/b3sum/what_does_check_do.md ================================================ # How does `b3sum --check` behave exactly?
or: Are filepaths...text? Most of the time, `b3sum --check` is a drop-in replacement for `md5sum --check` and other Coreutils hashing tools. It consumes a checkfile (the output of a regular `b3sum` command), re-hashes all the files listed there, and returns success if all of those hashes are still correct. What makes this more complicated than it might seem, is that representing filepaths as text means we need to consider many possible edge cases of unrepresentable filepaths. This document describes all of these edge cases in detail. ## The simple case Here's the result of running `b3sum a b c/d` in a directory that contains those three files: ```bash $ echo hi > a $ echo lo > b $ mkdir c $ echo stuff > c/d $ b3sum a b c/d 0b8b60248fad7ac6dfac221b7e01a8b91c772421a15b387dd1fb2d6a94aee438 a 6ae4a57bbba24f79c461d30bcb4db973b9427d9207877e34d2d74528daa84115 b 2d477356c962e54784f1c5dc5297718d92087006f6ee96b08aeaf7f3cd252377 c/d ``` If we pipe that output into `b3sum --check`, it will exit with status zero (success) and print: ```bash $ b3sum a b c/d | b3sum --check a: OK b: OK c/d: OK ``` If we delete `b` and change the contents of `c/d`, and then use the same checkfile as above, `b3sum --check` will exit with a non-zero status (failure) and print: ```bash $ b3sum a b c/d > checkfile $ rm b $ echo more stuff >> c/d $ b3sum --check checkfile a: OK b: FAILED (No such file or directory (os error 2)) c/d: FAILED ``` In these typical cases, `b3sum` and `md5sum` have identical output for success and very similar output for failure. ## Escaping newlines and backslashes Since the checkfile format (the regular output format of `b3sum`) is newline-separated text, we need to worry about what happens when a filepath contains a newline, or worse. Suppose we create a file named `x[newline]x` (3 characters). One way to create such a file is with a Python one-liner like this: ```python >>> open("x\nx", "w") ``` Here's what happens when we hash that file with `b3sum`: ```bash $ b3sum x* \af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262 x\nx ``` Notice two things. First, `b3sum` puts a single `\` character at the front of the line. This indicates that the filepath contains escape sequences that `b3sum --check` will need to unescape. Then, `b3sum` replaces the newline character in the filepath with the two-character escape sequence `\n`. Similarly, if the filepath contained carriage returns or backslashes, `b3sum` would escape those as `\r` and `\\` in the output. So far, all of this behavior is still identical to `md5sum`. (Note: Coreutils [introduced `\r` escaping](https://github.com/coreutils/coreutils/commit/ed1c58427d574fb4ff0cb8f915eb0d554000ceeb) in v9.0, September 2021.) ## Invalid Unicode This is where `b3sum` and `md5sum` diverge. Apart from the newline and backslash escapes described above, `md5sum` copies all other filepath bytes verbatim to its output. That means its output encoding is "ASCII plus whatever bytes we got from the command line". This creates two problems: 1. Printing something that isn't UTF-8 is kind of gross. 2. Windows support. What's the deal with Windows? To start with, there's a fundamental difference in how Unix and Windows represent filepaths. Unix filepaths are "usually UTF-8" and Windows filepaths are "usually UTF-16". That means that a file named `abc` is typically represented as the bytes `[97, 98, 99]` on Unix and as the bytes `[97, 0, 98, 0, 99, 0]` on Windows. The `md5sum` approach won't work if we plan on creating a checkfile on Unix and checking it on Windows, or vice versa. A more portable approach is to convert platform-specific bytes into some consistent Unicode encoding. (In practice this is going to be UTF-8, but in theory it could be anything.) Then when `--check` needs to open a file, we convert the Unicode representation back into platform-specific bytes. This makes important common cases like `abc`, and in fact even `abc[newline]def`, work as expected. Great! But...what did we mean above when we said *usually* UTF-8 and *usually* UTF-16? It turns out that not every possible sequence of bytes is valid UTF-8, and not every possible sequence of 16-bit wide chars is valid UTF-16. For example, the byte 0xFF (255) can never appear in any UTF-8 string. If we ask Python to decode it, it yells at us: ```python >>> b"\xFF".decode("UTF-8") UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte ``` However, tragically, we *can* create a file with that byte in its name (on Linux at least, though not usually on macOS): ```python >>> open(b"y\xFFy", "w") ``` So some filepaths aren't representable in Unicode at all. Our plan to "convert platform-specific bytes into some consistent Unicode encoding" isn't going to work for everything. What does `b3sum` do with the file above? ```bash $ b3sum y* af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262 y�y ``` That � in there is a "Unicode replacement character". When we run into filepaths that we can't represent in Unicode, we replace the unrepresentable parts with these characters. On the checking side, to avoid any possible confusion between two different invalid filepaths, we automatically fail if we see a replacement character. Together with a few more details covered in the next section, this gives us an important set of properties: 1. Any file can be hashed locally. 2. Any file with a valid Unicode name not containing the � character can be checked. 3. Checking ambiguous or unrepresentable filepaths always fails. 4. Checkfiles are always valid UTF-8. 5. Checkfiles are portable between Unix and Windows. ## Formal Rules 1. When hashing, filepaths are represented in a platform-specific encoding, which can accommodate any filepath on the current platform. In Rust, this is `OsStr`/`OsString`. 2. In output, filepaths are first converted to UTF-8. Any non-Unicode segments are replaced with Unicode replacement characters (U+FFFD). In Rust, this is `OsStr::to_string_lossy`. 3. Then, if a filepath contains any backslashes (U+005C) or newlines (U+000A), these characters are escaped as `\\` and `\n` respectively. 4. Finally, any output line containing an escape sequence is prefixed with a single backslash. 5. When checking, each line is parsed as UTF-8, separated by a newline (U+000A). Invalid UTF-8 is an error. 6. Then, if a line begins with a backslash, the filepath component is unescaped. Any escape sequence other than `\\` or `\n` is an error. If a line does not begin with a backslash, unescaping is not performed, and any backslashes in the filepath component are interpreted literally. (`b3sum` output never contains unescaped backslashes, but they can occur in checkfiles assembled by hand.) 7. Finally, if a filepath contains a Unicode replacement character (U+FFFD) or a null character (U+0000), it is an error. **Additionally, on Windows only:** 8. In output, all backslashes (U+005C) are replaced with forward slashes (U+002F). 9. When checking, after unescaping, if a filepath contains a backslash, it is an error. ================================================ FILE: third-party/blake3/benches/bench.rs ================================================ #![feature(test)] extern crate test; use arrayref::array_ref; use arrayvec::ArrayVec; use blake3::platform::{Platform, MAX_SIMD_DEGREE}; use blake3::OUT_LEN; use blake3::{BLOCK_LEN, CHUNK_LEN}; use rand::prelude::*; use test::Bencher; const KIB: usize = 1024; // This struct randomizes two things: // 1. The actual bytes of input. // 2. The page offset the input starts at. pub struct RandomInput { buf: Vec, len: usize, offsets: Vec, offset_index: usize, } impl RandomInput { pub fn new(b: &mut Bencher, len: usize) -> Self { b.bytes += len as u64; let page_size: usize = page_size::get(); let mut buf = vec![0u8; len + page_size]; let mut rng = rand::rng(); rng.fill_bytes(&mut buf); let mut offsets: Vec = (0..page_size).collect(); offsets.shuffle(&mut rng); Self { buf, len, offsets, offset_index: 0, } } pub fn get(&mut self) -> &[u8] { let offset = self.offsets[self.offset_index]; self.offset_index += 1; if self.offset_index >= self.offsets.len() { self.offset_index = 0; } &self.buf[offset..][..self.len] } } fn bench_single_compression_fn(b: &mut Bencher, platform: Platform) { let mut state = [1u32; 8]; let mut r = RandomInput::new(b, 64); let input = array_ref!(r.get(), 0, 64); b.iter(|| platform.compress_in_place(&mut state, input, 64 as u8, 0, 0)); } #[bench] fn bench_single_compression_portable(b: &mut Bencher) { bench_single_compression_fn(b, Platform::portable()); } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_single_compression_sse2(b: &mut Bencher) { if let Some(platform) = Platform::sse2() { bench_single_compression_fn(b, platform); } } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_single_compression_sse41(b: &mut Bencher) { if let Some(platform) = Platform::sse41() { bench_single_compression_fn(b, platform); } } #[bench] #[cfg(blake3_avx512_ffi)] fn bench_single_compression_avx512(b: &mut Bencher) { if let Some(platform) = Platform::avx512() { bench_single_compression_fn(b, platform); } } fn bench_many_chunks_fn(b: &mut Bencher, platform: Platform) { let degree = platform.simd_degree(); let mut inputs = Vec::new(); for _ in 0..degree { inputs.push(RandomInput::new(b, CHUNK_LEN)); } b.iter(|| { let input_arrays: ArrayVec<&[u8; CHUNK_LEN], MAX_SIMD_DEGREE> = inputs .iter_mut() .take(degree) .map(|i| array_ref!(i.get(), 0, CHUNK_LEN)) .collect(); let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; platform.hash_many( &input_arrays[..], &[0; 8], 0, blake3::IncrementCounter::Yes, 0, 0, 0, &mut out, ); }); } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_many_chunks_sse2(b: &mut Bencher) { if let Some(platform) = Platform::sse2() { bench_many_chunks_fn(b, platform); } } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_many_chunks_sse41(b: &mut Bencher) { if let Some(platform) = Platform::sse41() { bench_many_chunks_fn(b, platform); } } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_many_chunks_avx2(b: &mut Bencher) { if let Some(platform) = Platform::avx2() { bench_many_chunks_fn(b, platform); } } #[bench] #[cfg(blake3_avx512_ffi)] fn bench_many_chunks_avx512(b: &mut Bencher) { if let Some(platform) = Platform::avx512() { bench_many_chunks_fn(b, platform); } } #[bench] #[cfg(blake3_neon)] fn bench_many_chunks_neon(b: &mut Bencher) { bench_many_chunks_fn(b, Platform::neon().unwrap()); } #[bench] #[cfg(blake3_wasm32_simd)] fn bench_many_chunks_wasm(b: &mut Bencher) { bench_many_chunks_fn(b, Platform::wasm32_simd().unwrap()); } // TODO: When we get const generics we can unify this with the chunks code. fn bench_many_parents_fn(b: &mut Bencher, platform: Platform) { let degree = platform.simd_degree(); let mut inputs = Vec::new(); for _ in 0..degree { inputs.push(RandomInput::new(b, BLOCK_LEN)); } b.iter(|| { let input_arrays: ArrayVec<&[u8; BLOCK_LEN], MAX_SIMD_DEGREE> = inputs .iter_mut() .take(degree) .map(|i| array_ref!(i.get(), 0, BLOCK_LEN)) .collect(); let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN]; platform.hash_many( &input_arrays[..], &[0; 8], 0, blake3::IncrementCounter::No, 0, 0, 0, &mut out, ); }); } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_many_parents_sse2(b: &mut Bencher) { if let Some(platform) = Platform::sse2() { bench_many_parents_fn(b, platform); } } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_many_parents_sse41(b: &mut Bencher) { if let Some(platform) = Platform::sse41() { bench_many_parents_fn(b, platform); } } #[bench] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn bench_many_parents_avx2(b: &mut Bencher) { if let Some(platform) = Platform::avx2() { bench_many_parents_fn(b, platform); } } #[bench] #[cfg(blake3_avx512_ffi)] fn bench_many_parents_avx512(b: &mut Bencher) { if let Some(platform) = Platform::avx512() { bench_many_parents_fn(b, platform); } } #[bench] #[cfg(blake3_neon)] fn bench_many_parents_neon(b: &mut Bencher) { bench_many_parents_fn(b, Platform::neon().unwrap()); } #[bench] #[cfg(blake3_wasm32_simd)] fn bench_many_parents_wasm(b: &mut Bencher) { bench_many_parents_fn(b, Platform::wasm32_simd().unwrap()); } fn bench_atonce(b: &mut Bencher, len: usize) { let mut input = RandomInput::new(b, len); b.iter(|| blake3::hash(input.get())); } #[bench] fn bench_atonce_0001_block(b: &mut Bencher) { bench_atonce(b, BLOCK_LEN); } #[bench] fn bench_atonce_0001_kib(b: &mut Bencher) { bench_atonce(b, 1 * KIB); } #[bench] fn bench_atonce_0002_kib(b: &mut Bencher) { bench_atonce(b, 2 * KIB); } #[bench] fn bench_atonce_0004_kib(b: &mut Bencher) { bench_atonce(b, 4 * KIB); } #[bench] fn bench_atonce_0008_kib(b: &mut Bencher) { bench_atonce(b, 8 * KIB); } #[bench] fn bench_atonce_0016_kib(b: &mut Bencher) { bench_atonce(b, 16 * KIB); } #[bench] fn bench_atonce_0032_kib(b: &mut Bencher) { bench_atonce(b, 32 * KIB); } #[bench] fn bench_atonce_0064_kib(b: &mut Bencher) { bench_atonce(b, 64 * KIB); } #[bench] fn bench_atonce_0128_kib(b: &mut Bencher) { bench_atonce(b, 128 * KIB); } #[bench] fn bench_atonce_0256_kib(b: &mut Bencher) { bench_atonce(b, 256 * KIB); } #[bench] fn bench_atonce_0512_kib(b: &mut Bencher) { bench_atonce(b, 512 * KIB); } #[bench] fn bench_atonce_1024_kib(b: &mut Bencher) { bench_atonce(b, 1024 * KIB); } fn bench_incremental(b: &mut Bencher, len: usize) { let mut input = RandomInput::new(b, len); b.iter(|| blake3::Hasher::new().update(input.get()).finalize()); } #[bench] fn bench_incremental_0001_block(b: &mut Bencher) { bench_incremental(b, BLOCK_LEN); } #[bench] fn bench_incremental_0001_kib(b: &mut Bencher) { bench_incremental(b, 1 * KIB); } #[bench] fn bench_incremental_0002_kib(b: &mut Bencher) { bench_incremental(b, 2 * KIB); } #[bench] fn bench_incremental_0004_kib(b: &mut Bencher) { bench_incremental(b, 4 * KIB); } #[bench] fn bench_incremental_0008_kib(b: &mut Bencher) { bench_incremental(b, 8 * KIB); } #[bench] fn bench_incremental_0016_kib(b: &mut Bencher) { bench_incremental(b, 16 * KIB); } #[bench] fn bench_incremental_0032_kib(b: &mut Bencher) { bench_incremental(b, 32 * KIB); } #[bench] fn bench_incremental_0064_kib(b: &mut Bencher) { bench_incremental(b, 64 * KIB); } #[bench] fn bench_incremental_0128_kib(b: &mut Bencher) { bench_incremental(b, 128 * KIB); } #[bench] fn bench_incremental_0256_kib(b: &mut Bencher) { bench_incremental(b, 256 * KIB); } #[bench] fn bench_incremental_0512_kib(b: &mut Bencher) { bench_incremental(b, 512 * KIB); } #[bench] fn bench_incremental_1024_kib(b: &mut Bencher) { bench_incremental(b, 1024 * KIB); } fn bench_reference(b: &mut Bencher, len: usize) { let mut input = RandomInput::new(b, len); b.iter(|| { let mut hasher = reference_impl::Hasher::new(); hasher.update(input.get()); let mut out = [0; 32]; hasher.finalize(&mut out); out }); } #[bench] fn bench_reference_0001_block(b: &mut Bencher) { bench_reference(b, BLOCK_LEN); } #[bench] fn bench_reference_0001_kib(b: &mut Bencher) { bench_reference(b, 1 * KIB); } #[bench] fn bench_reference_0002_kib(b: &mut Bencher) { bench_reference(b, 2 * KIB); } #[bench] fn bench_reference_0004_kib(b: &mut Bencher) { bench_reference(b, 4 * KIB); } #[bench] fn bench_reference_0008_kib(b: &mut Bencher) { bench_reference(b, 8 * KIB); } #[bench] fn bench_reference_0016_kib(b: &mut Bencher) { bench_reference(b, 16 * KIB); } #[bench] fn bench_reference_0032_kib(b: &mut Bencher) { bench_reference(b, 32 * KIB); } #[bench] fn bench_reference_0064_kib(b: &mut Bencher) { bench_reference(b, 64 * KIB); } #[bench] fn bench_reference_0128_kib(b: &mut Bencher) { bench_reference(b, 128 * KIB); } #[bench] fn bench_reference_0256_kib(b: &mut Bencher) { bench_reference(b, 256 * KIB); } #[bench] fn bench_reference_0512_kib(b: &mut Bencher) { bench_reference(b, 512 * KIB); } #[bench] fn bench_reference_1024_kib(b: &mut Bencher) { bench_reference(b, 1024 * KIB); } #[cfg(feature = "rayon")] fn bench_rayon(b: &mut Bencher, len: usize) { let mut input = RandomInput::new(b, len); b.iter(|| blake3::Hasher::new().update_rayon(input.get()).finalize()); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0001_block(b: &mut Bencher) { bench_rayon(b, BLOCK_LEN); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0001_kib(b: &mut Bencher) { bench_rayon(b, 1 * KIB); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0002_kib(b: &mut Bencher) { bench_rayon(b, 2 * KIB); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0004_kib(b: &mut Bencher) { bench_rayon(b, 4 * KIB); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0008_kib(b: &mut Bencher) { bench_rayon(b, 8 * KIB); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0016_kib(b: &mut Bencher) { bench_rayon(b, 16 * KIB); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0032_kib(b: &mut Bencher) { bench_rayon(b, 32 * KIB); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0064_kib(b: &mut Bencher) { bench_rayon(b, 64 * KIB); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0128_kib(b: &mut Bencher) { bench_rayon(b, 128 * KIB); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0256_kib(b: &mut Bencher) { bench_rayon(b, 256 * KIB); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_0512_kib(b: &mut Bencher) { bench_rayon(b, 512 * KIB); } #[bench] #[cfg(feature = "rayon")] fn bench_rayon_1024_kib(b: &mut Bencher) { bench_rayon(b, 1024 * KIB); } // This checks that update() splits up its input in increasing powers of 2, so // that it can recover a high degree of parallelism when the number of bytes // hashed so far is uneven. The performance of this benchmark should be // reasonably close to bench_incremental_0064_kib, within 80% or so. When we // had a bug in this logic (https://github.com/BLAKE3-team/BLAKE3/issues/69), // performance was less than half. #[bench] fn bench_two_updates(b: &mut Bencher) { let len = 65536; let mut input = RandomInput::new(b, len); b.iter(|| { let mut hasher = blake3::Hasher::new(); let input = input.get(); hasher.update(&input[..1]); hasher.update(&input[1..]); hasher.finalize() }); } fn bench_xof(b: &mut Bencher, len: usize) { b.bytes = len as u64; let mut output = [0u8; 64 * BLOCK_LEN]; let output_slice = &mut output[..len]; let mut xof = blake3::Hasher::new().finalize_xof(); b.iter(|| xof.fill(output_slice)); } #[bench] fn bench_xof_01_block(b: &mut Bencher) { bench_xof(b, 1 * BLOCK_LEN); } #[bench] fn bench_xof_02_blocks(b: &mut Bencher) { bench_xof(b, 2 * BLOCK_LEN); } #[bench] fn bench_xof_03_blocks(b: &mut Bencher) { bench_xof(b, 3 * BLOCK_LEN); } #[bench] fn bench_xof_04_blocks(b: &mut Bencher) { bench_xof(b, 4 * BLOCK_LEN); } #[bench] fn bench_xof_05_blocks(b: &mut Bencher) { bench_xof(b, 5 * BLOCK_LEN); } #[bench] fn bench_xof_06_blocks(b: &mut Bencher) { bench_xof(b, 6 * BLOCK_LEN); } #[bench] fn bench_xof_07_blocks(b: &mut Bencher) { bench_xof(b, 7 * BLOCK_LEN); } #[bench] fn bench_xof_08_blocks(b: &mut Bencher) { bench_xof(b, 8 * BLOCK_LEN); } #[bench] fn bench_xof_09_blocks(b: &mut Bencher) { bench_xof(b, 9 * BLOCK_LEN); } #[bench] fn bench_xof_10_blocks(b: &mut Bencher) { bench_xof(b, 10 * BLOCK_LEN); } #[bench] fn bench_xof_11_blocks(b: &mut Bencher) { bench_xof(b, 11 * BLOCK_LEN); } #[bench] fn bench_xof_12_blocks(b: &mut Bencher) { bench_xof(b, 12 * BLOCK_LEN); } #[bench] fn bench_xof_13_blocks(b: &mut Bencher) { bench_xof(b, 13 * BLOCK_LEN); } #[bench] fn bench_xof_14_blocks(b: &mut Bencher) { bench_xof(b, 14 * BLOCK_LEN); } #[bench] fn bench_xof_15_blocks(b: &mut Bencher) { bench_xof(b, 15 * BLOCK_LEN); } #[bench] fn bench_xof_16_blocks(b: &mut Bencher) { bench_xof(b, 16 * BLOCK_LEN); } #[bench] fn bench_xof_32_blocks(b: &mut Bencher) { bench_xof(b, 32 * BLOCK_LEN); } #[bench] fn bench_xof_64_blocks(b: &mut Bencher) { bench_xof(b, 64 * BLOCK_LEN); } ================================================ FILE: third-party/blake3/build.rs ================================================ use std::env; fn defined(var: &str) -> bool { println!("cargo:rerun-if-env-changed={}", var); env::var_os(var).is_some() } fn is_pure() -> bool { defined("CARGO_FEATURE_PURE") } fn should_prefer_intrinsics() -> bool { defined("CARGO_FEATURE_PREFER_INTRINSICS") } fn is_neon() -> bool { defined("CARGO_FEATURE_NEON") } fn is_no_neon() -> bool { defined("CARGO_FEATURE_NO_NEON") } fn is_wasm32_simd() -> bool { defined("CARGO_FEATURE_WASM32_SIMD") } fn is_ci() -> bool { defined("BLAKE3_CI") } fn warn(warning: &str) { assert!(!warning.contains("\n")); println!("cargo:warning={}", warning); if is_ci() { println!("cargo:warning=Warnings in CI are treated as errors. Build failed."); std::process::exit(1); } } fn target_components() -> Vec { let target = env::var("TARGET").unwrap(); target.split("-").map(|s| s.to_string()).collect() } fn is_x86_64() -> bool { target_components()[0] == "x86_64" } fn is_windows_target() -> bool { env::var("CARGO_CFG_TARGET_OS").unwrap() == "windows" } fn use_msvc_asm() -> bool { const MSVC_NAMES: &[&str] = &["", "cl", "cl.exe"]; let target_os = env::var("CARGO_CFG_TARGET_OS").unwrap_or_default(); let target_env = env::var("CARGO_CFG_TARGET_ENV").unwrap_or_default(); let target_windows_msvc = target_os == "windows" && target_env == "msvc"; let host_triple = env::var("HOST").unwrap_or_default(); let target_triple = env::var("TARGET").unwrap_or_default(); let cross_compiling = host_triple != target_triple; let cc = env::var("CC").unwrap_or_default().to_ascii_lowercase(); if !target_windows_msvc { // We are not building for Windows with the MSVC toolchain. false } else if !cross_compiling && MSVC_NAMES.contains(&&*cc) { // We are building on Windows with the MSVC toolchain (and not cross-compiling for another architecture or target). true } else { // We are cross-compiling to Windows with the MSVC toolchain. let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default(); let target_vendor = env::var("CARGO_CFG_TARGET_VENDOR").unwrap_or_default(); let cc = env::var(format!("CC_{target_arch}_{target_vendor}_windows_msvc")) .unwrap_or_default() .to_ascii_lowercase(); // Check if we are using the MSVC compiler. MSVC_NAMES.contains(&&*cc) } } fn is_x86_32() -> bool { let arch = &target_components()[0]; arch == "i386" || arch == "i586" || arch == "i686" } fn is_arm() -> bool { is_armv7() || is_aarch64() || target_components()[0] == "arm" } fn is_aarch64() -> bool { target_components()[0] == "aarch64" } fn is_armv7() -> bool { target_components()[0] == "armv7" } fn is_wasm32() -> bool { target_components()[0] == "wasm32" } fn endianness() -> String { let endianness = env::var("CARGO_CFG_TARGET_ENDIAN").unwrap(); assert!(endianness == "little" || endianness == "big"); endianness } fn is_little_endian() -> bool { endianness() == "little" } fn is_big_endian() -> bool { endianness() == "big" } // Windows targets may be using the MSVC toolchain or the MinGW toolchain. The // right compiler flags to use depend on the toolchain. (And we don't want to // use flag_if_supported, because we don't want features to be silently // disabled by old compilers.) fn is_windows_msvc() -> bool { // Some targets are only two components long, so check in steps. let second_component = &target_components()[1]; (second_component == "pc" || second_component == "win7") && target_components()[2] == "windows" && target_components()[3] == "msvc" } // MinGW toolchain uses 2 different targets depending on the main compiler. // Target for a general MinGW toolchain ends with `-gnu` (GCC is used as C // compiler). Target for a LLVM-MinGW toolchain (Clang is used as C compiler) // ends with `-gnullvm`. fn is_windows_gnu() -> bool { // Some targets are only two components long, so check in steps. let second_component = &target_components()[1]; (second_component == "pc" || second_component == "win7") && target_components()[2] == "windows" && target_components()[3] != "msvc" } fn new_build() -> cc::Build { let mut build = cc::Build::new(); if !is_windows_msvc() { build.flag("-std=c11"); } // Do NOT trigger a rebuild any time the env changes (e.g. $PATH). // This prevents all downstream crates from being rebuilt when `cargo check` // or `cargo build` are run in different environments, like Rust Analyzer // vs. in the terminal vs. in a Git pre-commit hook. build.emit_rerun_if_env_changed(false); build } #[derive(PartialEq)] enum CCompilerSupport { NoCompiler, NoAVX512, YesAVX512, } use CCompilerSupport::*; fn c_compiler_support() -> CCompilerSupport { let build = new_build(); let flags_checked; let support_result: Result = if is_windows_msvc() { flags_checked = "/arch:AVX512"; build.is_flag_supported("/arch:AVX512") } else { // Check for both of the flags we use. If -mavx512f works, then -mavx512vl // will probably always work too, but we might as well be thorough. flags_checked = "-mavx512f and -mavx512vl"; match build.is_flag_supported("-mavx512f") { Ok(true) => build.is_flag_supported("-mavx512vl"), false_or_error => false_or_error, } }; match support_result { Ok(true) => YesAVX512, Ok(false) => { warn(&format!( "The C compiler {:?} does not support {}.", build.get_compiler().path(), flags_checked, )); NoAVX512 } Err(e) => { println!("{:?}", e); warn(&format!( "No C compiler {:?} detected.", build.get_compiler().path() )); NoCompiler } } } fn build_sse2_sse41_avx2_rust_intrinsics() { // No C code to compile here. Set the cfg flags that enable the Rust SSE2, // SSE4.1, and AVX2 intrinsics modules. The regular Cargo build will compile // them. println!("cargo:rustc-cfg=blake3_sse2_rust"); println!("cargo:rustc-cfg=blake3_sse41_rust"); println!("cargo:rustc-cfg=blake3_avx2_rust"); } fn build_sse2_sse41_avx2_assembly() { // Build the assembly implementations for SSE4.1 and AVX2. This is // preferred, but it only supports x86_64. assert!(is_x86_64()); println!("cargo:rustc-cfg=blake3_sse2_ffi"); println!("cargo:rustc-cfg=blake3_sse41_ffi"); println!("cargo:rustc-cfg=blake3_avx2_ffi"); let mut build = new_build(); if is_windows_target() { if use_msvc_asm() { build.file("c/blake3_sse2_x86-64_windows_msvc.asm"); build.file("c/blake3_sse41_x86-64_windows_msvc.asm"); build.file("c/blake3_avx2_x86-64_windows_msvc.asm"); } else { build.file("c/blake3_sse2_x86-64_windows_gnu.S"); build.file("c/blake3_sse41_x86-64_windows_gnu.S"); build.file("c/blake3_avx2_x86-64_windows_gnu.S"); } } else { // All non-Windows implementations are assumed to support // Linux-style assembly. These files do contain a small // explicit workaround for macOS also. build.file("c/blake3_sse2_x86-64_unix.S"); build.file("c/blake3_sse41_x86-64_unix.S"); build.file("c/blake3_avx2_x86-64_unix.S"); } build.compile("blake3_sse2_sse41_avx2_assembly"); } fn build_avx512_c_intrinsics() { // This is required on 32-bit x86 targets, since the assembly // implementation doesn't support those. println!("cargo:rustc-cfg=blake3_avx512_ffi"); let mut build = new_build(); build.file("c/blake3_avx512.c"); if is_windows_msvc() { build.flag("/arch:AVX512"); } else { build.flag("-mavx512f"); build.flag("-mavx512vl"); } if is_windows_gnu() { // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782. build.flag("-fno-asynchronous-unwind-tables"); } build.compile("blake3_avx512_intrinsics"); } fn build_avx512_assembly() { // Build the assembly implementation for AVX-512. This is preferred, but it // only supports x86_64. assert!(is_x86_64()); println!("cargo:rustc-cfg=blake3_avx512_ffi"); let mut build = new_build(); let mut is_msvc = false; if is_windows_target() { if use_msvc_asm() { build.file("c/blake3_avx512_x86-64_windows_msvc.asm"); is_msvc = true; } else { build.file("c/blake3_avx512_x86-64_windows_gnu.S"); } } else { build.file("c/blake3_avx512_x86-64_unix.S"); } // Older versions of Clang require these flags, even for assembly. See // https://github.com/BLAKE3-team/BLAKE3/issues/79. if !is_msvc { build.flag("-mavx512f"); build.flag("-mavx512vl"); } build.compile("blake3_avx512_assembly"); } fn build_neon_c_intrinsics() { let mut build = new_build(); // Note that blake3_neon.c normally depends on the blake3_portable.c // for the single-instance compression function, but we expose // portable.rs over FFI instead. See ffi_neon.rs. build.file("c/blake3_neon.c"); // ARMv7 platforms that support NEON generally need the following // flags. AArch64 supports NEON by default and does not support -mpfu. if is_armv7() { build.flag("-mfpu=neon-vfpv4"); build.flag("-mfloat-abi=hard"); } build.compile("blake3_neon"); } fn build_wasm32_simd() { assert!(is_wasm32()); // No C code to compile here. Set the cfg flags that enable the Wasm SIMD. // The regular Cargo build will compile it. println!("cargo:rustc-cfg=blake3_wasm32_simd"); } fn main() -> Result<(), Box> { // As of Rust 1.80, unrecognized config names are warnings. Give Cargo all of our config names. let all_cfgs = [ "blake3_sse2_ffi", "blake3_sse2_rust", "blake3_sse41_ffi", "blake3_sse41_rust", "blake3_avx2_ffi", "blake3_avx2_rust", "blake3_avx512_ffi", "blake3_neon", "blake3_wasm32_simd", ]; for cfg_name in all_cfgs { // TODO: Switch this whole file to the new :: syntax when our MSRV reaches 1.77. // https://doc.rust-lang.org/cargo/reference/build-scripts.html#outputs-of-the-build-script println!("cargo:rustc-check-cfg=cfg({cfg_name}, values(none()))"); } if is_pure() && is_neon() { panic!("It doesn't make sense to enable both \"pure\" and \"neon\"."); } if is_no_neon() && is_neon() { panic!("It doesn't make sense to enable both \"no_neon\" and \"neon\"."); } if is_x86_64() || is_x86_32() { let support = c_compiler_support(); if is_x86_32() || should_prefer_intrinsics() || is_pure() || support == NoCompiler { build_sse2_sse41_avx2_rust_intrinsics(); } else { // We assume that all C compilers can assemble SSE4.1 and AVX2. We // don't explicitly check for support. build_sse2_sse41_avx2_assembly(); } if is_pure() || support == NoCompiler || support == NoAVX512 { // The binary will not include any AVX-512 code. } else if is_x86_32() || should_prefer_intrinsics() { build_avx512_c_intrinsics(); } else { build_avx512_assembly(); } } if is_neon() && is_big_endian() { panic!("The NEON implementation doesn't support big-endian ARM.") } if (is_arm() && is_neon()) || (!is_no_neon() && !is_pure() && is_aarch64() && is_little_endian()) { println!("cargo:rustc-cfg=blake3_neon"); build_neon_c_intrinsics(); } if is_wasm32() && is_wasm32_simd() { build_wasm32_simd(); } // The `cc` crate doesn't automatically emit rerun-if directives for the // environment variables it supports, in particular for $CC. We expect to // do a lot of benchmarking across different compilers, so we explicitly // add the variables that we're likely to need. println!("cargo:rerun-if-env-changed=CC"); println!("cargo:rerun-if-env-changed=CFLAGS"); // Ditto for source files, though these shouldn't change as often. for file in std::fs::read_dir("c")? { println!( "cargo:rerun-if-changed={}", file?.path().to_str().expect("utf-8") ); } // When compiling with clang-cl for windows, it adds .asm files to the root // which we need to delete so cargo doesn't get angry if is_windows_target() && !use_msvc_asm() { let _ = std::fs::remove_file("blake3_avx2_x86-64_windows_gnu.asm"); let _ = std::fs::remove_file("blake3_avx512_x86-64_windows_gnu.asm"); let _ = std::fs::remove_file("blake3_sse2_x86-64_windows_gnu.asm"); let _ = std::fs::remove_file("blake3_sse41_x86-64_windows_gnu.asm"); } Ok(()) } ================================================ FILE: third-party/blake3/reference_impl/Cargo.toml ================================================ [package] name = "reference_impl" version = "0.0.0" edition = "2021" [lib] name = "reference_impl" path = "reference_impl.rs" ================================================ FILE: third-party/blake3/reference_impl/README.md ================================================ This is the reference implementation of BLAKE3. It is used for testing and as a readable example of the algorithms involved. Section 5.1 of [the BLAKE3 spec](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf) discusses this implementation. You can render docs for this implementation by running `cargo doc --open` in this directory. This implementation is a single file ([`reference_impl.rs`](reference_impl.rs)) with no dependencies. It is not optimized for performance. There are ports of this reference implementation to other languages: - [C](https://github.com/oconnor663/blake3_reference_impl_c) - [Python](https://github.com/oconnor663/pure_python_blake3) ================================================ FILE: third-party/blake3/reference_impl/reference_impl.rs ================================================ //! This is the reference implementation of BLAKE3. It is used for testing and //! as a readable example of the algorithms involved. Section 5.1 of [the BLAKE3 //! spec](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf) //! discusses this implementation. You can render docs for this implementation //! by running `cargo doc --open` in this directory. //! //! # Example //! //! ``` //! let mut hasher = reference_impl::Hasher::new(); //! hasher.update(b"abc"); //! hasher.update(b"def"); //! let mut hash = [0; 32]; //! hasher.finalize(&mut hash); //! let mut extended_hash = [0; 500]; //! hasher.finalize(&mut extended_hash); //! assert_eq!(hash, extended_hash[..32]); //! ``` use core::cmp::min; const OUT_LEN: usize = 32; const KEY_LEN: usize = 32; const BLOCK_LEN: usize = 64; const CHUNK_LEN: usize = 1024; const CHUNK_START: u32 = 1 << 0; const CHUNK_END: u32 = 1 << 1; const PARENT: u32 = 1 << 2; const ROOT: u32 = 1 << 3; const KEYED_HASH: u32 = 1 << 4; const DERIVE_KEY_CONTEXT: u32 = 1 << 5; const DERIVE_KEY_MATERIAL: u32 = 1 << 6; const IV: [u32; 8] = [ 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, ]; const MSG_PERMUTATION: [usize; 16] = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8]; // The mixing function, G, which mixes either a column or a diagonal. fn g(state: &mut [u32; 16], a: usize, b: usize, c: usize, d: usize, mx: u32, my: u32) { state[a] = state[a].wrapping_add(state[b]).wrapping_add(mx); state[d] = (state[d] ^ state[a]).rotate_right(16); state[c] = state[c].wrapping_add(state[d]); state[b] = (state[b] ^ state[c]).rotate_right(12); state[a] = state[a].wrapping_add(state[b]).wrapping_add(my); state[d] = (state[d] ^ state[a]).rotate_right(8); state[c] = state[c].wrapping_add(state[d]); state[b] = (state[b] ^ state[c]).rotate_right(7); } fn round(state: &mut [u32; 16], m: &[u32; 16]) { // Mix the columns. g(state, 0, 4, 8, 12, m[0], m[1]); g(state, 1, 5, 9, 13, m[2], m[3]); g(state, 2, 6, 10, 14, m[4], m[5]); g(state, 3, 7, 11, 15, m[6], m[7]); // Mix the diagonals. g(state, 0, 5, 10, 15, m[8], m[9]); g(state, 1, 6, 11, 12, m[10], m[11]); g(state, 2, 7, 8, 13, m[12], m[13]); g(state, 3, 4, 9, 14, m[14], m[15]); } fn permute(m: &mut [u32; 16]) { let mut permuted = [0; 16]; for i in 0..16 { permuted[i] = m[MSG_PERMUTATION[i]]; } *m = permuted; } fn compress( chaining_value: &[u32; 8], block_words: &[u32; 16], counter: u64, block_len: u32, flags: u32, ) -> [u32; 16] { let counter_low = counter as u32; let counter_high = (counter >> 32) as u32; #[rustfmt::skip] let mut state = [ chaining_value[0], chaining_value[1], chaining_value[2], chaining_value[3], chaining_value[4], chaining_value[5], chaining_value[6], chaining_value[7], IV[0], IV[1], IV[2], IV[3], counter_low, counter_high, block_len, flags, ]; let mut block = *block_words; round(&mut state, &block); // round 1 permute(&mut block); round(&mut state, &block); // round 2 permute(&mut block); round(&mut state, &block); // round 3 permute(&mut block); round(&mut state, &block); // round 4 permute(&mut block); round(&mut state, &block); // round 5 permute(&mut block); round(&mut state, &block); // round 6 permute(&mut block); round(&mut state, &block); // round 7 for i in 0..8 { state[i] ^= state[i + 8]; state[i + 8] ^= chaining_value[i]; } state } fn first_8_words(compression_output: [u32; 16]) -> [u32; 8] { compression_output[0..8].try_into().unwrap() } fn words_from_little_endian_bytes(bytes: &[u8], words: &mut [u32]) { debug_assert_eq!(bytes.len(), 4 * words.len()); for (four_bytes, word) in bytes.chunks_exact(4).zip(words) { *word = u32::from_le_bytes(four_bytes.try_into().unwrap()); } } // Each chunk or parent node can produce either an 8-word chaining value or, by // setting the ROOT flag, any number of final output bytes. The Output struct // captures the state just prior to choosing between those two possibilities. struct Output { input_chaining_value: [u32; 8], block_words: [u32; 16], counter: u64, block_len: u32, flags: u32, } impl Output { fn chaining_value(&self) -> [u32; 8] { first_8_words(compress( &self.input_chaining_value, &self.block_words, self.counter, self.block_len, self.flags, )) } fn root_output_bytes(&self, out_slice: &mut [u8]) { let mut output_block_counter = 0; for out_block in out_slice.chunks_mut(2 * OUT_LEN) { let words = compress( &self.input_chaining_value, &self.block_words, output_block_counter, self.block_len, self.flags | ROOT, ); // The output length might not be a multiple of 4. for (word, out_word) in words.iter().zip(out_block.chunks_mut(4)) { out_word.copy_from_slice(&word.to_le_bytes()[..out_word.len()]); } output_block_counter += 1; } } } struct ChunkState { chaining_value: [u32; 8], chunk_counter: u64, block: [u8; BLOCK_LEN], block_len: u8, blocks_compressed: u8, flags: u32, } impl ChunkState { fn new(key_words: [u32; 8], chunk_counter: u64, flags: u32) -> Self { Self { chaining_value: key_words, chunk_counter, block: [0; BLOCK_LEN], block_len: 0, blocks_compressed: 0, flags, } } fn len(&self) -> usize { BLOCK_LEN * self.blocks_compressed as usize + self.block_len as usize } fn start_flag(&self) -> u32 { if self.blocks_compressed == 0 { CHUNK_START } else { 0 } } fn update(&mut self, mut input: &[u8]) { while !input.is_empty() { // If the block buffer is full, compress it and clear it. More // input is coming, so this compression is not CHUNK_END. if self.block_len as usize == BLOCK_LEN { let mut block_words = [0; 16]; words_from_little_endian_bytes(&self.block, &mut block_words); self.chaining_value = first_8_words(compress( &self.chaining_value, &block_words, self.chunk_counter, BLOCK_LEN as u32, self.flags | self.start_flag(), )); self.blocks_compressed += 1; self.block = [0; BLOCK_LEN]; self.block_len = 0; } // Copy input bytes into the block buffer. let want = BLOCK_LEN - self.block_len as usize; let take = min(want, input.len()); self.block[self.block_len as usize..][..take].copy_from_slice(&input[..take]); self.block_len += take as u8; input = &input[take..]; } } fn output(&self) -> Output { let mut block_words = [0; 16]; words_from_little_endian_bytes(&self.block, &mut block_words); Output { input_chaining_value: self.chaining_value, block_words, counter: self.chunk_counter, block_len: self.block_len as u32, flags: self.flags | self.start_flag() | CHUNK_END, } } } fn parent_output( left_child_cv: [u32; 8], right_child_cv: [u32; 8], key_words: [u32; 8], flags: u32, ) -> Output { let mut block_words = [0; 16]; block_words[..8].copy_from_slice(&left_child_cv); block_words[8..].copy_from_slice(&right_child_cv); Output { input_chaining_value: key_words, block_words, counter: 0, // Always 0 for parent nodes. block_len: BLOCK_LEN as u32, // Always BLOCK_LEN (64) for parent nodes. flags: PARENT | flags, } } fn parent_cv( left_child_cv: [u32; 8], right_child_cv: [u32; 8], key_words: [u32; 8], flags: u32, ) -> [u32; 8] { parent_output(left_child_cv, right_child_cv, key_words, flags).chaining_value() } /// An incremental hasher that can accept any number of writes. pub struct Hasher { chunk_state: ChunkState, key_words: [u32; 8], cv_stack: [[u32; 8]; 54], // Space for 54 subtree chaining values: cv_stack_len: u8, // 2^54 * CHUNK_LEN = 2^64 flags: u32, } impl Hasher { fn new_internal(key_words: [u32; 8], flags: u32) -> Self { Self { chunk_state: ChunkState::new(key_words, 0, flags), key_words, cv_stack: [[0; 8]; 54], cv_stack_len: 0, flags, } } /// Construct a new `Hasher` for the regular hash function. pub fn new() -> Self { Self::new_internal(IV, 0) } /// Construct a new `Hasher` for the keyed hash function. pub fn new_keyed(key: &[u8; KEY_LEN]) -> Self { let mut key_words = [0; 8]; words_from_little_endian_bytes(key, &mut key_words); Self::new_internal(key_words, KEYED_HASH) } /// Construct a new `Hasher` for the key derivation function. The context /// string should be hardcoded, globally unique, and application-specific. pub fn new_derive_key(context: &str) -> Self { let mut context_hasher = Self::new_internal(IV, DERIVE_KEY_CONTEXT); context_hasher.update(context.as_bytes()); let mut context_key = [0; KEY_LEN]; context_hasher.finalize(&mut context_key); let mut context_key_words = [0; 8]; words_from_little_endian_bytes(&context_key, &mut context_key_words); Self::new_internal(context_key_words, DERIVE_KEY_MATERIAL) } fn push_stack(&mut self, cv: [u32; 8]) { self.cv_stack[self.cv_stack_len as usize] = cv; self.cv_stack_len += 1; } fn pop_stack(&mut self) -> [u32; 8] { self.cv_stack_len -= 1; self.cv_stack[self.cv_stack_len as usize] } // Section 5.1.2 of the BLAKE3 spec explains this algorithm in more detail. fn add_chunk_chaining_value(&mut self, mut new_cv: [u32; 8], mut total_chunks: u64) { // This chunk might complete some subtrees. For each completed subtree, // its left child will be the current top entry in the CV stack, and // its right child will be the current value of `new_cv`. Pop each left // child off the stack, merge it with `new_cv`, and overwrite `new_cv` // with the result. After all these merges, push the final value of // `new_cv` onto the stack. The number of completed subtrees is given // by the number of trailing 0-bits in the new total number of chunks. while total_chunks & 1 == 0 { new_cv = parent_cv(self.pop_stack(), new_cv, self.key_words, self.flags); total_chunks >>= 1; } self.push_stack(new_cv); } /// Add input to the hash state. This can be called any number of times. pub fn update(&mut self, mut input: &[u8]) { while !input.is_empty() { // If the current chunk is complete, finalize it and reset the // chunk state. More input is coming, so this chunk is not ROOT. if self.chunk_state.len() == CHUNK_LEN { let chunk_cv = self.chunk_state.output().chaining_value(); let total_chunks = self.chunk_state.chunk_counter + 1; self.add_chunk_chaining_value(chunk_cv, total_chunks); self.chunk_state = ChunkState::new(self.key_words, total_chunks, self.flags); } // Compress input bytes into the current chunk state. let want = CHUNK_LEN - self.chunk_state.len(); let take = min(want, input.len()); self.chunk_state.update(&input[..take]); input = &input[take..]; } } /// Finalize the hash and write any number of output bytes. pub fn finalize(&self, out_slice: &mut [u8]) { // Starting with the Output from the current chunk, compute all the // parent chaining values along the right edge of the tree, until we // have the root Output. let mut output = self.chunk_state.output(); let mut parent_nodes_remaining = self.cv_stack_len as usize; while parent_nodes_remaining > 0 { parent_nodes_remaining -= 1; output = parent_output( self.cv_stack[parent_nodes_remaining], output.chaining_value(), self.key_words, self.flags, ); } output.root_output_bytes(out_slice); } } ================================================ FILE: third-party/blake3/src/ffi_avx2.rs ================================================ use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; // Note that there is no AVX2 implementation of compress_in_place or // compress_xof. // Unsafe because this may only be called on platforms supporting AVX2. pub unsafe fn hash_many( inputs: &[&[u8; N]], key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8], ) { unsafe { // The Rust hash_many implementations do bounds checking on the `out` // array, but the C implementations don't. Even though this is an unsafe // function, assert the bounds here. assert!(out.len() >= inputs.len() * OUT_LEN); ffi::blake3_hash_many_avx2( inputs.as_ptr() as *const *const u8, inputs.len(), N / BLOCK_LEN, key.as_ptr(), counter, increment_counter.yes(), flags, flags_start, flags_end, out.as_mut_ptr(), ) } } pub mod ffi { extern "C" { pub fn blake3_hash_many_avx2( inputs: *const *const u8, num_inputs: usize, blocks: usize, key: *const u32, counter: u64, increment_counter: bool, flags: u8, flags_start: u8, flags_end: u8, out: *mut u8, ); } } #[cfg(test)] mod test { use super::*; #[test] fn test_hash_many() { if !crate::platform::avx2_detected() { return; } crate::test::test_hash_many_fn(hash_many, hash_many); } } ================================================ FILE: third-party/blake3/src/ffi_avx512.rs ================================================ use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; // Unsafe because this may only be called on platforms supporting AVX-512. pub unsafe fn compress_in_place( cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) { unsafe { ffi::blake3_compress_in_place_avx512( cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags, ) } } // Unsafe because this may only be called on platforms supporting AVX-512. pub unsafe fn compress_xof( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u8; 64] { unsafe { let mut out = [0u8; 64]; ffi::blake3_compress_xof_avx512( cv.as_ptr(), block.as_ptr(), block_len, counter, flags, out.as_mut_ptr(), ); out } } // Unsafe because this may only be called on platforms supporting AVX-512. pub unsafe fn hash_many( inputs: &[&[u8; N]], key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8], ) { unsafe { // The Rust hash_many implementations do bounds checking on the `out` // array, but the C implementations don't. Even though this is an unsafe // function, assert the bounds here. assert!(out.len() >= inputs.len() * OUT_LEN); ffi::blake3_hash_many_avx512( inputs.as_ptr() as *const *const u8, inputs.len(), N / BLOCK_LEN, key.as_ptr(), counter, increment_counter.yes(), flags, flags_start, flags_end, out.as_mut_ptr(), ) } } // Unsafe because this may only be called on platforms supporting AVX-512. #[cfg(unix)] pub unsafe fn xof_many( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, out: &mut [u8], ) { unsafe { debug_assert_eq!(0, out.len() % BLOCK_LEN, "whole blocks only"); ffi::blake3_xof_many_avx512( cv.as_ptr(), block.as_ptr(), block_len, counter, flags, out.as_mut_ptr(), out.len() / BLOCK_LEN, ); } } pub mod ffi { extern "C" { pub fn blake3_compress_in_place_avx512( cv: *mut u32, block: *const u8, block_len: u8, counter: u64, flags: u8, ); pub fn blake3_compress_xof_avx512( cv: *const u32, block: *const u8, block_len: u8, counter: u64, flags: u8, out: *mut u8, ); pub fn blake3_hash_many_avx512( inputs: *const *const u8, num_inputs: usize, blocks: usize, key: *const u32, counter: u64, increment_counter: bool, flags: u8, flags_start: u8, flags_end: u8, out: *mut u8, ); #[cfg(unix)] pub fn blake3_xof_many_avx512( cv: *const u32, block: *const u8, block_len: u8, counter: u64, flags: u8, out: *mut u8, outblocks: usize, ); } } #[cfg(test)] mod test { use super::*; #[test] fn test_compress() { if !crate::platform::avx512_detected() { return; } crate::test::test_compress_fn(compress_in_place, compress_xof); } #[test] fn test_hash_many() { if !crate::platform::avx512_detected() { return; } crate::test::test_hash_many_fn(hash_many, hash_many); } #[cfg(unix)] #[test] fn test_xof_many() { if !crate::platform::avx512_detected() { return; } crate::test::test_xof_many_fn(xof_many); } } ================================================ FILE: third-party/blake3/src/ffi_neon.rs ================================================ use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; // Unsafe because this may only be called on platforms supporting NEON. pub unsafe fn hash_many( inputs: &[&[u8; N]], key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8], ) { // The Rust hash_many implementations do bounds checking on the `out` // array, but the C implementations don't. Even though this is an unsafe // function, assert the bounds here. assert!(out.len() >= inputs.len() * OUT_LEN); ffi::blake3_hash_many_neon( inputs.as_ptr() as *const *const u8, inputs.len(), N / BLOCK_LEN, key.as_ptr(), counter, increment_counter.yes(), flags, flags_start, flags_end, out.as_mut_ptr(), ) } // blake3_neon.c normally depends on blake3_portable.c, because the NEON // implementation only provides 4x compression, and it relies on the portable // implementation for 1x compression. However, we expose the portable Rust // implementation here instead, to avoid linking in unnecessary code. #[no_mangle] pub extern "C" fn blake3_compress_in_place_portable( cv: *mut u32, block: *const u8, block_len: u8, counter: u64, flags: u8, ) { unsafe { crate::portable::compress_in_place( &mut *(cv as *mut [u32; 8]), &*(block as *const [u8; 64]), block_len, counter, flags, ) } } pub mod ffi { extern "C" { pub fn blake3_hash_many_neon( inputs: *const *const u8, num_inputs: usize, blocks: usize, key: *const u32, counter: u64, increment_counter: bool, flags: u8, flags_start: u8, flags_end: u8, out: *mut u8, ); } } #[cfg(test)] mod test { use super::*; #[test] fn test_hash_many() { // This entire file is gated on feature="neon", so NEON support is // assumed here. crate::test::test_hash_many_fn(hash_many, hash_many); } } ================================================ FILE: third-party/blake3/src/ffi_sse2.rs ================================================ use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; // Unsafe because this may only be called on platforms supporting SSE2. pub unsafe fn compress_in_place( cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) { unsafe { ffi::blake3_compress_in_place_sse2( cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags, ) } } // Unsafe because this may only be called on platforms supporting SSE2. pub unsafe fn compress_xof( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u8; 64] { unsafe { let mut out = [0u8; 64]; ffi::blake3_compress_xof_sse2( cv.as_ptr(), block.as_ptr(), block_len, counter, flags, out.as_mut_ptr(), ); out } } // Unsafe because this may only be called on platforms supporting SSE2. pub unsafe fn hash_many( inputs: &[&[u8; N]], key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8], ) { unsafe { // The Rust hash_many implementations do bounds checking on the `out` // array, but the C implementations don't. Even though this is an unsafe // function, assert the bounds here. assert!(out.len() >= inputs.len() * OUT_LEN); ffi::blake3_hash_many_sse2( inputs.as_ptr() as *const *const u8, inputs.len(), N / BLOCK_LEN, key.as_ptr(), counter, increment_counter.yes(), flags, flags_start, flags_end, out.as_mut_ptr(), ) } } pub mod ffi { extern "C" { pub fn blake3_compress_in_place_sse2( cv: *mut u32, block: *const u8, block_len: u8, counter: u64, flags: u8, ); pub fn blake3_compress_xof_sse2( cv: *const u32, block: *const u8, block_len: u8, counter: u64, flags: u8, out: *mut u8, ); pub fn blake3_hash_many_sse2( inputs: *const *const u8, num_inputs: usize, blocks: usize, key: *const u32, counter: u64, increment_counter: bool, flags: u8, flags_start: u8, flags_end: u8, out: *mut u8, ); } } #[cfg(test)] mod test { use super::*; #[test] fn test_compress() { if !crate::platform::sse2_detected() { return; } crate::test::test_compress_fn(compress_in_place, compress_xof); } #[test] fn test_hash_many() { if !crate::platform::sse2_detected() { return; } crate::test::test_hash_many_fn(hash_many, hash_many); } } ================================================ FILE: third-party/blake3/src/ffi_sse41.rs ================================================ use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; // Unsafe because this may only be called on platforms supporting SSE4.1. pub unsafe fn compress_in_place( cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) { unsafe { ffi::blake3_compress_in_place_sse41( cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags, ) } } // Unsafe because this may only be called on platforms supporting SSE4.1. pub unsafe fn compress_xof( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u8; 64] { unsafe { let mut out = [0u8; 64]; ffi::blake3_compress_xof_sse41( cv.as_ptr(), block.as_ptr(), block_len, counter, flags, out.as_mut_ptr(), ); out } } // Unsafe because this may only be called on platforms supporting SSE4.1. pub unsafe fn hash_many( inputs: &[&[u8; N]], key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8], ) { unsafe { // The Rust hash_many implementations do bounds checking on the `out` // array, but the C implementations don't. Even though this is an unsafe // function, assert the bounds here. assert!(out.len() >= inputs.len() * OUT_LEN); ffi::blake3_hash_many_sse41( inputs.as_ptr() as *const *const u8, inputs.len(), N / BLOCK_LEN, key.as_ptr(), counter, increment_counter.yes(), flags, flags_start, flags_end, out.as_mut_ptr(), ) } } pub mod ffi { extern "C" { pub fn blake3_compress_in_place_sse41( cv: *mut u32, block: *const u8, block_len: u8, counter: u64, flags: u8, ); pub fn blake3_compress_xof_sse41( cv: *const u32, block: *const u8, block_len: u8, counter: u64, flags: u8, out: *mut u8, ); pub fn blake3_hash_many_sse41( inputs: *const *const u8, num_inputs: usize, blocks: usize, key: *const u32, counter: u64, increment_counter: bool, flags: u8, flags_start: u8, flags_end: u8, out: *mut u8, ); } } #[cfg(test)] mod test { use super::*; #[test] fn test_compress() { if !crate::platform::sse41_detected() { return; } crate::test::test_compress_fn(compress_in_place, compress_xof); } #[test] fn test_hash_many() { if !crate::platform::sse41_detected() { return; } crate::test::test_hash_many_fn(hash_many, hash_many); } } ================================================ FILE: third-party/blake3/src/guts.rs ================================================ //! Deprecated in favor of [`hazmat`](crate::hazmat) pub use crate::{BLOCK_LEN, CHUNK_LEN}; #[derive(Clone, Debug)] pub struct ChunkState(crate::ChunkState); impl ChunkState { // Currently this type only supports the regular hash mode. If an // incremental user needs keyed_hash or derive_key, we can add that. pub fn new(chunk_counter: u64) -> Self { Self(crate::ChunkState::new( crate::IV, chunk_counter, 0, crate::platform::Platform::detect(), )) } #[inline] pub fn len(&self) -> usize { self.0.count() } #[inline] pub fn update(&mut self, input: &[u8]) -> &mut Self { self.0.update(input); self } pub fn finalize(&self, is_root: bool) -> crate::Hash { let output = self.0.output(); if is_root { output.root_hash() } else { output.chaining_value().into() } } } // As above, this currently assumes the regular hash mode. If an incremental // user needs keyed_hash or derive_key, we can add that. pub fn parent_cv( left_child: &crate::Hash, right_child: &crate::Hash, is_root: bool, ) -> crate::Hash { let output = crate::parent_node_output( left_child.as_bytes(), right_child.as_bytes(), crate::IV, 0, crate::platform::Platform::detect(), ); if is_root { output.root_hash() } else { output.chaining_value().into() } } ================================================ FILE: third-party/blake3/src/hazmat.rs ================================================ //! Low-level tree manipulations and other sharp tools //! //! The target audience for this module is projects like [Bao](https://github.com/oconnor663/bao), //! which work directly with the interior hashes ("chaining values") of BLAKE3 chunks and subtrees. //! For example, you could use these functions to implement a BitTorrent-like protocol using the //! BLAKE3 tree structure, or to hash an input that's distributed across different machines. These //! use cases are advanced, and most applications don't need this module. Also: //! //!
//! //! **Warning:** This module is *hazardous material*. If you've heard folks say *don't roll your //! own crypto,* this is the sort of thing they're talking about. These functions have complicated //! requirements, and any mistakes will give you garbage output and/or break the security //! properties that BLAKE3 is supposed to have. Read section 2.1 of [the BLAKE3 //! paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf) to understand the //! tree structure you need to maintain. Test your code against [`blake3::hash`](../fn.hash.html) //! and make sure you can get the same outputs for [lots of different //! inputs](https://github.com/BLAKE3-team/BLAKE3/blob/master/test_vectors/test_vectors.json). //! //!
//! //! On the other hand: //! //!
//! //! **Encouragement:** Playing with these functions is a great way to learn how BLAKE3 works on the //! inside. Have fun! //! //!
//! //! The main entrypoint for this module is the [`HasherExt`] trait, particularly the //! [`set_input_offset`](HasherExt::set_input_offset) and //! [`finalize_non_root`](HasherExt::finalize_non_root) methods. These let you compute the chaining //! values of individual chunks or subtrees. You then combine these chaining values into larger //! subtrees using [`merge_subtrees_non_root`] and finally (once at the very top) //! [`merge_subtrees_root`] or [`merge_subtrees_root_xof`]. //! //! # Examples //! //! Here's an example of computing all the interior hashes in a 3-chunk tree: //! //! ```text //! root //! / \ //! parent \ //! / \ \ //! chunk0 chunk1 chunk2 //! ``` //! //! ``` //! # fn main() { //! use blake3::{Hasher, CHUNK_LEN}; //! use blake3::hazmat::{merge_subtrees_non_root, merge_subtrees_root, Mode}; //! use blake3::hazmat::HasherExt; // an extension trait for Hasher //! //! let chunk0 = [b'a'; CHUNK_LEN]; //! let chunk1 = [b'b'; CHUNK_LEN]; //! let chunk2 = [b'c'; 42]; // The final chunk can be short. //! //! // Compute the non-root hashes ("chaining values") of all three chunks. Chunks or subtrees //! // that don't begin at the start of the input use `set_input_offset` to say where they begin. //! let chunk0_cv = Hasher::new() //! // .set_input_offset(0) is the default. //! .update(&chunk0) //! .finalize_non_root(); //! let chunk1_cv = Hasher::new() //! .set_input_offset(CHUNK_LEN as u64) //! .update(&chunk1) //! .finalize_non_root(); //! let chunk2_cv = Hasher::new() //! .set_input_offset(2 * CHUNK_LEN as u64) //! .update(&chunk2) //! .finalize_non_root(); //! //! // Join the first two chunks with a non-root parent node and compute its chaining value. //! let parent_cv = merge_subtrees_non_root(&chunk0_cv, &chunk1_cv, Mode::Hash); //! //! // Join that parent node and the third chunk with a root parent node and compute the hash. //! let root_hash = merge_subtrees_root(&parent_cv, &chunk2_cv, Mode::Hash); //! //! // Double check that we got the right answer. //! let mut combined_input = Vec::new(); //! combined_input.extend_from_slice(&chunk0); //! combined_input.extend_from_slice(&chunk1); //! combined_input.extend_from_slice(&chunk2); //! assert_eq!(root_hash, blake3::hash(&combined_input)); //! # } //! ``` //! //! Hashing many chunks together is important for performance, because it allows the implementation //! to use SIMD parallelism internally. ([AVX-512](https://en.wikipedia.org/wiki/AVX-512) for //! example needs 16 chunks to really get going.) We can reproduce `parent_cv` by hashing `chunk0` //! and `chunk1` at the same time: //! //! ``` //! # fn main() { //! # use blake3::{Hasher, CHUNK_LEN}; //! # use blake3::hazmat::{Mode, HasherExt, merge_subtrees_non_root, merge_subtrees_root}; //! # let chunk0 = [b'a'; CHUNK_LEN]; //! # let chunk1 = [b'b'; CHUNK_LEN]; //! # let chunk0_cv = Hasher::new().update(&chunk0).finalize_non_root(); //! # let chunk1_cv = Hasher::new().set_input_offset(CHUNK_LEN as u64).update(&chunk1).finalize_non_root(); //! # let parent_cv = merge_subtrees_non_root(&chunk0_cv, &chunk1_cv, Mode::Hash); //! # let mut combined_input = Vec::new(); //! # combined_input.extend_from_slice(&chunk0); //! # combined_input.extend_from_slice(&chunk1); //! let left_subtree_cv = Hasher::new() //! // .set_input_offset(0) is the default. //! .update(&combined_input[..2 * CHUNK_LEN]) //! .finalize_non_root(); //! assert_eq!(left_subtree_cv, parent_cv); //! //! // Using multiple updates gives the same answer, though it's not as efficient. //! let mut subtree_hasher = Hasher::new(); //! // Again, .set_input_offset(0) is the default. //! subtree_hasher.update(&chunk0); //! subtree_hasher.update(&chunk1); //! assert_eq!(left_subtree_cv, subtree_hasher.finalize_non_root()); //! # } //! ``` //! //! However, hashing multiple chunks together **must** respect the overall tree structure. Hashing //! `chunk0` and `chunk1` together is valid, but hashing `chunk1` and `chunk2` together is //! incorrect and gives a garbage result that will never match a standard BLAKE3 hash. The //! implementation includes a few best-effort asserts to catch some of these mistakes, but these //! checks aren't guaranteed. For example, this second call to `update` currently panics: //! //! ```should_panic //! # fn main() { //! # use blake3::{Hasher, CHUNK_LEN}; //! # use blake3::hazmat::HasherExt; //! # let chunk0 = [b'a'; CHUNK_LEN]; //! # let chunk1 = [b'b'; CHUNK_LEN]; //! # let chunk2 = [b'c'; 42]; //! let oops = Hasher::new() //! .set_input_offset(CHUNK_LEN as u64) //! .update(&chunk1) //! // PANIC: "the subtree starting at 1024 contains at most 1024 bytes" //! .update(&chunk2) //! .finalize_non_root(); //! # } //! ``` //! //! For more on valid tree structures, see the docs for and [`left_subtree_len`] and //! [`max_subtree_len`], and see section 2.1 of [the BLAKE3 //! paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). Note that the //! merging functions ([`merge_subtrees_root`] and friends) don't know the shape of the left and //! right subtrees you're giving them, and they can't help you catch mistakes. The best way to //! catch mistakes with these is to compare your root output to the [`blake3::hash`](crate::hash) //! of the same input. use crate::platform::Platform; use crate::{CVWords, Hasher, CHUNK_LEN, IV, KEY_LEN, OUT_LEN}; /// Extension methods for [`Hasher`]. This is the main entrypoint to the `hazmat` module. pub trait HasherExt { /// Similar to [`Hasher::new_derive_key`] but using a pre-hashed [`ContextKey`] from /// [`hash_derive_key_context`]. /// /// The [`hash_derive_key_context`] function is _only_ valid source of the [`ContextKey`] /// /// # Example /// /// ``` /// use blake3::Hasher; /// use blake3::hazmat::HasherExt; /// /// let context_key = blake3::hazmat::hash_derive_key_context("foo"); /// let mut hasher = Hasher::new_from_context_key(&context_key); /// hasher.update(b"bar"); /// let derived_key = *hasher.finalize().as_bytes(); /// /// assert_eq!(derived_key, blake3::derive_key("foo", b"bar")); /// ``` fn new_from_context_key(context_key: &ContextKey) -> Self; /// Configure the `Hasher` to process a chunk or subtree starting at `offset` bytes into the /// whole input. /// /// You must call this function before processing any input with [`update`](Hasher::update) or /// similar. This step isn't required for the first chunk, or for a subtree that includes the /// first chunk (i.e. when the `offset` is zero), but it's required for all other chunks and /// subtrees. /// /// The starting input offset of a subtree implies a maximum possible length for that subtree. /// See [`max_subtree_len`] and section 2.1 of [the BLAKE3 /// paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). Note that only /// subtrees along the right edge of the whole tree can have a length less than their maximum /// possible length. /// /// See the [module level examples](index.html#examples). /// /// # Panics /// /// This function panics if the `Hasher` has already accepted any input with /// [`update`](Hasher::update) or similar. /// /// This should always be paired with [`finalize_non_root`](HasherExt::finalize_non_root). It's /// never correct to use a non-zero input offset with [`finalize`](Hasher::finalize) or /// [`finalize_xof`](Hasher::finalize_xof). The `offset` must also be a multiple of /// `CHUNK_LEN`. Violating either of these rules will currently fail an assertion and panic, /// but this is not guaranteed. fn set_input_offset(&mut self, offset: u64) -> &mut Self; /// Finalize the non-root hash ("chaining value") of the current chunk or subtree. /// /// Afterwards you can merge subtree chaining values into parent nodes using /// [`merge_subtrees_non_root`] and ultimately into the root node with either /// [`merge_subtrees_root`] (similar to [`Hasher::finalize`]) or [`merge_subtrees_root_xof`] /// (similar to [`Hasher::finalize_xof`]). /// /// See the [module level examples](index.html#examples), particularly the discussion of valid /// tree structures. fn finalize_non_root(&self) -> ChainingValue; } impl HasherExt for Hasher { fn new_from_context_key(context_key: &[u8; KEY_LEN]) -> Hasher { let context_key_words = crate::platform::words_from_le_bytes_32(context_key); Hasher::new_internal(&context_key_words, crate::DERIVE_KEY_MATERIAL) } fn set_input_offset(&mut self, offset: u64) -> &mut Hasher { assert_eq!(self.count(), 0, "hasher has already accepted input"); assert_eq!( offset % CHUNK_LEN as u64, 0, "offset ({offset}) must be a chunk boundary (divisible by {CHUNK_LEN})", ); let counter = offset / CHUNK_LEN as u64; self.chunk_state.chunk_counter = counter; self.initial_chunk_counter = counter; self } fn finalize_non_root(&self) -> ChainingValue { assert_ne!(self.count(), 0, "empty subtrees are never valid"); self.final_output().chaining_value() } } /// The maximum length of a subtree in bytes, given its starting offset in bytes /// /// If you try to hash more than this many bytes as one subtree, you'll end up merging parent nodes /// that shouldn't be merged, and your output will be garbage. [`Hasher::update`] will currently /// panic in this case, but this is not guaranteed. /// /// For input offset zero (the default), there is no maximum length, and this function returns /// `None`. For all other offsets it returns `Some`. Note that valid offsets must be a multiple of /// [`CHUNK_LEN`] (1024); it's not possible to start hashing a chunk in the middle. /// /// In the example tree below, chunks are numbered by their _0-based index_. The subtree that /// _starts_ with chunk 3, i.e. `input_offset = 3 * CHUNK_LEN`, includes only that one chunk, so /// its max length is `Some(CHUNK_LEN)`. The subtree that starts with chunk 6 includes chunk 7 but /// not chunk 8, so its max length is `Some(2 * CHUNK_LEN)`. The subtree that starts with chunk 12 /// includes chunks 13, 14, and 15, but if the tree were bigger it would not include chunk 16, so /// its max length is `Some(4 * CHUNK_LEN)`. One way to think about the rule here is that, if you /// go beyond the max subtree length from a given starting offset, you start dealing with subtrees /// that include chunks _to the left_ of where you started. /// /// ```text /// root /// / \ /// . . /// / \ / \ /// . . . . /// / \ / \ / \ / \ /// . . . . . . . . /// / \ / \ / \ / \ / \ / \ / \ / \ /// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 /// ``` /// /// The general rule turns out to be that for a subtree starting at a 0-based chunk index N greater /// than zero, the maximum number of chunks in that subtree is the largest power-of-two that /// divides N, which is given by `1 << N.trailing_zeros()`. /// /// This function can be useful for writing tests or debug assertions, but it's actually rare to /// use this for real control flow. Callers who split their input recursively using /// [`left_subtree_len`] will automatically satisfy the `max_subtree_len` bound and don't /// necessarily need to check. It's also common to choose some fixed power-of-two subtree size, say /// 64 chunks, and divide your input up into slices of that fixed length (with the final slice /// possibly short). This approach also automatically satisfies the `max_subtree_len` bound and /// doesn't need to check. Proving that this is true can be an interesting exercise. Note that /// chunks 0, 4, 8, and 12 all begin subtrees of at least 4 chunks in the example tree above. /// /// # Panics /// /// This function currently panics if `input_offset` is not a multiple of `CHUNK_LEN`. This is not /// guaranteed. #[inline(always)] pub fn max_subtree_len(input_offset: u64) -> Option { if input_offset == 0 { return None; } assert_eq!(input_offset % CHUNK_LEN as u64, 0); let counter = input_offset / CHUNK_LEN as u64; let max_chunks = 1 << counter.trailing_zeros(); Some(max_chunks * CHUNK_LEN as u64) } #[test] fn test_max_subtree_len() { assert_eq!(max_subtree_len(0), None); // (chunk index, max chunks) let cases = [ (1, 1), (2, 2), (3, 1), (4, 4), (5, 1), (6, 2), (7, 1), (8, 8), ]; for (chunk_index, max_chunks) in cases { let input_offset = chunk_index * CHUNK_LEN as u64; assert_eq!( max_subtree_len(input_offset), Some(max_chunks * CHUNK_LEN as u64), ); } } /// Given the length in bytes of either a complete input or a subtree input, return the number of /// bytes that belong to its left child subtree. The rest belong to its right child subtree. /// /// Concretely, this function returns the largest power-of-two number of bytes that's strictly less /// than `input_len`. This leads to a tree where all left subtrees are "complete" and at least as /// large as their sibling right subtrees, as specified in section 2.1 of [the BLAKE3 /// paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). For example, if an /// input is exactly two chunks, its left and right subtrees both get one chunk. But if an input is /// two chunks plus one more byte, then its left subtree gets two chunks, and its right subtree /// only gets one byte. /// /// This function isn't meaningful for one chunk of input, because chunks don't have children. It /// currently panics in debug mode if `input_len <= CHUNK_LEN`. /// /// # Example /// /// Hash a input of random length as two subtrees: /// /// ``` /// # #[cfg(feature = "std")] { /// use blake3::hazmat::{left_subtree_len, merge_subtrees_root, HasherExt, Mode}; /// use blake3::{Hasher, CHUNK_LEN}; /// /// // Generate a random-length input. Note that to be split into two subtrees, the input length /// // must be greater than CHUNK_LEN. /// let input_len = rand::random_range(CHUNK_LEN + 1..1_000_000); /// let mut input = vec![0; input_len]; /// rand::fill(&mut input[..]); /// /// // Compute the left and right subtree hashes and then the root hash. left_subtree_len() tells /// // us exactly where to split the input. Any other split would either panic (if we're lucky) or /// // lead to an incorrect root hash. /// let left_len = left_subtree_len(input_len as u64) as usize; /// let left_subtree_cv = Hasher::new() /// .update(&input[..left_len]) /// .finalize_non_root(); /// let right_subtree_cv = Hasher::new() /// .set_input_offset(left_len as u64) /// .update(&input[left_len..]) /// .finalize_non_root(); /// let root_hash = merge_subtrees_root(&left_subtree_cv, &right_subtree_cv, Mode::Hash); /// /// // Double check the answer. /// assert_eq!(root_hash, blake3::hash(&input)); /// # } /// ``` #[inline(always)] pub fn left_subtree_len(input_len: u64) -> u64 { debug_assert!(input_len > CHUNK_LEN as u64); // Note that .next_power_of_two() is greater than *or equal*. ((input_len + 1) / 2).next_power_of_two() } #[test] fn test_left_subtree_len() { assert_eq!(left_subtree_len(1025), 1024); for boundary_case in [2, 4, 8, 16, 32, 64] { let input_len = boundary_case * CHUNK_LEN as u64; assert_eq!(left_subtree_len(input_len - 1), input_len / 2); assert_eq!(left_subtree_len(input_len), input_len / 2); assert_eq!(left_subtree_len(input_len + 1), input_len); } } /// The `mode` argument to [`merge_subtrees_root`] and friends /// /// See the [module level examples](index.html#examples). #[derive(Copy, Clone, Debug)] pub enum Mode<'a> { /// Corresponding to [`hash`](crate::hash) Hash, /// Corresponding to [`keyed_hash`](crate::hash) KeyedHash(&'a [u8; KEY_LEN]), /// Corresponding to [`derive_key`](crate::hash) /// /// The [`ContextKey`] comes from [`hash_derive_key_context`]. DeriveKeyMaterial(&'a ContextKey), } impl<'a> Mode<'a> { fn key_words(&self) -> CVWords { match self { Mode::Hash => *IV, Mode::KeyedHash(key) => crate::platform::words_from_le_bytes_32(key), Mode::DeriveKeyMaterial(cx_key) => crate::platform::words_from_le_bytes_32(cx_key), } } fn flags_byte(&self) -> u8 { match self { Mode::Hash => 0, Mode::KeyedHash(_) => crate::KEYED_HASH, Mode::DeriveKeyMaterial(_) => crate::DERIVE_KEY_MATERIAL, } } } /// "Chaining value" is the academic term for a non-root or non-final hash. /// /// Besides just sounding fancy, it turns out there are [security /// reasons](https://jacko.io/tree_hashing.html) to be careful about the difference between /// (root/final) hashes and (non-root/non-final) chaining values. pub type ChainingValue = [u8; OUT_LEN]; fn merge_subtrees_inner( left_child: &ChainingValue, right_child: &ChainingValue, mode: Mode, ) -> crate::Output { crate::parent_node_output( &left_child, &right_child, &mode.key_words(), mode.flags_byte(), Platform::detect(), ) } /// Compute a non-root parent node chaining value from two child chaining values. /// /// See the [module level examples](index.html#examples), particularly the discussion of valid tree /// structures. The left and right child chaining values can come from either /// [`Hasher::finalize_non_root`](HasherExt::finalize_non_root) or other calls to /// `merge_subtrees_non_root`. "Chaining value" is the academic term for a non-root or non-final /// hash. pub fn merge_subtrees_non_root( left_child: &ChainingValue, right_child: &ChainingValue, mode: Mode, ) -> ChainingValue { merge_subtrees_inner(left_child, right_child, mode).chaining_value() } /// Compute a root hash from two child chaining values. /// /// See the [module level examples](index.html#examples), particularly the discussion of valid tree /// structures. The left and right child chaining values can come from either /// [`Hasher::finalize_non_root`](HasherExt::finalize_non_root) or [`merge_subtrees_non_root`]. /// "Chaining value" is the academic term for a non-root or non-final hash. /// /// Note that inputs of [`CHUNK_LEN`] or less don't produce any parent nodes and can't be hashed /// using this function. In that case you must get the root hash from [`Hasher::finalize`] (or just /// [`blake3::hash`](crate::hash)). pub fn merge_subtrees_root( left_child: &ChainingValue, right_child: &ChainingValue, mode: Mode, ) -> crate::Hash { merge_subtrees_inner(left_child, right_child, mode).root_hash() } /// Build a root [`OutputReader`](crate::OutputReader) from two child chaining values. /// /// See also the [module level examples](index.html#examples), particularly the discussion of valid /// tree structures. The left and right child chaining values can come from either /// [`Hasher::finalize_non_root`](HasherExt::finalize_non_root) or [`merge_subtrees_non_root`]. /// "Chaining value" is the academic term for a non-root or non-final hash. /// /// Note that inputs of [`CHUNK_LEN`] or less don't produce any parent nodes and can't be hashed /// using this function. In that case you must get the `OutputReader` from /// [`Hasher::finalize_xof`]. /// /// # Example /// /// ``` /// use blake3::hazmat::{merge_subtrees_root_xof, HasherExt, Mode}; /// use blake3::{Hasher, CHUNK_LEN}; /// /// // Hash a 2-chunk subtree in steps. Note that only /// // the final chunk can be shorter than CHUNK_LEN. /// let chunk0 = &[42; CHUNK_LEN]; /// let chunk1 = b"hello world"; /// let chunk0_cv = Hasher::new() /// .update(chunk0) /// .finalize_non_root(); /// let chunk1_cv = Hasher::new() /// .set_input_offset(CHUNK_LEN as u64) /// .update(chunk1) /// .finalize_non_root(); /// /// // Obtain a blake3::OutputReader at the root and extract 1000 bytes. /// let mut output_reader = merge_subtrees_root_xof(&chunk0_cv, &chunk1_cv, Mode::Hash); /// let mut output_bytes = [0; 1_000]; /// output_reader.fill(&mut output_bytes); /// /// // Double check the answer. /// let mut hasher = Hasher::new(); /// hasher.update(chunk0); /// hasher.update(chunk1); /// let mut expected = [0; 1_000]; /// hasher.finalize_xof().fill(&mut expected); /// assert_eq!(output_bytes, expected); /// ``` pub fn merge_subtrees_root_xof( left_child: &ChainingValue, right_child: &ChainingValue, mode: Mode, ) -> crate::OutputReader { crate::OutputReader::new(merge_subtrees_inner(left_child, right_child, mode)) } /// An alias to distinguish [`hash_derive_key_context`] outputs from other keys. pub type ContextKey = [u8; KEY_LEN]; /// Hash a [`derive_key`](crate::derive_key) context string and return a [`ContextKey`]. /// /// The _only_ valid uses for the returned [`ContextKey`] are [`Hasher::new_from_context_key`] and /// [`Mode::DeriveKeyMaterial`] (together with the merge subtree functions). /// /// # Example /// /// ``` /// use blake3::Hasher; /// use blake3::hazmat::HasherExt; /// /// let context_key = blake3::hazmat::hash_derive_key_context("foo"); /// let mut hasher = Hasher::new_from_context_key(&context_key); /// hasher.update(b"bar"); /// let derived_key = *hasher.finalize().as_bytes(); /// /// assert_eq!(derived_key, blake3::derive_key("foo", b"bar")); /// ``` pub fn hash_derive_key_context(context: &str) -> ContextKey { crate::hash_all_at_once::( context.as_bytes(), IV, crate::DERIVE_KEY_CONTEXT, ) .root_hash() .0 } #[cfg(test)] mod test { use super::*; #[test] #[should_panic] fn test_empty_subtree_should_panic() { Hasher::new().finalize_non_root(); } #[test] #[should_panic] fn test_unaligned_offset_should_panic() { Hasher::new().set_input_offset(1); } #[test] #[should_panic] fn test_hasher_already_accepted_input_should_panic() { Hasher::new().update(b"x").set_input_offset(0); } #[test] #[should_panic] fn test_too_much_input_should_panic() { Hasher::new() .set_input_offset(CHUNK_LEN as u64) .update(&[0; CHUNK_LEN + 1]); } #[test] #[should_panic] fn test_set_input_offset_cant_finalize() { Hasher::new().set_input_offset(CHUNK_LEN as u64).finalize(); } #[test] #[should_panic] fn test_set_input_offset_cant_finalize_xof() { Hasher::new() .set_input_offset(CHUNK_LEN as u64) .finalize_xof(); } #[test] fn test_grouped_hash() { const MAX_CHUNKS: usize = (crate::test::TEST_CASES_MAX + 1) / CHUNK_LEN; let mut input_buf = [0; crate::test::TEST_CASES_MAX]; crate::test::paint_test_input(&mut input_buf); for subtree_chunks in [1, 2, 4, 8, 16, 32] { #[cfg(feature = "std")] dbg!(subtree_chunks); let subtree_len = subtree_chunks * CHUNK_LEN; for &case in crate::test::TEST_CASES { if case <= subtree_len { continue; } #[cfg(feature = "std")] dbg!(case); let input = &input_buf[..case]; let expected_hash = crate::hash(input); // Collect all the group chaining values. let mut chaining_values = arrayvec::ArrayVec::::new(); let mut subtree_offset = 0; while subtree_offset < input.len() { let take = core::cmp::min(subtree_len, input.len() - subtree_offset); let subtree_input = &input[subtree_offset..][..take]; let subtree_cv = Hasher::new() .set_input_offset(subtree_offset as u64) .update(subtree_input) .finalize_non_root(); chaining_values.push(subtree_cv); subtree_offset += take; } // Compress all the chaining_values together, layer by layer. assert!(chaining_values.len() >= 2); while chaining_values.len() > 2 { let n = chaining_values.len(); // Merge each side-by-side pair in place, overwriting the front half of the // array with the merged results. This moves us "up one level" in the tree. for i in 0..(n / 2) { chaining_values[i] = merge_subtrees_non_root( &chaining_values[2 * i], &chaining_values[2 * i + 1], Mode::Hash, ); } // If there's an odd CV out, it moves up. if n % 2 == 1 { chaining_values[n / 2] = chaining_values[n - 1]; } chaining_values.truncate(n / 2 + n % 2); } assert_eq!(chaining_values.len(), 2); let root_hash = merge_subtrees_root(&chaining_values[0], &chaining_values[1], Mode::Hash); assert_eq!(expected_hash, root_hash); } } } #[test] fn test_keyed_hash_xof() { let group0 = &[42; 4096]; let group1 = &[43; 4095]; let mut input = [0; 8191]; input[..4096].copy_from_slice(group0); input[4096..].copy_from_slice(group1); let key = &[44; 32]; let mut expected_output = [0; 100]; Hasher::new_keyed(&key) .update(&input) .finalize_xof() .fill(&mut expected_output); let mut hazmat_output = [0; 100]; let left = Hasher::new_keyed(key).update(group0).finalize_non_root(); let right = Hasher::new_keyed(key) .set_input_offset(group0.len() as u64) .update(group1) .finalize_non_root(); merge_subtrees_root_xof(&left, &right, Mode::KeyedHash(&key)).fill(&mut hazmat_output); assert_eq!(expected_output, hazmat_output); } #[test] fn test_derive_key() { let context = "foo"; let mut input = [0; 1025]; crate::test::paint_test_input(&mut input); let expected = crate::derive_key(context, &input); let cx_key = hash_derive_key_context(context); let left = Hasher::new_from_context_key(&cx_key) .update(&input[..1024]) .finalize_non_root(); let right = Hasher::new_from_context_key(&cx_key) .set_input_offset(1024) .update(&input[1024..]) .finalize_non_root(); let derived_key = merge_subtrees_root(&left, &right, Mode::DeriveKeyMaterial(&cx_key)).0; assert_eq!(expected, derived_key); } } ================================================ FILE: third-party/blake3/src/io.rs ================================================ //! Helper functions for efficient IO. #[cfg(feature = "std")] pub(crate) fn copy_wide( mut reader: impl std::io::Read, hasher: &mut crate::Hasher, ) -> std::io::Result { let mut buffer = [0; 65536]; let mut total = 0; loop { match reader.read(&mut buffer) { Ok(0) => return Ok(total), Ok(n) => { hasher.update(&buffer[..n]); total += n as u64; } // see test_update_reader_interrupted Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue, Err(e) => return Err(e), } } } // Mmap a file, if it looks like a good idea. Return None in cases where we know mmap will fail, or // if the file is short enough that mmapping isn't worth it. However, if we do try to mmap and it // fails, return the error. // // SAFETY: Mmaps are fundamentally unsafe, because you can call invariant-checking functions like // str::from_utf8 on them and then have them change out from under you. Letting a safe caller get // their hands on an mmap, or even a &[u8] that's backed by an mmap, is unsound. However, because // this function is crate-private, we can guarantee that all can ever happen in the event of a race // condition is that we either hash nonsense bytes or crash with SIGBUS or similar, neither of // which should risk memory corruption in a safe caller. // // PARANOIA: But a data race...is a data race...is a data race...right? Even if we know that no // platform in the "real world" is ever going to do anything other than compute the "wrong answer" // if we race on this mmap while we hash it, aren't we still supposed to feel bad about doing this? // Well, maybe. This is IO, and IO gets special carve-outs in the memory model. Consider a // memory-mapped register that returns random 32-bit words. (This is actually realistic if you have // a hardware RNG.) It's probably sound to construct a *const i32 pointing to that register and do // some raw pointer reads from it. Those reads should be volatile if you don't want the compiler to // coalesce them, but either way the compiler isn't allowed to just _go nuts_ and insert // should-never-happen branches to wipe your hard drive if two adjacent reads happen to give // different values. As far as I'm aware, there's no such thing as a read that's allowed if it's // volatile but prohibited if it's not (unlike atomics). As mentioned above, it's not ok to // construct a safe &i32 to the register if you're going to leak that reference to unknown callers. // But if you "know what you're doing," I don't think *const i32 and &i32 are fundamentally // different here. Feedback needed. #[cfg(feature = "mmap")] pub(crate) fn maybe_mmap_file(file: &std::fs::File) -> std::io::Result> { let metadata = file.metadata()?; let file_size = metadata.len(); if !metadata.is_file() { // Not a real file. Ok(None) } else if file_size < 16 * 1024 { // Mapping small files is not worth it, and some special files that can't be mapped report // a size of zero. Ok(None) } else { let map = unsafe { memmap2::Mmap::map(file)? }; Ok(Some(map)) } } ================================================ FILE: third-party/blake3/src/join.rs ================================================ //! The multi-threading abstractions used by `Hasher::update_with_join`. //! //! Different implementations of the `Join` trait determine whether //! `Hasher::update_with_join` performs multi-threading on sufficiently large //! inputs. The `SerialJoin` implementation is single-threaded, and the //! `RayonJoin` implementation (gated by the `rayon` feature) is multi-threaded. //! Interfaces other than `Hasher::update_with_join`, like [`hash`](crate::hash) //! and [`Hasher::update`](crate::Hasher::update), always use `SerialJoin` //! internally. //! //! The `Join` trait is an almost exact copy of the [`rayon::join`] API, and //! `RayonJoin` is the only non-trivial implementation. Previously this trait //! was public, but currently it's been re-privatized, as it's both 1) of no //! value to most callers and 2) a pretty big implementation detail to commit //! to. //! //! [`rayon::join`]: https://docs.rs/rayon/1.3.0/rayon/fn.join.html /// The trait that abstracts over single-threaded and multi-threaded recursion. /// /// See the [`join` module docs](index.html) for more details. pub trait Join { fn join(oper_a: A, oper_b: B) -> (RA, RB) where A: FnOnce() -> RA + Send, B: FnOnce() -> RB + Send, RA: Send, RB: Send; } /// The trivial, serial implementation of `Join`. The left and right sides are /// executed one after the other, on the calling thread. The standalone hashing /// functions and the `Hasher::update` method use this implementation /// internally. /// /// See the [`join` module docs](index.html) for more details. pub enum SerialJoin {} impl Join for SerialJoin { #[inline] fn join(oper_a: A, oper_b: B) -> (RA, RB) where A: FnOnce() -> RA + Send, B: FnOnce() -> RB + Send, RA: Send, RB: Send, { (oper_a(), oper_b()) } } /// The Rayon-based implementation of `Join`. The left and right sides are /// executed on the Rayon thread pool, potentially in parallel. This /// implementation is gated by the `rayon` feature, which is off by default. /// /// See the [`join` module docs](index.html) for more details. #[cfg(feature = "rayon")] pub enum RayonJoin {} #[cfg(feature = "rayon")] impl Join for RayonJoin { #[inline] fn join(oper_a: A, oper_b: B) -> (RA, RB) where A: FnOnce() -> RA + Send, B: FnOnce() -> RB + Send, RA: Send, RB: Send, { rayon_core::join(oper_a, oper_b) } } #[cfg(test)] mod test { use super::*; #[test] fn test_serial_join() { let oper_a = || 1 + 1; let oper_b = || 2 + 2; assert_eq!((2, 4), SerialJoin::join(oper_a, oper_b)); } #[test] #[cfg(feature = "rayon")] fn test_rayon_join() { let oper_a = || 1 + 1; let oper_b = || 2 + 2; assert_eq!((2, 4), RayonJoin::join(oper_a, oper_b)); } } ================================================ FILE: third-party/blake3/src/lib.rs ================================================ //! The official Rust implementation of the [BLAKE3] cryptographic hash //! function. //! //! # Examples //! //! ``` //! # fn main() -> Result<(), Box> { //! // Hash an input all at once. //! let hash1 = blake3::hash(b"foobarbaz"); //! //! // Hash an input incrementally. //! let mut hasher = blake3::Hasher::new(); //! hasher.update(b"foo"); //! hasher.update(b"bar"); //! hasher.update(b"baz"); //! let hash2 = hasher.finalize(); //! assert_eq!(hash1, hash2); //! //! // Extended output. OutputReader also implements Read and Seek. //! # #[cfg(feature = "std")] { //! let mut output = [0; 1000]; //! let mut output_reader = hasher.finalize_xof(); //! output_reader.fill(&mut output); //! assert_eq!(hash1, output[..32]); //! # } //! //! // Print a hash as hex. //! println!("{}", hash1); //! # Ok(()) //! # } //! ``` //! //! # Cargo Features //! //! The `std` feature (the only feature enabled by default) is required for //! implementations of the [`Write`] and [`Seek`] traits, the //! [`update_reader`](Hasher::update_reader) helper method, and runtime CPU //! feature detection on x86. If this feature is disabled, the only way to use //! the x86 SIMD implementations is to enable the corresponding instruction sets //! globally, with e.g. `RUSTFLAGS="-C target-cpu=native"`. The resulting binary //! will not be portable to other machines. //! //! The `rayon` feature (disabled by default, but enabled for [docs.rs]) adds //! the [`update_rayon`](Hasher::update_rayon) and (in combination with `mmap` //! below) [`update_mmap_rayon`](Hasher::update_mmap_rayon) methods, for //! multithreaded hashing. However, even if this feature is enabled, all other //! APIs remain single-threaded. //! //! The `mmap` feature (disabled by default, but enabled for [docs.rs]) adds the //! [`update_mmap`](Hasher::update_mmap) and (in combination with `rayon` above) //! [`update_mmap_rayon`](Hasher::update_mmap_rayon) helper methods for //! memory-mapped IO. //! //! The `zeroize` feature (disabled by default, but enabled for [docs.rs]) //! implements //! [`Zeroize`](https://docs.rs/zeroize/latest/zeroize/trait.Zeroize.html) for //! this crate's types. //! //! The `serde` feature (disabled by default, but enabled for [docs.rs]) implements //! [`serde::Serialize`](https://docs.rs/serde/latest/serde/trait.Serialize.html) and //! [`serde::Deserialize`](https://docs.rs/serde/latest/serde/trait.Deserialize.html) //! for [`Hash`](struct@Hash). //! //! The NEON implementation is enabled by default for AArch64 but requires the //! `neon` feature for other ARM targets. Not all ARMv7 CPUs support NEON, and //! enabling this feature will produce a binary that's not portable to CPUs //! without NEON support. //! //! The `wasm32_simd` feature enables the WASM SIMD implementation for all `wasm32-` //! targets. Similar to the `neon` feature, if `wasm32_simd` is enabled, WASM SIMD //! support is assumed. This may become the default in the future. //! //! The `traits-preview` feature enables implementations of traits from the //! RustCrypto [`digest`] crate, and re-exports that crate as `traits::digest`. //! However, the traits aren't stable, and they're expected to change in //! incompatible ways before that crate reaches 1.0. For that reason, this crate //! makes no SemVer guarantees for this feature, and callers who use it should //! expect breaking changes between patch versions. (The "-preview" feature name //! follows the conventions of the RustCrypto [`signature`] crate.) //! //! [`Hasher::update_rayon`]: struct.Hasher.html#method.update_rayon //! [BLAKE3]: https://blake3.io //! [Rayon]: https://github.com/rayon-rs/rayon //! [docs.rs]: https://docs.rs/ //! [`Write`]: https://doc.rust-lang.org/std/io/trait.Write.html //! [`Seek`]: https://doc.rust-lang.org/std/io/trait.Seek.html //! [`digest`]: https://crates.io/crates/digest //! [`signature`]: https://crates.io/crates/signature #![cfg_attr(not(feature = "std"), no_std)] #[cfg(test)] mod test; #[doc(hidden)] #[deprecated(since = "1.8.0", note = "use the hazmat module instead")] pub mod guts; pub mod hazmat; /// Undocumented and unstable, for benchmarks only. #[doc(hidden)] pub mod platform; // Platform-specific implementations of the compression function. These // BLAKE3-specific cfg flags are set in build.rs. #[cfg(blake3_avx2_rust)] #[path = "rust_avx2.rs"] mod avx2; #[cfg(blake3_avx2_ffi)] #[path = "ffi_avx2.rs"] mod avx2; #[cfg(blake3_avx512_ffi)] #[path = "ffi_avx512.rs"] mod avx512; #[cfg(blake3_neon)] #[path = "ffi_neon.rs"] mod neon; mod portable; #[cfg(blake3_sse2_rust)] #[path = "rust_sse2.rs"] mod sse2; #[cfg(blake3_sse2_ffi)] #[path = "ffi_sse2.rs"] mod sse2; #[cfg(blake3_sse41_rust)] #[path = "rust_sse41.rs"] mod sse41; #[cfg(blake3_sse41_ffi)] #[path = "ffi_sse41.rs"] mod sse41; #[cfg(blake3_wasm32_simd)] #[path = "wasm32_simd.rs"] mod wasm32_simd; #[cfg(feature = "traits-preview")] pub mod traits; mod io; mod join; use arrayref::{array_mut_ref, array_ref}; use arrayvec::{ArrayString, ArrayVec}; use core::cmp; use core::fmt; use platform::{Platform, MAX_SIMD_DEGREE, MAX_SIMD_DEGREE_OR_2}; #[cfg(feature = "zeroize")] use zeroize::Zeroize; /// The number of bytes in a [`Hash`](struct.Hash.html), 32. pub const OUT_LEN: usize = 32; /// The number of bytes in a key, 32. pub const KEY_LEN: usize = 32; /// The number of bytes in a block, 64. /// /// You don't usually need to think about this number. One case where it matters is calling /// [`OutputReader::fill`] in a loop, where using a `buf` argument that's a multiple of `BLOCK_LEN` /// avoids repeating work. pub const BLOCK_LEN: usize = 64; /// The number of bytes in a chunk, 1024. /// /// You don't usually need to think about this number, but it often comes up in benchmarks, because /// the maximum degree of parallelism used by the implementation equals the number of chunks. pub const CHUNK_LEN: usize = 1024; const MAX_DEPTH: usize = 54; // 2^54 * CHUNK_LEN = 2^64 // While iterating the compression function within a chunk, the CV is // represented as words, to avoid doing two extra endianness conversions for // each compression in the portable implementation. But the hash_many interface // needs to hash both input bytes and parent nodes, so its better for its // output CVs to be represented as bytes. type CVWords = [u32; 8]; type CVBytes = [u8; 32]; // little-endian const IV: &CVWords = &[ 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, ]; const MSG_SCHEDULE: [[usize; 16]; 7] = [ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8], [3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1], [10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6], [12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4], [9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7], [11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13], ]; // These are the internal flags that we use to domain separate root/non-root, // chunk/parent, and chunk beginning/middle/end. These get set at the high end // of the block flags word in the compression function, so their values start // high and go down. const CHUNK_START: u8 = 1 << 0; const CHUNK_END: u8 = 1 << 1; const PARENT: u8 = 1 << 2; const ROOT: u8 = 1 << 3; const KEYED_HASH: u8 = 1 << 4; const DERIVE_KEY_CONTEXT: u8 = 1 << 5; const DERIVE_KEY_MATERIAL: u8 = 1 << 6; #[inline] fn counter_low(counter: u64) -> u32 { counter as u32 } #[inline] fn counter_high(counter: u64) -> u32 { (counter >> 32) as u32 } /// An output of the default size, 32 bytes, which provides constant-time /// equality checking. /// /// `Hash` implements [`From`] and [`Into`] for `[u8; 32]`, and it provides /// [`from_bytes`] and [`as_bytes`] for explicit conversions between itself and /// `[u8; 32]`. However, byte arrays and slices don't provide constant-time /// equality checking, which is often a security requirement in software that /// handles private data. `Hash` doesn't implement [`Deref`] or [`AsRef`], to /// avoid situations where a type conversion happens implicitly and the /// constant-time property is accidentally lost. /// /// `Hash` provides the [`to_hex`] and [`from_hex`] methods for converting to /// and from hexadecimal. It also implements [`Display`] and [`FromStr`]. /// /// [`From`]: https://doc.rust-lang.org/std/convert/trait.From.html /// [`Into`]: https://doc.rust-lang.org/std/convert/trait.Into.html /// [`as_bytes`]: #method.as_bytes /// [`from_bytes`]: #method.from_bytes /// [`Deref`]: https://doc.rust-lang.org/stable/std/ops/trait.Deref.html /// [`AsRef`]: https://doc.rust-lang.org/std/convert/trait.AsRef.html /// [`to_hex`]: #method.to_hex /// [`from_hex`]: #method.from_hex /// [`Display`]: https://doc.rust-lang.org/std/fmt/trait.Display.html /// [`FromStr`]: https://doc.rust-lang.org/std/str/trait.FromStr.html #[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))] #[derive(Clone, Copy, Hash, Eq)] pub struct Hash([u8; OUT_LEN]); impl Hash { /// The raw bytes of the `Hash`. Note that byte arrays don't provide /// constant-time equality checking, so if you need to compare hashes, /// prefer the `Hash` type. #[inline] pub const fn as_bytes(&self) -> &[u8; OUT_LEN] { &self.0 } /// Create a `Hash` from its raw bytes representation. pub const fn from_bytes(bytes: [u8; OUT_LEN]) -> Self { Self(bytes) } /// Create a `Hash` from its raw bytes representation as a slice. /// /// Returns an error if the slice is not exactly 32 bytes long. pub fn from_slice(bytes: &[u8]) -> Result { Ok(Self::from_bytes(bytes.try_into()?)) } /// Encode a `Hash` in lowercase hexadecimal. /// /// The returned [`ArrayString`] is a fixed size and doesn't allocate memory /// on the heap. Note that [`ArrayString`] doesn't provide constant-time /// equality checking, so if you need to compare hashes, prefer the `Hash` /// type. /// /// [`ArrayString`]: https://docs.rs/arrayvec/0.5.1/arrayvec/struct.ArrayString.html pub fn to_hex(&self) -> ArrayString<{ 2 * OUT_LEN }> { let mut s = ArrayString::new(); let table = b"0123456789abcdef"; for &b in self.0.iter() { s.push(table[(b >> 4) as usize] as char); s.push(table[(b & 0xf) as usize] as char); } s } /// Decode a `Hash` from hexadecimal. Both uppercase and lowercase ASCII /// bytes are supported. /// /// Any byte outside the ranges `'0'...'9'`, `'a'...'f'`, and `'A'...'F'` /// results in an error. An input length other than 64 also results in an /// error. /// /// Note that `Hash` also implements `FromStr`, so `Hash::from_hex("...")` /// is equivalent to `"...".parse()`. pub fn from_hex(hex: impl AsRef<[u8]>) -> Result { fn hex_val(byte: u8) -> Result { match byte { b'A'..=b'F' => Ok(byte - b'A' + 10), b'a'..=b'f' => Ok(byte - b'a' + 10), b'0'..=b'9' => Ok(byte - b'0'), _ => Err(HexError(HexErrorInner::InvalidByte(byte))), } } let hex_bytes: &[u8] = hex.as_ref(); if hex_bytes.len() != OUT_LEN * 2 { return Err(HexError(HexErrorInner::InvalidLen(hex_bytes.len()))); } let mut hash_bytes: [u8; OUT_LEN] = [0; OUT_LEN]; for i in 0..OUT_LEN { hash_bytes[i] = 16 * hex_val(hex_bytes[2 * i])? + hex_val(hex_bytes[2 * i + 1])?; } Ok(Hash::from(hash_bytes)) } } impl From<[u8; OUT_LEN]> for Hash { #[inline] fn from(bytes: [u8; OUT_LEN]) -> Self { Self::from_bytes(bytes) } } impl From for [u8; OUT_LEN] { #[inline] fn from(hash: Hash) -> Self { hash.0 } } impl core::str::FromStr for Hash { type Err = HexError; fn from_str(s: &str) -> Result { Hash::from_hex(s) } } #[cfg(feature = "zeroize")] impl Zeroize for Hash { fn zeroize(&mut self) { // Destructuring to trigger compile error as a reminder to update this impl. let Self(bytes) = self; bytes.zeroize(); } } /// This implementation is constant-time. impl PartialEq for Hash { #[inline] fn eq(&self, other: &Hash) -> bool { constant_time_eq::constant_time_eq_32(&self.0, &other.0) } } /// This implementation is constant-time. impl PartialEq<[u8; OUT_LEN]> for Hash { #[inline] fn eq(&self, other: &[u8; OUT_LEN]) -> bool { constant_time_eq::constant_time_eq_32(&self.0, other) } } /// This implementation is constant-time if the target is 32 bytes long. impl PartialEq<[u8]> for Hash { #[inline] fn eq(&self, other: &[u8]) -> bool { constant_time_eq::constant_time_eq(&self.0, other) } } impl fmt::Display for Hash { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // Formatting field as `&str` to reduce code size since the `Debug` // dynamic dispatch table for `&str` is likely needed elsewhere already, // but that for `ArrayString<[u8; 64]>` is not. let hex = self.to_hex(); let hex: &str = hex.as_str(); f.write_str(hex) } } impl fmt::Debug for Hash { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // Formatting field as `&str` to reduce code size since the `Debug` // dynamic dispatch table for `&str` is likely needed elsewhere already, // but that for `ArrayString<[u8; 64]>` is not. let hex = self.to_hex(); let hex: &str = hex.as_str(); f.debug_tuple("Hash").field(&hex).finish() } } /// The error type for [`Hash::from_hex`]. /// /// The `.to_string()` representation of this error currently distinguishes between bad length /// errors and bad character errors. This is to help with logging and debugging, but it isn't a /// stable API detail, and it may change at any time. #[derive(Clone, Debug)] pub struct HexError(HexErrorInner); #[derive(Clone, Debug)] enum HexErrorInner { InvalidByte(u8), InvalidLen(usize), } impl fmt::Display for HexError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self.0 { HexErrorInner::InvalidByte(byte) => { if byte < 128 { write!(f, "invalid hex character: {:?}", byte as char) } else { write!(f, "invalid hex character: 0x{:x}", byte) } } HexErrorInner::InvalidLen(len) => { write!(f, "expected 64 hex bytes, received {}", len) } } } } #[cfg(feature = "std")] impl std::error::Error for HexError {} // Each chunk or parent node can produce either a 32-byte chaining value or, by // setting the ROOT flag, any number of final output bytes. The Output struct // captures the state just prior to choosing between those two possibilities. #[derive(Clone)] struct Output { input_chaining_value: CVWords, block: [u8; 64], block_len: u8, counter: u64, flags: u8, platform: Platform, } impl Output { fn chaining_value(&self) -> CVBytes { let mut cv = self.input_chaining_value; self.platform.compress_in_place( &mut cv, &self.block, self.block_len, self.counter, self.flags, ); platform::le_bytes_from_words_32(&cv) } fn root_hash(&self) -> Hash { debug_assert_eq!(self.counter, 0); let mut cv = self.input_chaining_value; self.platform .compress_in_place(&mut cv, &self.block, self.block_len, 0, self.flags | ROOT); Hash(platform::le_bytes_from_words_32(&cv)) } fn root_output_block(&self) -> [u8; 2 * OUT_LEN] { self.platform.compress_xof( &self.input_chaining_value, &self.block, self.block_len, self.counter, self.flags | ROOT, ) } } #[cfg(feature = "zeroize")] impl Zeroize for Output { fn zeroize(&mut self) { // Destructuring to trigger compile error as a reminder to update this impl. let Self { input_chaining_value, block, block_len, counter, flags, platform: _, } = self; input_chaining_value.zeroize(); block.zeroize(); block_len.zeroize(); counter.zeroize(); flags.zeroize(); } } #[derive(Clone)] struct ChunkState { cv: CVWords, chunk_counter: u64, buf: [u8; BLOCK_LEN], buf_len: u8, blocks_compressed: u8, flags: u8, platform: Platform, } impl ChunkState { fn new(key: &CVWords, chunk_counter: u64, flags: u8, platform: Platform) -> Self { Self { cv: *key, chunk_counter, buf: [0; BLOCK_LEN], buf_len: 0, blocks_compressed: 0, flags, platform, } } fn count(&self) -> usize { BLOCK_LEN * self.blocks_compressed as usize + self.buf_len as usize } fn fill_buf(&mut self, input: &mut &[u8]) { let want = BLOCK_LEN - self.buf_len as usize; let take = cmp::min(want, input.len()); self.buf[self.buf_len as usize..][..take].copy_from_slice(&input[..take]); self.buf_len += take as u8; *input = &input[take..]; } fn start_flag(&self) -> u8 { if self.blocks_compressed == 0 { CHUNK_START } else { 0 } } // Try to avoid buffering as much as possible, by compressing directly from // the input slice when full blocks are available. fn update(&mut self, mut input: &[u8]) -> &mut Self { if self.buf_len > 0 { self.fill_buf(&mut input); if !input.is_empty() { debug_assert_eq!(self.buf_len as usize, BLOCK_LEN); let block_flags = self.flags | self.start_flag(); // borrowck self.platform.compress_in_place( &mut self.cv, &self.buf, BLOCK_LEN as u8, self.chunk_counter, block_flags, ); self.buf_len = 0; self.buf = [0; BLOCK_LEN]; self.blocks_compressed += 1; } } while input.len() > BLOCK_LEN { debug_assert_eq!(self.buf_len, 0); let block_flags = self.flags | self.start_flag(); // borrowck self.platform.compress_in_place( &mut self.cv, array_ref!(input, 0, BLOCK_LEN), BLOCK_LEN as u8, self.chunk_counter, block_flags, ); self.blocks_compressed += 1; input = &input[BLOCK_LEN..]; } self.fill_buf(&mut input); debug_assert!(input.is_empty()); debug_assert!(self.count() <= CHUNK_LEN); self } fn output(&self) -> Output { let block_flags = self.flags | self.start_flag() | CHUNK_END; Output { input_chaining_value: self.cv, block: self.buf, block_len: self.buf_len, counter: self.chunk_counter, flags: block_flags, platform: self.platform, } } } // Don't derive(Debug), because the state may be secret. impl fmt::Debug for ChunkState { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("ChunkState") .field("count", &self.count()) .field("chunk_counter", &self.chunk_counter) .field("flags", &self.flags) .field("platform", &self.platform) .finish() } } #[cfg(feature = "zeroize")] impl Zeroize for ChunkState { fn zeroize(&mut self) { // Destructuring to trigger compile error as a reminder to update this impl. let Self { cv, chunk_counter, buf, buf_len, blocks_compressed, flags, platform: _, } = self; cv.zeroize(); chunk_counter.zeroize(); buf.zeroize(); buf_len.zeroize(); blocks_compressed.zeroize(); flags.zeroize(); } } // IMPLEMENTATION NOTE // =================== // The recursive function compress_subtree_wide(), implemented below, is the // basis of high-performance BLAKE3. We use it both for all-at-once hashing, // and for the incremental input with Hasher (though we have to be careful with // subtree boundaries in the incremental case). compress_subtree_wide() applies // several optimizations at the same time: // - Multithreading with Rayon. // - Parallel chunk hashing with SIMD. // - Parallel parent hashing with SIMD. Note that while SIMD chunk hashing // maxes out at MAX_SIMD_DEGREE*CHUNK_LEN, parallel parent hashing continues // to benefit from larger inputs, because more levels of the tree benefit can // use full-width SIMD vectors for parent hashing. Without parallel parent // hashing, we lose about 10% of overall throughput on AVX2 and AVX-512. /// Undocumented and unstable, for benchmarks only. #[doc(hidden)] #[derive(Clone, Copy)] pub enum IncrementCounter { Yes, No, } impl IncrementCounter { #[inline] fn yes(&self) -> bool { match self { IncrementCounter::Yes => true, IncrementCounter::No => false, } } } // The largest power of two less than or equal to `n`, used in Hasher::update(). This is similar to // left_subtree_len(n), but note that left_subtree_len(n) is strictly less than `n`. fn largest_power_of_two_leq(n: usize) -> usize { ((n / 2) + 1).next_power_of_two() } // Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time // on a single thread. Write out the chunk chaining values and return the // number of chunks hashed. These chunks are never the root and never empty; // those cases use a different codepath. fn compress_chunks_parallel( input: &[u8], key: &CVWords, chunk_counter: u64, flags: u8, platform: Platform, out: &mut [u8], ) -> usize { debug_assert!(!input.is_empty(), "empty chunks below the root"); debug_assert!(input.len() <= MAX_SIMD_DEGREE * CHUNK_LEN); let mut chunks_exact = input.chunks_exact(CHUNK_LEN); let mut chunks_array = ArrayVec::<&[u8; CHUNK_LEN], MAX_SIMD_DEGREE>::new(); for chunk in &mut chunks_exact { chunks_array.push(array_ref!(chunk, 0, CHUNK_LEN)); } platform.hash_many( &chunks_array, key, chunk_counter, IncrementCounter::Yes, flags, CHUNK_START, CHUNK_END, out, ); // Hash the remaining partial chunk, if there is one. Note that the empty // chunk (meaning the empty message) is a different codepath. let chunks_so_far = chunks_array.len(); if !chunks_exact.remainder().is_empty() { let counter = chunk_counter + chunks_so_far as u64; let mut chunk_state = ChunkState::new(key, counter, flags, platform); chunk_state.update(chunks_exact.remainder()); *array_mut_ref!(out, chunks_so_far * OUT_LEN, OUT_LEN) = chunk_state.output().chaining_value(); chunks_so_far + 1 } else { chunks_so_far } } // Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time // on a single thread. Write out the parent chaining values and return the // number of parents hashed. (If there's an odd input chaining value left over, // return it as an additional output.) These parents are never the root and // never empty; those cases use a different codepath. fn compress_parents_parallel( child_chaining_values: &[u8], key: &CVWords, flags: u8, platform: Platform, out: &mut [u8], ) -> usize { debug_assert_eq!(child_chaining_values.len() % OUT_LEN, 0, "wacky hash bytes"); let num_children = child_chaining_values.len() / OUT_LEN; debug_assert!(num_children >= 2, "not enough children"); debug_assert!(num_children <= 2 * MAX_SIMD_DEGREE_OR_2, "too many"); let mut parents_exact = child_chaining_values.chunks_exact(BLOCK_LEN); // Use MAX_SIMD_DEGREE_OR_2 rather than MAX_SIMD_DEGREE here, because of // the requirements of compress_subtree_wide(). let mut parents_array = ArrayVec::<&[u8; BLOCK_LEN], MAX_SIMD_DEGREE_OR_2>::new(); for parent in &mut parents_exact { parents_array.push(array_ref!(parent, 0, BLOCK_LEN)); } platform.hash_many( &parents_array, key, 0, // Parents always use counter 0. IncrementCounter::No, flags | PARENT, 0, // Parents have no start flags. 0, // Parents have no end flags. out, ); // If there's an odd child left over, it becomes an output. let parents_so_far = parents_array.len(); if !parents_exact.remainder().is_empty() { out[parents_so_far * OUT_LEN..][..OUT_LEN].copy_from_slice(parents_exact.remainder()); parents_so_far + 1 } else { parents_so_far } } // The wide helper function returns (writes out) an array of chaining values // and returns the length of that array. The number of chaining values returned // is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, // if the input is shorter than that many chunks. The reason for maintaining a // wide array of chaining values going back up the tree, is to allow the // implementation to hash as many parents in parallel as possible. // // As a special case when the SIMD degree is 1, this function will still return // at least 2 outputs. This guarantees that this function doesn't perform the // root compression. (If it did, it would use the wrong flags, and also we // wouldn't be able to implement extendable output.) Note that this function is // not used when the whole input is only 1 chunk long; that's a different // codepath. // // Why not just have the caller split the input on the first update(), instead // of implementing this special rule? Because we don't want to limit SIMD or // multithreading parallelism for that update(). fn compress_subtree_wide( input: &[u8], key: &CVWords, chunk_counter: u64, flags: u8, platform: Platform, out: &mut [u8], ) -> usize { // Note that the single chunk case does *not* bump the SIMD degree up to 2 // when it is 1. This allows Rayon the option of multithreading even the // 2-chunk case, which can help performance on smaller platforms. if input.len() <= platform.simd_degree() * CHUNK_LEN { return compress_chunks_parallel(input, key, chunk_counter, flags, platform, out); } // With more than simd_degree chunks, we need to recurse. Start by dividing // the input into left and right subtrees. (Note that this is only optimal // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree // of 3 or something, we'll need a more complicated strategy.) debug_assert_eq!(platform.simd_degree().count_ones(), 1, "power of 2"); let (left, right) = input.split_at(hazmat::left_subtree_len(input.len() as u64) as usize); let right_chunk_counter = chunk_counter + (left.len() / CHUNK_LEN) as u64; // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to // account for the special case of returning 2 outputs when the SIMD degree // is 1. let mut cv_array = [0; 2 * MAX_SIMD_DEGREE_OR_2 * OUT_LEN]; let degree = if left.len() == CHUNK_LEN { // The "simd_degree=1 and we're at the leaf nodes" case. debug_assert_eq!(platform.simd_degree(), 1); 1 } else { cmp::max(platform.simd_degree(), 2) }; let (left_out, right_out) = cv_array.split_at_mut(degree * OUT_LEN); // Recurse! For update_rayon(), this is where we take advantage of RayonJoin and use multiple // threads. let (left_n, right_n) = J::join( || compress_subtree_wide::(left, key, chunk_counter, flags, platform, left_out), || compress_subtree_wide::(right, key, right_chunk_counter, flags, platform, right_out), ); // The special case again. If simd_degree=1, then we'll have left_n=1 and // right_n=1. Rather than compressing them into a single output, return // them directly, to make sure we always have at least two outputs. debug_assert_eq!(left_n, degree); debug_assert!(right_n >= 1 && right_n <= left_n); if left_n == 1 { out[..2 * OUT_LEN].copy_from_slice(&cv_array[..2 * OUT_LEN]); return 2; } // Otherwise, do one layer of parent node compression. let num_children = left_n + right_n; compress_parents_parallel( &cv_array[..num_children * OUT_LEN], key, flags, platform, out, ) } // Hash a subtree with compress_subtree_wide(), and then condense the resulting // list of chaining values down to a single parent node. Don't compress that // last parent node, however. Instead, return its message bytes (the // concatenated chaining values of its children). This is necessary when the // first call to update() supplies a complete subtree, because the topmost // parent node of that subtree could end up being the root. It's also necessary // for extended output in the general case. // // As with compress_subtree_wide(), this function is not used on inputs of 1 // chunk or less. That's a different codepath. fn compress_subtree_to_parent_node( input: &[u8], key: &CVWords, chunk_counter: u64, flags: u8, platform: Platform, ) -> [u8; BLOCK_LEN] { debug_assert!(input.len() > CHUNK_LEN); let mut cv_array = [0; MAX_SIMD_DEGREE_OR_2 * OUT_LEN]; let mut num_cvs = compress_subtree_wide::(input, &key, chunk_counter, flags, platform, &mut cv_array); debug_assert!(num_cvs >= 2); // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, // compress_subtree_wide() returns more than 2 chaining values. Condense // them into 2 by forming parent nodes repeatedly. let mut out_array = [0; MAX_SIMD_DEGREE_OR_2 * OUT_LEN / 2]; while num_cvs > 2 { let cv_slice = &cv_array[..num_cvs * OUT_LEN]; num_cvs = compress_parents_parallel(cv_slice, key, flags, platform, &mut out_array); cv_array[..num_cvs * OUT_LEN].copy_from_slice(&out_array[..num_cvs * OUT_LEN]); } *array_ref!(cv_array, 0, 2 * OUT_LEN) } // Hash a complete input all at once. Unlike compress_subtree_wide() and // compress_subtree_to_parent_node(), this function handles the 1 chunk case. fn hash_all_at_once(input: &[u8], key: &CVWords, flags: u8) -> Output { let platform = Platform::detect(); // If the whole subtree is one chunk, hash it directly with a ChunkState. if input.len() <= CHUNK_LEN { return ChunkState::new(key, 0, flags, platform) .update(input) .output(); } // Otherwise construct an Output object from the parent node returned by // compress_subtree_to_parent_node(). Output { input_chaining_value: *key, block: compress_subtree_to_parent_node::(input, key, 0, flags, platform), block_len: BLOCK_LEN as u8, counter: 0, flags: flags | PARENT, platform, } } /// The default hash function. /// /// For an incremental version that accepts multiple writes, see [`Hasher::new`], /// [`Hasher::update`], and [`Hasher::finalize`]. These two lines are equivalent: /// /// ``` /// let hash = blake3::hash(b"foo"); /// # let hash1 = hash; /// /// let hash = blake3::Hasher::new().update(b"foo").finalize(); /// # let hash2 = hash; /// # assert_eq!(hash1, hash2); /// ``` /// /// For output sizes other than 32 bytes, see [`Hasher::finalize_xof`] and /// [`OutputReader`]. /// /// This function is always single-threaded. For multithreading support, see /// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon). pub fn hash(input: &[u8]) -> Hash { hash_all_at_once::(input, IV, 0).root_hash() } /// The keyed hash function. /// /// This is suitable for use as a message authentication code, for example to /// replace an HMAC instance. In that use case, the constant-time equality /// checking provided by [`Hash`](struct.Hash.html) is almost always a security /// requirement, and callers need to be careful not to compare MACs as raw /// bytes. /// /// For an incremental version that accepts multiple writes, see [`Hasher::new_keyed`], /// [`Hasher::update`], and [`Hasher::finalize`]. These two lines are equivalent: /// /// ``` /// # const KEY: &[u8; 32] = &[0; 32]; /// let mac = blake3::keyed_hash(KEY, b"foo"); /// # let mac1 = mac; /// /// let mac = blake3::Hasher::new_keyed(KEY).update(b"foo").finalize(); /// # let mac2 = mac; /// # assert_eq!(mac1, mac2); /// ``` /// /// For output sizes other than 32 bytes, see [`Hasher::finalize_xof`], and [`OutputReader`]. /// /// This function is always single-threaded. For multithreading support, see /// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon). pub fn keyed_hash(key: &[u8; KEY_LEN], input: &[u8]) -> Hash { let key_words = platform::words_from_le_bytes_32(key); hash_all_at_once::(input, &key_words, KEYED_HASH).root_hash() } /// The key derivation function. /// /// Given cryptographic key material of any length and a context string of any /// length, this function outputs a 32-byte derived subkey. **The context string /// should be hardcoded, globally unique, and application-specific.** A good /// default format for such strings is `"[application] [commit timestamp] /// [purpose]"`, e.g., `"example.com 2019-12-25 16:18:03 session tokens v1"`. /// /// Key derivation is important when you want to use the same key in multiple /// algorithms or use cases. Using the same key with different cryptographic /// algorithms is generally forbidden, and deriving a separate subkey for each /// use case protects you from bad interactions. Derived keys also mitigate the /// damage from one part of your application accidentally leaking its key. /// /// As a rare exception to that general rule, however, it is possible to use /// `derive_key` itself with key material that you are already using with /// another algorithm. You might need to do this if you're adding features to /// an existing application, which does not yet use key derivation internally. /// However, you still must not share key material with algorithms that forbid /// key reuse entirely, like a one-time pad. For more on this, see sections 6.2 /// and 7.8 of the [BLAKE3 paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). /// /// Note that BLAKE3 is not a password hash, and **`derive_key` should never be /// used with passwords.** Instead, use a dedicated password hash like /// [Argon2]. Password hashes are entirely different from generic hash /// functions, with opposite design requirements. /// /// For an incremental version that accepts multiple writes, see [`Hasher::new_derive_key`], /// [`Hasher::update`], and [`Hasher::finalize`]. These two statements are equivalent: /// /// ``` /// # const CONTEXT: &str = "example.com 2019-12-25 16:18:03 session tokens v1"; /// let key = blake3::derive_key(CONTEXT, b"key material, not a password"); /// # let key1 = key; /// /// let key: [u8; 32] = blake3::Hasher::new_derive_key(CONTEXT) /// .update(b"key material, not a password") /// .finalize() /// .into(); /// # let key2 = key; /// # assert_eq!(key1, key2); /// ``` /// /// For output sizes other than 32 bytes, see [`Hasher::finalize_xof`], and [`OutputReader`]. /// /// This function is always single-threaded. For multithreading support, see /// [`Hasher::update_rayon`](struct.Hasher.html#method.update_rayon). /// /// [Argon2]: https://en.wikipedia.org/wiki/Argon2 pub fn derive_key(context: &str, key_material: &[u8]) -> [u8; OUT_LEN] { let context_key = hazmat::hash_derive_key_context(context); let context_key_words = platform::words_from_le_bytes_32(&context_key); hash_all_at_once::(key_material, &context_key_words, DERIVE_KEY_MATERIAL) .root_hash() .0 } fn parent_node_output( left_child: &CVBytes, right_child: &CVBytes, key: &CVWords, flags: u8, platform: Platform, ) -> Output { let mut block = [0; BLOCK_LEN]; block[..32].copy_from_slice(left_child); block[32..].copy_from_slice(right_child); Output { input_chaining_value: *key, block, block_len: BLOCK_LEN as u8, counter: 0, flags: flags | PARENT, platform, } } /// An incremental hash state that can accept any number of writes. /// /// The `rayon` and `mmap` Cargo features enable additional methods on this /// type related to multithreading and memory-mapped IO. /// /// When the `traits-preview` Cargo feature is enabled, this type implements /// several commonly used traits from the /// [`digest`](https://crates.io/crates/digest) crate. However, those /// traits aren't stable, and they're expected to change in incompatible ways /// before that crate reaches 1.0. For that reason, this crate makes no SemVer /// guarantees for this feature, and callers who use it should expect breaking /// changes between patch versions. /// /// # Examples /// /// ``` /// # fn main() -> Result<(), Box> { /// // Hash an input incrementally. /// let mut hasher = blake3::Hasher::new(); /// hasher.update(b"foo"); /// hasher.update(b"bar"); /// hasher.update(b"baz"); /// assert_eq!(hasher.finalize(), blake3::hash(b"foobarbaz")); /// /// // Extended output. OutputReader also implements Read and Seek. /// # #[cfg(feature = "std")] { /// let mut output = [0; 1000]; /// let mut output_reader = hasher.finalize_xof(); /// output_reader.fill(&mut output); /// assert_eq!(&output[..32], blake3::hash(b"foobarbaz").as_bytes()); /// # } /// # Ok(()) /// # } /// ``` #[derive(Clone)] pub struct Hasher { key: CVWords, chunk_state: ChunkState, initial_chunk_counter: u64, // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example, // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk // requires a 4th entry, rather than merging everything down to 1, because // we don't know whether more input is coming. This is different from how // the reference implementation does things. cv_stack: ArrayVec, } impl Hasher { fn new_internal(key: &CVWords, flags: u8) -> Self { Self { key: *key, chunk_state: ChunkState::new(key, 0, flags, Platform::detect()), initial_chunk_counter: 0, cv_stack: ArrayVec::new(), } } /// Construct a new `Hasher` for the regular hash function. pub fn new() -> Self { Self::new_internal(IV, 0) } /// Construct a new `Hasher` for the keyed hash function. See /// [`keyed_hash`]. /// /// [`keyed_hash`]: fn.keyed_hash.html pub fn new_keyed(key: &[u8; KEY_LEN]) -> Self { let key_words = platform::words_from_le_bytes_32(key); Self::new_internal(&key_words, KEYED_HASH) } /// Construct a new `Hasher` for the key derivation function. See /// [`derive_key`]. The context string should be hardcoded, globally /// unique, and application-specific. /// /// [`derive_key`]: fn.derive_key.html pub fn new_derive_key(context: &str) -> Self { let context_key = hazmat::hash_derive_key_context(context); let context_key_words = platform::words_from_le_bytes_32(&context_key); Self::new_internal(&context_key_words, DERIVE_KEY_MATERIAL) } /// Reset the `Hasher` to its initial state. /// /// This is functionally the same as overwriting the `Hasher` with a new /// one, using the same key or context string if any. pub fn reset(&mut self) -> &mut Self { self.chunk_state = ChunkState::new( &self.key, 0, self.chunk_state.flags, self.chunk_state.platform, ); self.cv_stack.clear(); self } // As described in push_cv() below, we do "lazy merging", delaying merges // until right before the next CV is about to be added. This is different // from the reference implementation. Another difference is that we aren't // always merging 1 chunk at a time. Instead, each CV might represent any // power-of-two number of chunks, as long as the smaller-above-larger stack // order is maintained. Instead of the "count the trailing 0-bits" // algorithm described in the spec (which assumes you're adding one chunk // at a time), we use a "count the total number of 1-bits" variant (which // doesn't assume that). The principle is the same: each CV that should // remain in the stack is represented by a 1-bit in the total number of // chunks (or bytes) so far. fn merge_cv_stack(&mut self, chunk_counter: u64) { // Account for non-zero cases of Hasher::set_input_offset, where there are no prior // subtrees in the stack. Note that initial_chunk_counter is always 0 for callers who don't // use the hazmat module. let post_merge_stack_len = (chunk_counter - self.initial_chunk_counter).count_ones() as usize; while self.cv_stack.len() > post_merge_stack_len { let right_child = self.cv_stack.pop().unwrap(); let left_child = self.cv_stack.pop().unwrap(); let parent_output = parent_node_output( &left_child, &right_child, &self.key, self.chunk_state.flags, self.chunk_state.platform, ); self.cv_stack.push(parent_output.chaining_value()); } } // In reference_impl.rs, we merge the new CV with existing CVs from the // stack before pushing it. We can do that because we know more input is // coming, so we know none of the merges are root. // // This setting is different. We want to feed as much input as possible to // compress_subtree_wide(), without setting aside anything for the // chunk_state. If the user gives us 64 KiB, we want to parallelize over // all 64 KiB at once as a single subtree, if at all possible. // // This leads to two problems: // 1) This 64 KiB input might be the only call that ever gets made to // update. In this case, the root node of the 64 KiB subtree would be // the root node of the whole tree, and it would need to be ROOT // finalized. We can't compress it until we know. // 2) This 64 KiB input might complete a larger tree, whose root node is // similarly going to be the root of the whole tree. For example, // maybe we have 196 KiB (that is, 128 + 64) hashed so far. We can't // compress the node at the root of the 256 KiB subtree until we know // how to finalize it. // // The second problem is solved with "lazy merging". That is, when we're // about to add a CV to the stack, we don't merge it with anything first, // as the reference impl does. Instead we do merges using the *previous* CV // that was added, which is sitting on top of the stack, and we put the new // CV (unmerged) on top of the stack afterwards. This guarantees that we // never merge the root node until finalize(). // // Solving the first problem requires an additional tool, // compress_subtree_to_parent_node(). That function always returns the top // *two* chaining values of the subtree it's compressing. We then do lazy // merging with each of them separately, so that the second CV will always // remain unmerged. (That also helps us support extendable output when // we're hashing an input all-at-once.) fn push_cv(&mut self, new_cv: &CVBytes, chunk_counter: u64) { self.merge_cv_stack(chunk_counter); self.cv_stack.push(*new_cv); } /// Add input bytes to the hash state. You can call this any number of times. /// /// This method is always single-threaded. For multithreading support, see /// [`update_rayon`](#method.update_rayon) (enabled with the `rayon` Cargo feature). /// /// Note that the degree of SIMD parallelism that `update` can use is limited by the size of /// this input buffer. See [`update_reader`](#method.update_reader). pub fn update(&mut self, input: &[u8]) -> &mut Self { self.update_with_join::(input) } fn update_with_join(&mut self, mut input: &[u8]) -> &mut Self { let input_offset = self.initial_chunk_counter * CHUNK_LEN as u64; if let Some(max) = hazmat::max_subtree_len(input_offset) { let remaining = max - self.count(); assert!( input.len() as u64 <= remaining, "the subtree starting at {} contains at most {} bytes (found {})", CHUNK_LEN as u64 * self.initial_chunk_counter, max, input.len(), ); } // If we have some partial chunk bytes in the internal chunk_state, we // need to finish that chunk first. if self.chunk_state.count() > 0 { let want = CHUNK_LEN - self.chunk_state.count(); let take = cmp::min(want, input.len()); self.chunk_state.update(&input[..take]); input = &input[take..]; if !input.is_empty() { // We've filled the current chunk, and there's more input // coming, so we know it's not the root and we can finalize it. // Then we'll proceed to hashing whole chunks below. debug_assert_eq!(self.chunk_state.count(), CHUNK_LEN); let chunk_cv = self.chunk_state.output().chaining_value(); self.push_cv(&chunk_cv, self.chunk_state.chunk_counter); self.chunk_state = ChunkState::new( &self.key, self.chunk_state.chunk_counter + 1, self.chunk_state.flags, self.chunk_state.platform, ); } else { return self; } } // Now the chunk_state is clear, and we have more input. If there's // more than a single chunk (so, definitely not the root chunk), hash // the largest whole subtree we can, with the full benefits of SIMD and // multithreading parallelism. Two restrictions: // - The subtree has to be a power-of-2 number of chunks. Only subtrees // along the right edge can be incomplete, and we don't know where // the right edge is going to be until we get to finalize(). // - The subtree must evenly divide the total number of chunks up until // this point (if total is not 0). If the current incomplete subtree // is only waiting for 1 more chunk, we can't hash a subtree of 4 // chunks. We have to complete the current subtree first. // Because we might need to break up the input to form powers of 2, or // to evenly divide what we already have, this part runs in a loop. while input.len() > CHUNK_LEN { debug_assert_eq!(self.chunk_state.count(), 0, "no partial chunk data"); debug_assert_eq!(CHUNK_LEN.count_ones(), 1, "power of 2 chunk len"); let mut subtree_len = largest_power_of_two_leq(input.len()); let count_so_far = self.chunk_state.chunk_counter * CHUNK_LEN as u64; // Shrink the subtree_len until it evenly divides the count so far. // We know that subtree_len itself is a power of 2, so we can use a // bitmasking trick instead of an actual remainder operation. (Note // that if the caller consistently passes power-of-2 inputs of the // same size, as is hopefully typical, this loop condition will // always fail, and subtree_len will always be the full length of // the input.) // // An aside: We don't have to shrink subtree_len quite this much. // For example, if count_so_far is 1, we could pass 2 chunks to // compress_subtree_to_parent_node. Since we'll get 2 CVs back, // we'll still get the right answer in the end, and we might get to // use 2-way SIMD parallelism. The problem with this optimization, // is that it gets us stuck always hashing 2 chunks. The total // number of chunks will remain odd, and we'll never graduate to // higher degrees of parallelism. See // https://github.com/BLAKE3-team/BLAKE3/issues/69. while (subtree_len - 1) as u64 & count_so_far != 0 { subtree_len /= 2; } // The shrunken subtree_len might now be 1 chunk long. If so, hash // that one chunk by itself. Otherwise, compress the subtree into a // pair of CVs. let subtree_chunks = (subtree_len / CHUNK_LEN) as u64; if subtree_len <= CHUNK_LEN { debug_assert_eq!(subtree_len, CHUNK_LEN); self.push_cv( &ChunkState::new( &self.key, self.chunk_state.chunk_counter, self.chunk_state.flags, self.chunk_state.platform, ) .update(&input[..subtree_len]) .output() .chaining_value(), self.chunk_state.chunk_counter, ); } else { // This is the high-performance happy path, though getting here // depends on the caller giving us a long enough input. let cv_pair = compress_subtree_to_parent_node::( &input[..subtree_len], &self.key, self.chunk_state.chunk_counter, self.chunk_state.flags, self.chunk_state.platform, ); let left_cv = array_ref!(cv_pair, 0, 32); let right_cv = array_ref!(cv_pair, 32, 32); // Push the two CVs we received into the CV stack in order. Because // the stack merges lazily, this guarantees we aren't merging the // root. self.push_cv(left_cv, self.chunk_state.chunk_counter); self.push_cv( right_cv, self.chunk_state.chunk_counter + (subtree_chunks / 2), ); } self.chunk_state.chunk_counter += subtree_chunks; input = &input[subtree_len..]; } // What remains is 1 chunk or less. Add it to the chunk state. debug_assert!(input.len() <= CHUNK_LEN); if !input.is_empty() { self.chunk_state.update(input); // Having added some input to the chunk_state, we know what's in // the CV stack won't become the root node, and we can do an extra // merge. This simplifies finalize(). self.merge_cv_stack(self.chunk_state.chunk_counter); } self } fn final_output(&self) -> Output { // If the current chunk is the only chunk, that makes it the root node // also. Convert it directly into an Output. Otherwise, we need to // merge subtrees below. if self.cv_stack.is_empty() { debug_assert_eq!(self.chunk_state.chunk_counter, self.initial_chunk_counter); return self.chunk_state.output(); } // If there are any bytes in the ChunkState, finalize that chunk and // merge its CV with everything in the CV stack. In that case, the work // we did at the end of update() above guarantees that the stack // doesn't contain any unmerged subtrees that need to be merged first. // (This is important, because if there were two chunk hashes sitting // on top of the stack, they would need to merge with each other, and // merging a new chunk hash into them would be incorrect.) // // If there are no bytes in the ChunkState, we'll merge what's already // in the stack. In this case it's fine if there are unmerged chunks on // top, because we'll merge them with each other. Note that the case of // the empty chunk is taken care of above. let mut output: Output; let mut num_cvs_remaining = self.cv_stack.len(); if self.chunk_state.count() > 0 { debug_assert_eq!( self.cv_stack.len(), (self.chunk_state.chunk_counter - self.initial_chunk_counter).count_ones() as usize, "cv stack does not need a merge", ); output = self.chunk_state.output(); } else { debug_assert!(self.cv_stack.len() >= 2); output = parent_node_output( &self.cv_stack[num_cvs_remaining - 2], &self.cv_stack[num_cvs_remaining - 1], &self.key, self.chunk_state.flags, self.chunk_state.platform, ); num_cvs_remaining -= 2; } while num_cvs_remaining > 0 { output = parent_node_output( &self.cv_stack[num_cvs_remaining - 1], &output.chaining_value(), &self.key, self.chunk_state.flags, self.chunk_state.platform, ); num_cvs_remaining -= 1; } output } /// Finalize the hash state and return the [`Hash`](struct.Hash.html) of /// the input. /// /// This method is idempotent. Calling it twice will give the same result. /// You can also add more input and finalize again. pub fn finalize(&self) -> Hash { assert_eq!( self.initial_chunk_counter, 0, "set_input_offset must be used with finalize_non_root", ); self.final_output().root_hash() } /// Finalize the hash state and return an [`OutputReader`], which can /// supply any number of output bytes. /// /// This method is idempotent. Calling it twice will give the same result. /// You can also add more input and finalize again. /// /// [`OutputReader`]: struct.OutputReader.html pub fn finalize_xof(&self) -> OutputReader { assert_eq!( self.initial_chunk_counter, 0, "set_input_offset must be used with finalize_non_root", ); OutputReader::new(self.final_output()) } /// Return the total number of bytes hashed so far. /// /// [`hazmat::HasherExt::set_input_offset`] does not affect this value. This only counts bytes /// passed to [`update`](Hasher::update). pub fn count(&self) -> u64 { // Account for non-zero cases of Hasher::set_input_offset. Note that initial_chunk_counter // is always 0 for callers who don't use the hazmat module. (self.chunk_state.chunk_counter - self.initial_chunk_counter) * CHUNK_LEN as u64 + self.chunk_state.count() as u64 } /// As [`update`](Hasher::update), but reading from a /// [`std::io::Read`](https://doc.rust-lang.org/std/io/trait.Read.html) implementation. /// /// [`Hasher`] implements /// [`std::io::Write`](https://doc.rust-lang.org/std/io/trait.Write.html), so it's possible to /// use [`std::io::copy`](https://doc.rust-lang.org/std/io/fn.copy.html) to update a [`Hasher`] /// from any reader. Unfortunately, this standard approach can limit performance, because /// `copy` currently uses an internal 8 KiB buffer that isn't big enough to take advantage of /// all SIMD instruction sets. (In particular, [AVX-512](https://en.wikipedia.org/wiki/AVX-512) /// needs a 16 KiB buffer.) `update_reader` avoids this performance problem and is slightly /// more convenient. /// /// The internal buffer size this method uses may change at any time, and it may be different /// for different targets. The only guarantee is that it will be large enough for all of this /// crate's SIMD implementations on the current platform. /// /// The most common implementer of /// [`std::io::Read`](https://doc.rust-lang.org/std/io/trait.Read.html) might be /// [`std::fs::File`](https://doc.rust-lang.org/std/fs/struct.File.html), but note that memory /// mapping can be faster than this method for hashing large files. See /// [`update_mmap`](Hasher::update_mmap) and [`update_mmap_rayon`](Hasher::update_mmap_rayon), /// which require the `mmap` and (for the latter) `rayon` Cargo features. /// /// This method requires the `std` Cargo feature, which is enabled by default. /// /// # Example /// /// ```no_run /// # use std::fs::File; /// # use std::io; /// # fn main() -> io::Result<()> { /// // Hash standard input. /// let mut hasher = blake3::Hasher::new(); /// hasher.update_reader(std::io::stdin().lock())?; /// println!("{}", hasher.finalize()); /// # Ok(()) /// # } /// ``` #[cfg(feature = "std")] pub fn update_reader(&mut self, reader: impl std::io::Read) -> std::io::Result<&mut Self> { io::copy_wide(reader, self)?; Ok(self) } /// As [`update`](Hasher::update), but using Rayon-based multithreading /// internally. /// /// This method is gated by the `rayon` Cargo feature, which is disabled by /// default but enabled on [docs.rs](https://docs.rs). /// /// To get any performance benefit from multithreading, the input buffer /// needs to be large. As a rule of thumb on x86_64, `update_rayon` is /// _slower_ than `update` for inputs under 128 KiB. That threshold varies /// quite a lot across different processors, and it's important to benchmark /// your specific use case. See also the performance warning associated with /// [`update_mmap_rayon`](Hasher::update_mmap_rayon). /// /// If you already have a large buffer in memory, and you want to hash it /// with multiple threads, this method is a good option. However, reading a /// file into memory just to call this method can be a performance mistake, /// both because it requires lots of memory and because single-threaded /// reads can be slow. For hashing whole files, see /// [`update_mmap_rayon`](Hasher::update_mmap_rayon), which is gated by both /// the `rayon` and `mmap` Cargo features. #[cfg(feature = "rayon")] pub fn update_rayon(&mut self, input: &[u8]) -> &mut Self { self.update_with_join::(input) } /// As [`update`](Hasher::update), but reading the contents of a file using memory mapping. /// /// Not all files can be memory mapped, and memory mapping small files can be slower than /// reading them the usual way. In those cases, this method will fall back to standard file IO. /// The heuristic for whether to use memory mapping is currently very simple (file size >= /// 16 KiB), and it might change at any time. /// /// Like [`update`](Hasher::update), this method is single-threaded. In this author's /// experience, memory mapping improves single-threaded performance by ~10% for large files /// that are already in cache. This probably varies between platforms, and as always it's a /// good idea to benchmark your own use case. In comparison, the multithreaded /// [`update_mmap_rayon`](Hasher::update_mmap_rayon) method can have a much larger impact on /// performance. /// /// There's a correctness reason that this method takes /// [`Path`](https://doc.rust-lang.org/stable/std/path/struct.Path.html) instead of /// [`File`](https://doc.rust-lang.org/std/fs/struct.File.html): reading from a memory-mapped /// file ignores the seek position of the original file handle (it neither respects the current /// position nor updates the position). This difference in behavior would've caused /// `update_mmap` and [`update_reader`](Hasher::update_reader) to give different answers and /// have different side effects in some cases. Taking a /// [`Path`](https://doc.rust-lang.org/stable/std/path/struct.Path.html) avoids this problem by /// making it clear that a new [`File`](https://doc.rust-lang.org/std/fs/struct.File.html) is /// opened internally. /// /// This method requires the `mmap` Cargo feature, which is disabled by default but enabled on /// [docs.rs](https://docs.rs). /// /// # Example /// /// ```no_run /// # use std::io; /// # use std::path::Path; /// # fn main() -> io::Result<()> { /// let path = Path::new("file.dat"); /// let mut hasher = blake3::Hasher::new(); /// hasher.update_mmap(path)?; /// println!("{}", hasher.finalize()); /// # Ok(()) /// # } /// ``` #[cfg(feature = "mmap")] pub fn update_mmap(&mut self, path: impl AsRef) -> std::io::Result<&mut Self> { let file = std::fs::File::open(path.as_ref())?; if let Some(mmap) = io::maybe_mmap_file(&file)? { self.update(&mmap); } else { io::copy_wide(&file, self)?; } Ok(self) } /// As [`update_rayon`](Hasher::update_rayon), but reading the contents of a file using /// memory mapping. This is the default behavior of `b3sum`. /// /// For large files that are likely to be in cache, this can be much faster than /// single-threaded hashing. When benchmarks report that BLAKE3 is 10x or 20x faster than other /// cryptographic hashes, this is usually what they're measuring. However... /// /// **Performance Warning:** There are cases where multithreading hurts performance. The worst /// case is [a large file on a spinning disk](https://github.com/BLAKE3-team/BLAKE3/issues/31), /// where simultaneous reads from multiple threads can cause "thrashing" (i.e. the disk spends /// more time seeking around than reading data). Windows tends to be somewhat worse about this, /// in part because it's less likely than Linux to keep very large files in cache. More /// generally, if your CPU cores are already busy, then multithreading will add overhead /// without improving performance. If your code runs in different environments that you don't /// control and can't measure, then unfortunately there's no one-size-fits-all answer for /// whether multithreading is a good idea. /// /// The memory mapping behavior of this function is the same as /// [`update_mmap`](Hasher::update_mmap), and the heuristic for when to fall back to standard /// file IO might change at any time. /// /// This method requires both the `mmap` and `rayon` Cargo features, which are disabled by /// default but enabled on [docs.rs](https://docs.rs). /// /// # Example /// /// ```no_run /// # use std::io; /// # use std::path::Path; /// # fn main() -> io::Result<()> { /// # #[cfg(feature = "rayon")] /// # { /// let path = Path::new("big_file.dat"); /// let mut hasher = blake3::Hasher::new(); /// hasher.update_mmap_rayon(path)?; /// println!("{}", hasher.finalize()); /// # } /// # Ok(()) /// # } /// ``` #[cfg(feature = "mmap")] #[cfg(feature = "rayon")] pub fn update_mmap_rayon( &mut self, path: impl AsRef, ) -> std::io::Result<&mut Self> { let file = std::fs::File::open(path.as_ref())?; if let Some(mmap) = io::maybe_mmap_file(&file)? { self.update_rayon(&mmap); } else { io::copy_wide(&file, self)?; } Ok(self) } } // Don't derive(Debug), because the state may be secret. impl fmt::Debug for Hasher { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("Hasher") .field("flags", &self.chunk_state.flags) .field("platform", &self.chunk_state.platform) .finish() } } impl Default for Hasher { #[inline] fn default() -> Self { Self::new() } } #[cfg(feature = "std")] impl std::io::Write for Hasher { /// This is equivalent to [`update`](#method.update). #[inline] fn write(&mut self, input: &[u8]) -> std::io::Result { self.update(input); Ok(input.len()) } #[inline] fn flush(&mut self) -> std::io::Result<()> { Ok(()) } } #[cfg(feature = "zeroize")] impl Zeroize for Hasher { fn zeroize(&mut self) { // Destructuring to trigger compile error as a reminder to update this impl. let Self { key, chunk_state, initial_chunk_counter, cv_stack, } = self; key.zeroize(); chunk_state.zeroize(); initial_chunk_counter.zeroize(); cv_stack.zeroize(); } } /// An incremental reader for extended output, returned by /// [`Hasher::finalize_xof`](struct.Hasher.html#method.finalize_xof). /// /// Shorter BLAKE3 outputs are prefixes of longer ones, and explicitly requesting a short output is /// equivalent to truncating the default-length output. Note that this is a difference between /// BLAKE2 and BLAKE3. /// /// # Security notes /// /// Outputs shorter than the default length of 32 bytes (256 bits) provide less security. An N-bit /// BLAKE3 output is intended to provide N bits of first and second preimage resistance and N/2 /// bits of collision resistance, for any N up to 256. Longer outputs don't provide any additional /// security. /// /// Avoid relying on the secrecy of the output offset, that is, the number of output bytes read or /// the arguments to [`seek`](struct.OutputReader.html#method.seek) or /// [`set_position`](struct.OutputReader.html#method.set_position). [_Block-Cipher-Based Tree /// Hashing_ by Aldo Gunsing](https://eprint.iacr.org/2022/283) shows that an attacker who knows /// both the message and the key (if any) can easily determine the offset of an extended output. /// For comparison, AES-CTR has a similar property: if you know the key, you can decrypt a block /// from an unknown position in the output stream to recover its block index. Callers with strong /// secret keys aren't affected in practice, but secret offsets are a [design /// smell](https://en.wikipedia.org/wiki/Design_smell) in any case. #[derive(Clone)] pub struct OutputReader { inner: Output, position_within_block: u8, } impl OutputReader { fn new(inner: Output) -> Self { Self { inner, position_within_block: 0, } } // This helper function handles both the case where the output buffer is // shorter than one block, and the case where our position_within_block is // non-zero. fn fill_one_block(&mut self, buf: &mut &mut [u8]) { let output_block: [u8; BLOCK_LEN] = self.inner.root_output_block(); let output_bytes = &output_block[self.position_within_block as usize..]; let take = cmp::min(buf.len(), output_bytes.len()); buf[..take].copy_from_slice(&output_bytes[..take]); self.position_within_block += take as u8; if self.position_within_block == BLOCK_LEN as u8 { self.inner.counter += 1; self.position_within_block = 0; } // Advance the dest buffer. mem::take() is a borrowck workaround. *buf = &mut core::mem::take(buf)[take..]; } /// Fill a buffer with output bytes and advance the position of the /// `OutputReader`. This is equivalent to [`Read::read`], except that it /// doesn't return a `Result`. Both methods always fill the entire buffer. /// /// Note that `OutputReader` doesn't buffer output bytes internally, so /// calling `fill` repeatedly with a short-length or odd-length slice will /// end up performing the same compression multiple times. If you're /// reading output in a loop, prefer a slice length that's a multiple of /// [`BLOCK_LEN`] (64 bytes). /// /// The maximum output size of BLAKE3 is 264-1 bytes. If you try /// to extract more than that, for example by seeking near the end and /// reading further, the behavior is unspecified. /// /// [`Read::read`]: #method.read pub fn fill(&mut self, mut buf: &mut [u8]) { if buf.is_empty() { return; } // If we're partway through a block, try to get to a block boundary. if self.position_within_block != 0 { self.fill_one_block(&mut buf); } let full_blocks = buf.len() / BLOCK_LEN; let full_blocks_len = full_blocks * BLOCK_LEN; if full_blocks > 0 { debug_assert_eq!(0, self.position_within_block); self.inner.platform.xof_many( &self.inner.input_chaining_value, &self.inner.block, self.inner.block_len, self.inner.counter, self.inner.flags | ROOT, &mut buf[..full_blocks_len], ); self.inner.counter += full_blocks as u64; buf = &mut buf[full_blocks * BLOCK_LEN..]; } if !buf.is_empty() { debug_assert!(buf.len() < BLOCK_LEN); self.fill_one_block(&mut buf); debug_assert!(buf.is_empty()); } } /// Return the current read position in the output stream. This is /// equivalent to [`Seek::stream_position`], except that it doesn't return /// a `Result`. The position of a new `OutputReader` starts at 0, and each /// call to [`fill`] or [`Read::read`] moves the position forward by the /// number of bytes read. /// /// [`Seek::stream_position`]: #method.stream_position /// [`fill`]: #method.fill /// [`Read::read`]: #method.read pub fn position(&self) -> u64 { self.inner.counter * BLOCK_LEN as u64 + self.position_within_block as u64 } /// Seek to a new read position in the output stream. This is equivalent to /// calling [`Seek::seek`] with [`SeekFrom::Start`], except that it doesn't /// return a `Result`. /// /// [`Seek::seek`]: #method.seek /// [`SeekFrom::Start`]: https://doc.rust-lang.org/std/io/enum.SeekFrom.html pub fn set_position(&mut self, position: u64) { self.position_within_block = (position % BLOCK_LEN as u64) as u8; self.inner.counter = position / BLOCK_LEN as u64; } } // Don't derive(Debug), because the state may be secret. impl fmt::Debug for OutputReader { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("OutputReader") .field("position", &self.position()) .finish() } } #[cfg(feature = "std")] impl std::io::Read for OutputReader { #[inline] fn read(&mut self, buf: &mut [u8]) -> std::io::Result { self.fill(buf); Ok(buf.len()) } } #[cfg(feature = "std")] impl std::io::Seek for OutputReader { fn seek(&mut self, pos: std::io::SeekFrom) -> std::io::Result { let max_position = u64::max_value() as i128; let target_position: i128 = match pos { std::io::SeekFrom::Start(x) => x as i128, std::io::SeekFrom::Current(x) => self.position() as i128 + x as i128, std::io::SeekFrom::End(_) => { return Err(std::io::Error::new( std::io::ErrorKind::InvalidInput, "seek from end not supported", )); } }; if target_position < 0 { return Err(std::io::Error::new( std::io::ErrorKind::InvalidInput, "seek before start", )); } self.set_position(cmp::min(target_position, max_position) as u64); Ok(self.position()) } } #[cfg(feature = "zeroize")] impl Zeroize for OutputReader { fn zeroize(&mut self) { // Destructuring to trigger compile error as a reminder to update this impl. let Self { inner, position_within_block, } = self; inner.zeroize(); position_within_block.zeroize(); } } ================================================ FILE: third-party/blake3/src/platform.rs ================================================ use crate::{portable, CVWords, IncrementCounter, BLOCK_LEN}; use arrayref::{array_mut_ref, array_ref}; cfg_if::cfg_if! { if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { cfg_if::cfg_if! { if #[cfg(blake3_avx512_ffi)] { pub const MAX_SIMD_DEGREE: usize = 16; } else { pub const MAX_SIMD_DEGREE: usize = 8; } } } else if #[cfg(blake3_neon)] { pub const MAX_SIMD_DEGREE: usize = 4; } else if #[cfg(blake3_wasm32_simd)] { pub const MAX_SIMD_DEGREE: usize = 4; } else { pub const MAX_SIMD_DEGREE: usize = 1; } } // There are some places where we want a static size that's equal to the // MAX_SIMD_DEGREE, but also at least 2. Constant contexts aren't currently // allowed to use cmp::max, so we have to hardcode this additional constant // value. Get rid of this once cmp::max is a const fn. cfg_if::cfg_if! { if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { cfg_if::cfg_if! { if #[cfg(blake3_avx512_ffi)] { pub const MAX_SIMD_DEGREE_OR_2: usize = 16; } else { pub const MAX_SIMD_DEGREE_OR_2: usize = 8; } } } else if #[cfg(blake3_neon)] { pub const MAX_SIMD_DEGREE_OR_2: usize = 4; } else if #[cfg(blake3_wasm32_simd)] { pub const MAX_SIMD_DEGREE_OR_2: usize = 4; } else { pub const MAX_SIMD_DEGREE_OR_2: usize = 2; } } #[derive(Clone, Copy, Debug)] pub enum Platform { Portable, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] SSE2, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] SSE41, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] AVX2, #[cfg(blake3_avx512_ffi)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] AVX512, #[cfg(blake3_neon)] NEON, #[cfg(blake3_wasm32_simd)] #[allow(non_camel_case_types)] WASM32_SIMD, } impl Platform { #[allow(unreachable_code)] pub fn detect() -> Self { #[cfg(miri)] { return Platform::Portable; } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { #[cfg(blake3_avx512_ffi)] { if avx512_detected() { return Platform::AVX512; } } if avx2_detected() { return Platform::AVX2; } if sse41_detected() { return Platform::SSE41; } if sse2_detected() { return Platform::SSE2; } } // We don't use dynamic feature detection for NEON. If the "neon" // feature is on, NEON is assumed to be supported. #[cfg(blake3_neon)] { return Platform::NEON; } #[cfg(blake3_wasm32_simd)] { return Platform::WASM32_SIMD; } Platform::Portable } pub fn simd_degree(&self) -> usize { let degree = match self { Platform::Portable => 1, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE2 => 4, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE41 => 4, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX2 => 8, #[cfg(blake3_avx512_ffi)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX512 => 16, #[cfg(blake3_neon)] Platform::NEON => 4, #[cfg(blake3_wasm32_simd)] Platform::WASM32_SIMD => 4, }; debug_assert!(degree <= MAX_SIMD_DEGREE); degree } pub fn compress_in_place( &self, cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) { match self { Platform::Portable => portable::compress_in_place(cv, block, block_len, counter, flags), // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE2 => unsafe { crate::sse2::compress_in_place(cv, block, block_len, counter, flags) }, // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE41 | Platform::AVX2 => unsafe { crate::sse41::compress_in_place(cv, block, block_len, counter, flags) }, // Safe because detect() checked for platform support. #[cfg(blake3_avx512_ffi)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX512 => unsafe { crate::avx512::compress_in_place(cv, block, block_len, counter, flags) }, // No NEON compress_in_place() implementation yet. #[cfg(blake3_neon)] Platform::NEON => portable::compress_in_place(cv, block, block_len, counter, flags), #[cfg(blake3_wasm32_simd)] Platform::WASM32_SIMD => { crate::wasm32_simd::compress_in_place(cv, block, block_len, counter, flags) } } } pub fn compress_xof( &self, cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u8; 64] { match self { Platform::Portable => portable::compress_xof(cv, block, block_len, counter, flags), // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE2 => unsafe { crate::sse2::compress_xof(cv, block, block_len, counter, flags) }, // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE41 | Platform::AVX2 => unsafe { crate::sse41::compress_xof(cv, block, block_len, counter, flags) }, // Safe because detect() checked for platform support. #[cfg(blake3_avx512_ffi)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX512 => unsafe { crate::avx512::compress_xof(cv, block, block_len, counter, flags) }, // No NEON compress_xof() implementation yet. #[cfg(blake3_neon)] Platform::NEON => portable::compress_xof(cv, block, block_len, counter, flags), #[cfg(blake3_wasm32_simd)] Platform::WASM32_SIMD => { crate::wasm32_simd::compress_xof(cv, block, block_len, counter, flags) } } } // IMPLEMENTATION NOTE // =================== // hash_many() applies two optimizations. The critically important // optimization is the high-performance parallel SIMD hashing mode, // described in detail in the spec. This more than doubles throughput per // thread. Another optimization is keeping the state vectors transposed // from block to block within a chunk. When state vectors are transposed // after every block, there's a small but measurable performance loss. // Compressing chunks with a dedicated loop avoids this. pub fn hash_many( &self, inputs: &[&[u8; N]], key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8], ) { match self { Platform::Portable => portable::hash_many( inputs, key, counter, increment_counter, flags, flags_start, flags_end, out, ), // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE2 => unsafe { crate::sse2::hash_many( inputs, key, counter, increment_counter, flags, flags_start, flags_end, out, ) }, // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::SSE41 => unsafe { crate::sse41::hash_many( inputs, key, counter, increment_counter, flags, flags_start, flags_end, out, ) }, // Safe because detect() checked for platform support. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX2 => unsafe { crate::avx2::hash_many( inputs, key, counter, increment_counter, flags, flags_start, flags_end, out, ) }, // Safe because detect() checked for platform support. #[cfg(blake3_avx512_ffi)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX512 => unsafe { crate::avx512::hash_many( inputs, key, counter, increment_counter, flags, flags_start, flags_end, out, ) }, // Assumed to be safe if the "neon" feature is on. #[cfg(blake3_neon)] Platform::NEON => unsafe { crate::neon::hash_many( inputs, key, counter, increment_counter, flags, flags_start, flags_end, out, ) }, // Assumed to be safe if the "wasm32_simd" feature is on. #[cfg(blake3_wasm32_simd)] Platform::WASM32_SIMD => unsafe { crate::wasm32_simd::hash_many( inputs, key, counter, increment_counter, flags, flags_start, flags_end, out, ) }, } } pub fn xof_many( &self, cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, mut counter: u64, flags: u8, out: &mut [u8], ) { debug_assert_eq!(0, out.len() % BLOCK_LEN, "whole blocks only"); if out.is_empty() { // The current assembly implementation always outputs at least 1 block. return; } match self { // Safe because detect() checked for platform support. #[cfg(blake3_avx512_ffi)] #[cfg(unix)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Platform::AVX512 => unsafe { crate::avx512::xof_many(cv, block, block_len, counter, flags, out) }, _ => { // For platforms without an optimized xof_many, fall back to a loop over // compress_xof. This is still faster than portable code. for out_block in out.chunks_exact_mut(BLOCK_LEN) { // TODO: Use array_chunks_mut here once that's stable. let out_array: &mut [u8; BLOCK_LEN] = out_block.try_into().unwrap(); *out_array = self.compress_xof(cv, block, block_len, counter, flags); counter += 1; } } } } // Explicit platform constructors, for benchmarks. pub fn portable() -> Self { Self::Portable } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn sse2() -> Option { if sse2_detected() { Some(Self::SSE2) } else { None } } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn sse41() -> Option { if sse41_detected() { Some(Self::SSE41) } else { None } } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn avx2() -> Option { if avx2_detected() { Some(Self::AVX2) } else { None } } #[cfg(blake3_avx512_ffi)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub fn avx512() -> Option { if avx512_detected() { Some(Self::AVX512) } else { None } } #[cfg(blake3_neon)] pub fn neon() -> Option { // Assumed to be safe if the "neon" feature is on. Some(Self::NEON) } #[cfg(blake3_wasm32_simd)] pub fn wasm32_simd() -> Option { // Assumed to be safe if the "wasm32_simd" feature is on. Some(Self::WASM32_SIMD) } } // Note that AVX-512 is divided into multiple featuresets, and we use two of // them, F and VL. #[cfg(blake3_avx512_ffi)] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline(always)] #[allow(unreachable_code)] pub fn avx512_detected() -> bool { if cfg!(miri) { return false; } // A testing-only short-circuit. if cfg!(feature = "no_avx512") { return false; } // Static check, e.g. for building with target-cpu=native. #[cfg(all(target_feature = "avx512f", target_feature = "avx512vl"))] { return true; } // Dynamic check, if std is enabled. #[cfg(feature = "std")] { if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") { return true; } } false } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline(always)] #[allow(unreachable_code)] pub fn avx2_detected() -> bool { if cfg!(miri) { return false; } // A testing-only short-circuit. if cfg!(feature = "no_avx2") { return false; } // Static check, e.g. for building with target-cpu=native. #[cfg(target_feature = "avx2")] { return true; } // Dynamic check, if std is enabled. #[cfg(feature = "std")] { if is_x86_feature_detected!("avx2") { return true; } } false } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline(always)] #[allow(unreachable_code)] pub fn sse41_detected() -> bool { if cfg!(miri) { return false; } // A testing-only short-circuit. if cfg!(feature = "no_sse41") { return false; } // Static check, e.g. for building with target-cpu=native. #[cfg(target_feature = "sse4.1")] { return true; } // Dynamic check, if std is enabled. #[cfg(feature = "std")] { if is_x86_feature_detected!("sse4.1") { return true; } } false } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline(always)] #[allow(unreachable_code)] pub fn sse2_detected() -> bool { if cfg!(miri) { return false; } // A testing-only short-circuit. if cfg!(feature = "no_sse2") { return false; } // Static check, e.g. for building with target-cpu=native. #[cfg(target_feature = "sse2")] { return true; } // Dynamic check, if std is enabled. #[cfg(feature = "std")] { if is_x86_feature_detected!("sse2") { return true; } } false } #[inline(always)] pub fn words_from_le_bytes_32(bytes: &[u8; 32]) -> [u32; 8] { let mut out = [0; 8]; out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4)); out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4)); out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4)); out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4)); out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4)); out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4)); out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4)); out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4)); out } #[inline(always)] pub fn words_from_le_bytes_64(bytes: &[u8; 64]) -> [u32; 16] { let mut out = [0; 16]; out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4)); out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4)); out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4)); out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4)); out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4)); out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4)); out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4)); out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4)); out[8] = u32::from_le_bytes(*array_ref!(bytes, 8 * 4, 4)); out[9] = u32::from_le_bytes(*array_ref!(bytes, 9 * 4, 4)); out[10] = u32::from_le_bytes(*array_ref!(bytes, 10 * 4, 4)); out[11] = u32::from_le_bytes(*array_ref!(bytes, 11 * 4, 4)); out[12] = u32::from_le_bytes(*array_ref!(bytes, 12 * 4, 4)); out[13] = u32::from_le_bytes(*array_ref!(bytes, 13 * 4, 4)); out[14] = u32::from_le_bytes(*array_ref!(bytes, 14 * 4, 4)); out[15] = u32::from_le_bytes(*array_ref!(bytes, 15 * 4, 4)); out } #[inline(always)] pub fn le_bytes_from_words_32(words: &[u32; 8]) -> [u8; 32] { let mut out = [0; 32]; *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes(); *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes(); *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes(); *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes(); *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes(); *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes(); *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes(); *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes(); out } #[inline(always)] pub fn le_bytes_from_words_64(words: &[u32; 16]) -> [u8; 64] { let mut out = [0; 64]; *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes(); *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes(); *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes(); *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes(); *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes(); *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes(); *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes(); *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes(); *array_mut_ref!(out, 8 * 4, 4) = words[8].to_le_bytes(); *array_mut_ref!(out, 9 * 4, 4) = words[9].to_le_bytes(); *array_mut_ref!(out, 10 * 4, 4) = words[10].to_le_bytes(); *array_mut_ref!(out, 11 * 4, 4) = words[11].to_le_bytes(); *array_mut_ref!(out, 12 * 4, 4) = words[12].to_le_bytes(); *array_mut_ref!(out, 13 * 4, 4) = words[13].to_le_bytes(); *array_mut_ref!(out, 14 * 4, 4) = words[14].to_le_bytes(); *array_mut_ref!(out, 15 * 4, 4) = words[15].to_le_bytes(); out } ================================================ FILE: third-party/blake3/src/portable.rs ================================================ use crate::{ counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN, }; use arrayref::{array_mut_ref, array_ref}; #[inline(always)] fn g(state: &mut [u32; 16], a: usize, b: usize, c: usize, d: usize, x: u32, y: u32) { state[a] = state[a].wrapping_add(state[b]).wrapping_add(x); state[d] = (state[d] ^ state[a]).rotate_right(16); state[c] = state[c].wrapping_add(state[d]); state[b] = (state[b] ^ state[c]).rotate_right(12); state[a] = state[a].wrapping_add(state[b]).wrapping_add(y); state[d] = (state[d] ^ state[a]).rotate_right(8); state[c] = state[c].wrapping_add(state[d]); state[b] = (state[b] ^ state[c]).rotate_right(7); } #[inline(always)] fn round(state: &mut [u32; 16], msg: &[u32; 16], round: usize) { // Select the message schedule based on the round. let schedule = MSG_SCHEDULE[round]; // Mix the columns. g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); // Mix the diagonals. g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); } #[inline(always)] fn compress_pre( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u32; 16] { let block_words = crate::platform::words_from_le_bytes_64(block); let mut state = [ cv[0], cv[1], cv[2], cv[3], cv[4], cv[5], cv[6], cv[7], IV[0], IV[1], IV[2], IV[3], counter_low(counter), counter_high(counter), block_len as u32, flags as u32, ]; round(&mut state, &block_words, 0); round(&mut state, &block_words, 1); round(&mut state, &block_words, 2); round(&mut state, &block_words, 3); round(&mut state, &block_words, 4); round(&mut state, &block_words, 5); round(&mut state, &block_words, 6); state } pub fn compress_in_place( cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) { let state = compress_pre(cv, block, block_len, counter, flags); cv[0] = state[0] ^ state[8]; cv[1] = state[1] ^ state[9]; cv[2] = state[2] ^ state[10]; cv[3] = state[3] ^ state[11]; cv[4] = state[4] ^ state[12]; cv[5] = state[5] ^ state[13]; cv[6] = state[6] ^ state[14]; cv[7] = state[7] ^ state[15]; } pub fn compress_xof( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u8; 64] { let mut state = compress_pre(cv, block, block_len, counter, flags); state[0] ^= state[8]; state[1] ^= state[9]; state[2] ^= state[10]; state[3] ^= state[11]; state[4] ^= state[12]; state[5] ^= state[13]; state[6] ^= state[14]; state[7] ^= state[15]; state[8] ^= cv[0]; state[9] ^= cv[1]; state[10] ^= cv[2]; state[11] ^= cv[3]; state[12] ^= cv[4]; state[13] ^= cv[5]; state[14] ^= cv[6]; state[15] ^= cv[7]; crate::platform::le_bytes_from_words_64(&state) } pub fn hash1( input: &[u8; N], key: &CVWords, counter: u64, flags: u8, flags_start: u8, flags_end: u8, out: &mut CVBytes, ) { debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks"); let mut cv = *key; let mut block_flags = flags | flags_start; let mut slice = &input[..]; while slice.len() >= BLOCK_LEN { if slice.len() == BLOCK_LEN { block_flags |= flags_end; } compress_in_place( &mut cv, array_ref!(slice, 0, BLOCK_LEN), BLOCK_LEN as u8, counter, block_flags, ); block_flags = flags; slice = &slice[BLOCK_LEN..]; } *out = crate::platform::le_bytes_from_words_32(&cv); } pub fn hash_many( inputs: &[&[u8; N]], key: &CVWords, mut counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8], ) { debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) { hash1( input, key, counter, flags, flags_start, flags_end, array_mut_ref!(output, 0, OUT_LEN), ); if increment_counter.yes() { counter += 1; } } } #[cfg(test)] pub mod test { use super::*; // This is basically testing the portable implementation against itself, // but it also checks that compress_in_place and compress_xof are // consistent. And there are tests against the reference implementation and // against hardcoded test vectors elsewhere. #[test] fn test_compress() { crate::test::test_compress_fn(compress_in_place, compress_xof); } // Ditto. #[test] fn test_hash_many() { crate::test::test_hash_many_fn(hash_many, hash_many); } } ================================================ FILE: third-party/blake3/src/rust_avx2.rs ================================================ #[cfg(target_arch = "x86")] use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; use crate::{ counter_high, counter_low, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN, }; use arrayref::{array_mut_ref, mut_array_refs}; pub const DEGREE: usize = 8; #[inline(always)] unsafe fn loadu(src: *const u8) -> __m256i { // This is an unaligned load, so the pointer cast is allowed. _mm256_loadu_si256(src as *const __m256i) } #[inline(always)] unsafe fn storeu(src: __m256i, dest: *mut u8) { // This is an unaligned store, so the pointer cast is allowed. _mm256_storeu_si256(dest as *mut __m256i, src) } #[inline(always)] unsafe fn add(a: __m256i, b: __m256i) -> __m256i { _mm256_add_epi32(a, b) } #[inline(always)] unsafe fn xor(a: __m256i, b: __m256i) -> __m256i { _mm256_xor_si256(a, b) } #[inline(always)] unsafe fn set1(x: u32) -> __m256i { _mm256_set1_epi32(x as i32) } #[inline(always)] unsafe fn set8(a: u32, b: u32, c: u32, d: u32, e: u32, f: u32, g: u32, h: u32) -> __m256i { _mm256_setr_epi32( a as i32, b as i32, c as i32, d as i32, e as i32, f as i32, g as i32, h as i32, ) } // These rotations are the "simple/shifts version". For the // "complicated/shuffles version", see // https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. // For a discussion of the tradeoffs, see // https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug // (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better // on recent x86 chips. #[inline(always)] unsafe fn rot16(x: __m256i) -> __m256i { _mm256_or_si256(_mm256_srli_epi32(x, 16), _mm256_slli_epi32(x, 32 - 16)) } #[inline(always)] unsafe fn rot12(x: __m256i) -> __m256i { _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12)) } #[inline(always)] unsafe fn rot8(x: __m256i) -> __m256i { _mm256_or_si256(_mm256_srli_epi32(x, 8), _mm256_slli_epi32(x, 32 - 8)) } #[inline(always)] unsafe fn rot7(x: __m256i) -> __m256i { _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7)) } #[inline(always)] unsafe fn round(v: &mut [__m256i; 16], m: &[__m256i; 16], r: usize) { v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]); v[0] = add(v[0], v[4]); v[1] = add(v[1], v[5]); v[2] = add(v[2], v[6]); v[3] = add(v[3], v[7]); v[12] = xor(v[12], v[0]); v[13] = xor(v[13], v[1]); v[14] = xor(v[14], v[2]); v[15] = xor(v[15], v[3]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[15] = rot16(v[15]); v[8] = add(v[8], v[12]); v[9] = add(v[9], v[13]); v[10] = add(v[10], v[14]); v[11] = add(v[11], v[15]); v[4] = xor(v[4], v[8]); v[5] = xor(v[5], v[9]); v[6] = xor(v[6], v[10]); v[7] = xor(v[7], v[11]); v[4] = rot12(v[4]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]); v[0] = add(v[0], v[4]); v[1] = add(v[1], v[5]); v[2] = add(v[2], v[6]); v[3] = add(v[3], v[7]); v[12] = xor(v[12], v[0]); v[13] = xor(v[13], v[1]); v[14] = xor(v[14], v[2]); v[15] = xor(v[15], v[3]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[15] = rot8(v[15]); v[8] = add(v[8], v[12]); v[9] = add(v[9], v[13]); v[10] = add(v[10], v[14]); v[11] = add(v[11], v[15]); v[4] = xor(v[4], v[8]); v[5] = xor(v[5], v[9]); v[6] = xor(v[6], v[10]); v[7] = xor(v[7], v[11]); v[4] = rot7(v[4]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]); v[0] = add(v[0], v[5]); v[1] = add(v[1], v[6]); v[2] = add(v[2], v[7]); v[3] = add(v[3], v[4]); v[15] = xor(v[15], v[0]); v[12] = xor(v[12], v[1]); v[13] = xor(v[13], v[2]); v[14] = xor(v[14], v[3]); v[15] = rot16(v[15]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[10] = add(v[10], v[15]); v[11] = add(v[11], v[12]); v[8] = add(v[8], v[13]); v[9] = add(v[9], v[14]); v[5] = xor(v[5], v[10]); v[6] = xor(v[6], v[11]); v[7] = xor(v[7], v[8]); v[4] = xor(v[4], v[9]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[4] = rot12(v[4]); v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]); v[0] = add(v[0], v[5]); v[1] = add(v[1], v[6]); v[2] = add(v[2], v[7]); v[3] = add(v[3], v[4]); v[15] = xor(v[15], v[0]); v[12] = xor(v[12], v[1]); v[13] = xor(v[13], v[2]); v[14] = xor(v[14], v[3]); v[15] = rot8(v[15]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[10] = add(v[10], v[15]); v[11] = add(v[11], v[12]); v[8] = add(v[8], v[13]); v[9] = add(v[9], v[14]); v[5] = xor(v[5], v[10]); v[6] = xor(v[6], v[11]); v[7] = xor(v[7], v[8]); v[4] = xor(v[4], v[9]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[4] = rot7(v[4]); } #[inline(always)] unsafe fn interleave128(a: __m256i, b: __m256i) -> (__m256i, __m256i) { ( _mm256_permute2x128_si256(a, b, 0x20), _mm256_permute2x128_si256(a, b, 0x31), ) } // There are several ways to do a transposition. We could do it naively, with 8 separate // _mm256_set_epi32 instructions, referencing each of the 32 words explicitly. Or we could copy // the vecs into contiguous storage and then use gather instructions. This third approach is to use // a series of unpack instructions to interleave the vectors. In my benchmarks, interleaving is the // fastest approach. To test this, run `cargo +nightly bench --bench libtest load_8` in the // https://github.com/oconnor663/bao_experiments repo. #[inline(always)] unsafe fn transpose_vecs(vecs: &mut [__m256i; DEGREE]) { // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high is 22/33/66/77. let ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); let ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); let cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); let cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); let ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); let ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); let gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); let gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is 11/33. let abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); let abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); let abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); let abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); let efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); let efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); let efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); let efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); // Interleave 128-bit lanes. let (abcdefgh_0, abcdefgh_4) = interleave128(abcd_04, efgh_04); let (abcdefgh_1, abcdefgh_5) = interleave128(abcd_15, efgh_15); let (abcdefgh_2, abcdefgh_6) = interleave128(abcd_26, efgh_26); let (abcdefgh_3, abcdefgh_7) = interleave128(abcd_37, efgh_37); vecs[0] = abcdefgh_0; vecs[1] = abcdefgh_1; vecs[2] = abcdefgh_2; vecs[3] = abcdefgh_3; vecs[4] = abcdefgh_4; vecs[5] = abcdefgh_5; vecs[6] = abcdefgh_6; vecs[7] = abcdefgh_7; } #[inline(always)] unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m256i; 16] { let mut vecs = [ loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[4].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[5].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[6].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[7].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[4].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[5].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[6].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[7].add(block_offset + 1 * 4 * DEGREE)), ]; for i in 0..DEGREE { _mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0); } let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE); transpose_vecs(squares.0); transpose_vecs(squares.1); vecs } #[inline(always)] unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m256i, __m256i) { let mask = if increment_counter.yes() { !0 } else { 0 }; ( set8( counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)), counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3)), counter_low(counter + (mask & 4)), counter_low(counter + (mask & 5)), counter_low(counter + (mask & 6)), counter_low(counter + (mask & 7)), ), set8( counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)), counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)), counter_high(counter + (mask & 4)), counter_high(counter + (mask & 5)), counter_high(counter + (mask & 6)), counter_high(counter + (mask & 7)), ), ) } #[target_feature(enable = "avx2")] pub unsafe fn hash8( inputs: &[*const u8; DEGREE], blocks: usize, key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8; DEGREE * OUT_LEN], ) { let mut h_vecs = [ set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), ]; let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); let mut block_flags = flags | flags_start; for block in 0..blocks { if block + 1 == blocks { block_flags |= flags_end; } let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only let block_flags_vec = set1(block_flags as u32); let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); // The transposed compression function. Note that inlining this // manually here improves compile times by a lot, compared to factoring // it out into its own function and making it #[inline(always)]. Just // guessing, it might have something to do with loop unrolling. let mut v = [ h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, ]; round(&mut v, &msg_vecs, 0); round(&mut v, &msg_vecs, 1); round(&mut v, &msg_vecs, 2); round(&mut v, &msg_vecs, 3); round(&mut v, &msg_vecs, 4); round(&mut v, &msg_vecs, 5); round(&mut v, &msg_vecs, 6); h_vecs[0] = xor(v[0], v[8]); h_vecs[1] = xor(v[1], v[9]); h_vecs[2] = xor(v[2], v[10]); h_vecs[3] = xor(v[3], v[11]); h_vecs[4] = xor(v[4], v[12]); h_vecs[5] = xor(v[5], v[13]); h_vecs[6] = xor(v[6], v[14]); h_vecs[7] = xor(v[7], v[15]); block_flags = flags; } transpose_vecs(&mut h_vecs); storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); storeu(h_vecs[1], out.as_mut_ptr().add(1 * 4 * DEGREE)); storeu(h_vecs[2], out.as_mut_ptr().add(2 * 4 * DEGREE)); storeu(h_vecs[3], out.as_mut_ptr().add(3 * 4 * DEGREE)); storeu(h_vecs[4], out.as_mut_ptr().add(4 * 4 * DEGREE)); storeu(h_vecs[5], out.as_mut_ptr().add(5 * 4 * DEGREE)); storeu(h_vecs[6], out.as_mut_ptr().add(6 * 4 * DEGREE)); storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); } #[target_feature(enable = "avx2")] pub unsafe fn hash_many( mut inputs: &[&[u8; N]], key: &CVWords, mut counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, mut out: &mut [u8], ) { debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN { // Safe because the layout of arrays is guaranteed, and because the // `blocks` count is determined statically from the argument type. let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); let blocks = N / BLOCK_LEN; hash8( input_ptrs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, array_mut_ref!(out, 0, DEGREE * OUT_LEN), ); if increment_counter.yes() { counter += DEGREE as u64; } inputs = &inputs[DEGREE..]; out = &mut out[DEGREE * OUT_LEN..]; } crate::sse41::hash_many( inputs, key, counter, increment_counter, flags, flags_start, flags_end, out, ); } #[cfg(test)] mod test { use super::*; #[test] fn test_transpose() { if !crate::platform::avx2_detected() { return; } #[target_feature(enable = "avx2")] unsafe fn transpose_wrapper(vecs: &mut [__m256i; DEGREE]) { transpose_vecs(vecs); } let mut matrix = [[0 as u32; DEGREE]; DEGREE]; for i in 0..DEGREE { for j in 0..DEGREE { matrix[i][j] = (i * DEGREE + j) as u32; } } unsafe { let mut vecs: [__m256i; DEGREE] = core::mem::transmute(matrix); transpose_wrapper(&mut vecs); matrix = core::mem::transmute(vecs); } for i in 0..DEGREE { for j in 0..DEGREE { // Reversed indexes from above. assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); } } } #[test] fn test_hash_many() { if !crate::platform::avx2_detected() { return; } crate::test::test_hash_many_fn(hash_many, hash_many); } } ================================================ FILE: third-party/blake3/src/rust_sse2.rs ================================================ #[cfg(target_arch = "x86")] use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; use crate::{ counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN, }; use arrayref::{array_mut_ref, array_ref, mut_array_refs}; pub const DEGREE: usize = 4; #[inline(always)] unsafe fn loadu(src: *const u8) -> __m128i { // This is an unaligned load, so the pointer cast is allowed. _mm_loadu_si128(src as *const __m128i) } #[inline(always)] unsafe fn storeu(src: __m128i, dest: *mut u8) { // This is an unaligned store, so the pointer cast is allowed. _mm_storeu_si128(dest as *mut __m128i, src) } #[inline(always)] unsafe fn add(a: __m128i, b: __m128i) -> __m128i { _mm_add_epi32(a, b) } #[inline(always)] unsafe fn xor(a: __m128i, b: __m128i) -> __m128i { _mm_xor_si128(a, b) } #[inline(always)] unsafe fn set1(x: u32) -> __m128i { _mm_set1_epi32(x as i32) } #[inline(always)] unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i { _mm_setr_epi32(a as i32, b as i32, c as i32, d as i32) } // These rotations are the "simple/shifts version". For the // "complicated/shuffles version", see // https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. // For a discussion of the tradeoffs, see // https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug // (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better // on recent x86 chips. #[inline(always)] unsafe fn rot16(a: __m128i) -> __m128i { _mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16)) } #[inline(always)] unsafe fn rot12(a: __m128i) -> __m128i { _mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12)) } #[inline(always)] unsafe fn rot8(a: __m128i) -> __m128i { _mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8)) } #[inline(always)] unsafe fn rot7(a: __m128i) -> __m128i { _mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7)) } #[inline(always)] unsafe fn g1( row0: &mut __m128i, row1: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i, m: __m128i, ) { *row0 = add(add(*row0, m), *row1); *row3 = xor(*row3, *row0); *row3 = rot16(*row3); *row2 = add(*row2, *row3); *row1 = xor(*row1, *row2); *row1 = rot12(*row1); } #[inline(always)] unsafe fn g2( row0: &mut __m128i, row1: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i, m: __m128i, ) { *row0 = add(add(*row0, m), *row1); *row3 = xor(*row3, *row0); *row3 = rot8(*row3); *row2 = add(*row2, *row3); *row1 = xor(*row1, *row2); *row1 = rot7(*row1); } // Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479. macro_rules! _MM_SHUFFLE { ($z:expr, $y:expr, $x:expr, $w:expr) => { ($z << 6) | ($y << 4) | ($x << 2) | $w }; } macro_rules! shuffle2 { ($a:expr, $b:expr, $c:expr) => { _mm_castps_si128(_mm_shuffle_ps( _mm_castsi128_ps($a), _mm_castsi128_ps($b), $c, )) }; } // Note the optimization here of leaving row1 as the unrotated row, rather than // row0. All the message loads below are adjusted to compensate for this. See // discussion at https://github.com/sneves/blake2-avx2/pull/4 #[inline(always)] unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3)); *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1)); } #[inline(always)] unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1)); *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3)); } #[inline(always)] unsafe fn blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i { let bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); let mut mask = _mm_set1_epi16(imm8 as i16); mask = _mm_and_si128(mask, bits); mask = _mm_cmpeq_epi16(mask, bits); _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a)) } #[inline(always)] unsafe fn compress_pre( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [__m128i; 4] { let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8); let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8); let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]); let row3 = &mut set4( counter_low(counter), counter_high(counter), block_len as u32, flags as u32, ); let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE)); let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE)); let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE)); let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE)); let mut t0; let mut t1; let mut t2; let mut t3; let mut tt; // Round 1. The first round permutes the message words from the original // input order, into the groups that get mixed in parallel. t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); // 6 4 2 0 g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); // 7 5 3 1 g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10 8 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10 8 14 g1(row0, row1, row2, row3, t2); t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11 9 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11 9 15 g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 2. This round and all following rounds apply a fixed permutation // to the message words from the round before. t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 3 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 4 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 5 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 6 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 7 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); [*row0, *row1, *row2, *row3] } #[target_feature(enable = "sse2")] pub unsafe fn compress_in_place( cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) { let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags); storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8); storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8); } #[target_feature(enable = "sse2")] pub unsafe fn compress_xof( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u8; 64] { let [mut row0, mut row1, mut row2, mut row3] = compress_pre(cv, block, block_len, counter, flags); row0 = xor(row0, row2); row1 = xor(row1, row3); row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8)); row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8)); core::mem::transmute([row0, row1, row2, row3]) } #[inline(always)] unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) { v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]); v[0] = add(v[0], v[4]); v[1] = add(v[1], v[5]); v[2] = add(v[2], v[6]); v[3] = add(v[3], v[7]); v[12] = xor(v[12], v[0]); v[13] = xor(v[13], v[1]); v[14] = xor(v[14], v[2]); v[15] = xor(v[15], v[3]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[15] = rot16(v[15]); v[8] = add(v[8], v[12]); v[9] = add(v[9], v[13]); v[10] = add(v[10], v[14]); v[11] = add(v[11], v[15]); v[4] = xor(v[4], v[8]); v[5] = xor(v[5], v[9]); v[6] = xor(v[6], v[10]); v[7] = xor(v[7], v[11]); v[4] = rot12(v[4]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]); v[0] = add(v[0], v[4]); v[1] = add(v[1], v[5]); v[2] = add(v[2], v[6]); v[3] = add(v[3], v[7]); v[12] = xor(v[12], v[0]); v[13] = xor(v[13], v[1]); v[14] = xor(v[14], v[2]); v[15] = xor(v[15], v[3]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[15] = rot8(v[15]); v[8] = add(v[8], v[12]); v[9] = add(v[9], v[13]); v[10] = add(v[10], v[14]); v[11] = add(v[11], v[15]); v[4] = xor(v[4], v[8]); v[5] = xor(v[5], v[9]); v[6] = xor(v[6], v[10]); v[7] = xor(v[7], v[11]); v[4] = rot7(v[4]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]); v[0] = add(v[0], v[5]); v[1] = add(v[1], v[6]); v[2] = add(v[2], v[7]); v[3] = add(v[3], v[4]); v[15] = xor(v[15], v[0]); v[12] = xor(v[12], v[1]); v[13] = xor(v[13], v[2]); v[14] = xor(v[14], v[3]); v[15] = rot16(v[15]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[10] = add(v[10], v[15]); v[11] = add(v[11], v[12]); v[8] = add(v[8], v[13]); v[9] = add(v[9], v[14]); v[5] = xor(v[5], v[10]); v[6] = xor(v[6], v[11]); v[7] = xor(v[7], v[8]); v[4] = xor(v[4], v[9]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[4] = rot12(v[4]); v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]); v[0] = add(v[0], v[5]); v[1] = add(v[1], v[6]); v[2] = add(v[2], v[7]); v[3] = add(v[3], v[4]); v[15] = xor(v[15], v[0]); v[12] = xor(v[12], v[1]); v[13] = xor(v[13], v[2]); v[14] = xor(v[14], v[3]); v[15] = rot8(v[15]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[10] = add(v[10], v[15]); v[11] = add(v[11], v[12]); v[8] = add(v[8], v[13]); v[9] = add(v[9], v[14]); v[5] = xor(v[5], v[10]); v[6] = xor(v[6], v[11]); v[7] = xor(v[7], v[8]); v[4] = xor(v[4], v[9]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[4] = rot7(v[4]); } #[inline(always)] unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) { // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is // 22/33. Note that this doesn't split the vector into two lanes, as the // AVX2 counterparts do. let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); // Interleave 64-bit lanes. let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); vecs[0] = abcd_0; vecs[1] = abcd_1; vecs[2] = abcd_2; vecs[3] = abcd_3; } #[inline(always)] unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] { let mut vecs = [ loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)), ]; for i in 0..DEGREE { _mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0); } let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE); transpose_vecs(squares.0); transpose_vecs(squares.1); transpose_vecs(squares.2); transpose_vecs(squares.3); vecs } #[inline(always)] unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) { let mask = if increment_counter.yes() { !0 } else { 0 }; ( set4( counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)), counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3)), ), set4( counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)), counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)), ), ) } #[target_feature(enable = "sse2")] pub unsafe fn hash4( inputs: &[*const u8; DEGREE], blocks: usize, key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8; DEGREE * OUT_LEN], ) { let mut h_vecs = [ set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), ]; let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); let mut block_flags = flags | flags_start; for block in 0..blocks { if block + 1 == blocks { block_flags |= flags_end; } let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only let block_flags_vec = set1(block_flags as u32); let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); // The transposed compression function. Note that inlining this // manually here improves compile times by a lot, compared to factoring // it out into its own function and making it #[inline(always)]. Just // guessing, it might have something to do with loop unrolling. let mut v = [ h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, ]; round(&mut v, &msg_vecs, 0); round(&mut v, &msg_vecs, 1); round(&mut v, &msg_vecs, 2); round(&mut v, &msg_vecs, 3); round(&mut v, &msg_vecs, 4); round(&mut v, &msg_vecs, 5); round(&mut v, &msg_vecs, 6); h_vecs[0] = xor(v[0], v[8]); h_vecs[1] = xor(v[1], v[9]); h_vecs[2] = xor(v[2], v[10]); h_vecs[3] = xor(v[3], v[11]); h_vecs[4] = xor(v[4], v[12]); h_vecs[5] = xor(v[5], v[13]); h_vecs[6] = xor(v[6], v[14]); h_vecs[7] = xor(v[7], v[15]); block_flags = flags; } let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE); transpose_vecs(squares.0); transpose_vecs(squares.1); // The first four vecs now contain the first half of each output, and the // second four vecs contain the second half of each output. storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE)); storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE)); storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE)); storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE)); storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE)); storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE)); storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); } #[target_feature(enable = "sse2")] unsafe fn hash1( input: &[u8; N], key: &CVWords, counter: u64, flags: u8, flags_start: u8, flags_end: u8, out: &mut CVBytes, ) { debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks"); let mut cv = *key; let mut block_flags = flags | flags_start; let mut slice = &input[..]; while slice.len() >= BLOCK_LEN { if slice.len() == BLOCK_LEN { block_flags |= flags_end; } compress_in_place( &mut cv, array_ref!(slice, 0, BLOCK_LEN), BLOCK_LEN as u8, counter, block_flags, ); block_flags = flags; slice = &slice[BLOCK_LEN..]; } *out = core::mem::transmute(cv); // x86 is little-endian } #[target_feature(enable = "sse2")] pub unsafe fn hash_many( mut inputs: &[&[u8; N]], key: &CVWords, mut counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, mut out: &mut [u8], ) { debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN { // Safe because the layout of arrays is guaranteed, and because the // `blocks` count is determined statically from the argument type. let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); let blocks = N / BLOCK_LEN; hash4( input_ptrs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, array_mut_ref!(out, 0, DEGREE * OUT_LEN), ); if increment_counter.yes() { counter += DEGREE as u64; } inputs = &inputs[DEGREE..]; out = &mut out[DEGREE * OUT_LEN..]; } for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) { hash1( input, key, counter, flags, flags_start, flags_end, array_mut_ref!(output, 0, OUT_LEN), ); if increment_counter.yes() { counter += 1; } } } #[cfg(test)] mod test { use super::*; #[test] fn test_transpose() { if !crate::platform::sse2_detected() { return; } #[target_feature(enable = "sse2")] unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) { transpose_vecs(vecs); } let mut matrix = [[0 as u32; DEGREE]; DEGREE]; for i in 0..DEGREE { for j in 0..DEGREE { matrix[i][j] = (i * DEGREE + j) as u32; } } unsafe { let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix); transpose_wrapper(&mut vecs); matrix = core::mem::transmute(vecs); } for i in 0..DEGREE { for j in 0..DEGREE { // Reversed indexes from above. assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); } } } #[test] fn test_compress() { if !crate::platform::sse2_detected() { return; } crate::test::test_compress_fn(compress_in_place, compress_xof); } #[test] fn test_hash_many() { if !crate::platform::sse2_detected() { return; } crate::test::test_hash_many_fn(hash_many, hash_many); } } ================================================ FILE: third-party/blake3/src/rust_sse41.rs ================================================ #[cfg(target_arch = "x86")] use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; use crate::{ counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN, }; use arrayref::{array_mut_ref, array_ref, mut_array_refs}; pub const DEGREE: usize = 4; #[inline(always)] unsafe fn loadu(src: *const u8) -> __m128i { // This is an unaligned load, so the pointer cast is allowed. _mm_loadu_si128(src as *const __m128i) } #[inline(always)] unsafe fn storeu(src: __m128i, dest: *mut u8) { // This is an unaligned store, so the pointer cast is allowed. _mm_storeu_si128(dest as *mut __m128i, src) } #[inline(always)] unsafe fn add(a: __m128i, b: __m128i) -> __m128i { _mm_add_epi32(a, b) } #[inline(always)] unsafe fn xor(a: __m128i, b: __m128i) -> __m128i { _mm_xor_si128(a, b) } #[inline(always)] unsafe fn set1(x: u32) -> __m128i { _mm_set1_epi32(x as i32) } #[inline(always)] unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i { _mm_setr_epi32(a as i32, b as i32, c as i32, d as i32) } // These rotations are the "simple/shifts version". For the // "complicated/shuffles version", see // https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. // For a discussion of the tradeoffs, see // https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug // (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better // on recent x86 chips. #[inline(always)] unsafe fn rot16(a: __m128i) -> __m128i { _mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16)) } #[inline(always)] unsafe fn rot12(a: __m128i) -> __m128i { _mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12)) } #[inline(always)] unsafe fn rot8(a: __m128i) -> __m128i { _mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8)) } #[inline(always)] unsafe fn rot7(a: __m128i) -> __m128i { _mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7)) } #[inline(always)] unsafe fn g1( row0: &mut __m128i, row1: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i, m: __m128i, ) { *row0 = add(add(*row0, m), *row1); *row3 = xor(*row3, *row0); *row3 = rot16(*row3); *row2 = add(*row2, *row3); *row1 = xor(*row1, *row2); *row1 = rot12(*row1); } #[inline(always)] unsafe fn g2( row0: &mut __m128i, row1: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i, m: __m128i, ) { *row0 = add(add(*row0, m), *row1); *row3 = xor(*row3, *row0); *row3 = rot8(*row3); *row2 = add(*row2, *row3); *row1 = xor(*row1, *row2); *row1 = rot7(*row1); } // Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479. macro_rules! _MM_SHUFFLE { ($z:expr, $y:expr, $x:expr, $w:expr) => { ($z << 6) | ($y << 4) | ($x << 2) | $w }; } macro_rules! shuffle2 { ($a:expr, $b:expr, $c:expr) => { _mm_castps_si128(_mm_shuffle_ps( _mm_castsi128_ps($a), _mm_castsi128_ps($b), $c, )) }; } // Note the optimization here of leaving row1 as the unrotated row, rather than // row0. All the message loads below are adjusted to compensate for this. See // discussion at https://github.com/sneves/blake2-avx2/pull/4 #[inline(always)] unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3)); *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1)); } #[inline(always)] unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1)); *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3)); } #[inline(always)] unsafe fn compress_pre( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [__m128i; 4] { let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8); let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8); let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]); let row3 = &mut set4( counter_low(counter), counter_high(counter), block_len as u32, flags as u32, ); let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE)); let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE)); let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE)); let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE)); let mut t0; let mut t1; let mut t2; let mut t3; let mut tt; // Round 1. The first round permutes the message words from the original // input order, into the groups that get mixed in parallel. t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); // 6 4 2 0 g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); // 7 5 3 1 g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10 8 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10 8 14 g1(row0, row1, row2, row3, t2); t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11 9 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11 9 15 g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 2. This round and all following rounds apply a fixed permutation // to the message words from the round before. t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 3 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 4 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 5 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 6 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 7 t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); g1(row0, row1, row2, row3, t0); t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); t1 = _mm_blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = _mm_unpacklo_epi64(m3, m1); tt = _mm_blend_epi16(t2, m2, 0xC0); t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); g1(row0, row1, row2, row3, t2); t3 = _mm_unpackhi_epi32(m1, m3); tt = _mm_unpacklo_epi32(m2, t3); t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); [*row0, *row1, *row2, *row3] } #[target_feature(enable = "sse4.1")] pub unsafe fn compress_in_place( cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) { let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags); storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8); storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8); } #[target_feature(enable = "sse4.1")] pub unsafe fn compress_xof( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u8; 64] { let [mut row0, mut row1, mut row2, mut row3] = compress_pre(cv, block, block_len, counter, flags); row0 = xor(row0, row2); row1 = xor(row1, row3); row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8)); row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8)); core::mem::transmute([row0, row1, row2, row3]) } #[inline(always)] unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) { v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]); v[0] = add(v[0], v[4]); v[1] = add(v[1], v[5]); v[2] = add(v[2], v[6]); v[3] = add(v[3], v[7]); v[12] = xor(v[12], v[0]); v[13] = xor(v[13], v[1]); v[14] = xor(v[14], v[2]); v[15] = xor(v[15], v[3]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[15] = rot16(v[15]); v[8] = add(v[8], v[12]); v[9] = add(v[9], v[13]); v[10] = add(v[10], v[14]); v[11] = add(v[11], v[15]); v[4] = xor(v[4], v[8]); v[5] = xor(v[5], v[9]); v[6] = xor(v[6], v[10]); v[7] = xor(v[7], v[11]); v[4] = rot12(v[4]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]); v[0] = add(v[0], v[4]); v[1] = add(v[1], v[5]); v[2] = add(v[2], v[6]); v[3] = add(v[3], v[7]); v[12] = xor(v[12], v[0]); v[13] = xor(v[13], v[1]); v[14] = xor(v[14], v[2]); v[15] = xor(v[15], v[3]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[15] = rot8(v[15]); v[8] = add(v[8], v[12]); v[9] = add(v[9], v[13]); v[10] = add(v[10], v[14]); v[11] = add(v[11], v[15]); v[4] = xor(v[4], v[8]); v[5] = xor(v[5], v[9]); v[6] = xor(v[6], v[10]); v[7] = xor(v[7], v[11]); v[4] = rot7(v[4]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]); v[0] = add(v[0], v[5]); v[1] = add(v[1], v[6]); v[2] = add(v[2], v[7]); v[3] = add(v[3], v[4]); v[15] = xor(v[15], v[0]); v[12] = xor(v[12], v[1]); v[13] = xor(v[13], v[2]); v[14] = xor(v[14], v[3]); v[15] = rot16(v[15]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[10] = add(v[10], v[15]); v[11] = add(v[11], v[12]); v[8] = add(v[8], v[13]); v[9] = add(v[9], v[14]); v[5] = xor(v[5], v[10]); v[6] = xor(v[6], v[11]); v[7] = xor(v[7], v[8]); v[4] = xor(v[4], v[9]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[4] = rot12(v[4]); v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]); v[0] = add(v[0], v[5]); v[1] = add(v[1], v[6]); v[2] = add(v[2], v[7]); v[3] = add(v[3], v[4]); v[15] = xor(v[15], v[0]); v[12] = xor(v[12], v[1]); v[13] = xor(v[13], v[2]); v[14] = xor(v[14], v[3]); v[15] = rot8(v[15]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[10] = add(v[10], v[15]); v[11] = add(v[11], v[12]); v[8] = add(v[8], v[13]); v[9] = add(v[9], v[14]); v[5] = xor(v[5], v[10]); v[6] = xor(v[6], v[11]); v[7] = xor(v[7], v[8]); v[4] = xor(v[4], v[9]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[4] = rot7(v[4]); } #[inline(always)] unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) { // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is // 22/33. Note that this doesn't split the vector into two lanes, as the // AVX2 counterparts do. let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); // Interleave 64-bit lanes. let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); vecs[0] = abcd_0; vecs[1] = abcd_1; vecs[2] = abcd_2; vecs[3] = abcd_3; } #[inline(always)] unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] { let mut vecs = [ loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)), ]; for i in 0..DEGREE { _mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0); } let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE); transpose_vecs(squares.0); transpose_vecs(squares.1); transpose_vecs(squares.2); transpose_vecs(squares.3); vecs } #[inline(always)] unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) { let mask = if increment_counter.yes() { !0 } else { 0 }; ( set4( counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)), counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3)), ), set4( counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)), counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)), ), ) } #[target_feature(enable = "sse4.1")] pub unsafe fn hash4( inputs: &[*const u8; DEGREE], blocks: usize, key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8; DEGREE * OUT_LEN], ) { let mut h_vecs = [ set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), ]; let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); let mut block_flags = flags | flags_start; for block in 0..blocks { if block + 1 == blocks { block_flags |= flags_end; } let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only let block_flags_vec = set1(block_flags as u32); let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); // The transposed compression function. Note that inlining this // manually here improves compile times by a lot, compared to factoring // it out into its own function and making it #[inline(always)]. Just // guessing, it might have something to do with loop unrolling. let mut v = [ h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, ]; round(&mut v, &msg_vecs, 0); round(&mut v, &msg_vecs, 1); round(&mut v, &msg_vecs, 2); round(&mut v, &msg_vecs, 3); round(&mut v, &msg_vecs, 4); round(&mut v, &msg_vecs, 5); round(&mut v, &msg_vecs, 6); h_vecs[0] = xor(v[0], v[8]); h_vecs[1] = xor(v[1], v[9]); h_vecs[2] = xor(v[2], v[10]); h_vecs[3] = xor(v[3], v[11]); h_vecs[4] = xor(v[4], v[12]); h_vecs[5] = xor(v[5], v[13]); h_vecs[6] = xor(v[6], v[14]); h_vecs[7] = xor(v[7], v[15]); block_flags = flags; } let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE); transpose_vecs(squares.0); transpose_vecs(squares.1); // The first four vecs now contain the first half of each output, and the // second four vecs contain the second half of each output. storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE)); storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE)); storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE)); storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE)); storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE)); storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE)); storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); } #[target_feature(enable = "sse4.1")] unsafe fn hash1( input: &[u8; N], key: &CVWords, counter: u64, flags: u8, flags_start: u8, flags_end: u8, out: &mut CVBytes, ) { debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks"); let mut cv = *key; let mut block_flags = flags | flags_start; let mut slice = &input[..]; while slice.len() >= BLOCK_LEN { if slice.len() == BLOCK_LEN { block_flags |= flags_end; } compress_in_place( &mut cv, array_ref!(slice, 0, BLOCK_LEN), BLOCK_LEN as u8, counter, block_flags, ); block_flags = flags; slice = &slice[BLOCK_LEN..]; } *out = core::mem::transmute(cv); // x86 is little-endian } #[target_feature(enable = "sse4.1")] pub unsafe fn hash_many( mut inputs: &[&[u8; N]], key: &CVWords, mut counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, mut out: &mut [u8], ) { debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN { // Safe because the layout of arrays is guaranteed, and because the // `blocks` count is determined statically from the argument type. let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); let blocks = N / BLOCK_LEN; hash4( input_ptrs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, array_mut_ref!(out, 0, DEGREE * OUT_LEN), ); if increment_counter.yes() { counter += DEGREE as u64; } inputs = &inputs[DEGREE..]; out = &mut out[DEGREE * OUT_LEN..]; } for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) { hash1( input, key, counter, flags, flags_start, flags_end, array_mut_ref!(output, 0, OUT_LEN), ); if increment_counter.yes() { counter += 1; } } } #[cfg(test)] mod test { use super::*; #[test] fn test_transpose() { if !crate::platform::sse41_detected() { return; } #[target_feature(enable = "sse4.1")] unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) { transpose_vecs(vecs); } let mut matrix = [[0 as u32; DEGREE]; DEGREE]; for i in 0..DEGREE { for j in 0..DEGREE { matrix[i][j] = (i * DEGREE + j) as u32; } } unsafe { let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix); transpose_wrapper(&mut vecs); matrix = core::mem::transmute(vecs); } for i in 0..DEGREE { for j in 0..DEGREE { // Reversed indexes from above. assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); } } } #[test] fn test_compress() { if !crate::platform::sse41_detected() { return; } crate::test::test_compress_fn(compress_in_place, compress_xof); } #[test] fn test_hash_many() { if !crate::platform::sse41_detected() { return; } crate::test::test_hash_many_fn(hash_many, hash_many); } } ================================================ FILE: third-party/blake3/src/test.rs ================================================ use crate::{CVBytes, CVWords, IncrementCounter, BLOCK_LEN, CHUNK_LEN, OUT_LEN}; use arrayref::array_ref; use arrayvec::ArrayVec; use core::usize; use rand::prelude::*; // Interesting input lengths to run tests on. pub const TEST_CASES: &[usize] = &[ 0, 1, 2, 3, 4, 5, 6, 7, 8, BLOCK_LEN - 1, BLOCK_LEN, BLOCK_LEN + 1, 2 * BLOCK_LEN - 1, 2 * BLOCK_LEN, 2 * BLOCK_LEN + 1, CHUNK_LEN - 1, CHUNK_LEN, CHUNK_LEN + 1, 2 * CHUNK_LEN, 2 * CHUNK_LEN + 1, 3 * CHUNK_LEN, 3 * CHUNK_LEN + 1, 4 * CHUNK_LEN, 4 * CHUNK_LEN + 1, 5 * CHUNK_LEN, 5 * CHUNK_LEN + 1, 6 * CHUNK_LEN, 6 * CHUNK_LEN + 1, 7 * CHUNK_LEN, 7 * CHUNK_LEN + 1, 8 * CHUNK_LEN, 8 * CHUNK_LEN + 1, 16 * CHUNK_LEN - 1, 16 * CHUNK_LEN, // AVX512's bandwidth 16 * CHUNK_LEN + 1, 31 * CHUNK_LEN - 1, 31 * CHUNK_LEN, // 16 + 8 + 4 + 2 + 1 31 * CHUNK_LEN + 1, 100 * CHUNK_LEN, // subtrees larger than MAX_SIMD_DEGREE chunks ]; pub const TEST_CASES_MAX: usize = 100 * CHUNK_LEN; // There's a test to make sure these two are equal below. pub const TEST_KEY: CVBytes = *b"whats the Elvish word for friend"; pub const TEST_KEY_WORDS: CVWords = [ 1952540791, 1752440947, 1816469605, 1752394102, 1919907616, 1868963940, 1919295602, 1684956521, ]; // Paint the input with a repeating byte pattern. We use a cycle length of 251, // because that's the largest prime number less than 256. This makes it // unlikely to swapping any two adjacent input blocks or chunks will give the // same answer. pub fn paint_test_input(buf: &mut [u8]) { for (i, b) in buf.iter_mut().enumerate() { *b = (i % 251) as u8; } } type CompressInPlaceFn = unsafe fn(cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8); type CompressXofFn = unsafe fn( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u8; 64]; // A shared helper function for platform-specific tests. pub fn test_compress_fn(compress_in_place_fn: CompressInPlaceFn, compress_xof_fn: CompressXofFn) { let initial_state = TEST_KEY_WORDS; let block_len: u8 = 61; let mut block = [0; BLOCK_LEN]; paint_test_input(&mut block[..block_len as usize]); // Use a counter with set bits in both 32-bit words. let counter = (5u64 << 32) + 6; let flags = crate::CHUNK_END | crate::ROOT | crate::KEYED_HASH; let portable_out = crate::portable::compress_xof(&initial_state, &block, block_len, counter as u64, flags); let mut test_state = initial_state; unsafe { compress_in_place_fn(&mut test_state, &block, block_len, counter as u64, flags) }; let test_state_bytes = crate::platform::le_bytes_from_words_32(&test_state); let test_xof = unsafe { compress_xof_fn(&initial_state, &block, block_len, counter as u64, flags) }; assert_eq!(&portable_out[..32], &test_state_bytes[..]); assert_eq!(&portable_out[..], &test_xof[..]); } type HashManyFn = unsafe fn( inputs: &[&A], key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8], ); // A shared helper function for platform-specific tests. pub fn test_hash_many_fn( hash_many_chunks_fn: HashManyFn<[u8; CHUNK_LEN]>, hash_many_parents_fn: HashManyFn<[u8; 2 * OUT_LEN]>, ) { // Test a few different initial counter values. // - 0: The base case. // - u32::MAX: The low word of the counter overflows for all inputs except the first. // - i32::MAX: *No* overflow. But carry bugs in tricky SIMD code can screw this up, if you XOR // when you're supposed to ANDNOT... let initial_counters = [0, u32::MAX as u64, i32::MAX as u64]; for counter in initial_counters { #[cfg(feature = "std")] dbg!(counter); // 31 (16 + 8 + 4 + 2 + 1) inputs const NUM_INPUTS: usize = 31; let mut input_buf = [0; CHUNK_LEN * NUM_INPUTS]; crate::test::paint_test_input(&mut input_buf); // First hash chunks. let mut chunks = ArrayVec::<&[u8; CHUNK_LEN], NUM_INPUTS>::new(); for i in 0..NUM_INPUTS { chunks.push(array_ref!(input_buf, i * CHUNK_LEN, CHUNK_LEN)); } let mut portable_chunks_out = [0; NUM_INPUTS * OUT_LEN]; crate::portable::hash_many( &chunks, &TEST_KEY_WORDS, counter, IncrementCounter::Yes, crate::KEYED_HASH, crate::CHUNK_START, crate::CHUNK_END, &mut portable_chunks_out, ); let mut test_chunks_out = [0; NUM_INPUTS * OUT_LEN]; unsafe { hash_many_chunks_fn( &chunks[..], &TEST_KEY_WORDS, counter, IncrementCounter::Yes, crate::KEYED_HASH, crate::CHUNK_START, crate::CHUNK_END, &mut test_chunks_out, ); } for n in 0..NUM_INPUTS { #[cfg(feature = "std")] dbg!(n); assert_eq!( &portable_chunks_out[n * OUT_LEN..][..OUT_LEN], &test_chunks_out[n * OUT_LEN..][..OUT_LEN] ); } // Then hash parents. let mut parents = ArrayVec::<&[u8; 2 * OUT_LEN], NUM_INPUTS>::new(); for i in 0..NUM_INPUTS { parents.push(array_ref!(input_buf, i * 2 * OUT_LEN, 2 * OUT_LEN)); } let mut portable_parents_out = [0; NUM_INPUTS * OUT_LEN]; crate::portable::hash_many( &parents, &TEST_KEY_WORDS, counter, IncrementCounter::No, crate::KEYED_HASH | crate::PARENT, 0, 0, &mut portable_parents_out, ); let mut test_parents_out = [0; NUM_INPUTS * OUT_LEN]; unsafe { hash_many_parents_fn( &parents[..], &TEST_KEY_WORDS, counter, IncrementCounter::No, crate::KEYED_HASH | crate::PARENT, 0, 0, &mut test_parents_out, ); } for n in 0..NUM_INPUTS { #[cfg(feature = "std")] dbg!(n); assert_eq!( &portable_parents_out[n * OUT_LEN..][..OUT_LEN], &test_parents_out[n * OUT_LEN..][..OUT_LEN] ); } } } #[allow(unused)] type XofManyFunction = unsafe fn( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, out: &mut [u8], ); // A shared helper function for platform-specific tests. #[allow(unused)] pub fn test_xof_many_fn(xof_many_function: XofManyFunction) { let mut block = [0; BLOCK_LEN]; let block_len = 42; crate::test::paint_test_input(&mut block[..block_len]); let cv = [40, 41, 42, 43, 44, 45, 46, 47]; let flags = crate::KEYED_HASH; // Test a few different initial counter values. // - 0: The base case. // - u32::MAX: The low word of the counter overflows for all inputs except the first. // - i32::MAX: *No* overflow. But carry bugs in tricky SIMD code can screw this up, if you XOR // when you're supposed to ANDNOT... let initial_counters = [0, u32::MAX as u64, i32::MAX as u64]; for counter in initial_counters { #[cfg(feature = "std")] dbg!(counter); // 31 (16 + 8 + 4 + 2 + 1) outputs const OUTPUT_SIZE: usize = 31 * BLOCK_LEN; let mut portable_out = [0u8; OUTPUT_SIZE]; for (i, out_block) in portable_out.chunks_exact_mut(64).enumerate() { out_block.copy_from_slice(&crate::portable::compress_xof( &cv, &block, block_len as u8, counter + i as u64, flags, )); } let mut test_out = [0u8; OUTPUT_SIZE]; unsafe { xof_many_function(&cv, &block, block_len as u8, counter, flags, &mut test_out); } assert_eq!(portable_out, test_out); } // Test that xof_many doesn't write more blocks than requested. Note that the current assembly // implementation always outputs at least one block, so we don't test the zero case. for block_count in 1..=32 { let mut array = [0; BLOCK_LEN * 33]; let output_start = 17; let output_len = block_count * BLOCK_LEN; let output_end = output_start + output_len; let output = &mut array[output_start..output_end]; unsafe { xof_many_function(&cv, &block, block_len as u8, 0, flags, output); } for i in 0..array.len() { if i < output_start || output_end <= i { assert_eq!(0, array[i], "index {i}"); } } } } #[test] fn test_key_bytes_equal_key_words() { assert_eq!( TEST_KEY_WORDS, crate::platform::words_from_le_bytes_32(&TEST_KEY), ); } #[test] fn test_reference_impl_size() { // Because the Rust compiler optimizes struct layout, it's possible that // some future version of the compiler will produce a different size. If // that happens, we can either disable this test, or test for multiple // expected values. For now, the purpose of this test is to make sure we // notice if that happens. assert_eq!(1880, core::mem::size_of::()); } #[test] fn test_counter_words() { let counter: u64 = (1 << 32) + 2; assert_eq!(crate::counter_low(counter), 2); assert_eq!(crate::counter_high(counter), 1); } #[test] fn test_largest_power_of_two_leq() { let input_output = &[ // The zero case is nonsensical, but it does work. (0, 1), (1, 1), (2, 2), (3, 2), (4, 4), (5, 4), (6, 4), (7, 4), (8, 8), // the largest possible usize (usize::MAX, (usize::MAX >> 1) + 1), ]; for &(input, output) in input_output { assert_eq!( output, crate::largest_power_of_two_leq(input), "wrong output for n={}", input ); } } #[test] fn test_compare_reference_impl() { const OUT: usize = 303; // more than 64, not a multiple of 4 let mut input_buf = [0; TEST_CASES_MAX]; paint_test_input(&mut input_buf); for &case in TEST_CASES { let input = &input_buf[..case]; #[cfg(feature = "std")] dbg!(case); // regular { let mut reference_hasher = reference_impl::Hasher::new(); reference_hasher.update(input); let mut expected_out = [0; OUT]; reference_hasher.finalize(&mut expected_out); // all at once let test_out = crate::hash(input); assert_eq!(test_out, *array_ref!(expected_out, 0, 32)); // incremental let mut hasher = crate::Hasher::new(); hasher.update(input); assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); assert_eq!(hasher.finalize(), test_out); // incremental (rayon) #[cfg(feature = "rayon")] { let mut hasher = crate::Hasher::new(); hasher.update_rayon(input); assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); assert_eq!(hasher.finalize(), test_out); } // xof let mut extended = [0; OUT]; hasher.finalize_xof().fill(&mut extended); assert_eq!(extended, expected_out); } // keyed { let mut reference_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY); reference_hasher.update(input); let mut expected_out = [0; OUT]; reference_hasher.finalize(&mut expected_out); // all at once let test_out = crate::keyed_hash(&TEST_KEY, input); assert_eq!(test_out, *array_ref!(expected_out, 0, 32)); // incremental let mut hasher = crate::Hasher::new_keyed(&TEST_KEY); hasher.update(input); assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); assert_eq!(hasher.finalize(), test_out); // incremental (rayon) #[cfg(feature = "rayon")] { let mut hasher = crate::Hasher::new_keyed(&TEST_KEY); hasher.update_rayon(input); assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); assert_eq!(hasher.finalize(), test_out); } // xof let mut extended = [0; OUT]; hasher.finalize_xof().fill(&mut extended); assert_eq!(extended, expected_out); } // derive_key { let context = "BLAKE3 2019-12-27 16:13:59 example context (not the test vector one)"; let mut reference_hasher = reference_impl::Hasher::new_derive_key(context); reference_hasher.update(input); let mut expected_out = [0; OUT]; reference_hasher.finalize(&mut expected_out); // all at once let test_out = crate::derive_key(context, input); assert_eq!(test_out, expected_out[..32]); // incremental let mut hasher = crate::Hasher::new_derive_key(context); hasher.update(input); assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); assert_eq!(hasher.finalize(), *array_ref!(test_out, 0, 32)); // incremental (rayon) #[cfg(feature = "rayon")] { let mut hasher = crate::Hasher::new_derive_key(context); hasher.update_rayon(input); assert_eq!(hasher.finalize(), *array_ref!(expected_out, 0, 32)); assert_eq!(hasher.finalize(), *array_ref!(test_out, 0, 32)); } // xof let mut extended = [0; OUT]; hasher.finalize_xof().fill(&mut extended); assert_eq!(extended, expected_out); } } } #[test] fn test_compare_reference_impl_long_xof() { let mut reference_output = [0u8; 32 * BLOCK_LEN - 1]; let mut reference_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY); reference_hasher.update(b"hello world"); reference_hasher.finalize(&mut reference_output); let mut test_output = [0u8; 32 * BLOCK_LEN - 1]; let mut test_hasher = crate::Hasher::new_keyed(&TEST_KEY); test_hasher.update(b"hello world"); test_hasher.finalize_xof().fill(&mut test_output); assert_eq!(reference_output, test_output); } #[test] fn test_xof_partial_blocks() { const OUT_LEN: usize = 6 * BLOCK_LEN; let mut reference_out = [0u8; OUT_LEN]; reference_impl::Hasher::new().finalize(&mut reference_out); let mut all_at_once_out = [0u8; OUT_LEN]; crate::Hasher::new() .finalize_xof() .fill(&mut all_at_once_out); assert_eq!(reference_out, all_at_once_out); let mut partial_out = [0u8; OUT_LEN]; let partial_start = 32; let partial_end = OUT_LEN - 32; let mut xof = crate::Hasher::new().finalize_xof(); xof.fill(&mut partial_out[..partial_start]); xof.fill(&mut partial_out[partial_start..partial_end]); xof.fill(&mut partial_out[partial_end..]); assert_eq!(reference_out, partial_out); } fn reference_hash(input: &[u8]) -> crate::Hash { let mut hasher = reference_impl::Hasher::new(); hasher.update(input); let mut bytes = [0; 32]; hasher.finalize(&mut bytes); bytes.into() } #[test] fn test_compare_update_multiple() { // Don't use all the long test cases here, since that's unnecessarily slow // in debug mode. let mut short_test_cases = TEST_CASES; while *short_test_cases.last().unwrap() > 4 * CHUNK_LEN { short_test_cases = &short_test_cases[..short_test_cases.len() - 1]; } assert_eq!(*short_test_cases.last().unwrap(), 4 * CHUNK_LEN); let mut input_buf = [0; 2 * TEST_CASES_MAX]; paint_test_input(&mut input_buf); for &first_update in short_test_cases { #[cfg(feature = "std")] dbg!(first_update); let first_input = &input_buf[..first_update]; let mut test_hasher = crate::Hasher::new(); test_hasher.update(first_input); for &second_update in short_test_cases { #[cfg(feature = "std")] dbg!(second_update); let second_input = &input_buf[first_update..][..second_update]; let total_input = &input_buf[..first_update + second_update]; // Clone the hasher with first_update bytes already written, so // that the next iteration can reuse it. let mut test_hasher = test_hasher.clone(); test_hasher.update(second_input); let expected = reference_hash(total_input); assert_eq!(expected, test_hasher.finalize()); } } } #[test] fn test_fuzz_hasher() { const INPUT_MAX: usize = 4 * CHUNK_LEN; let mut input_buf = [0; 3 * INPUT_MAX]; paint_test_input(&mut input_buf); // Don't do too many iterations in debug mode, to keep the tests under a // second or so. CI should run tests in release mode also. Provide an // environment variable for specifying a larger number of fuzz iterations. let num_tests = if cfg!(debug_assertions) { 100 } else { 10_000 }; // Use a fixed RNG seed for reproducibility. let mut rng = rand_chacha::ChaCha8Rng::from_seed([1; 32]); for _num_test in 0..num_tests { #[cfg(feature = "std")] dbg!(_num_test); let mut hasher = crate::Hasher::new(); let mut total_input = 0; // For each test, write 3 inputs of random length. for _ in 0..3 { let input_len = rng.random_range(0..(INPUT_MAX + 1)); #[cfg(feature = "std")] dbg!(input_len); let input = &input_buf[total_input..][..input_len]; hasher.update(input); total_input += input_len; } let expected = reference_hash(&input_buf[..total_input]); assert_eq!(expected, hasher.finalize()); } } #[test] fn test_fuzz_xof() { let mut input_buf = [0u8; 3 * BLOCK_LEN]; paint_test_input(&mut input_buf); // Don't do too many iterations in debug mode, to keep the tests under a // second or so. CI should run tests in release mode also. Provide an // environment variable for specifying a larger number of fuzz iterations. let num_tests = if cfg!(debug_assertions) { 100 } else { 2500 }; // Use a fixed RNG seed for reproducibility. let mut rng = rand_chacha::ChaCha8Rng::from_seed([1; 32]); for _num_test in 0..num_tests { #[cfg(feature = "std")] dbg!(_num_test); // 31 (16 + 8 + 4 + 2 + 1) outputs let mut output_buf = [0; 31 * CHUNK_LEN]; let input_len = rng.random_range(0..input_buf.len()); let mut xof = crate::Hasher::new() .update(&input_buf[..input_len]) .finalize_xof(); let partial_start = rng.random_range(0..output_buf.len()); let partial_end = rng.random_range(partial_start..output_buf.len()); xof.fill(&mut output_buf[..partial_start]); xof.fill(&mut output_buf[partial_start..partial_end]); xof.fill(&mut output_buf[partial_end..]); let mut reference_buf = [0; 31 * CHUNK_LEN]; let mut reference_hasher = reference_impl::Hasher::new(); reference_hasher.update(&input_buf[..input_len]); reference_hasher.finalize(&mut reference_buf); assert_eq!(reference_buf, output_buf); } } #[test] fn test_xof_seek() { let mut out = [0; 533]; let mut hasher = crate::Hasher::new(); hasher.update(b"foo"); hasher.finalize_xof().fill(&mut out); assert_eq!(hasher.finalize().as_bytes(), &out[0..32]); let mut reader = hasher.finalize_xof(); reader.set_position(303); let mut out2 = [0; 102]; reader.fill(&mut out2); assert_eq!(&out[303..][..102], &out2[..]); #[cfg(feature = "std")] { use std::io::prelude::*; let mut reader = hasher.finalize_xof(); reader.seek(std::io::SeekFrom::Start(303)).unwrap(); let mut out3 = Vec::new(); reader.by_ref().take(102).read_to_end(&mut out3).unwrap(); assert_eq!(&out[303..][..102], &out3[..]); assert_eq!( reader.seek(std::io::SeekFrom::Current(0)).unwrap(), 303 + 102 ); reader.seek(std::io::SeekFrom::Current(-5)).unwrap(); assert_eq!( reader.seek(std::io::SeekFrom::Current(0)).unwrap(), 303 + 102 - 5 ); let mut out4 = [0; 17]; assert_eq!(reader.read(&mut out4).unwrap(), 17); assert_eq!(&out[303 + 102 - 5..][..17], &out4[..]); assert_eq!( reader.seek(std::io::SeekFrom::Current(0)).unwrap(), 303 + 102 - 5 + 17 ); assert!(reader.seek(std::io::SeekFrom::End(0)).is_err()); assert!(reader.seek(std::io::SeekFrom::Current(-1000)).is_err()); } } #[test] fn test_msg_schedule_permutation() { let permutation = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8]; let mut generated = [[0; 16]; 7]; generated[0] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; for round in 1..7 { for i in 0..16 { generated[round][i] = generated[round - 1][permutation[i]]; } } assert_eq!(generated, crate::MSG_SCHEDULE); } #[test] fn test_reset() { let mut hasher = crate::Hasher::new(); hasher.update(&[42; 3 * CHUNK_LEN + 7]); hasher.reset(); hasher.update(&[42; CHUNK_LEN + 3]); assert_eq!(hasher.finalize(), crate::hash(&[42; CHUNK_LEN + 3])); let key = &[99; crate::KEY_LEN]; let mut keyed_hasher = crate::Hasher::new_keyed(key); keyed_hasher.update(&[42; 3 * CHUNK_LEN + 7]); keyed_hasher.reset(); keyed_hasher.update(&[42; CHUNK_LEN + 3]); assert_eq!( keyed_hasher.finalize(), crate::keyed_hash(key, &[42; CHUNK_LEN + 3]), ); let context = "BLAKE3 2020-02-12 10:20:58 reset test"; let mut kdf = crate::Hasher::new_derive_key(context); kdf.update(&[42; 3 * CHUNK_LEN + 7]); kdf.reset(); kdf.update(&[42; CHUNK_LEN + 3]); let expected = crate::derive_key(context, &[42; CHUNK_LEN + 3]); assert_eq!(kdf.finalize(), expected); } #[test] fn test_hex_encoding_decoding() { let digest_str = "04e0bb39f30b1a3feb89f536c93be15055482df748674b00d26e5a75777702e9"; let mut hasher = crate::Hasher::new(); hasher.update(b"foo"); let digest = hasher.finalize(); assert_eq!(digest.to_hex().as_str(), digest_str); #[cfg(feature = "std")] assert_eq!(digest.to_string(), digest_str); // Test round trip let digest = crate::Hash::from_hex(digest_str).unwrap(); assert_eq!(digest.to_hex().as_str(), digest_str); // Test uppercase let digest = crate::Hash::from_hex(digest_str.to_uppercase()).unwrap(); assert_eq!(digest.to_hex().as_str(), digest_str); // Test string parsing via FromStr let digest: crate::Hash = digest_str.parse().unwrap(); assert_eq!(digest.to_hex().as_str(), digest_str); // Test errors let bad_len = "04e0bb39f30b1"; let _result = crate::Hash::from_hex(bad_len).unwrap_err(); #[cfg(feature = "std")] assert_eq!(_result.to_string(), "expected 64 hex bytes, received 13"); let bad_char = "Z4e0bb39f30b1a3feb89f536c93be15055482df748674b00d26e5a75777702e9"; let _result = crate::Hash::from_hex(bad_char).unwrap_err(); #[cfg(feature = "std")] assert_eq!(_result.to_string(), "invalid hex character: 'Z'"); let _result = crate::Hash::from_hex([128; 64]).unwrap_err(); #[cfg(feature = "std")] assert_eq!(_result.to_string(), "invalid hex character: 0x80"); } // This test is a mimized failure case for the Windows SSE2 bug described in // https://github.com/BLAKE3-team/BLAKE3/issues/206. // // Before that issue was fixed, this test would fail on Windows in the following configuration: // // cargo test --features=no_avx512,no_avx2,no_sse41 --release // // Bugs like this one (stomping on a caller's register) are very sensitive to the details of // surrounding code, so it's not especially likely that this test will catch another bug (or even // the same bug) in the future. Still, there's no harm in keeping it. #[test] fn test_issue_206_windows_sse2() { // This stupid loop has to be here to trigger the bug. I don't know why. for _ in &[0] { // The length 65 (two blocks) is significant. It doesn't repro with 64 (one block). It also // doesn't repro with an all-zero input. let input = &[0xff; 65]; let expected_hash = [ 183, 235, 50, 217, 156, 24, 190, 219, 2, 216, 176, 255, 224, 53, 28, 95, 57, 148, 179, 245, 162, 90, 37, 121, 0, 142, 219, 62, 234, 204, 225, 161, ]; // This throwaway call has to be here to trigger the bug. crate::Hasher::new().update(input); // This assert fails when the bug is triggered. assert_eq!(crate::Hasher::new().update(input).finalize(), expected_hash); } } #[test] fn test_hash_conversions() { let bytes1 = [42; 32]; let hash1: crate::Hash = bytes1.into(); let bytes2: [u8; 32] = hash1.into(); assert_eq!(bytes1, bytes2); let bytes3 = *hash1.as_bytes(); assert_eq!(bytes1, bytes3); let hash2 = crate::Hash::from_bytes(bytes1); assert_eq!(hash1, hash2); let hex = hash1.to_hex(); let hash3 = crate::Hash::from_hex(hex.as_bytes()).unwrap(); assert_eq!(hash1, hash3); let slice1: &[u8] = bytes1.as_slice(); let hash4 = crate::Hash::from_slice(slice1).expect("correct length"); assert_eq!(hash1, hash4); assert!(crate::Hash::from_slice(&[]).is_err()); assert!(crate::Hash::from_slice(&[42]).is_err()); assert!(crate::Hash::from_slice([42; 31].as_slice()).is_err()); assert!(crate::Hash::from_slice([42; 33].as_slice()).is_err()); assert!(crate::Hash::from_slice([42; 100].as_slice()).is_err()); } #[test] const fn test_hash_const_conversions() { let bytes = [42; 32]; let hash = crate::Hash::from_bytes(bytes); _ = hash.as_bytes(); } #[cfg(feature = "zeroize")] #[test] fn test_zeroize() { use zeroize::Zeroize; let mut hash = crate::Hash([42; 32]); hash.zeroize(); assert_eq!(hash.0, [0u8; 32]); let mut hasher = crate::Hasher { chunk_state: crate::ChunkState { cv: [42; 8], chunk_counter: 42, buf: [42; 64], buf_len: 42, blocks_compressed: 42, flags: 42, platform: crate::Platform::Portable, }, initial_chunk_counter: 42, key: [42; 8], cv_stack: [[42; 32]; { crate::MAX_DEPTH + 1 }].into(), }; hasher.zeroize(); assert_eq!(hasher.chunk_state.cv, [0; 8]); assert_eq!(hasher.chunk_state.chunk_counter, 0); assert_eq!(hasher.chunk_state.buf, [0; 64]); assert_eq!(hasher.chunk_state.buf_len, 0); assert_eq!(hasher.chunk_state.blocks_compressed, 0); assert_eq!(hasher.chunk_state.flags, 0); assert!(matches!( hasher.chunk_state.platform, crate::Platform::Portable )); assert_eq!(hasher.initial_chunk_counter, 0); assert_eq!(hasher.key, [0; 8]); assert_eq!(&*hasher.cv_stack, &[[0u8; 32]; 0]); let mut output_reader = crate::OutputReader { inner: crate::Output { input_chaining_value: [42; 8], block: [42; 64], counter: 42, block_len: 42, flags: 42, platform: crate::Platform::Portable, }, position_within_block: 42, }; output_reader.zeroize(); assert_eq!(output_reader.inner.input_chaining_value, [0; 8]); assert_eq!(output_reader.inner.block, [0; 64]); assert_eq!(output_reader.inner.counter, 0); assert_eq!(output_reader.inner.block_len, 0); assert_eq!(output_reader.inner.flags, 0); assert!(matches!( output_reader.inner.platform, crate::Platform::Portable )); assert_eq!(output_reader.position_within_block, 0); } #[test] #[cfg(feature = "std")] fn test_update_reader() -> Result<(), std::io::Error> { // This is a brief test, since update_reader() is mostly a wrapper around update(), which already // has substantial testing. let mut input = vec![0; 1_000_000]; paint_test_input(&mut input); assert_eq!( crate::Hasher::new().update_reader(&input[..])?.finalize(), crate::hash(&input), ); Ok(()) } #[test] #[cfg(feature = "std")] fn test_update_reader_interrupted() -> std::io::Result<()> { use std::io; struct InterruptingReader<'a> { already_interrupted: bool, slice: &'a [u8], } impl<'a> InterruptingReader<'a> { fn new(slice: &'a [u8]) -> Self { Self { already_interrupted: false, slice, } } } impl<'a> io::Read for InterruptingReader<'a> { fn read(&mut self, buf: &mut [u8]) -> io::Result { if !self.already_interrupted { self.already_interrupted = true; return Err(io::Error::from(io::ErrorKind::Interrupted)); } let take = std::cmp::min(self.slice.len(), buf.len()); buf[..take].copy_from_slice(&self.slice[..take]); self.slice = &self.slice[take..]; Ok(take) } } let input = b"hello world"; let mut reader = InterruptingReader::new(input); let mut hasher = crate::Hasher::new(); hasher.update_reader(&mut reader)?; assert_eq!(hasher.finalize(), crate::hash(input)); Ok(()) } #[test] #[cfg(feature = "mmap")] // NamedTempFile isn't Miri-compatible #[cfg(not(miri))] fn test_mmap() -> Result<(), std::io::Error> { // This is a brief test, since update_mmap() is mostly a wrapper around update(), which already // has substantial testing. use std::io::prelude::*; let mut input = vec![0; 1_000_000]; paint_test_input(&mut input); let mut tempfile = tempfile::NamedTempFile::new()?; tempfile.write_all(&input)?; tempfile.flush()?; assert_eq!( crate::Hasher::new() .update_mmap(tempfile.path())? .finalize(), crate::hash(&input), ); Ok(()) } #[test] #[cfg(feature = "mmap")] #[cfg(target_os = "linux")] fn test_mmap_virtual_file() -> Result<(), std::io::Error> { // Virtual files like /proc/version can't be mmapped, because their contents don't actually // exist anywhere in memory. Make sure we fall back to regular file IO in these cases. // Currently this is handled with a length check, where the assumption is that virtual files // will always report length 0. If that assumption ever breaks, hopefully this test will catch // it. let virtual_filepath = "/proc/version"; let mut mmap_hasher = crate::Hasher::new(); // We'll fail right here if the fallback doesn't work. mmap_hasher.update_mmap(virtual_filepath)?; let mut read_hasher = crate::Hasher::new(); read_hasher.update_reader(std::fs::File::open(virtual_filepath)?)?; assert_eq!(mmap_hasher.finalize(), read_hasher.finalize()); Ok(()) } #[test] #[cfg(feature = "mmap")] #[cfg(feature = "rayon")] // NamedTempFile isn't Miri-compatible #[cfg(not(miri))] fn test_mmap_rayon() -> Result<(), std::io::Error> { // This is a brief test, since update_mmap_rayon() is mostly a wrapper around update_rayon(), // which already has substantial testing. use std::io::prelude::*; let mut input = vec![0; 1_000_000]; paint_test_input(&mut input); let mut tempfile = tempfile::NamedTempFile::new()?; tempfile.write_all(&input)?; tempfile.flush()?; assert_eq!( crate::Hasher::new() .update_mmap_rayon(tempfile.path())? .finalize(), crate::hash(&input), ); Ok(()) } #[test] #[cfg(feature = "std")] #[cfg(feature = "serde")] fn test_serde() { // Henrik suggested that we use 0xfe / 254 for byte test data instead of 0xff / 255, due to the // fact that 0xfe is not a well formed CBOR item. let hash: crate::Hash = [0xfe; 32].into(); let json = serde_json::to_string(&hash).unwrap(); assert_eq!( json, "[254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254]", ); let hash2: crate::Hash = serde_json::from_str(&json).unwrap(); assert_eq!(hash, hash2); let mut cbor = Vec::::new(); ciborium::into_writer(&hash, &mut cbor).unwrap(); assert_eq!( cbor, [ 0x98, 0x20, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, 0x18, 0xfe, ] ); let hash_from_cbor: crate::Hash = ciborium::from_reader(&cbor[..]).unwrap(); assert_eq!(hash_from_cbor, hash); // Version 1.5.2 of this crate changed the default serialization format to a bytestring // (instead of an array/list) to save bytes on the wire. That was a backwards compatibility // mistake for non-self-describing formats, and it's been reverted. Since some small number of // serialized bytestrings will probably exist forever in the wild, we shold test that we can // still deserialize these from self-describing formats. let bytestring_cbor: &[u8] = &[ 0x58, 0x20, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, ]; let hash_from_bytestring_cbor: crate::Hash = ciborium::from_reader(bytestring_cbor).unwrap(); assert_eq!(hash_from_bytestring_cbor, hash); } // `cargo +nightly miri test` currently works, but it takes forever, because some of our test // inputs are quite large. Most of our unsafe code is platform specific and incompatible with Miri // anyway, but we'd like it to be possible for callers to run their own tests under Miri, assuming // they don't use incompatible features like Rayon or mmap. This test should get reasonable // coverage of our public API without using any large inputs, so we can run it in CI and catch // obvious breaks. (For example, constant_time_eq is not compatible with Miri.) #[test] fn test_miri_smoketest() { let mut hasher = crate::Hasher::new_derive_key("Miri smoketest"); hasher.update(b"foo"); #[cfg(feature = "std")] hasher.update_reader(&b"bar"[..]).unwrap(); assert_eq!(hasher.finalize(), hasher.finalize()); let mut reader = hasher.finalize_xof(); reader.set_position(999999); reader.fill(&mut [0]); } // I had to move these tests out of the deprecated guts module, because leaving them there causes // an un-silenceable warning: https://github.com/rust-lang/rust/issues/47238 #[cfg(test)] #[allow(deprecated)] mod guts_tests { use crate::guts::*; #[test] fn test_chunk() { assert_eq!( crate::hash(b"foo"), ChunkState::new(0).update(b"foo").finalize(true) ); } #[test] fn test_parents() { let mut hasher = crate::Hasher::new(); let mut buf = [0; crate::CHUNK_LEN]; buf[0] = 'a' as u8; hasher.update(&buf); let chunk0_cv = ChunkState::new(0).update(&buf).finalize(false); buf[0] = 'b' as u8; hasher.update(&buf); let chunk1_cv = ChunkState::new(1).update(&buf).finalize(false); hasher.update(b"c"); let chunk2_cv = ChunkState::new(2).update(b"c").finalize(false); let parent = parent_cv(&chunk0_cv, &chunk1_cv, false); let root = parent_cv(&parent, &chunk2_cv, true); assert_eq!(hasher.finalize(), root); } } ================================================ FILE: third-party/blake3/src/traits.rs ================================================ //! Implementations of commonly used traits like `Digest` and `Mac` from the //! [`digest`](https://crates.io/crates/digest) crate. pub use digest; use crate::{Hasher, OutputReader}; use digest::crypto_common; use digest::generic_array::{typenum::U32, typenum::U64, GenericArray}; impl digest::HashMarker for Hasher {} impl digest::Update for Hasher { #[inline] fn update(&mut self, data: &[u8]) { self.update(data); } } impl digest::Reset for Hasher { #[inline] fn reset(&mut self) { self.reset(); // the inherent method } } impl digest::OutputSizeUser for Hasher { type OutputSize = U32; } impl digest::FixedOutput for Hasher { #[inline] fn finalize_into(self, out: &mut GenericArray) { out.copy_from_slice(self.finalize().as_bytes()); } } impl digest::FixedOutputReset for Hasher { #[inline] fn finalize_into_reset(&mut self, out: &mut GenericArray) { out.copy_from_slice(self.finalize().as_bytes()); self.reset(); } } impl digest::ExtendableOutput for Hasher { type Reader = OutputReader; #[inline] fn finalize_xof(self) -> Self::Reader { Hasher::finalize_xof(&self) } } impl digest::ExtendableOutputReset for Hasher { #[inline] fn finalize_xof_reset(&mut self) -> Self::Reader { let reader = Hasher::finalize_xof(self); self.reset(); reader } } impl digest::XofReader for OutputReader { #[inline] fn read(&mut self, buffer: &mut [u8]) { self.fill(buffer); } } impl crypto_common::KeySizeUser for Hasher { type KeySize = U32; } impl crypto_common::BlockSizeUser for Hasher { type BlockSize = U64; } impl digest::MacMarker for Hasher {} impl digest::KeyInit for Hasher { #[inline] fn new(key: &digest::Key) -> Self { let key_bytes: [u8; 32] = (*key).into(); Hasher::new_keyed(&key_bytes) } } #[cfg(test)] mod test { use super::*; #[test] fn test_digest_traits() { // Inherent methods. let mut hasher1 = crate::Hasher::new(); hasher1.update(b"foo"); hasher1.update(b"bar"); hasher1.update(b"baz"); let out1 = hasher1.finalize(); let mut xof1 = [0; 301]; hasher1.finalize_xof().fill(&mut xof1); assert_eq!(out1.as_bytes(), &xof1[..32]); // Trait implementations. let mut hasher2: crate::Hasher = digest::Digest::new(); digest::Digest::update(&mut hasher2, b"xxx"); digest::Digest::reset(&mut hasher2); digest::Digest::update(&mut hasher2, b"foo"); digest::Digest::update(&mut hasher2, b"bar"); digest::Digest::update(&mut hasher2, b"baz"); let out2 = digest::Digest::finalize(hasher2.clone()); let mut xof2 = [0; 301]; digest::XofReader::read( &mut digest::ExtendableOutput::finalize_xof(hasher2.clone()), &mut xof2, ); assert_eq!(out1.as_bytes(), &out2[..]); assert_eq!(xof1[..], xof2[..]); // Again with the resetting variants. let mut hasher3: crate::Hasher = digest::Digest::new(); digest::Digest::update(&mut hasher3, b"foobarbaz"); let mut out3 = [0; 32]; digest::FixedOutputReset::finalize_into_reset( &mut hasher3, GenericArray::from_mut_slice(&mut out3), ); digest::Digest::update(&mut hasher3, b"foobarbaz"); let mut out4 = [0; 32]; digest::FixedOutputReset::finalize_into_reset( &mut hasher3, GenericArray::from_mut_slice(&mut out4), ); digest::Digest::update(&mut hasher3, b"foobarbaz"); let mut xof3 = [0; 301]; digest::XofReader::read( &mut digest::ExtendableOutputReset::finalize_xof_reset(&mut hasher3), &mut xof3, ); digest::Digest::update(&mut hasher3, b"foobarbaz"); let mut xof4 = [0; 301]; digest::XofReader::read( &mut digest::ExtendableOutputReset::finalize_xof_reset(&mut hasher3), &mut xof4, ); assert_eq!(out1.as_bytes(), &out3[..]); assert_eq!(out1.as_bytes(), &out4[..]); assert_eq!(xof1[..], xof3[..]); assert_eq!(xof1[..], xof4[..]); } #[test] fn test_mac_trait() { // Inherent methods. let key = b"some super secret key bytes fooo"; let mut hasher1 = crate::Hasher::new_keyed(key); hasher1.update(b"foo"); hasher1.update(b"bar"); hasher1.update(b"baz"); let out1 = hasher1.finalize(); // Trait implementation. let generic_key = (*key).into(); let mut hasher2: crate::Hasher = digest::Mac::new(&generic_key); digest::Mac::update(&mut hasher2, b"xxx"); digest::Mac::reset(&mut hasher2); digest::Mac::update(&mut hasher2, b"foo"); digest::Mac::update(&mut hasher2, b"bar"); digest::Mac::update(&mut hasher2, b"baz"); let out2 = digest::Mac::finalize(hasher2); assert_eq!(out1.as_bytes(), out2.into_bytes().as_slice()); } fn expected_hmac_blake3(key: &[u8], input: &[u8]) -> [u8; 32] { // See https://en.wikipedia.org/wiki/HMAC. let key_hash; let key_prime = if key.len() <= 64 { key } else { key_hash = *crate::hash(key).as_bytes(); &key_hash }; let mut ipad = [0x36; 64]; let mut opad = [0x5c; 64]; for i in 0..key_prime.len() { ipad[i] ^= key_prime[i]; opad[i] ^= key_prime[i]; } let mut inner_state = crate::Hasher::new(); inner_state.update(&ipad); inner_state.update(input); let mut outer_state = crate::Hasher::new(); outer_state.update(&opad); outer_state.update(inner_state.finalize().as_bytes()); outer_state.finalize().into() } #[test] fn test_hmac_compatibility() { use hmac::{Mac, SimpleHmac}; // Test a short key. let mut x = SimpleHmac::::new_from_slice(b"key").unwrap(); hmac::digest::Update::update(&mut x, b"data"); let output = x.finalize().into_bytes(); assert_ne!(output.len(), 0); let expected = expected_hmac_blake3(b"key", b"data"); assert_eq!(expected, output.as_ref()); // Test a range of key and data lengths, particularly to exercise the long-key logic. let mut input_bytes = [0; crate::test::TEST_CASES_MAX]; crate::test::paint_test_input(&mut input_bytes); for &input_len in crate::test::TEST_CASES { #[cfg(feature = "std")] dbg!(input_len); let input = &input_bytes[..input_len]; let mut x = SimpleHmac::::new_from_slice(input).unwrap(); hmac::digest::Update::update(&mut x, input); let output = x.finalize().into_bytes(); assert_ne!(output.len(), 0); let expected = expected_hmac_blake3(input, input); assert_eq!(expected, output.as_ref()); } } } ================================================ FILE: third-party/blake3/src/wasm32_simd.rs ================================================ /* * This code is based on rust_sse2.rs of the same distribution, and is subject to further improvements. * Some comments are left intact even if their applicability is questioned. * * Performance measurements with a primitive benchmark with ~16Kb of data: * * | M1 native | 11,610 ns | * | M1 Wasm SIMD | 13,355 ns | * | M1 Wasm | 22,037 ns | * | x64 native | 6,713 ns | * | x64 Wasm SIMD | 11,985 ns | * | x64 Wasm | 25,978 ns | * * wasmtime v12.0.1 was used on both platforms. */ use core::arch::wasm32::*; use crate::{ counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN, }; use arrayref::{array_mut_ref, array_ref, mut_array_refs}; pub const DEGREE: usize = 4; #[inline(always)] unsafe fn loadu(src: *const u8) -> v128 { // This is an unaligned load, so the pointer cast is allowed. v128_load(src as *const v128) } #[inline(always)] unsafe fn storeu(src: v128, dest: *mut u8) { // This is an unaligned store, so the pointer cast is allowed. v128_store(dest as *mut v128, src) } #[inline(always)] fn add(a: v128, b: v128) -> v128 { i32x4_add(a, b) } #[inline(always)] fn xor(a: v128, b: v128) -> v128 { v128_xor(a, b) } #[inline(always)] fn set1(x: u32) -> v128 { i32x4_splat(x as i32) } #[inline(always)] fn set4(a: u32, b: u32, c: u32, d: u32) -> v128 { i32x4(a as i32, b as i32, c as i32, d as i32) } // These rotations are the "simple/shifts version". For the // "complicated/shuffles version", see // https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. // For a discussion of the tradeoffs, see // https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug // (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better // on recent x86 chips. #[inline(always)] fn rot16(a: v128) -> v128 { v128_or(u32x4_shr(a, 16), u32x4_shl(a, 32 - 16)) } #[inline(always)] fn rot12(a: v128) -> v128 { v128_or(u32x4_shr(a, 12), u32x4_shl(a, 32 - 12)) } #[inline(always)] fn rot8(a: v128) -> v128 { v128_or(u32x4_shr(a, 8), u32x4_shl(a, 32 - 8)) } #[inline(always)] fn rot7(a: v128) -> v128 { v128_or(u32x4_shr(a, 7), u32x4_shl(a, 32 - 7)) } #[inline(always)] fn g1(row0: &mut v128, row1: &mut v128, row2: &mut v128, row3: &mut v128, m: v128) { *row0 = add(add(*row0, m), *row1); *row3 = xor(*row3, *row0); *row3 = rot16(*row3); *row2 = add(*row2, *row3); *row1 = xor(*row1, *row2); *row1 = rot12(*row1); } #[inline(always)] fn g2(row0: &mut v128, row1: &mut v128, row2: &mut v128, row3: &mut v128, m: v128) { *row0 = add(add(*row0, m), *row1); *row3 = xor(*row3, *row0); *row3 = rot8(*row3); *row2 = add(*row2, *row3); *row1 = xor(*row1, *row2); *row1 = rot7(*row1); } // It could be a function, but artimetics in const generics is too limited yet. macro_rules! shuffle { ($a: expr, $b: expr, $z:expr, $y:expr, $x:expr, $w:expr) => { i32x4_shuffle::<{ $w }, { $x }, { $y + 4 }, { $z + 4 }>($a, $b) }; } #[inline(always)] fn unpacklo_epi64(a: v128, b: v128) -> v128 { i64x2_shuffle::<0, 2>(a, b) } #[inline(always)] fn unpackhi_epi64(a: v128, b: v128) -> v128 { i64x2_shuffle::<1, 3>(a, b) } #[inline(always)] fn unpacklo_epi32(a: v128, b: v128) -> v128 { i32x4_shuffle::<0, 4, 1, 5>(a, b) } #[inline(always)] fn unpackhi_epi32(a: v128, b: v128) -> v128 { i32x4_shuffle::<2, 6, 3, 7>(a, b) } #[inline(always)] fn shuffle_epi32( a: v128, ) -> v128 { // Please note that generic arguments in delcaration and imlementation are in // different order. // second arg is actually ignored. i32x4_shuffle::(a, a) } #[inline(always)] fn blend_epi16(a: v128, b: v128, imm8: i32) -> v128 { // imm8 is always constant; it allows to implement this function with // i16x8_shuffle. However, it is marginally slower on x64. let bits = i16x8(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80); let mut mask = i16x8_splat(imm8 as i16); mask = v128_and(mask, bits); mask = i16x8_eq(mask, bits); // The swapped argument order is equivalent to mask negation. v128_bitselect(b, a, mask) } // Note the optimization here of leaving row1 as the unrotated row, rather than // row0. All the message loads below are adjusted to compensate for this. See // discussion at https://github.com/sneves/blake2-avx2/pull/4 #[inline(always)] fn diagonalize(row0: &mut v128, row2: &mut v128, row3: &mut v128) { *row0 = shuffle_epi32::<2, 1, 0, 3>(*row0); *row3 = shuffle_epi32::<1, 0, 3, 2>(*row3); *row2 = shuffle_epi32::<0, 3, 2, 1>(*row2); } #[inline(always)] fn undiagonalize(row0: &mut v128, row2: &mut v128, row3: &mut v128) { *row0 = shuffle_epi32::<0, 3, 2, 1>(*row0); *row3 = shuffle_epi32::<1, 0, 3, 2>(*row3); *row2 = shuffle_epi32::<2, 1, 0, 3>(*row2); } #[inline(always)] fn compress_pre( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [v128; 4] { // safe because CVWords is [u32; 8] let row0 = &mut unsafe { loadu(cv.as_ptr().add(0) as *const u8) }; let row1 = &mut unsafe { loadu(cv.as_ptr().add(4) as *const u8) }; let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]); let row3 = &mut set4( counter_low(counter), counter_high(counter), block_len as u32, flags as u32, ); // safe because block is &[u8; 64] let mut m0 = unsafe { loadu(block.as_ptr().add(0 * 4 * DEGREE)) }; let mut m1 = unsafe { loadu(block.as_ptr().add(1 * 4 * DEGREE)) }; let mut m2 = unsafe { loadu(block.as_ptr().add(2 * 4 * DEGREE)) }; let mut m3 = unsafe { loadu(block.as_ptr().add(3 * 4 * DEGREE)) }; let mut t0; let mut t1; let mut t2; let mut t3; let mut tt; // Round 1. The first round permutes the message words from the original // input order, into the groups that get mixed in parallel. t0 = shuffle!(m0, m1, 2, 0, 2, 0); // 6 4 2 0 g1(row0, row1, row2, row3, t0); t1 = shuffle!(m0, m1, 3, 1, 3, 1); // 7 5 3 1 g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = shuffle!(m2, m3, 2, 0, 2, 0); // 14 12 10 8 t2 = shuffle_epi32::<2, 1, 0, 3>(t2); // 12 10 8 14 g1(row0, row1, row2, row3, t2); t3 = shuffle!(m2, m3, 3, 1, 3, 1); // 15 13 11 9 t3 = shuffle_epi32::<2, 1, 0, 3>(t3); // 13 11 9 15 g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 2. This round and all following rounds apply a fixed permutation // to the message words from the round before. t0 = shuffle!(m0, m1, 3, 1, 1, 2); t0 = shuffle_epi32::<0, 3, 2, 1>(t0); g1(row0, row1, row2, row3, t0); t1 = shuffle!(m2, m3, 3, 3, 2, 2); tt = shuffle_epi32::<0, 0, 3, 3>(m0); t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = shuffle_epi32::<1, 3, 2, 0>(tt); g1(row0, row1, row2, row3, t2); t3 = unpackhi_epi32(m1, m3); tt = unpacklo_epi32(m2, t3); t3 = shuffle_epi32::<0, 1, 3, 2>(tt); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 3 t0 = shuffle!(m0, m1, 3, 1, 1, 2); t0 = shuffle_epi32::<0, 3, 2, 1>(t0); g1(row0, row1, row2, row3, t0); t1 = shuffle!(m2, m3, 3, 3, 2, 2); tt = shuffle_epi32::<0, 0, 3, 3>(m0); t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = shuffle_epi32::<1, 3, 2, 0>(tt); g1(row0, row1, row2, row3, t2); t3 = unpackhi_epi32(m1, m3); tt = unpacklo_epi32(m2, t3); t3 = shuffle_epi32::<0, 1, 3, 2>(tt); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 4 t0 = shuffle!(m0, m1, 3, 1, 1, 2); t0 = shuffle_epi32::<0, 3, 2, 1>(t0); g1(row0, row1, row2, row3, t0); t1 = shuffle!(m2, m3, 3, 3, 2, 2); tt = shuffle_epi32::<0, 0, 3, 3>(m0); t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = shuffle_epi32::<1, 3, 2, 0>(tt); g1(row0, row1, row2, row3, t2); t3 = unpackhi_epi32(m1, m3); tt = unpacklo_epi32(m2, t3); t3 = shuffle_epi32::<0, 1, 3, 2>(tt); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 5 t0 = shuffle!(m0, m1, 3, 1, 1, 2); t0 = shuffle_epi32::<0, 3, 2, 1>(t0); g1(row0, row1, row2, row3, t0); t1 = shuffle!(m2, m3, 3, 3, 2, 2); tt = shuffle_epi32::<0, 0, 3, 3>(m0); t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = shuffle_epi32::<1, 3, 2, 0>(tt); g1(row0, row1, row2, row3, t2); t3 = unpackhi_epi32(m1, m3); tt = unpacklo_epi32(m2, t3); t3 = shuffle_epi32::<0, 1, 3, 2>(tt); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 6 t0 = shuffle!(m0, m1, 3, 1, 1, 2); t0 = shuffle_epi32::<0, 3, 2, 1>(t0); g1(row0, row1, row2, row3, t0); t1 = shuffle!(m2, m3, 3, 3, 2, 2); tt = shuffle_epi32::<0, 0, 3, 3>(m0); t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = shuffle_epi32::<1, 3, 2, 0>(tt); g1(row0, row1, row2, row3, t2); t3 = unpackhi_epi32(m1, m3); tt = unpacklo_epi32(m2, t3); t3 = shuffle_epi32::<0, 1, 3, 2>(tt); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); m0 = t0; m1 = t1; m2 = t2; m3 = t3; // Round 7 t0 = shuffle!(m0, m1, 3, 1, 1, 2); t0 = shuffle_epi32::<0, 3, 2, 1>(t0); g1(row0, row1, row2, row3, t0); t1 = shuffle!(m2, m3, 3, 3, 2, 2); tt = shuffle_epi32::<0, 0, 3, 3>(m0); t1 = blend_epi16(tt, t1, 0xCC); g2(row0, row1, row2, row3, t1); diagonalize(row0, row2, row3); t2 = unpacklo_epi64(m3, m1); tt = blend_epi16(t2, m2, 0xC0); t2 = shuffle_epi32::<1, 3, 2, 0>(tt); g1(row0, row1, row2, row3, t2); t3 = unpackhi_epi32(m1, m3); tt = unpacklo_epi32(m2, t3); t3 = shuffle_epi32::<0, 1, 3, 2>(tt); g2(row0, row1, row2, row3, t3); undiagonalize(row0, row2, row3); [*row0, *row1, *row2, *row3] } #[target_feature(enable = "simd128")] pub fn compress_in_place( cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) { let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags); // it stores in reversed order... // safe because CVWords is [u32; 8] unsafe { storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8); storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8); } } #[target_feature(enable = "simd128")] pub fn compress_xof( cv: &CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8, ) -> [u8; 64] { let [mut row0, mut row1, mut row2, mut row3] = compress_pre(cv, block, block_len, counter, flags); row0 = xor(row0, row2); row1 = xor(row1, row3); // safe because CVWords is [u32; 8] row2 = xor(row2, unsafe { loadu(cv.as_ptr().add(0) as *const u8) }); row3 = xor(row3, unsafe { loadu(cv.as_ptr().add(4) as *const u8) }); // It seems to be architecture dependent, but works. // safe because sizes match, and every state of u8 is valid. unsafe { core::mem::transmute([row0, row1, row2, row3]) } } #[inline(always)] fn round(v: &mut [v128; 16], m: &[v128; 16], r: usize) { v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]); v[0] = add(v[0], v[4]); v[1] = add(v[1], v[5]); v[2] = add(v[2], v[6]); v[3] = add(v[3], v[7]); v[12] = xor(v[12], v[0]); v[13] = xor(v[13], v[1]); v[14] = xor(v[14], v[2]); v[15] = xor(v[15], v[3]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[15] = rot16(v[15]); v[8] = add(v[8], v[12]); v[9] = add(v[9], v[13]); v[10] = add(v[10], v[14]); v[11] = add(v[11], v[15]); v[4] = xor(v[4], v[8]); v[5] = xor(v[5], v[9]); v[6] = xor(v[6], v[10]); v[7] = xor(v[7], v[11]); v[4] = rot12(v[4]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]); v[0] = add(v[0], v[4]); v[1] = add(v[1], v[5]); v[2] = add(v[2], v[6]); v[3] = add(v[3], v[7]); v[12] = xor(v[12], v[0]); v[13] = xor(v[13], v[1]); v[14] = xor(v[14], v[2]); v[15] = xor(v[15], v[3]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[15] = rot8(v[15]); v[8] = add(v[8], v[12]); v[9] = add(v[9], v[13]); v[10] = add(v[10], v[14]); v[11] = add(v[11], v[15]); v[4] = xor(v[4], v[8]); v[5] = xor(v[5], v[9]); v[6] = xor(v[6], v[10]); v[7] = xor(v[7], v[11]); v[4] = rot7(v[4]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]); v[0] = add(v[0], v[5]); v[1] = add(v[1], v[6]); v[2] = add(v[2], v[7]); v[3] = add(v[3], v[4]); v[15] = xor(v[15], v[0]); v[12] = xor(v[12], v[1]); v[13] = xor(v[13], v[2]); v[14] = xor(v[14], v[3]); v[15] = rot16(v[15]); v[12] = rot16(v[12]); v[13] = rot16(v[13]); v[14] = rot16(v[14]); v[10] = add(v[10], v[15]); v[11] = add(v[11], v[12]); v[8] = add(v[8], v[13]); v[9] = add(v[9], v[14]); v[5] = xor(v[5], v[10]); v[6] = xor(v[6], v[11]); v[7] = xor(v[7], v[8]); v[4] = xor(v[4], v[9]); v[5] = rot12(v[5]); v[6] = rot12(v[6]); v[7] = rot12(v[7]); v[4] = rot12(v[4]); v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]); v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]); v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]); v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]); v[0] = add(v[0], v[5]); v[1] = add(v[1], v[6]); v[2] = add(v[2], v[7]); v[3] = add(v[3], v[4]); v[15] = xor(v[15], v[0]); v[12] = xor(v[12], v[1]); v[13] = xor(v[13], v[2]); v[14] = xor(v[14], v[3]); v[15] = rot8(v[15]); v[12] = rot8(v[12]); v[13] = rot8(v[13]); v[14] = rot8(v[14]); v[10] = add(v[10], v[15]); v[11] = add(v[11], v[12]); v[8] = add(v[8], v[13]); v[9] = add(v[9], v[14]); v[5] = xor(v[5], v[10]); v[6] = xor(v[6], v[11]); v[7] = xor(v[7], v[8]); v[4] = xor(v[4], v[9]); v[5] = rot7(v[5]); v[6] = rot7(v[6]); v[7] = rot7(v[7]); v[4] = rot7(v[4]); } #[inline(always)] fn transpose_vecs(vecs: &mut [v128; DEGREE]) { // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is // 22/33. Note that this doesn't split the vector into two lanes, as the // AVX2 counterparts do. let ab_01 = unpacklo_epi32(vecs[0], vecs[1]); let ab_23 = unpackhi_epi32(vecs[0], vecs[1]); let cd_01 = unpacklo_epi32(vecs[2], vecs[3]); let cd_23 = unpackhi_epi32(vecs[2], vecs[3]); // Interleave 64-bit lanes. let abcd_0 = unpacklo_epi64(ab_01, cd_01); let abcd_1 = unpackhi_epi64(ab_01, cd_01); let abcd_2 = unpacklo_epi64(ab_23, cd_23); let abcd_3 = unpackhi_epi64(ab_23, cd_23); vecs[0] = abcd_0; vecs[1] = abcd_1; vecs[2] = abcd_2; vecs[3] = abcd_3; } #[inline(always)] unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [v128; 16] { let mut vecs = [ loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)), loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)), loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)), loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)), loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)), ]; let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE); transpose_vecs(squares.0); transpose_vecs(squares.1); transpose_vecs(squares.2); transpose_vecs(squares.3); vecs } #[inline(always)] fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (v128, v128) { let mask = if increment_counter.yes() { !0 } else { 0 }; ( set4( counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)), counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3)), ), set4( counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)), counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)), ), ) } #[target_feature(enable = "simd128")] pub unsafe fn hash4( inputs: &[*const u8; DEGREE], blocks: usize, key: &CVWords, counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, out: &mut [u8; DEGREE * OUT_LEN], ) { let mut h_vecs = [ set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]), set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]), ]; let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); let mut block_flags = flags | flags_start; for block in 0..blocks { if block + 1 == blocks { block_flags |= flags_end; } let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only let block_flags_vec = set1(block_flags as u32); let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); // The transposed compression function. Note that inlining this // manually here improves compile times by a lot, compared to factoring // it out into its own function and making it #[inline(always)]. Just // guessing, it might have something to do with loop unrolling. let mut v = [ h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3], h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7], set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]), counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec, ]; round(&mut v, &msg_vecs, 0); round(&mut v, &msg_vecs, 1); round(&mut v, &msg_vecs, 2); round(&mut v, &msg_vecs, 3); round(&mut v, &msg_vecs, 4); round(&mut v, &msg_vecs, 5); round(&mut v, &msg_vecs, 6); h_vecs[0] = xor(v[0], v[8]); h_vecs[1] = xor(v[1], v[9]); h_vecs[2] = xor(v[2], v[10]); h_vecs[3] = xor(v[3], v[11]); h_vecs[4] = xor(v[4], v[12]); h_vecs[5] = xor(v[5], v[13]); h_vecs[6] = xor(v[6], v[14]); h_vecs[7] = xor(v[7], v[15]); block_flags = flags; } let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE); transpose_vecs(squares.0); transpose_vecs(squares.1); // The first four vecs now contain the first half of each output, and the // second four vecs contain the second half of each output. storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE)); storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE)); storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE)); storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE)); storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE)); storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE)); storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); } #[target_feature(enable = "simd128")] unsafe fn hash1( input: &[u8; N], key: &CVWords, counter: u64, flags: u8, flags_start: u8, flags_end: u8, out: &mut CVBytes, ) { debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks"); let mut cv = *key; let mut block_flags = flags | flags_start; let mut slice = &input[..]; while slice.len() >= BLOCK_LEN { if slice.len() == BLOCK_LEN { block_flags |= flags_end; } compress_in_place( &mut cv, array_ref!(slice, 0, BLOCK_LEN), BLOCK_LEN as u8, counter, block_flags, ); block_flags = flags; slice = &slice[BLOCK_LEN..]; } *out = core::mem::transmute(cv); } #[target_feature(enable = "simd128")] pub unsafe fn hash_many( mut inputs: &[&[u8; N]], key: &CVWords, mut counter: u64, increment_counter: IncrementCounter, flags: u8, flags_start: u8, flags_end: u8, mut out: &mut [u8], ) { debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN { // Safe because the layout of arrays is guaranteed, and because the // `blocks` count is determined statically from the argument type. let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); let blocks = N / BLOCK_LEN; hash4( input_ptrs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, array_mut_ref!(out, 0, DEGREE * OUT_LEN), ); if increment_counter.yes() { counter += DEGREE as u64; } inputs = &inputs[DEGREE..]; out = &mut out[DEGREE * OUT_LEN..]; } for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) { hash1( input, key, counter, flags, flags_start, flags_end, array_mut_ref!(output, 0, OUT_LEN), ); if increment_counter.yes() { counter += 1; } } } #[cfg(test)] mod test { use super::*; #[test] fn test_transpose() { #[target_feature(enable = "simd128")] fn transpose_wrapper(vecs: &mut [v128; DEGREE]) { transpose_vecs(vecs); } let mut matrix = [[0 as u32; DEGREE]; DEGREE]; for i in 0..DEGREE { for j in 0..DEGREE { matrix[i][j] = (i * DEGREE + j) as u32; } } unsafe { let mut vecs: [v128; DEGREE] = core::mem::transmute(matrix); transpose_wrapper(&mut vecs); matrix = core::mem::transmute(vecs); } for i in 0..DEGREE { for j in 0..DEGREE { // Reversed indexes from above. assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); } } } #[test] fn test_compress() { crate::test::test_compress_fn(compress_in_place, compress_xof); } #[test] fn test_hash_many() { crate::test::test_hash_many_fn(hash_many, hash_many); } } ================================================ FILE: third-party/blake3/test_vectors/Cargo.toml ================================================ [package] name = "test_vectors" version = "0.0.0" edition = "2021" [features] neon = ["blake3/neon"] prefer_intrinsics = ["blake3/prefer_intrinsics"] pure = ["blake3/pure"] wasm32_simd = ["blake3/wasm32_simd"] [dependencies] # If you ever change these path dependencies, you'll probably need to update # cross_test.sh, or CI will break. I'm sorry >.< blake3 = { path = "../" } hex = "0.4.0" reference_impl = { path = "../reference_impl" } serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" ================================================ FILE: third-party/blake3/test_vectors/cross_test.sh ================================================ #! /usr/bin/env bash # This hacky script works around the fact that `cross test` does not support # path dependencies. (It uses a docker shared folder to let the guest access # project files, so parent directories aren't available.) Solve this problem by # copying the entire project to a temp dir and rearranging paths to put # "blake3" and "reference_impl" underneath "test_vectors", so that everything # is accessible. Hopefully this will just run on CI forever and no one will # ever read this and discover my deep shame. set -e -u -o pipefail project_root="$(realpath "$(dirname "$BASH_SOURCE")/..")" tmpdir="$(mktemp -d)" echo "Running cross tests in $tmpdir" cd "$tmpdir" git clone "$project_root" blake3 mv blake3/test_vectors . mv blake3/reference_impl test_vectors mv blake3 test_vectors cd test_vectors sed -i 's|blake3 = { path = "../" }|blake3 = { path = "./blake3" }|' Cargo.toml sed -i 's|reference_impl = { path = "../reference_impl" }|reference_impl = { path = "reference_impl" }|' Cargo.toml cross test "$@" ================================================ FILE: third-party/blake3/test_vectors/src/bin/generate.rs ================================================ fn main() { // The trailing newline is included. print!("{}", test_vectors::generate_json()); } ================================================ FILE: third-party/blake3/test_vectors/src/lib.rs ================================================ use blake3::{BLOCK_LEN, CHUNK_LEN}; use serde::{Deserialize, Serialize}; // Reading files at runtime requires special configuration under WASM/WASI, so including this at // compile time is simpler. const TEST_VECTORS_JSON: &str = include_str!("../test_vectors.json"); // A non-multiple of 4 is important, since one possible bug is to fail to emit // partial words. pub const OUTPUT_LEN: usize = 2 * BLOCK_LEN + 3; pub const TEST_CASES: &[usize] = &[ 0, 1, 2, 3, 4, 5, 6, 7, 8, BLOCK_LEN - 1, BLOCK_LEN, BLOCK_LEN + 1, 2 * BLOCK_LEN - 1, 2 * BLOCK_LEN, 2 * BLOCK_LEN + 1, CHUNK_LEN - 1, CHUNK_LEN, CHUNK_LEN + 1, 2 * CHUNK_LEN, 2 * CHUNK_LEN + 1, 3 * CHUNK_LEN, 3 * CHUNK_LEN + 1, 4 * CHUNK_LEN, 4 * CHUNK_LEN + 1, 5 * CHUNK_LEN, 5 * CHUNK_LEN + 1, 6 * CHUNK_LEN, 6 * CHUNK_LEN + 1, 7 * CHUNK_LEN, 7 * CHUNK_LEN + 1, 8 * CHUNK_LEN, 8 * CHUNK_LEN + 1, 16 * CHUNK_LEN, // AVX512's bandwidth 31 * CHUNK_LEN, // 16 + 8 + 4 + 2 + 1 100 * CHUNK_LEN, // subtrees larger than MAX_SIMD_DEGREE chunks ]; pub const TEST_KEY: &[u8; blake3::KEY_LEN] = b"whats the Elvish word for friend"; pub const TEST_CONTEXT: &str = "BLAKE3 2019-12-27 16:29:52 test vectors context"; const COMMENT: &str = r#" Each test is an input length and three outputs, one for each of the hash, keyed_hash, and derive_key modes. The input in each case is filled with a repeating sequence of 251 bytes: 0, 1, 2, ..., 249, 250, 0, 1, ..., and so on. The key used with keyed_hash is the 32-byte ASCII string "whats the Elvish word for friend", also given in the `key` field below. The context string used with derive_key is the ASCII string "BLAKE3 2019-12-27 16:29:52 test vectors context", also given in the `context_string` field below. Outputs are encoded as hexadecimal. Each case is an extended output, and implementations should also check that the first 32 bytes match their default-length output. "#; // Paint the input with a repeating byte pattern. We use a cycle length of 251, // because that's the largest prime number less than 256. This makes it // unlikely to swapping any two adjacent input blocks or chunks will give the // same answer. pub fn paint_test_input(buf: &mut [u8]) { for (i, b) in buf.iter_mut().enumerate() { *b = (i % 251) as u8; } } #[derive(Debug, Serialize, Deserialize)] pub struct Cases { pub _comment: String, pub key: String, pub context_string: String, pub cases: Vec, } #[derive(Debug, Serialize, Deserialize)] pub struct Case { pub input_len: usize, pub hash: String, pub keyed_hash: String, pub derive_key: String, } pub fn generate_json() -> String { let mut cases = Vec::new(); for &input_len in TEST_CASES { let mut input = vec![0; input_len]; paint_test_input(&mut input); let mut hash_out = [0; OUTPUT_LEN]; blake3::Hasher::new() .update(&input) .finalize_xof() .fill(&mut hash_out); let mut keyed_hash_out = [0; OUTPUT_LEN]; blake3::Hasher::new_keyed(TEST_KEY) .update(&input) .finalize_xof() .fill(&mut keyed_hash_out); let mut derive_key_out = [0; OUTPUT_LEN]; blake3::Hasher::new_derive_key(TEST_CONTEXT) .update(&input) .finalize_xof() .fill(&mut derive_key_out); cases.push(Case { input_len, hash: hex::encode(&hash_out[..]), keyed_hash: hex::encode(&keyed_hash_out[..]), derive_key: hex::encode(&derive_key_out[..]), }); } let mut json = serde_json::to_string_pretty(&Cases { _comment: COMMENT.trim().replace("\n", " "), key: std::str::from_utf8(TEST_KEY).unwrap().to_string(), context_string: TEST_CONTEXT.to_string(), cases, }) .unwrap(); // Add a trailing newline. json.push('\n'); json } pub fn parse_test_cases() -> Cases { serde_json::from_str(TEST_VECTORS_JSON).expect("failed to parse test_vectors.json") } #[cfg(test)] mod tests { use super::*; fn test_reference_impl_all_at_once( key: &[u8; blake3::KEY_LEN], input: &[u8], expected_hash: &[u8], expected_keyed_hash: &[u8], expected_derive_key: &[u8], ) { let mut out = vec![0; expected_hash.len()]; let mut hasher = reference_impl::Hasher::new(); hasher.update(input); hasher.finalize(&mut out); assert_eq!(expected_hash, &out[..]); let mut out = vec![0; expected_keyed_hash.len()]; let mut hasher = reference_impl::Hasher::new_keyed(key); hasher.update(input); hasher.finalize(&mut out); assert_eq!(expected_keyed_hash, &out[..]); let mut out = vec![0; expected_derive_key.len()]; let mut hasher = reference_impl::Hasher::new_derive_key(TEST_CONTEXT); hasher.update(input); hasher.finalize(&mut out); assert_eq!(expected_derive_key, &out[..]); } fn test_reference_impl_one_at_a_time( key: &[u8; blake3::KEY_LEN], input: &[u8], expected_hash: &[u8], expected_keyed_hash: &[u8], expected_derive_key: &[u8], ) { let mut out = vec![0; expected_hash.len()]; let mut hasher = reference_impl::Hasher::new(); for &b in input { hasher.update(&[b]); } hasher.finalize(&mut out); assert_eq!(expected_hash, &out[..]); let mut out = vec![0; expected_keyed_hash.len()]; let mut hasher = reference_impl::Hasher::new_keyed(key); for &b in input { hasher.update(&[b]); } hasher.finalize(&mut out); assert_eq!(expected_keyed_hash, &out[..]); let mut out = vec![0; expected_derive_key.len()]; let mut hasher = reference_impl::Hasher::new_derive_key(TEST_CONTEXT); for &b in input { hasher.update(&[b]); } hasher.finalize(&mut out); assert_eq!(expected_derive_key, &out[..]); } fn test_incremental_all_at_once( key: &[u8; blake3::KEY_LEN], input: &[u8], expected_hash: &[u8], expected_keyed_hash: &[u8], expected_derive_key: &[u8], ) { let mut out = vec![0; expected_hash.len()]; let mut hasher = blake3::Hasher::new(); hasher.update(input); hasher.finalize_xof().fill(&mut out); assert_eq!(expected_hash, &out[..]); assert_eq!(&expected_hash[..32], hasher.finalize().as_bytes()); let mut out = vec![0; expected_keyed_hash.len()]; let mut hasher = blake3::Hasher::new_keyed(key); hasher.update(input); hasher.finalize_xof().fill(&mut out); assert_eq!(expected_keyed_hash, &out[..]); assert_eq!(&expected_keyed_hash[..32], hasher.finalize().as_bytes()); let mut out = vec![0; expected_derive_key.len()]; let mut hasher = blake3::Hasher::new_derive_key(TEST_CONTEXT); hasher.update(input); hasher.finalize_xof().fill(&mut out); assert_eq!(expected_derive_key, &out[..]); assert_eq!(&expected_derive_key[..32], hasher.finalize().as_bytes()); } fn test_incremental_one_at_a_time( key: &[u8; blake3::KEY_LEN], input: &[u8], expected_hash: &[u8], expected_keyed_hash: &[u8], expected_derive_key: &[u8], ) { let mut out = vec![0; expected_hash.len()]; let mut hasher = blake3::Hasher::new(); for i in 0..input.len() { hasher.update(&[input[i]]); assert_eq!(i as u64 + 1, hasher.count()); } hasher.finalize_xof().fill(&mut out); assert_eq!(expected_hash, &out[..]); assert_eq!(&expected_hash[..32], hasher.finalize().as_bytes()); let mut out = vec![0; expected_keyed_hash.len()]; let mut hasher = blake3::Hasher::new_keyed(key); for i in 0..input.len() { hasher.update(&[input[i]]); assert_eq!(i as u64 + 1, hasher.count()); } hasher.finalize_xof().fill(&mut out); assert_eq!(expected_keyed_hash, &out[..]); assert_eq!(&expected_keyed_hash[..32], hasher.finalize().as_bytes()); let mut out = vec![0; expected_derive_key.len()]; let mut hasher = blake3::Hasher::new_derive_key(TEST_CONTEXT); for i in 0..input.len() { hasher.update(&[input[i]]); assert_eq!(i as u64 + 1, hasher.count()); } hasher.finalize_xof().fill(&mut out); assert_eq!(expected_derive_key, &out[..]); assert_eq!(&expected_derive_key[..32], hasher.finalize().as_bytes()); } fn test_recursive( key: &[u8; blake3::KEY_LEN], input: &[u8], expected_hash: &[u8], expected_keyed_hash: &[u8], expected_derive_key: &[u8], ) { assert_eq!(&expected_hash[..32], blake3::hash(input).as_bytes()); assert_eq!( &expected_keyed_hash[..32], blake3::keyed_hash(key, input).as_bytes(), ); assert_eq!( expected_derive_key[..32], blake3::derive_key(TEST_CONTEXT, input) ); } #[test] fn run_test_vectors() { let cases = parse_test_cases(); let key: &[u8; blake3::KEY_LEN] = cases.key.as_bytes().try_into().unwrap(); for case in &cases.cases { dbg!(case.input_len); let mut input = vec![0; case.input_len]; paint_test_input(&mut input); let expected_hash = hex::decode(&case.hash).unwrap(); let expected_keyed_hash = hex::decode(&case.keyed_hash).unwrap(); let expected_derive_key = hex::decode(&case.derive_key).unwrap(); test_reference_impl_all_at_once( key, &input, &expected_hash, &expected_keyed_hash, &expected_derive_key, ); test_reference_impl_one_at_a_time( key, &input, &expected_hash, &expected_keyed_hash, &expected_derive_key, ); test_incremental_all_at_once( key, &input, &expected_hash, &expected_keyed_hash, &expected_derive_key, ); test_incremental_one_at_a_time( key, &input, &expected_hash, &expected_keyed_hash, &expected_derive_key, ); test_recursive( key, &input, &expected_hash, &expected_keyed_hash, &expected_derive_key, ); } } #[test] fn test_checked_in_vectors_up_to_date() { // Replace Windows newlines, in case Git is configured to alter // newlines when files are checked out. let json = TEST_VECTORS_JSON.replace("\r\n", "\n"); if generate_json() != json { panic!("Checked-in test_vectors.json is not up to date. Regenerate with `cargo run --bin generate > ./test_vectors.json`."); } } } ================================================ FILE: third-party/blake3/test_vectors/test_vectors.json ================================================ { "_comment": "Each test is an input length and three outputs, one for each of the hash, keyed_hash, and derive_key modes. The input in each case is filled with a repeating sequence of 251 bytes: 0, 1, 2, ..., 249, 250, 0, 1, ..., and so on. The key used with keyed_hash is the 32-byte ASCII string \"whats the Elvish word for friend\", also given in the `key` field below. The context string used with derive_key is the ASCII string \"BLAKE3 2019-12-27 16:29:52 test vectors context\", also given in the `context_string` field below. Outputs are encoded as hexadecimal. Each case is an extended output, and implementations should also check that the first 32 bytes match their default-length output.", "key": "whats the Elvish word for friend", "context_string": "BLAKE3 2019-12-27 16:29:52 test vectors context", "cases": [ { "input_len": 0, "hash": "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262e00f03e7b69af26b7faaf09fcd333050338ddfe085b8cc869ca98b206c08243a26f5487789e8f660afe6c99ef9e0c52b92e7393024a80459cf91f476f9ffdbda7001c22e159b402631f277ca96f2defdf1078282314e763699a31c5363165421cce14d", "keyed_hash": "92b2b75604ed3c761f9d6f62392c8a9227ad0ea3f09573e783f1498a4ed60d26b18171a2f22a4b94822c701f107153dba24918c4bae4d2945c20ece13387627d3b73cbf97b797d5e59948c7ef788f54372df45e45e4293c7dc18c1d41144a9758be58960856be1eabbe22c2653190de560ca3b2ac4aa692a9210694254c371e851bc8f", "derive_key": "2cc39783c223154fea8dfb7c1b1660f2ac2dcbd1c1de8277b0b0dd39b7e50d7d905630c8be290dfcf3e6842f13bddd573c098c3f17361f1f206b8cad9d088aa4a3f746752c6b0ce6a83b0da81d59649257cdf8eb3e9f7d4998e41021fac119deefb896224ac99f860011f73609e6e0e4540f93b273e56547dfd3aa1a035ba6689d89a0" }, { "input_len": 1, "hash": "2d3adedff11b61f14c886e35afa036736dcd87a74d27b5c1510225d0f592e213c3a6cb8bf623e20cdb535f8d1a5ffb86342d9c0b64aca3bce1d31f60adfa137b358ad4d79f97b47c3d5e79f179df87a3b9776ef8325f8329886ba42f07fb138bb502f4081cbcec3195c5871e6c23e2cc97d3c69a613eba131e5f1351f3f1da786545e5", "keyed_hash": "6d7878dfff2f485635d39013278ae14f1454b8c0a3a2d34bc1ab38228a80c95b6568c0490609413006fbd428eb3fd14e7756d90f73a4725fad147f7bf70fd61c4e0cf7074885e92b0e3f125978b4154986d4fb202a3f331a3fb6cf349a3a70e49990f98fe4289761c8602c4e6ab1138d31d3b62218078b2f3ba9a88e1d08d0dd4cea11", "derive_key": "b3e2e340a117a499c6cf2398a19ee0d29cca2bb7404c73063382693bf66cb06c5827b91bf889b6b97c5477f535361caefca0b5d8c4746441c57617111933158950670f9aa8a05d791daae10ac683cbef8faf897c84e6114a59d2173c3f417023a35d6983f2c7dfa57e7fc559ad751dbfb9ffab39c2ef8c4aafebc9ae973a64f0c76551" }, { "input_len": 2, "hash": "7b7015bb92cf0b318037702a6cdd81dee41224f734684c2c122cd6359cb1ee63d8386b22e2ddc05836b7c1bb693d92af006deb5ffbc4c70fb44d0195d0c6f252faac61659ef86523aa16517f87cb5f1340e723756ab65efb2f91964e14391de2a432263a6faf1d146937b35a33621c12d00be8223a7f1919cec0acd12097ff3ab00ab1", "keyed_hash": "5392ddae0e0a69d5f40160462cbd9bd889375082ff224ac9c758802b7a6fd20a9ffbf7efd13e989a6c246f96d3a96b9d279f2c4e63fb0bdff633957acf50ee1a5f658be144bab0f6f16500dee4aa5967fc2c586d85a04caddec90fffb7633f46a60786024353b9e5cebe277fcd9514217fee2267dcda8f7b31697b7c54fab6a939bf8f", "derive_key": "1f166565a7df0098ee65922d7fea425fb18b9943f19d6161e2d17939356168e6daa59cae19892b2d54f6fc9f475d26031fd1c22ae0a3e8ef7bdb23f452a15e0027629d2e867b1bb1e6ab21c71297377750826c404dfccc2406bd57a83775f89e0b075e59a7732326715ef912078e213944f490ad68037557518b79c0086de6d6f6cdd2" }, { "input_len": 3, "hash": "e1be4d7a8ab5560aa4199eea339849ba8e293d55ca0a81006726d184519e647f5b49b82f805a538c68915c1ae8035c900fd1d4b13902920fd05e1450822f36de9454b7e9996de4900c8e723512883f93f4345f8a58bfe64ee38d3ad71ab027765d25cdd0e448328a8e7a683b9a6af8b0af94fa09010d9186890b096a08471e4230a134", "keyed_hash": "39e67b76b5a007d4921969779fe666da67b5213b096084ab674742f0d5ec62b9b9142d0fab08e1b161efdbb28d18afc64d8f72160c958e53a950cdecf91c1a1bbab1a9c0f01def762a77e2e8545d4dec241e98a89b6db2e9a5b070fc110caae2622690bd7b76c02ab60750a3ea75426a6bb8803c370ffe465f07fb57def95df772c39f", "derive_key": "440aba35cb006b61fc17c0529255de438efc06a8c9ebf3f2ddac3b5a86705797f27e2e914574f4d87ec04c379e12789eccbfbc15892626042707802dbe4e97c3ff59dca80c1e54246b6d055154f7348a39b7d098b2b4824ebe90e104e763b2a447512132cede16243484a55a4e40a85790038bb0dcf762e8c053cabae41bbe22a5bff7" }, { "input_len": 4, "hash": "f30f5ab28fe047904037f77b6da4fea1e27241c5d132638d8bedce9d40494f328f603ba4564453e06cdcee6cbe728a4519bbe6f0d41e8a14b5b225174a566dbfa61b56afb1e452dc08c804f8c3143c9e2cc4a31bb738bf8c1917b55830c6e65797211701dc0b98daa1faeaa6ee9e56ab606ce03a1a881e8f14e87a4acf4646272cfd12", "keyed_hash": "7671dde590c95d5ac9616651ff5aa0a27bee5913a348e053b8aa9108917fe070116c0acff3f0d1fa97ab38d813fd46506089118147d83393019b068a55d646251ecf81105f798d76a10ae413f3d925787d6216a7eb444e510fd56916f1d753a5544ecf0072134a146b2615b42f50c179f56b8fae0788008e3e27c67482349e249cb86a", "derive_key": "f46085c8190d69022369ce1a18880e9b369c135eb93f3c63550d3e7630e91060fbd7d8f4258bec9da4e05044f88b91944f7cab317a2f0c18279629a3867fad0662c9ad4d42c6f27e5b124da17c8c4f3a94a025ba5d1b623686c6099d202a7317a82e3d95dae46a87de0555d727a5df55de44dab799a20dffe239594d6e99ed17950910" }, { "input_len": 5, "hash": "b40b44dfd97e7a84a996a91af8b85188c66c126940ba7aad2e7ae6b385402aa2ebcfdac6c5d32c31209e1f81a454751280db64942ce395104e1e4eaca62607de1c2ca748251754ea5bbe8c20150e7f47efd57012c63b3c6a6632dc1c7cd15f3e1c999904037d60fac2eb9397f2adbe458d7f264e64f1e73aa927b30988e2aed2f03620", "keyed_hash": "73ac69eecf286894d8102018a6fc729f4b1f4247d3703f69bdc6a5fe3e0c84616ab199d1f2f3e53bffb17f0a2209fe8b4f7d4c7bae59c2bc7d01f1ff94c67588cc6b38fa6024886f2c078bfe09b5d9e6584cd6c521c3bb52f4de7687b37117a2dbbec0d59e92fa9a8cc3240d4432f91757aabcae03e87431dac003e7d73574bfdd8218", "derive_key": "1f24eda69dbcb752847ec3ebb5dd42836d86e58500c7c98d906ecd82ed9ae47f6f48a3f67e4e43329c9a89b1ca526b9b35cbf7d25c1e353baffb590fd79be58ddb6c711f1a6b60e98620b851c688670412fcb0435657ba6b638d21f0f2a04f2f6b0bd8834837b10e438d5f4c7c2c71299cf7586ea9144ed09253d51f8f54dd6bff719d" }, { "input_len": 6, "hash": "06c4e8ffb6872fad96f9aaca5eee1553eb62aed0ad7198cef42e87f6a616c844611a30c4e4f37fe2fe23c0883cde5cf7059d88b657c7ed2087e3d210925ede716435d6d5d82597a1e52b9553919e804f5656278bd739880692c94bff2824d8e0b48cac1d24682699e4883389dc4f2faa2eb3b4db6e39debd5061ff3609916f3e07529a", "keyed_hash": "82d3199d0013035682cc7f2a399d4c212544376a839aa863a0f4c91220ca7a6dc2ffb3aa05f2631f0fa9ac19b6e97eb7e6669e5ec254799350c8b8d189e8807800842a5383c4d907c932f34490aaf00064de8cdb157357bde37c1504d2960034930887603abc5ccb9f5247f79224baff6120a3c622a46d7b1bcaee02c5025460941256", "derive_key": "be96b30b37919fe4379dfbe752ae77b4f7e2ab92f7ff27435f76f2f065f6a5f435ae01a1d14bd5a6b3b69d8cbd35f0b01ef2173ff6f9b640ca0bd4748efa398bf9a9c0acd6a66d9332fdc9b47ffe28ba7ab6090c26747b85f4fab22f936b71eb3f64613d8bd9dfabe9bb68da19de78321b481e5297df9e40ec8a3d662f3e1479c65de0" }, { "input_len": 7, "hash": "3f8770f387faad08faa9d8414e9f449ac68e6ff0417f673f602a646a891419fe66036ef6e6d1a8f54baa9fed1fc11c77cfb9cff65bae915045027046ebe0c01bf5a941f3bb0f73791d3fc0b84370f9f30af0cd5b0fc334dd61f70feb60dad785f070fef1f343ed933b49a5ca0d16a503f599a365a4296739248b28d1a20b0e2cc8975c", "keyed_hash": "af0a7ec382aedc0cfd626e49e7628bc7a353a4cb108855541a5651bf64fbb28a7c5035ba0f48a9c73dabb2be0533d02e8fd5d0d5639a18b2803ba6bf527e1d145d5fd6406c437b79bcaad6c7bdf1cf4bd56a893c3eb9510335a7a798548c6753f74617bede88bef924ba4b334f8852476d90b26c5dc4c3668a2519266a562c6c8034a6", "derive_key": "dc3b6485f9d94935329442916b0d059685ba815a1fa2a14107217453a7fc9f0e66266db2ea7c96843f9d8208e600a73f7f45b2f55b9e6d6a7ccf05daae63a3fdd10b25ac0bd2e224ce8291f88c05976d575df998477db86fb2cfbbf91725d62cb57acfeb3c2d973b89b503c2b60dde85a7802b69dc1ac2007d5623cbea8cbfb6b181f5" }, { "input_len": 8, "hash": "2351207d04fc16ade43ccab08600939c7c1fa70a5c0aaca76063d04c3228eaeb725d6d46ceed8f785ab9f2f9b06acfe398c6699c6129da084cb531177445a682894f9685eaf836999221d17c9a64a3a057000524cd2823986db378b074290a1a9b93a22e135ed2c14c7e20c6d045cd00b903400374126676ea78874d79f2dd7883cf5c", "keyed_hash": "be2f5495c61cba1bb348a34948c004045e3bd4dae8f0fe82bf44d0da245a060048eb5e68ce6dea1eb0229e144f578b3aa7e9f4f85febd135df8525e6fe40c6f0340d13dd09b255ccd5112a94238f2be3c0b5b7ecde06580426a93e0708555a265305abf86d874e34b4995b788e37a823491f25127a502fe0704baa6bfdf04e76c13276", "derive_key": "2b166978cef14d9d438046c720519d8b1cad707e199746f1562d0c87fbd32940f0e2545a96693a66654225ebbaac76d093bfa9cd8f525a53acb92a861a98c42e7d1c4ae82e68ab691d510012edd2a728f98cd4794ef757e94d6546961b4f280a51aac339cc95b64a92b83cc3f26d8af8dfb4c091c240acdb4d47728d23e7148720ef04" }, { "input_len": 63, "hash": "e9bc37a594daad83be9470df7f7b3798297c3d834ce80ba85d6e207627b7db7b1197012b1e7d9af4d7cb7bdd1f3bb49a90a9b5dec3ea2bbc6eaebce77f4e470cbf4687093b5352f04e4a4570fba233164e6acc36900e35d185886a827f7ea9bdc1e5c3ce88b095a200e62c10c043b3e9bc6cb9b6ac4dfa51794b02ace9f98779040755", "keyed_hash": "bb1eb5d4afa793c1ebdd9fb08def6c36d10096986ae0cfe148cd101170ce37aea05a63d74a840aecd514f654f080e51ac50fd617d22610d91780fe6b07a26b0847abb38291058c97474ef6ddd190d30fc318185c09ca1589d2024f0a6f16d45f11678377483fa5c005b2a107cb9943e5da634e7046855eaa888663de55d6471371d55d", "derive_key": "b6451e30b953c206e34644c6803724e9d2725e0893039cfc49584f991f451af3b89e8ff572d3da4f4022199b9563b9d70ebb616efff0763e9abec71b550f1371e233319c4c4e74da936ba8e5bbb29a598e007a0bbfa929c99738ca2cc098d59134d11ff300c39f82e2fce9f7f0fa266459503f64ab9913befc65fddc474f6dc1c67669" }, { "input_len": 64, "hash": "4eed7141ea4a5cd4b788606bd23f46e212af9cacebacdc7d1f4c6dc7f2511b98fc9cc56cb831ffe33ea8e7e1d1df09b26efd2767670066aa82d023b1dfe8ab1b2b7fbb5b97592d46ffe3e05a6a9b592e2949c74160e4674301bc3f97e04903f8c6cf95b863174c33228924cdef7ae47559b10b294acd660666c4538833582b43f82d74", "keyed_hash": "ba8ced36f327700d213f120b1a207a3b8c04330528586f414d09f2f7d9ccb7e68244c26010afc3f762615bbac552a1ca909e67c83e2fd5478cf46b9e811efccc93f77a21b17a152ebaca1695733fdb086e23cd0eb48c41c034d52523fc21236e5d8c9255306e48d52ba40b4dac24256460d56573d1312319afcf3ed39d72d0bfc69acb", "derive_key": "a5c4a7053fa86b64746d4bb688d06ad1f02a18fce9afd3e818fefaa7126bf73e9b9493a9befebe0bf0c9509fb3105cfa0e262cde141aa8e3f2c2f77890bb64a4cca96922a21ead111f6338ad5244f2c15c44cb595443ac2ac294231e31be4a4307d0a91e874d36fc9852aeb1265c09b6e0cda7c37ef686fbbcab97e8ff66718be048bb" }, { "input_len": 65, "hash": "de1e5fa0be70df6d2be8fffd0e99ceaa8eb6e8c93a63f2d8d1c30ecb6b263dee0e16e0a4749d6811dd1d6d1265c29729b1b75a9ac346cf93f0e1d7296dfcfd4313b3a227faaaaf7757cc95b4e87a49be3b8a270a12020233509b1c3632b3485eef309d0abc4a4a696c9decc6e90454b53b000f456a3f10079072baaf7a981653221f2c", "keyed_hash": "c0a4edefa2d2accb9277c371ac12fcdbb52988a86edc54f0716e1591b4326e72d5e795f46a596b02d3d4bfb43abad1e5d19211152722ec1f20fef2cd413e3c22f2fc5da3d73041275be6ede3517b3b9f0fc67ade5956a672b8b75d96cb43294b9041497de92637ed3f2439225e683910cb3ae923374449ca788fb0f9bea92731bc26ad", "derive_key": "51fd05c3c1cfbc8ed67d139ad76f5cf8236cd2acd26627a30c104dfd9d3ff8a82b02e8bd36d8498a75ad8c8e9b15eb386970283d6dd42c8ae7911cc592887fdbe26a0a5f0bf821cd92986c60b2502c9be3f98a9c133a7e8045ea867e0828c7252e739321f7c2d65daee4468eb4429efae469a42763f1f94977435d10dccae3e3dce88d" }, { "input_len": 127, "hash": "d81293fda863f008c09e92fc382a81f5a0b4a1251cba1634016a0f86a6bd640de3137d477156d1fde56b0cf36f8ef18b44b2d79897bece12227539ac9ae0a5119da47644d934d26e74dc316145dcb8bb69ac3f2e05c242dd6ee06484fcb0e956dc44355b452c5e2bbb5e2b66e99f5dd443d0cbcaaafd4beebaed24ae2f8bb672bcef78", "keyed_hash": "c64200ae7dfaf35577ac5a9521c47863fb71514a3bcad18819218b818de85818ee7a317aaccc1458f78d6f65f3427ec97d9c0adb0d6dacd4471374b621b7b5f35cd54663c64dbe0b9e2d95632f84c611313ea5bd90b71ce97b3cf645776f3adc11e27d135cbadb9875c2bf8d3ae6b02f8a0206aba0c35bfe42574011931c9a255ce6dc", "derive_key": "c91c090ceee3a3ac81902da31838012625bbcd73fcb92e7d7e56f78deba4f0c3feeb3974306966ccb3e3c69c337ef8a45660ad02526306fd685c88542ad00f759af6dd1adc2e50c2b8aac9f0c5221ff481565cf6455b772515a69463223202e5c371743e35210bbbbabd89651684107fd9fe493c937be16e39cfa7084a36207c99bea3" }, { "input_len": 128, "hash": "f17e570564b26578c33bb7f44643f539624b05df1a76c81f30acd548c44b45efa69faba091427f9c5c4caa873aa07828651f19c55bad85c47d1368b11c6fd99e47ecba5820a0325984d74fe3e4058494ca12e3f1d3293d0010a9722f7dee64f71246f75e9361f44cc8e214a100650db1313ff76a9f93ec6e84edb7add1cb4a95019b0c", "keyed_hash": "b04fe15577457267ff3b6f3c947d93be581e7e3a4b018679125eaf86f6a628ecd86bbe0001f10bda47e6077b735016fca8119da11348d93ca302bbd125bde0db2b50edbe728a620bb9d3e6f706286aedea973425c0b9eedf8a38873544cf91badf49ad92a635a93f71ddfcee1eae536c25d1b270956be16588ef1cfef2f1d15f650bd5", "derive_key": "81720f34452f58a0120a58b6b4608384b5c51d11f39ce97161a0c0e442ca022550e7cd651e312f0b4c6afb3c348ae5dd17d2b29fab3b894d9a0034c7b04fd9190cbd90043ff65d1657bbc05bfdecf2897dd894c7a1b54656d59a50b51190a9da44db426266ad6ce7c173a8c0bbe091b75e734b4dadb59b2861cd2518b4e7591e4b83c9" }, { "input_len": 129, "hash": "683aaae9f3c5ba37eaaf072aed0f9e30bac0865137bae68b1fde4ca2aebdcb12f96ffa7b36dd78ba321be7e842d364a62a42e3746681c8bace18a4a8a79649285c7127bf8febf125be9de39586d251f0d41da20980b70d35e3dac0eee59e468a894fa7e6a07129aaad09855f6ad4801512a116ba2b7841e6cfc99ad77594a8f2d181a7", "keyed_hash": "d4a64dae6cdccbac1e5287f54f17c5f985105457c1a2ec1878ebd4b57e20d38f1c9db018541eec241b748f87725665b7b1ace3e0065b29c3bcb232c90e37897fa5aaee7e1e8a2ecfcd9b51463e42238cfdd7fee1aecb3267fa7f2128079176132a412cd8aaf0791276f6b98ff67359bd8652ef3a203976d5ff1cd41885573487bcd683", "derive_key": "938d2d4435be30eafdbb2b7031f7857c98b04881227391dc40db3c7b21f41fc18d72d0f9c1de5760e1941aebf3100b51d64644cb459eb5d20258e233892805eb98b07570ef2a1787cd48e117c8d6a63a68fd8fc8e59e79dbe63129e88352865721c8d5f0cf183f85e0609860472b0d6087cefdd186d984b21542c1c780684ed6832d8d" }, { "input_len": 1023, "hash": "10108970eeda3eb932baac1428c7a2163b0e924c9a9e25b35bba72b28f70bd11a182d27a591b05592b15607500e1e8dd56bc6c7fc063715b7a1d737df5bad3339c56778957d870eb9717b57ea3d9fb68d1b55127bba6a906a4a24bbd5acb2d123a37b28f9e9a81bbaae360d58f85e5fc9d75f7c370a0cc09b6522d9c8d822f2f28f485", "keyed_hash": "c951ecdf03288d0fcc96ee3413563d8a6d3589547f2c2fb36d9786470f1b9d6e890316d2e6d8b8c25b0a5b2180f94fb1a158ef508c3cde45e2966bd796a696d3e13efd86259d756387d9becf5c8bf1ce2192b87025152907b6d8cc33d17826d8b7b9bc97e38c3c85108ef09f013e01c229c20a83d9e8efac5b37470da28575fd755a10", "derive_key": "74a16c1c3d44368a86e1ca6df64be6a2f64cce8f09220787450722d85725dea59c413264404661e9e4d955409dfe4ad3aa487871bcd454ed12abfe2c2b1eb7757588cf6cb18d2eccad49e018c0d0fec323bec82bf1644c6325717d13ea712e6840d3e6e730d35553f59eff5377a9c350bcc1556694b924b858f329c44ee64b884ef00d" }, { "input_len": 1024, "hash": "42214739f095a406f3fc83deb889744ac00df831c10daa55189b5d121c855af71cf8107265ecdaf8505b95d8fcec83a98a6a96ea5109d2c179c47a387ffbb404756f6eeae7883b446b70ebb144527c2075ab8ab204c0086bb22b7c93d465efc57f8d917f0b385c6df265e77003b85102967486ed57db5c5ca170ba441427ed9afa684e", "keyed_hash": "75c46f6f3d9eb4f55ecaaee480db732e6c2105546f1e675003687c31719c7ba4a78bc838c72852d4f49c864acb7adafe2478e824afe51c8919d06168414c265f298a8094b1ad813a9b8614acabac321f24ce61c5a5346eb519520d38ecc43e89b5000236df0597243e4d2493fd626730e2ba17ac4d8824d09d1a4a8f57b8227778e2de", "derive_key": "7356cd7720d5b66b6d0697eb3177d9f8d73a4a5c5e968896eb6a6896843027066c23b601d3ddfb391e90d5c8eccdef4ae2a264bce9e612ba15e2bc9d654af1481b2e75dbabe615974f1070bba84d56853265a34330b4766f8e75edd1f4a1650476c10802f22b64bd3919d246ba20a17558bc51c199efdec67e80a227251808d8ce5bad" }, { "input_len": 1025, "hash": "d00278ae47eb27b34faecf67b4fe263f82d5412916c1ffd97c8cb7fb814b8444f4c4a22b4b399155358a994e52bf255de60035742ec71bd08ac275a1b51cc6bfe332b0ef84b409108cda080e6269ed4b3e2c3f7d722aa4cdc98d16deb554e5627be8f955c98e1d5f9565a9194cad0c4285f93700062d9595adb992ae68ff12800ab67a", "keyed_hash": "357dc55de0c7e382c900fd6e320acc04146be01db6a8ce7210b7189bd664ea69362396b77fdc0d2634a552970843722066c3c15902ae5097e00ff53f1e116f1cd5352720113a837ab2452cafbde4d54085d9cf5d21ca613071551b25d52e69d6c81123872b6f19cd3bc1333edf0c52b94de23ba772cf82636cff4542540a7738d5b930", "derive_key": "effaa245f065fbf82ac186839a249707c3bddf6d3fdda22d1b95a3c970379bcb5d31013a167509e9066273ab6e2123bc835b408b067d88f96addb550d96b6852dad38e320b9d940f86db74d398c770f462118b35d2724efa13da97194491d96dd37c3c09cbef665953f2ee85ec83d88b88d11547a6f911c8217cca46defa2751e7f3ad" }, { "input_len": 2048, "hash": "e776b6028c7cd22a4d0ba182a8bf62205d2ef576467e838ed6f2529b85fba24a9a60bf80001410ec9eea6698cd537939fad4749edd484cb541aced55cd9bf54764d063f23f6f1e32e12958ba5cfeb1bf618ad094266d4fc3c968c2088f677454c288c67ba0dba337b9d91c7e1ba586dc9a5bc2d5e90c14f53a8863ac75655461cea8f9", "keyed_hash": "879cf1fa2ea0e79126cb1063617a05b6ad9d0b696d0d757cf053439f60a99dd10173b961cd574288194b23ece278c330fbb8585485e74967f31352a8183aa782b2b22f26cdcadb61eed1a5bc144b8198fbb0c13abbf8e3192c145d0a5c21633b0ef86054f42809df823389ee40811a5910dcbd1018af31c3b43aa55201ed4edaac74fe", "derive_key": "7b2945cb4fef70885cc5d78a87bf6f6207dd901ff239201351ffac04e1088a23e2c11a1ebffcea4d80447867b61badb1383d842d4e79645d48dd82ccba290769caa7af8eaa1bd78a2a5e6e94fbdab78d9c7b74e894879f6a515257ccf6f95056f4e25390f24f6b35ffbb74b766202569b1d797f2d4bd9d17524c720107f985f4ddc583" }, { "input_len": 2049, "hash": "5f4d72f40d7a5f82b15ca2b2e44b1de3c2ef86c426c95c1af0b687952256303096de31d71d74103403822a2e0bc1eb193e7aecc9643a76b7bbc0c9f9c52e8783aae98764ca468962b5c2ec92f0c74eb5448d519713e09413719431c802f948dd5d90425a4ecdadece9eb178d80f26efccae630734dff63340285adec2aed3b51073ad3", "keyed_hash": "9f29700902f7c86e514ddc4df1e3049f258b2472b6dd5267f61bf13983b78dd5f9a88abfefdfa1e00b418971f2b39c64ca621e8eb37fceac57fd0c8fc8e117d43b81447be22d5d8186f8f5919ba6bcc6846bd7d50726c06d245672c2ad4f61702c646499ee1173daa061ffe15bf45a631e2946d616a4c345822f1151284712f76b2b0e", "derive_key": "2ea477c5515cc3dd606512ee72bb3e0e758cfae7232826f35fb98ca1bcbdf27316d8e9e79081a80b046b60f6a263616f33ca464bd78d79fa18200d06c7fc9bffd808cc4755277a7d5e09da0f29ed150f6537ea9bed946227ff184cc66a72a5f8c1e4bd8b04e81cf40fe6dc4427ad5678311a61f4ffc39d195589bdbc670f63ae70f4b6" }, { "input_len": 3072, "hash": "b98cb0ff3623be03326b373de6b9095218513e64f1ee2edd2525c7ad1e5cffd29a3f6b0b978d6608335c09dc94ccf682f9951cdfc501bfe47b9c9189a6fc7b404d120258506341a6d802857322fbd20d3e5dae05b95c88793fa83db1cb08e7d8008d1599b6209d78336e24839724c191b2a52a80448306e0daa84a3fdb566661a37e11", "keyed_hash": "044a0e7b172a312dc02a4c9a818c036ffa2776368d7f528268d2e6b5df19177022f302d0529e4174cc507c463671217975e81dab02b8fdeb0d7ccc7568dd22574c783a76be215441b32e91b9a904be8ea81f7a0afd14bad8ee7c8efc305ace5d3dd61b996febe8da4f56ca0919359a7533216e2999fc87ff7d8f176fbecb3d6f34278b", "derive_key": "050df97f8c2ead654d9bb3ab8c9178edcd902a32f8495949feadcc1e0480c46b3604131bbd6e3ba573b6dd682fa0a63e5b165d39fc43a625d00207607a2bfeb65ff1d29292152e26b298868e3b87be95d6458f6f2ce6118437b632415abe6ad522874bcd79e4030a5e7bad2efa90a7a7c67e93f0a18fb28369d0a9329ab5c24134ccb0" }, { "input_len": 3073, "hash": "7124b49501012f81cc7f11ca069ec9226cecb8a2c850cfe644e327d22d3e1cd39a27ae3b79d68d89da9bf25bc27139ae65a324918a5f9b7828181e52cf373c84f35b639b7fccbb985b6f2fa56aea0c18f531203497b8bbd3a07ceb5926f1cab74d14bd66486d9a91eba99059a98bd1cd25876b2af5a76c3e9eed554ed72ea952b603bf", "keyed_hash": "68dede9bef00ba89e43f31a6825f4cf433389fedae75c04ee9f0cf16a427c95a96d6da3fe985054d3478865be9a092250839a697bbda74e279e8a9e69f0025e4cfddd6cfb434b1cd9543aaf97c635d1b451a4386041e4bb100f5e45407cbbc24fa53ea2de3536ccb329e4eb9466ec37093a42cf62b82903c696a93a50b702c80f3c3c5", "derive_key": "72613c9ec9ff7e40f8f5c173784c532ad852e827dba2bf85b2ab4b76f7079081576288e552647a9d86481c2cae75c2dd4e7c5195fb9ada1ef50e9c5098c249d743929191441301c69e1f48505a4305ec1778450ee48b8e69dc23a25960fe33070ea549119599760a8a2d28aeca06b8c5e9ba58bc19e11fe57b6ee98aa44b2a8e6b14a5" }, { "input_len": 4096, "hash": "015094013f57a5277b59d8475c0501042c0b642e531b0a1c8f58d2163229e9690289e9409ddb1b99768eafe1623da896faf7e1114bebeadc1be30829b6f8af707d85c298f4f0ff4d9438aef948335612ae921e76d411c3a9111df62d27eaf871959ae0062b5492a0feb98ef3ed4af277f5395172dbe5c311918ea0074ce0036454f620", "keyed_hash": "befc660aea2f1718884cd8deb9902811d332f4fc4a38cf7c7300d597a081bfc0bbb64a36edb564e01e4b4aaf3b060092a6b838bea44afebd2deb8298fa562b7b597c757b9df4c911c3ca462e2ac89e9a787357aaf74c3b56d5c07bc93ce899568a3eb17d9250c20f6c5f6c1e792ec9a2dcb715398d5a6ec6d5c54f586a00403a1af1de", "derive_key": "1e0d7f3db8c414c97c6307cbda6cd27ac3b030949da8e23be1a1a924ad2f25b9d78038f7b198596c6cc4a9ccf93223c08722d684f240ff6569075ed81591fd93f9fff1110b3a75bc67e426012e5588959cc5a4c192173a03c00731cf84544f65a2fb9378989f72e9694a6a394a8a30997c2e67f95a504e631cd2c5f55246024761b245" }, { "input_len": 4097, "hash": "9b4052b38f1c5fc8b1f9ff7ac7b27cd242487b3d890d15c96a1c25b8aa0fb99505f91b0b5600a11251652eacfa9497b31cd3c409ce2e45cfe6c0a016967316c426bd26f619eab5d70af9a418b845c608840390f361630bd497b1ab44019316357c61dbe091ce72fc16dc340ac3d6e009e050b3adac4b5b2c92e722cffdc46501531956", "keyed_hash": "00df940cd36bb9fa7cbbc3556744e0dbc8191401afe70520ba292ee3ca80abbc606db4976cfdd266ae0abf667d9481831ff12e0caa268e7d3e57260c0824115a54ce595ccc897786d9dcbf495599cfd90157186a46ec800a6763f1c59e36197e9939e900809f7077c102f888caaf864b253bc41eea812656d46742e4ea42769f89b83f", "derive_key": "aca51029626b55fda7117b42a7c211f8c6e9ba4fe5b7a8ca922f34299500ead8a897f66a400fed9198fd61dd2d58d382458e64e100128075fc54b860934e8de2e84170734b06e1d212a117100820dbc48292d148afa50567b8b84b1ec336ae10d40c8c975a624996e12de31abbe135d9d159375739c333798a80c64ae895e51e22f3ad" }, { "input_len": 5120, "hash": "9cadc15fed8b5d854562b26a9536d9707cadeda9b143978f319ab34230535833acc61c8fdc114a2010ce8038c853e121e1544985133fccdd0a2d507e8e615e611e9a0ba4f47915f49e53d721816a9198e8b30f12d20ec3689989175f1bf7a300eee0d9321fad8da232ece6efb8e9fd81b42ad161f6b9550a069e66b11b40487a5f5059", "keyed_hash": "2c493e48e9b9bf31e0553a22b23503c0a3388f035cece68eb438d22fa1943e209b4dc9209cd80ce7c1f7c9a744658e7e288465717ae6e56d5463d4f80cdb2ef56495f6a4f5487f69749af0c34c2cdfa857f3056bf8d807336a14d7b89bf62bef2fb54f9af6a546f818dc1e98b9e07f8a5834da50fa28fb5874af91bf06020d1bf0120e", "derive_key": "7a7acac8a02adcf3038d74cdd1d34527de8a0fcc0ee3399d1262397ce5817f6055d0cefd84d9d57fe792d65a278fd20384ac6c30fdb340092f1a74a92ace99c482b28f0fc0ef3b923e56ade20c6dba47e49227166251337d80a037e987ad3a7f728b5ab6dfafd6e2ab1bd583a95d9c895ba9c2422c24ea0f62961f0dca45cad47bfa0d" }, { "input_len": 5121, "hash": "628bd2cb2004694adaab7bbd778a25df25c47b9d4155a55f8fbd79f2fe154cff96adaab0613a6146cdaabe498c3a94e529d3fc1da2bd08edf54ed64d40dcd6777647eac51d8277d70219a9694334a68bc8f0f23e20b0ff70ada6f844542dfa32cd4204ca1846ef76d811cdb296f65e260227f477aa7aa008bac878f72257484f2b6c95", "keyed_hash": "6ccf1c34753e7a044db80798ecd0782a8f76f33563accaddbfbb2e0ea4b2d0240d07e63f13667a8d1490e5e04f13eb617aea16a8c8a5aaed1ef6fbde1b0515e3c81050b361af6ead126032998290b563e3caddeaebfab592e155f2e161fb7cba939092133f23f9e65245e58ec23457b78a2e8a125588aad6e07d7f11a85b88d375b72d", "derive_key": "b07f01e518e702f7ccb44a267e9e112d403a7b3f4883a47ffbed4b48339b3c341a0add0ac032ab5aaea1e4e5b004707ec5681ae0fcbe3796974c0b1cf31a194740c14519273eedaabec832e8a784b6e7cfc2c5952677e6c3f2c3914454082d7eb1ce1766ac7d75a4d3001fc89544dd46b5147382240d689bbbaefc359fb6ae30263165" }, { "input_len": 6144, "hash": "3e2e5b74e048f3add6d21faab3f83aa44d3b2278afb83b80b3c35164ebeca2054d742022da6fdda444ebc384b04a54c3ac5839b49da7d39f6d8a9db03deab32aade156c1c0311e9b3435cde0ddba0dce7b26a376cad121294b689193508dd63151603c6ddb866ad16c2ee41585d1633a2cea093bea714f4c5d6b903522045b20395c83", "keyed_hash": "3d6b6d21281d0ade5b2b016ae4034c5dec10ca7e475f90f76eac7138e9bc8f1dc35754060091dc5caf3efabe0603c60f45e415bb3407db67e6beb3d11cf8e4f7907561f05dace0c15807f4b5f389c841eb114d81a82c02a00b57206b1d11fa6e803486b048a5ce87105a686dee041207e095323dfe172df73deb8c9532066d88f9da7e", "derive_key": "2a95beae63ddce523762355cf4b9c1d8f131465780a391286a5d01abb5683a1597099e3c6488aab6c48f3c15dbe1942d21dbcdc12115d19a8b8465fb54e9053323a9178e4275647f1a9927f6439e52b7031a0b465c861a3fc531527f7758b2b888cf2f20582e9e2c593709c0a44f9c6e0f8b963994882ea4168827823eef1f64169fef" }, { "input_len": 6145, "hash": "f1323a8631446cc50536a9f705ee5cb619424d46887f3c376c695b70e0f0507f18a2cfdd73c6e39dd75ce7c1c6e3ef238fd54465f053b25d21044ccb2093beb015015532b108313b5829c3621ce324b8e14229091b7c93f32db2e4e63126a377d2a63a3597997d4f1cba59309cb4af240ba70cebff9a23d5e3ff0cdae2cfd54e070022", "keyed_hash": "9ac301e9e39e45e3250a7e3b3df701aa0fb6889fbd80eeecf28dbc6300fbc539f3c184ca2f59780e27a576c1d1fb9772e99fd17881d02ac7dfd39675aca918453283ed8c3169085ef4a466b91c1649cc341dfdee60e32231fc34c9c4e0b9a2ba87ca8f372589c744c15fd6f985eec15e98136f25beeb4b13c4e43dc84abcc79cd4646c", "derive_key": "379bcc61d0051dd489f686c13de00d5b14c505245103dc040d9e4dd1facab8e5114493d029bdbd295aaa744a59e31f35c7f52dba9c3642f773dd0b4262a9980a2aef811697e1305d37ba9d8b6d850ef07fe41108993180cf779aeece363704c76483458603bbeeb693cffbbe5588d1f3535dcad888893e53d977424bb707201569a8d2" }, { "input_len": 7168, "hash": "61da957ec2499a95d6b8023e2b0e604ec7f6b50e80a9678b89d2628e99ada77a5707c321c83361793b9af62a40f43b523df1c8633cecb4cd14d00bdc79c78fca5165b863893f6d38b02ff7236c5a9a8ad2dba87d24c547cab046c29fc5bc1ed142e1de4763613bb162a5a538e6ef05ed05199d751f9eb58d332791b8d73fb74e4fce95", "keyed_hash": "b42835e40e9d4a7f42ad8cc04f85a963a76e18198377ed84adddeaecacc6f3fca2f01d5277d69bb681c70fa8d36094f73ec06e452c80d2ff2257ed82e7ba348400989a65ee8daa7094ae0933e3d2210ac6395c4af24f91c2b590ef87d7788d7066ea3eaebca4c08a4f14b9a27644f99084c3543711b64a070b94f2c9d1d8a90d035d52", "derive_key": "11c37a112765370c94a51415d0d651190c288566e295d505defdad895dae223730d5a5175a38841693020669c7638f40b9bc1f9f39cf98bda7a5b54ae24218a800a2116b34665aa95d846d97ea988bfcb53dd9c055d588fa21ba78996776ea6c40bc428b53c62b5f3ccf200f647a5aae8067f0ea1976391fcc72af1945100e2a6dcb88" }, { "input_len": 7169, "hash": "a003fc7a51754a9b3c7fae0367ab3d782dccf28855a03d435f8cfe74605e781798a8b20534be1ca9eb2ae2df3fae2ea60e48c6fb0b850b1385b5de0fe460dbe9d9f9b0d8db4435da75c601156df9d047f4ede008732eb17adc05d96180f8a73548522840779e6062d643b79478a6e8dbce68927f36ebf676ffa7d72d5f68f050b119c8", "keyed_hash": "ed9b1a922c046fdb3d423ae34e143b05ca1bf28b710432857bf738bcedbfa5113c9e28d72fcbfc020814ce3f5d4fc867f01c8f5b6caf305b3ea8a8ba2da3ab69fabcb438f19ff11f5378ad4484d75c478de425fb8e6ee809b54eec9bdb184315dc856617c09f5340451bf42fd3270a7b0b6566169f242e533777604c118a6358250f54", "derive_key": "554b0a5efea9ef183f2f9b931b7497995d9eb26f5c5c6dad2b97d62fc5ac31d99b20652c016d88ba2a611bbd761668d5eda3e568e940faae24b0d9991c3bd25a65f770b89fdcadabcb3d1a9c1cb63e69721cacf1ae69fefdcef1e3ef41bc5312ccc17222199e47a26552c6adc460cf47a72319cb5039369d0060eaea59d6c65130f1dd" }, { "input_len": 8192, "hash": "aae792484c8efe4f19e2ca7d371d8c467ffb10748d8a5a1ae579948f718a2a635fe51a27db045a567c1ad51be5aa34c01c6651c4d9b5b5ac5d0fd58cf18dd61a47778566b797a8c67df7b1d60b97b19288d2d877bb2df417ace009dcb0241ca1257d62712b6a4043b4ff33f690d849da91ea3bf711ed583cb7b7a7da2839ba71309bbf", "keyed_hash": "dc9637c8845a770b4cbf76b8daec0eebf7dc2eac11498517f08d44c8fc00d58a4834464159dcbc12a0ba0c6d6eb41bac0ed6585cabfe0aca36a375e6c5480c22afdc40785c170f5a6b8a1107dbee282318d00d915ac9ed1143ad40765ec120042ee121cd2baa36250c618adaf9e27260fda2f94dea8fb6f08c04f8f10c78292aa46102", "derive_key": "ad01d7ae4ad059b0d33baa3c01319dcf8088094d0359e5fd45d6aeaa8b2d0c3d4c9e58958553513b67f84f8eac653aeeb02ae1d5672dcecf91cd9985a0e67f4501910ecba25555395427ccc7241d70dc21c190e2aadee875e5aae6bf1912837e53411dabf7a56cbf8e4fb780432b0d7fe6cec45024a0788cf5874616407757e9e6bef7" }, { "input_len": 8193, "hash": "bab6c09cb8ce8cf459261398d2e7aef35700bf488116ceb94a36d0f5f1b7bc3bb2282aa69be089359ea1154b9a9286c4a56af4de975a9aa4a5c497654914d279bea60bb6d2cf7225a2fa0ff5ef56bbe4b149f3ed15860f78b4e2ad04e158e375c1e0c0b551cd7dfc82f1b155c11b6b3ed51ec9edb30d133653bb5709d1dbd55f4e1ff6", "keyed_hash": "954a2a75420c8d6547e3ba5b98d963e6fa6491addc8c023189cc519821b4a1f5f03228648fd983aef045c2fa8290934b0866b615f585149587dda2299039965328835a2b18f1d63b7e300fc76ff260b571839fe44876a4eae66cbac8c67694411ed7e09df51068a22c6e67d6d3dd2cca8ff12e3275384006c80f4db68023f24eebba57", "derive_key": "af1e0346e389b17c23200270a64aa4e1ead98c61695d917de7d5b00491c9b0f12f20a01d6d622edf3de026a4db4e4526225debb93c1237934d71c7340bb5916158cbdafe9ac3225476b6ab57a12357db3abbad7a26c6e66290e44034fb08a20a8d0ec264f309994d2810c49cfba6989d7abb095897459f5425adb48aba07c5fb3c83c0" }, { "input_len": 16384, "hash": "f875d6646de28985646f34ee13be9a576fd515f76b5b0a26bb324735041ddde49d764c270176e53e97bdffa58d549073f2c660be0e81293767ed4e4929f9ad34bbb39a529334c57c4a381ffd2a6d4bfdbf1482651b172aa883cc13408fa67758a3e47503f93f87720a3177325f7823251b85275f64636a8f1d599c2e49722f42e93893", "keyed_hash": "9e9fc4eb7cf081ea7c47d1807790ed211bfec56aa25bb7037784c13c4b707b0df9e601b101e4cf63a404dfe50f2e1865bb12edc8fca166579ce0c70dba5a5c0fc960ad6f3772183416a00bd29d4c6e651ea7620bb100c9449858bf14e1ddc9ecd35725581ca5b9160de04060045993d972571c3e8f71e9d0496bfa744656861b169d65", "derive_key": "160e18b5878cd0df1c3af85eb25a0db5344d43a6fbd7a8ef4ed98d0714c3f7e160dc0b1f09caa35f2f417b9ef309dfe5ebd67f4c9507995a531374d099cf8ae317542e885ec6f589378864d3ea98716b3bbb65ef4ab5e0ab5bb298a501f19a41ec19af84a5e6b428ecd813b1a47ed91c9657c3fba11c406bc316768b58f6802c9e9b57" }, { "input_len": 31744, "hash": "62b6960e1a44bcc1eb1a611a8d6235b6b4b78f32e7abc4fb4c6cdcce94895c47860cc51f2b0c28a7b77304bd55fe73af663c02d3f52ea053ba43431ca5bab7bfea2f5e9d7121770d88f70ae9649ea713087d1914f7f312147e247f87eb2d4ffef0ac978bf7b6579d57d533355aa20b8b77b13fd09748728a5cc327a8ec470f4013226f", "keyed_hash": "efa53b389ab67c593dba624d898d0f7353ab99e4ac9d42302ee64cbf9939a4193a7258db2d9cd32a7a3ecfce46144114b15c2fcb68a618a976bd74515d47be08b628be420b5e830fade7c080e351a076fbc38641ad80c736c8a18fe3c66ce12f95c61c2462a9770d60d0f77115bbcd3782b593016a4e728d4c06cee4505cb0c08a42ec", "derive_key": "39772aef80e0ebe60596361e45b061e8f417429d529171b6764468c22928e28e9759adeb797a3fbf771b1bcea30150a020e317982bf0d6e7d14dd9f064bc11025c25f31e81bd78a921db0174f03dd481d30e93fd8e90f8b2fee209f849f2d2a52f31719a490fb0ba7aea1e09814ee912eba111a9fde9d5c274185f7bae8ba85d300a2b" }, { "input_len": 102400, "hash": "bc3e3d41a1146b069abffad3c0d44860cf664390afce4d9661f7902e7943e085e01c59dab908c04c3342b816941a26d69c2605ebee5ec5291cc55e15b76146e6745f0601156c3596cb75065a9c57f35585a52e1ac70f69131c23d611ce11ee4ab1ec2c009012d236648e77be9295dd0426f29b764d65de58eb7d01dd42248204f45f8e", "keyed_hash": "1c35d1a5811083fd7119f5d5d1ba027b4d01c0c6c49fb6ff2cf75393ea5db4a7f9dbdd3e1d81dcbca3ba241bb18760f207710b751846faaeb9dff8262710999a59b2aa1aca298a032d94eacfadf1aa192418eb54808db23b56e34213266aa08499a16b354f018fc4967d05f8b9d2ad87a7278337be9693fc638a3bfdbe314574ee6fc4", "derive_key": "4652cff7a3f385a6103b5c260fc1593e13c778dbe608efb092fe7ee69df6e9c6d83a3e041bc3a48df2879f4a0a3ed40e7c961c73eff740f3117a0504c2dff4786d44fb17f1549eb0ba585e40ec29bf7732f0b7e286ff8acddc4cb1e23b87ff5d824a986458dcc6a04ac83969b80637562953df51ed1a7e90a7926924d2763778be8560" } ] } ================================================ FILE: third-party/blake3/tools/compiler_version/Cargo.toml ================================================ [package] name = "compiler_version" version = "0.0.0" edition = "2021" [build-dependencies] cc = "1.0.50" ================================================ FILE: third-party/blake3/tools/compiler_version/build.rs ================================================ fn main() { let build = cc::Build::new(); let compiler = build.get_compiler(); let compiler_path = compiler.path().to_string_lossy(); println!("cargo:rustc-env=COMPILER_PATH={}", compiler_path); } ================================================ FILE: third-party/blake3/tools/compiler_version/src/main.rs ================================================ use std::process::Command; fn main() { // Print the rustc version. Command::new(env!("CARGO")) .args(&["rustc", "--quiet", "--", "--version"]) .status() .unwrap(); println!(); // Print the Cargo version. Command::new(env!("CARGO")) .args(&["--version"]) .status() .unwrap(); println!(); // Print the C compiler version. This relies on C compiler detection done // in build.rs, which sets the COMPILER_PATH variable. let compiler_path = env!("COMPILER_PATH"); let mut compiler_command = Command::new(compiler_path); // Use the --version flag on everything other than MSVC. if !cfg!(target_env = "msvc") { compiler_command.arg("--version"); } let _ = compiler_command.status().unwrap(); } ================================================ FILE: third-party/blake3/tools/instruction_set_support/Cargo.toml ================================================ [package] name = "instruction_set_support" version = "0.0.0" edition = "2021" [dependencies] ================================================ FILE: third-party/blake3/tools/instruction_set_support/src/main.rs ================================================ fn main() { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { dbg!(is_x86_feature_detected!("sse2")); dbg!(is_x86_feature_detected!("sse4.1")); dbg!(is_x86_feature_detected!("avx2")); dbg!(is_x86_feature_detected!("avx512f")); dbg!(is_x86_feature_detected!("avx512vl")); } } ================================================ FILE: third-party/blake3/tools/release.md ================================================ # Release checklist - Make sure `cargo outdated -R` is clean in the root and in b3sum/. - Bump the version in the root Cargo.toml. - Bump the version in b3sum/Cargo.toml. - Bump the dependency version too, if new features are used. - Delete b3sum/Cargo.lock and recreate it with `cargo build` or similar. - Update the `-h` output in b3sum/README.md if it's changed. - Bump `BLAKE3_VERSION_STRING` in c/blake3.h. - Bump `VERSION` in c/CMakeLists.txt. - Make a version bump commit with change notes. - `git push` and make sure CI is green. - `git tag` the version bump commit with the new version number. - `git push --tags` - `cargo publish` in the root. - `cargo publish` in b3sum/. ================================================ FILE: third-party/mimalloc/.gitattributes ================================================ # default behavior is to always use unix style line endings * text eol=lf *.png binary *.pdn binary *.jpg binary *.sln binary *.suo binary *.vcproj binary *.patch binary *.dll binary *.lib binary *.exe binary ================================================ FILE: third-party/mimalloc/.gitignore ================================================ build ide/vs20??/*.db ide/vs20??/*.opendb ide/vs20??/*.user ide/vs20??/.vs ide/vs20??/VTune* out/ docs/ *.zip *.tar *.gz .vscode .DS_STore ================================================ FILE: third-party/mimalloc/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.18) project(libmimalloc C CXX) set(CMAKE_C_STANDARD 11) set(CMAKE_CXX_STANDARD 17) option(MI_SECURE "Use full security mitigations (like guard pages, allocation randomization, double-free mitigation, and free-list corruption detection)" OFF) option(MI_DEBUG_FULL "Use full internal heap invariant checking in DEBUG mode (expensive)" OFF) option(MI_PADDING "Enable padding to detect heap block overflow (always on in DEBUG or SECURE mode, or with Valgrind/ASAN)" OFF) option(MI_OVERRIDE "Override the standard malloc interface (i.e. define entry points for 'malloc', 'free', etc)" ON) option(MI_XMALLOC "Enable abort() call on memory allocation failure by default" OFF) option(MI_SHOW_ERRORS "Show error and warning messages by default (only enabled by default in DEBUG mode)" OFF) option(MI_TRACK_VALGRIND "Compile with Valgrind support (adds a small overhead)" OFF) option(MI_TRACK_ASAN "Compile with address sanitizer support (adds a small overhead)" OFF) option(MI_TRACK_ETW "Compile with Windows event tracing (ETW) support (adds a small overhead)" OFF) option(MI_USE_CXX "Use the C++ compiler to compile the library (instead of the C compiler)" OFF) option(MI_OPT_ARCH "Only for optimized builds: turn on architecture specific optimizations (for arm64: '-march=armv8.1-a' (2016))" OFF) option(MI_SEE_ASM "Generate assembly files" OFF) option(MI_OSX_INTERPOSE "Use interpose to override standard malloc on macOS" ON) option(MI_OSX_ZONE "Use malloc zone to override standard malloc on macOS" ON) option(MI_WIN_REDIRECT "Use redirection module ('mimalloc-redirect') on Windows if compiling mimalloc as a DLL" ON) option(MI_WIN_USE_FIXED_TLS "Use a fixed TLS slot on Windows to avoid extra tests in the malloc fast path" OFF) option(MI_LOCAL_DYNAMIC_TLS "Use local-dynamic-tls, a slightly slower but dlopen-compatible thread local storage mechanism (Unix)" OFF) option(MI_LIBC_MUSL "Set this when linking with musl libc" OFF) option(MI_BUILD_SHARED "Build shared library" ON) option(MI_BUILD_STATIC "Build static library" ON) option(MI_BUILD_OBJECT "Build object library" ON) option(MI_BUILD_TESTS "Build test executables" ON) option(MI_DEBUG_TSAN "Build with thread sanitizer (needs clang)" OFF) option(MI_DEBUG_UBSAN "Build with undefined-behavior sanitizer (needs clang++)" OFF) option(MI_GUARDED "Build with guard pages behind certain object allocations (implies MI_NO_PADDING=ON)" OFF) option(MI_SKIP_COLLECT_ON_EXIT "Skip collecting memory on program exit" OFF) option(MI_NO_PADDING "Force no use of padding even in DEBUG mode etc." OFF) option(MI_INSTALL_TOPLEVEL "Install directly into $CMAKE_INSTALL_PREFIX instead of PREFIX/lib/mimalloc-version" OFF) option(MI_NO_THP "Disable transparent huge pages support on Linux/Android for the mimalloc process only" OFF) option(MI_EXTRA_CPPDEFS "Extra pre-processor definitions (use as `-DMI_EXTRA_CPPDEFS=\"opt1=val1;opt2=val2\"`)" "") # negated options for vcpkg features option(MI_NO_USE_CXX "Use plain C compilation (has priority over MI_USE_CXX)" OFF) option(MI_NO_OPT_ARCH "Do not use architecture specific optimizations (like '-march=armv8.1-a' for example) (has priority over MI_OPT_ARCH)" OFF) # deprecated options option(MI_WIN_USE_FLS "Use Fiber local storage on Windows to detect thread termination (deprecated)" OFF) option(MI_CHECK_FULL "Use full internal invariant checking in DEBUG mode (deprecated, use MI_DEBUG_FULL instead)" OFF) option(MI_USE_LIBATOMIC "Explicitly link with -latomic (on older systems) (deprecated and detected automatically)" OFF) include(CheckLinkerFlag) # requires cmake 3.18 include(CheckIncludeFiles) include(GNUInstallDirs) include("cmake/mimalloc-config-version.cmake") set(mi_sources src/alloc.c src/alloc-aligned.c src/alloc-posix.c src/arena.c src/bitmap.c src/heap.c src/init.c src/libc.c src/options.c src/os.c src/page.c src/random.c src/segment.c src/segment-map.c src/stats.c src/prim/prim.c) set(mi_cflags "") set(mi_cflags_static "") # extra flags for a static library build set(mi_cflags_dynamic "") # extra flags for a shared-object library build set(mi_libraries "") if(MI_EXTRA_CPPDEFS) set(mi_defines ${MI_EXTRA_CPPDEFS}) else() set(mi_defines "") endif() # pass git revision as a define if(EXISTS "${CMAKE_SOURCE_DIR}/.git/index") find_package(Git) if(GIT_FOUND) execute_process(COMMAND ${GIT_EXECUTABLE} "describe" OUTPUT_VARIABLE mi_git_describe RESULT_VARIABLE mi_git_res ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) if(mi_git_res EQUAL "0") list(APPEND mi_defines "MI_GIT_DESCRIBE=${mi_git_describe}") # add to dependencies so we rebuild if the git head commit changes set_property(GLOBAL APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${CMAKE_SOURCE_DIR}/.git/index") endif() endif() endif() # ----------------------------------------------------------------------------- # Convenience: set default build type and compiler depending on the build directory # ----------------------------------------------------------------------------- message(STATUS "") if (NOT CMAKE_BUILD_TYPE) if ("${CMAKE_BINARY_DIR}" MATCHES ".*((D|d)ebug|asan|tsan|ubsan|valgrind)$" OR MI_DEBUG_FULL) message(STATUS "No build type selected, default to 'Debug'") set(CMAKE_BUILD_TYPE "Debug") else() message(STATUS "No build type selected, default to 'Release'") set(CMAKE_BUILD_TYPE "Release") endif() endif() if (CMAKE_GENERATOR MATCHES "^Visual Studio.*$") message(STATUS "Note: when building with Visual Studio the build type is specified when building.") message(STATUS "For example: 'cmake --build . --config=Release") endif() if("${CMAKE_BINARY_DIR}" MATCHES ".*(S|s)ecure$") message(STATUS "Default to secure build") set(MI_SECURE "ON") endif() # Determine architecture set(MI_OPT_ARCH_FLAGS "") set(MI_ARCH "unknown") if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86|i[3456]86)$" OR CMAKE_GENERATOR_PLATFORM MATCHES "^(x86|Win32)$") set(MI_ARCH "x86") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|x64|amd64|AMD64)$" OR CMAKE_GENERATOR_PLATFORM STREQUAL "x64" OR "x86_64" IN_LIST CMAKE_OSX_ARCHITECTURES) # must be before arm64 set(MI_ARCH "x64") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64|armv[89].?|ARM64)$" OR CMAKE_GENERATOR_PLATFORM STREQUAL "ARM64" OR "arm64" IN_LIST CMAKE_OSX_ARCHITECTURES) set(MI_ARCH "arm64") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|armv[34567]|ARM)$") set(MI_ARCH "arm32") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv|riscv32|riscv64)$") if(CMAKE_SIZEOF_VOID_P==4) set(MI_ARCH "riscv32") else() set(MI_ARCH "riscv64") endif() else() set(MI_ARCH ${CMAKE_SYSTEM_PROCESSOR}) endif() message(STATUS "Architecture: ${MI_ARCH}") # (${CMAKE_SYSTEM_PROCESSOR}, ${CMAKE_GENERATOR_PLATFORM}, ${CMAKE_GENERATOR})") # negative overrides (mainly to support vcpkg features) if(MI_NO_USE_CXX) set(MI_USE_CXX "OFF") endif() if(MI_NO_OPT_ARCH) set(MI_OPT_ARCH "OFF") elseif(MI_ARCH STREQUAL "arm64") set(MI_OPT_ARCH "ON") # enable armv8.1-a by default on arm64 unless MI_NO_OPT_ARCH is set endif() # ----------------------------------------------------------------------------- # Process options # ----------------------------------------------------------------------------- if(CMAKE_C_COMPILER_ID STREQUAL "Clang" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT STREQUAL "MSVC") set(MI_CLANG_CL "ON") endif() # put -Wall early so other warnings can be disabled selectively if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang") if (MI_CLANG_CL) list(APPEND mi_cflags -W) else() list(APPEND mi_cflags -Wall -Wextra -Wpedantic) endif() endif() if(CMAKE_C_COMPILER_ID MATCHES "GNU") list(APPEND mi_cflags -Wall -Wextra) endif() if(CMAKE_C_COMPILER_ID MATCHES "Intel") list(APPEND mi_cflags -Wall) endif() if(CMAKE_C_COMPILER_ID MATCHES "MSVC|Intel") set(MI_USE_CXX "ON") endif() if(MI_OVERRIDE) message(STATUS "Override standard malloc (MI_OVERRIDE=ON)") if(APPLE) if(MI_OSX_ZONE) # use zone's on macOS message(STATUS " Use malloc zone to override malloc (MI_OSX_ZONE=ON)") list(APPEND mi_sources src/prim/osx/alloc-override-zone.c) list(APPEND mi_defines MI_OSX_ZONE=1) if (NOT MI_OSX_INTERPOSE) message(STATUS " WARNING: zone overriding usually also needs interpose (use -DMI_OSX_INTERPOSE=ON)") endif() endif() if(MI_OSX_INTERPOSE) # use interpose on macOS message(STATUS " Use interpose to override malloc (MI_OSX_INTERPOSE=ON)") list(APPEND mi_defines MI_OSX_INTERPOSE=1) if (NOT MI_OSX_ZONE) message(STATUS " WARNING: interpose usually also needs zone overriding (use -DMI_OSX_INTERPOSE=ON)") endif() endif() if(MI_USE_CXX AND MI_OSX_INTERPOSE) message(STATUS " WARNING: if dynamically overriding malloc/free, it is more reliable to build mimalloc as C code (use -DMI_USE_CXX=OFF)") endif() endif() endif() if(WIN32) if (NOT MI_WIN_REDIRECT) # use a negative define for backward compatibility list(APPEND mi_defines MI_WIN_NOREDIRECT=1) endif() endif() if(MI_SECURE) message(STATUS "Set full secure build (MI_SECURE=ON)") list(APPEND mi_defines MI_SECURE=4) endif() if(MI_TRACK_VALGRIND) CHECK_INCLUDE_FILES("valgrind/valgrind.h;valgrind/memcheck.h" MI_HAS_VALGRINDH) if (NOT MI_HAS_VALGRINDH) set(MI_TRACK_VALGRIND OFF) message(WARNING "Cannot find the 'valgrind/valgrind.h' and 'valgrind/memcheck.h' -- install valgrind first?") message(STATUS "Disabling Valgrind support (MI_TRACK_VALGRIND=OFF)") else() message(STATUS "Compile with Valgrind support (MI_TRACK_VALGRIND=ON)") list(APPEND mi_defines MI_TRACK_VALGRIND=1) endif() endif() if(MI_TRACK_ASAN) if (APPLE AND MI_OVERRIDE) set(MI_TRACK_ASAN OFF) message(WARNING "Cannot enable address sanitizer support on macOS if MI_OVERRIDE is ON (MI_TRACK_ASAN=OFF)") endif() if (MI_TRACK_VALGRIND) set(MI_TRACK_ASAN OFF) message(WARNING "Cannot enable address sanitizer support with also Valgrind support enabled (MI_TRACK_ASAN=OFF)") endif() if(MI_TRACK_ASAN) CHECK_INCLUDE_FILES("sanitizer/asan_interface.h" MI_HAS_ASANH) if (NOT MI_HAS_ASANH) set(MI_TRACK_ASAN OFF) message(WARNING "Cannot find the 'sanitizer/asan_interface.h' -- install address sanitizer support first") message(STATUS "Compile **without** address sanitizer support (MI_TRACK_ASAN=OFF)") else() message(STATUS "Compile with address sanitizer support (MI_TRACK_ASAN=ON)") list(APPEND mi_defines MI_TRACK_ASAN=1) list(APPEND mi_cflags -fsanitize=address) list(APPEND mi_libraries -fsanitize=address) endif() endif() endif() if(MI_TRACK_ETW) if(NOT WIN32) set(MI_TRACK_ETW OFF) message(WARNING "Can only enable ETW support on Windows (MI_TRACK_ETW=OFF)") endif() if (MI_TRACK_VALGRIND OR MI_TRACK_ASAN) set(MI_TRACK_ETW OFF) message(WARNING "Cannot enable ETW support with also Valgrind or ASAN support enabled (MI_TRACK_ETW=OFF)") endif() if(MI_TRACK_ETW) message(STATUS "Compile with Windows event tracing support (MI_TRACK_ETW=ON)") list(APPEND mi_defines MI_TRACK_ETW=1) endif() endif() if(MI_GUARDED) message(STATUS "Compile guard pages behind certain object allocations (MI_GUARDED=ON)") list(APPEND mi_defines MI_GUARDED=1) if(NOT MI_NO_PADDING) message(STATUS " Disabling padding due to guard pages (MI_NO_PADDING=ON)") set(MI_NO_PADDING ON) endif() endif() if(MI_SEE_ASM) message(STATUS "Generate assembly listings (MI_SEE_ASM=ON)") list(APPEND mi_cflags -save-temps) if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang") message(STATUS "No GNU Line marker") list(APPEND mi_cflags -Wno-gnu-line-marker) endif() endif() if(MI_CHECK_FULL) message(STATUS "The MI_CHECK_FULL option is deprecated, use MI_DEBUG_FULL instead") set(MI_DEBUG_FULL "ON") endif() if (MI_SKIP_COLLECT_ON_EXIT) message(STATUS "Skip collecting memory on program exit (MI_SKIP_COLLECT_ON_EXIT=ON)") list(APPEND mi_defines MI_SKIP_COLLECT_ON_EXIT=1) endif() if(MI_DEBUG_FULL) message(STATUS "Set debug level to full internal invariant checking (MI_DEBUG_FULL=ON)") list(APPEND mi_defines MI_DEBUG=3) # full invariant checking endif() if(MI_NO_PADDING) message(STATUS "Suppress any padding of heap blocks (MI_NO_PADDING=ON)") list(APPEND mi_defines MI_PADDING=0) else() if(MI_PADDING) message(STATUS "Enable explicit padding of heap blocks (MI_PADDING=ON)") list(APPEND mi_defines MI_PADDING=1) endif() endif() if(MI_XMALLOC) message(STATUS "Enable abort() calls on memory allocation failure (MI_XMALLOC=ON)") list(APPEND mi_defines MI_XMALLOC=1) endif() if(MI_SHOW_ERRORS) message(STATUS "Enable printing of error and warning messages by default (MI_SHOW_ERRORS=ON)") list(APPEND mi_defines MI_SHOW_ERRORS=1) endif() if(MI_DEBUG_TSAN) if(CMAKE_C_COMPILER_ID MATCHES "Clang") message(STATUS "Build with thread sanitizer (MI_DEBUG_TSAN=ON)") list(APPEND mi_defines MI_TSAN=1) list(APPEND mi_cflags -fsanitize=thread -g -O1) list(APPEND mi_libraries -fsanitize=thread) else() message(WARNING "Can only use thread sanitizer with clang (MI_DEBUG_TSAN=ON but ignored)") endif() endif() if(MI_DEBUG_UBSAN) if(CMAKE_BUILD_TYPE MATCHES "Debug") if(CMAKE_CXX_COMPILER_ID MATCHES "Clang") message(STATUS "Build with undefined-behavior sanitizer (MI_DEBUG_UBSAN=ON)") list(APPEND mi_defines MI_UBSAN=1) list(APPEND mi_cflags -fsanitize=undefined -g -fno-sanitize-recover=undefined) list(APPEND mi_libraries -fsanitize=undefined) if (NOT MI_USE_CXX) message(STATUS "(switch to use C++ due to MI_DEBUG_UBSAN)") set(MI_USE_CXX "ON") endif() else() message(WARNING "Can only use undefined-behavior sanitizer with clang++ (MI_DEBUG_UBSAN=ON but ignored)") endif() else() message(WARNING "Can only use undefined-behavior sanitizer with a debug build (CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE})") endif() endif() if(MI_USE_CXX) message(STATUS "Use the C++ compiler to compile (MI_USE_CXX=ON)") set_source_files_properties(${mi_sources} PROPERTIES LANGUAGE CXX ) set_source_files_properties(src/static.c test/test-api.c test/test-api-fill test/test-stress PROPERTIES LANGUAGE CXX ) if(CMAKE_CXX_COMPILER_ID MATCHES "AppleClang|Clang") list(APPEND mi_cflags -Wno-deprecated) endif() if(CMAKE_CXX_COMPILER_ID MATCHES "Intel" AND NOT CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM") list(APPEND mi_cflags -Kc++) endif() endif() if(CMAKE_SYSTEM_NAME MATCHES "Linux|Android") if(MI_NO_THP) message(STATUS "Disable transparent huge pages support (MI_NO_THP=ON)") list(APPEND mi_defines MI_NO_THP=1) endif() endif() if(MI_LIBC_MUSL) message(STATUS "Assume using musl libc (MI_LIBC_MUSL=ON)") list(APPEND mi_defines MI_LIBC_MUSL=1) endif() if(MI_WIN_USE_FLS) message(STATUS "Use the Fiber API to detect thread termination (deprecated) (MI_WIN_USE_FLS=ON)") list(APPEND mi_defines MI_WIN_USE_FLS=1) endif() if(MI_WIN_USE_FIXED_TLS) message(STATUS "Use fixed TLS slot on Windows to avoid extra tests in the malloc fast path (MI_WIN_USE_FIXED_TLS=ON)") list(APPEND mi_defines MI_WIN_USE_FIXED_TLS=1) endif() # Check /proc/cpuinfo for an SV39 MMU and limit the virtual address bits. # (this will skip the aligned hinting in that case. Issue #939, #949) if (EXISTS /proc/cpuinfo) file(STRINGS /proc/cpuinfo mi_sv39_mmu REGEX "^mmu[ \t]+:[ \t]+sv39$") if (mi_sv39_mmu) MESSAGE( STATUS "Set virtual address bits to 39 (SV39 MMU detected)" ) list(APPEND mi_defines MI_DEFAULT_VIRTUAL_ADDRESS_BITS=39) endif() endif() # On Haiku use `-DCMAKE_INSTALL_PREFIX` instead, issue #788 # if(CMAKE_SYSTEM_NAME MATCHES "Haiku") # SET(CMAKE_INSTALL_LIBDIR ~/config/non-packaged/lib) # SET(CMAKE_INSTALL_INCLUDEDIR ~/config/non-packaged/headers) # endif() # Compiler flags if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU" AND NOT MI_CLANG_CL) list(APPEND mi_cflags -Wno-unknown-pragmas -fvisibility=hidden) if(NOT MI_USE_CXX) list(APPEND mi_cflags -Wstrict-prototypes) endif() if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang") list(APPEND mi_cflags -Wno-static-in-inline) endif() endif() if(CMAKE_C_COMPILER_ID MATCHES "Intel") list(APPEND mi_cflags -fvisibility=hidden) endif() if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM_NAME MATCHES "Haiku" AND NOT MI_CLANG_CL) if(MI_LOCAL_DYNAMIC_TLS) list(APPEND mi_cflags -ftls-model=local-dynamic) else() if(MI_LIBC_MUSL) # with musl we use local-dynamic for the static build, see issue #644 list(APPEND mi_cflags_static -ftls-model=local-dynamic) list(APPEND mi_cflags_dynamic -ftls-model=initial-exec) message(STATUS "Use local dynamic TLS for the static build (since MI_LIBC_MUSL=ON)") else() list(APPEND mi_cflags -ftls-model=initial-exec) endif() endif() if(MI_OVERRIDE) list(APPEND mi_cflags -fno-builtin-malloc) endif() endif() if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM_NAME MATCHES "Haiku") if(MI_OPT_ARCH) if(APPLE AND CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND CMAKE_OSX_ARCHITECTURES) # to support multi-arch binaries (#999) if("arm64" IN_LIST CMAKE_OSX_ARCHITECTURES) list(APPEND MI_OPT_ARCH_FLAGS "-Xarch_arm64;-march=armv8.1-a") endif() elseif(MI_ARCH STREQUAL "arm64") set(MI_OPT_ARCH_FLAGS "-march=armv8.1-a") # fast atomics endif() endif() endif() if (MSVC AND MSVC_VERSION GREATER_EQUAL 1914) list(APPEND mi_cflags /Zc:__cplusplus) if(MI_OPT_ARCH AND NOT MI_CLANG_CL) if(MI_ARCH STREQUAL "arm64") set(MI_OPT_ARCH_FLAGS "/arch:armv8.1") # fast atomics endif() endif() endif() if(MINGW) add_definitions(-D_WIN32_WINNT=0x600) # issue #976 endif() if(MI_OPT_ARCH_FLAGS) list(APPEND mi_cflags ${MI_OPT_ARCH_FLAGS}) message(STATUS "Architecture specific optimization is enabled (with ${MI_OPT_ARCH_FLAGS}) (MI_OPT_ARCH=ON)") endif() # extra needed libraries # we prefer -l test over `find_library` as sometimes core libraries # like `libatomic` are not on the system path (see issue #898) function(find_link_library libname outlibname) check_linker_flag(C "-l${libname}" mi_has_lib${libname}) if (mi_has_lib${libname}) message(VERBOSE "link library: -l${libname}") set(${outlibname} ${libname} PARENT_SCOPE) else() find_library(MI_LIBPATH libname) if (MI_LIBPATH) message(VERBOSE "link library ${libname} at ${MI_LIBPATH}") set(${outlibname} ${MI_LIBPATH} PARENT_SCOPE) else() message(VERBOSE "link library not found: ${libname}") set(${outlibname} "" PARENT_SCOPE) endif() endif() endfunction() if(WIN32) list(APPEND mi_libraries psapi shell32 user32 advapi32 bcrypt) else() find_link_library("pthread" MI_LIB_PTHREAD) if(MI_LIB_PTHREAD) list(APPEND mi_libraries "${MI_LIB_PTHREAD}") endif() find_link_library("rt" MI_LIB_RT) if(MI_LIB_RT) list(APPEND mi_libraries "${MI_LIB_RT}") endif() find_link_library("atomic" MI_LIB_ATOMIC) if(MI_LIB_ATOMIC) list(APPEND mi_libraries "${MI_LIB_ATOMIC}") endif() endif() # ----------------------------------------------------------------------------- # Install and output names # ----------------------------------------------------------------------------- # dynamic/shared library and symlinks always go to /usr/local/lib equivalent # we use ${CMAKE_INSTALL_BINDIR} and ${CMAKE_INSTALL_LIBDIR}. # static libraries and object files, includes, and cmake config files # are either installed at top level, or use versioned directories for side-by-side installation (default) if (MI_INSTALL_TOPLEVEL) set(mi_install_objdir "${CMAKE_INSTALL_LIBDIR}") set(mi_install_incdir "${CMAKE_INSTALL_INCLUDEDIR}") set(mi_install_cmakedir "${CMAKE_INSTALL_LIBDIR}/cmake/mimalloc") else() set(mi_install_objdir "${CMAKE_INSTALL_LIBDIR}/mimalloc-${mi_version}") # for static library and object files set(mi_install_incdir "${CMAKE_INSTALL_INCLUDEDIR}/mimalloc-${mi_version}") # for includes set(mi_install_cmakedir "${CMAKE_INSTALL_LIBDIR}/cmake/mimalloc-${mi_version}") # for cmake package info endif() set(mi_libname "mimalloc") if(MI_SECURE) set(mi_libname "${mi_libname}-secure") endif() if(MI_TRACK_VALGRIND) set(mi_libname "${mi_libname}-valgrind") endif() if(MI_TRACK_ASAN) set(mi_libname "${mi_libname}-asan") endif() string(TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_LC) list(APPEND mi_defines "MI_CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE_LC}") #todo: multi-config project needs $ ? if(NOT(CMAKE_BUILD_TYPE_LC MATCHES "^(release|relwithdebinfo|minsizerel|none)$")) set(mi_libname "${mi_libname}-${CMAKE_BUILD_TYPE_LC}") #append build type (e.g. -debug) if not a release version endif() if(MI_BUILD_SHARED) list(APPEND mi_build_targets "shared") endif() if(MI_BUILD_STATIC) list(APPEND mi_build_targets "static") endif() if(MI_BUILD_OBJECT) list(APPEND mi_build_targets "object") endif() if(MI_BUILD_TESTS) list(APPEND mi_build_targets "tests") endif() message(STATUS "") message(STATUS "Library name : ${mi_libname}") message(STATUS "Version : ${mi_version}.${mi_version_patch}") message(STATUS "Build type : ${CMAKE_BUILD_TYPE_LC}") if(MI_USE_CXX) message(STATUS "C++ Compiler : ${CMAKE_CXX_COMPILER}") else() message(STATUS "C Compiler : ${CMAKE_C_COMPILER}") endif() message(STATUS "Compiler flags : ${mi_cflags}") message(STATUS "Compiler defines : ${mi_defines}") message(STATUS "Link libraries : ${mi_libraries}") message(STATUS "Build targets : ${mi_build_targets}") message(STATUS "") # ----------------------------------------------------------------------------- # Main targets # ----------------------------------------------------------------------------- # shared library if(MI_BUILD_SHARED) add_library(mimalloc SHARED ${mi_sources}) set_target_properties(mimalloc PROPERTIES VERSION ${mi_version} SOVERSION ${mi_version_major} OUTPUT_NAME ${mi_libname} ) target_compile_definitions(mimalloc PRIVATE ${mi_defines} MI_SHARED_LIB MI_SHARED_LIB_EXPORT) target_compile_options(mimalloc PRIVATE ${mi_cflags} ${mi_cflags_dynamic}) target_link_libraries(mimalloc PRIVATE ${mi_libraries}) target_include_directories(mimalloc PUBLIC $ $ ) install(TARGETS mimalloc EXPORT mimalloc ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}) install(EXPORT mimalloc DESTINATION ${mi_install_cmakedir}) if(WIN32) # On windows, the import library name for the dll would clash with the static mimalloc.lib library # so we postfix the dll import library with `.dll.lib` (and also the .pdb debug file) set_property(TARGET mimalloc PROPERTY ARCHIVE_OUTPUT_NAME "${mi_libname}.dll" ) install(FILES "$/${mi_libname}.dll.lib" DESTINATION ${CMAKE_INSTALL_LIBDIR}) set_property(TARGET mimalloc PROPERTY PDB_NAME "${mi_libname}.dll") # don't try to install the pdb since it may not be generated depending on the configuration # install(FILES "$/${mi_libname}.dll.pdb" DESTINATION ${CMAKE_INSTALL_LIBDIR}) endif() if(WIN32 AND MI_WIN_REDIRECT) # On windows, link and copy the mimalloc redirection dll too. if(CMAKE_GENERATOR_PLATFORM STREQUAL "arm64ec") set(MIMALLOC_REDIRECT_SUFFIX "-arm64ec") elseif(MI_ARCH STREQUAL "x64") set(MIMALLOC_REDIRECT_SUFFIX "") if(CMAKE_SYSTEM_PROCESSOR STREQUAL "ARM64") message(STATUS "Note: x64 code emulated on Windows for arm64 should use an arm64ec build of 'mimalloc.dll'") message(STATUS " together with 'mimalloc-redirect-arm64ec.dll'. See the 'bin\\readme.md' for more information.") endif() elseif(MI_ARCH STREQUAL "x86") set(MIMALLOC_REDIRECT_SUFFIX "32") else() set(MIMALLOC_REDIRECT_SUFFIX "-${MI_ARCH}") # -arm64 etc. endif() target_link_libraries(mimalloc PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/bin/mimalloc-redirect${MIMALLOC_REDIRECT_SUFFIX}.lib) # the DLL import library add_custom_command(TARGET mimalloc POST_BUILD COMMAND "${CMAKE_COMMAND}" -E copy "${CMAKE_CURRENT_SOURCE_DIR}/bin/mimalloc-redirect${MIMALLOC_REDIRECT_SUFFIX}.dll" $ COMMENT "Copy mimalloc-redirect${MIMALLOC_REDIRECT_SUFFIX}.dll to output directory") install(FILES "$/mimalloc-redirect${MIMALLOC_REDIRECT_SUFFIX}.dll" DESTINATION ${CMAKE_INSTALL_BINDIR}) endif() endif() # static library if (MI_BUILD_STATIC) add_library(mimalloc-static STATIC ${mi_sources}) set_property(TARGET mimalloc-static PROPERTY OUTPUT_NAME ${mi_libname}) set_property(TARGET mimalloc-static PROPERTY POSITION_INDEPENDENT_CODE ON) target_compile_definitions(mimalloc-static PRIVATE ${mi_defines} MI_STATIC_LIB) target_compile_options(mimalloc-static PRIVATE ${mi_cflags} ${mi_cflags_static}) target_link_libraries(mimalloc-static PRIVATE ${mi_libraries}) target_include_directories(mimalloc-static PUBLIC $ $ ) install(TARGETS mimalloc-static EXPORT mimalloc DESTINATION ${mi_install_objdir} LIBRARY) install(EXPORT mimalloc DESTINATION ${mi_install_cmakedir}) endif() # install include files install(FILES include/mimalloc.h DESTINATION ${mi_install_incdir}) install(FILES include/mimalloc-override.h DESTINATION ${mi_install_incdir}) install(FILES include/mimalloc-new-delete.h DESTINATION ${mi_install_incdir}) install(FILES include/mimalloc-stats.h DESTINATION ${mi_install_incdir}) install(FILES cmake/mimalloc-config.cmake DESTINATION ${mi_install_cmakedir}) install(FILES cmake/mimalloc-config-version.cmake DESTINATION ${mi_install_cmakedir}) # single object file for more predictable static overriding if (MI_BUILD_OBJECT) add_library(mimalloc-obj OBJECT src/static.c) set_property(TARGET mimalloc-obj PROPERTY POSITION_INDEPENDENT_CODE ON) target_compile_definitions(mimalloc-obj PRIVATE ${mi_defines}) target_compile_options(mimalloc-obj PRIVATE ${mi_cflags} ${mi_cflags_static}) target_include_directories(mimalloc-obj PUBLIC $ $ ) # Copy the generated object file (`static.o`) to the output directory (as `mimalloc.o`) if(CMAKE_GENERATOR MATCHES "^Visual Studio.*$") set(mimalloc-obj-static "${CMAKE_CURRENT_BINARY_DIR}/mimalloc-obj.dir/$/static${CMAKE_C_OUTPUT_EXTENSION}") else() set(mimalloc-obj-static "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/mimalloc-obj.dir/src/static.c${CMAKE_C_OUTPUT_EXTENSION}") endif() set(mimalloc-obj-out "${CMAKE_CURRENT_BINARY_DIR}/${mi_libname}${CMAKE_C_OUTPUT_EXTENSION}") add_custom_command(OUTPUT ${mimalloc-obj-out} DEPENDS mimalloc-obj COMMAND "${CMAKE_COMMAND}" -E copy "${mimalloc-obj-static}" "${mimalloc-obj-out}") add_custom_target(mimalloc-obj-target ALL DEPENDS ${mimalloc-obj-out}) # the following seems to lead to cmake warnings/errors on some systems, disable for now :-( # install(TARGETS mimalloc-obj EXPORT mimalloc DESTINATION ${mi_install_objdir}) # the FILES expression can also be: $ # but that fails cmake versions less than 3.10 so we leave it as is for now install(FILES ${mimalloc-obj-static} DESTINATION ${mi_install_objdir} RENAME ${mi_libname}${CMAKE_C_OUTPUT_EXTENSION} ) endif() # pkg-config file support set(mi_pc_libraries "") foreach(item IN LISTS mi_libraries) if(item MATCHES " *[-].*") set(mi_pc_libraries "${mi_pc_libraries} ${item}") else() set(mi_pc_libraries "${mi_pc_libraries} -l${item}") endif() endforeach() include("cmake/JoinPaths.cmake") join_paths(mi_pc_includedir "\${prefix}" "${CMAKE_INSTALL_INCLUDEDIR}") join_paths(mi_pc_libdir "\${prefix}" "${CMAKE_INSTALL_LIBDIR}") configure_file(mimalloc.pc.in mimalloc.pc @ONLY) install(FILES "${CMAKE_CURRENT_BINARY_DIR}/mimalloc.pc" DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") # ----------------------------------------------------------------------------- # API surface testing # ----------------------------------------------------------------------------- if (MI_BUILD_TESTS) enable_testing() # static link tests foreach(TEST_NAME api api-fill stress) add_executable(mimalloc-test-${TEST_NAME} test/test-${TEST_NAME}.c) target_compile_definitions(mimalloc-test-${TEST_NAME} PRIVATE ${mi_defines}) target_compile_options(mimalloc-test-${TEST_NAME} PRIVATE ${mi_cflags}) target_include_directories(mimalloc-test-${TEST_NAME} PRIVATE include) if(MI_BUILD_SHARED AND (MI_TRACK_ASAN OR MI_DEBUG_TSAN OR MI_DEBUG_UBSAN)) target_link_libraries(mimalloc-test-${TEST_NAME} PRIVATE mimalloc ${mi_libraries}) else() target_link_libraries(mimalloc-test-${TEST_NAME} PRIVATE mimalloc-static ${mi_libraries}) endif() add_test(NAME test-${TEST_NAME} COMMAND mimalloc-test-${TEST_NAME}) endforeach() # dynamic override test if(MI_BUILD_SHARED AND NOT (MI_TRACK_ASAN OR MI_DEBUG_TSAN OR MI_DEBUG_UBSAN)) add_executable(mimalloc-test-stress-dynamic test/test-stress.c) target_compile_definitions(mimalloc-test-stress-dynamic PRIVATE ${mi_defines} "USE_STD_MALLOC=1") if(WIN32) target_compile_definitions(mimalloc-test-stress-dynamic PRIVATE "MI_LINK_VERSION=1") endif() target_compile_options(mimalloc-test-stress-dynamic PRIVATE ${mi_cflags}) target_include_directories(mimalloc-test-stress-dynamic PRIVATE include) target_link_libraries(mimalloc-test-stress-dynamic PRIVATE mimalloc ${mi_libraries}) # mi_version if(WIN32) add_test(NAME test-stress-dynamic COMMAND ${CMAKE_COMMAND} -E env MIMALLOC_SHOW_STATS=1 $) else() if(APPLE) set(LD_PRELOAD "DYLD_INSERT_LIBRARIES") else() set(LD_PRELOAD "LD_PRELOAD") endif() add_test(NAME test-stress-dynamic COMMAND ${CMAKE_COMMAND} -E env MIMALLOC_SHOW_STATS=1 ${LD_PRELOAD}=$ $) endif() endif() endif() # ----------------------------------------------------------------------------- # Set override properties # ----------------------------------------------------------------------------- if (MI_OVERRIDE) if (MI_BUILD_SHARED) target_compile_definitions(mimalloc PRIVATE MI_MALLOC_OVERRIDE) endif() if(NOT WIN32) # It is only possible to override malloc on Windows when building as a DLL. if (MI_BUILD_STATIC) target_compile_definitions(mimalloc-static PRIVATE MI_MALLOC_OVERRIDE) endif() if (MI_BUILD_OBJECT) target_compile_definitions(mimalloc-obj PRIVATE MI_MALLOC_OVERRIDE) endif() endif() endif() ================================================ FILE: third-party/mimalloc/LICENSE ================================================ MIT License Copyright (c) 2018-2025 Microsoft Corporation, Daan Leijen Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: third-party/mimalloc/SECURITY.md ================================================ ## Security Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin). If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below. ## Reporting Security Issues **Please do not report security vulnerabilities through public GitHub issues.** Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report). If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp). You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) * Full paths of source file(s) related to the manifestation of the issue * The location of the affected source code (tag/branch/commit or direct URL) * Any special configuration required to reproduce the issue * Step-by-step instructions to reproduce the issue * Proof-of-concept or exploit code (if possible) * Impact of the issue, including how an attacker might exploit the issue This information will help us triage your report more quickly. If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs. ## Preferred Languages We prefer all communications to be in English. ## Policy Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd). ================================================ FILE: third-party/mimalloc/azure-pipelines.yml ================================================ # Starter pipeline # Start with a minimal pipeline that you can customize to build and deploy your code. # Add steps that build, run tests, deploy, and more: # https://aka.ms/yaml trigger: branches: include: - master - dev - dev2 - dev3 tags: include: - v* jobs: - job: displayName: Windows 2022 pool: vmImage: windows-2022 strategy: matrix: Debug: BuildType: debug cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON MSBuildConfiguration: Debug Release: BuildType: release cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release MSBuildConfiguration: Release Secure: BuildType: secure cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_SECURE=ON MSBuildConfiguration: Release steps: - task: CMake@1 inputs: workingDirectory: $(BuildType) cmakeArgs: .. $(cmakeExtraArgs) - task: MSBuild@1 inputs: solution: $(BuildType)/libmimalloc.sln configuration: '$(MSBuildConfiguration)' msbuildArguments: -m - script: ctest --verbose --timeout 240 -C $(MSBuildConfiguration) workingDirectory: $(BuildType) displayName: CTest #- script: $(BuildType)\$(BuildType)\mimalloc-test-stress # displayName: TestStress #- upload: $(Build.SourcesDirectory)/$(BuildType) # artifact: mimalloc-windows-$(BuildType) - job: displayName: Ubuntu 22.04 pool: vmImage: ubuntu-22.04 strategy: matrix: Debug: CC: gcc CXX: g++ BuildType: debug cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON Release: CC: gcc CXX: g++ BuildType: release cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release Secure: CC: gcc CXX: g++ BuildType: secure cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_SECURE=ON Debug++: CC: gcc CXX: g++ BuildType: debug-cxx cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON -DMI_USE_CXX=ON Debug Clang: CC: clang CXX: clang++ BuildType: debug-clang cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON Release Clang: CC: clang CXX: clang++ BuildType: release-clang cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release Secure Clang: CC: clang CXX: clang++ BuildType: secure-clang cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_SECURE=ON Debug++ Clang: CC: clang CXX: clang++ BuildType: debug-clang-cxx cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON -DMI_USE_CXX=ON Debug ASAN Clang: CC: clang CXX: clang++ BuildType: debug-asan-clang cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON -DMI_TRACK_ASAN=ON Debug UBSAN Clang: CC: clang CXX: clang++ BuildType: debug-ubsan-clang cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON -DMI_DEBUG_UBSAN=ON Debug TSAN Clang++: CC: clang CXX: clang++ BuildType: debug-tsan-clang-cxx cmakeExtraArgs: -DCMAKE_BUILD_TYPE=RelWithDebInfo -DMI_USE_CXX=ON -DMI_DEBUG_TSAN=ON Debug Guarded Clang: CC: clang CXX: clang BuildType: debug-guarded-clang cmakeExtraArgs: -DCMAKE_BUILD_TYPE=RelWithDebInfo -DMI_DEBUG_FULL=ON -DMI_GUARDED=ON steps: - task: CMake@1 inputs: workingDirectory: $(BuildType) cmakeArgs: .. $(cmakeExtraArgs) - script: make -j$(nproc) -C $(BuildType) displayName: Make - script: ctest --verbose --timeout 240 workingDirectory: $(BuildType) displayName: CTest env: MIMALLOC_GUARDED_SAMPLE_RATE: 1000 # - upload: $(Build.SourcesDirectory)/$(BuildType) # artifact: mimalloc-ubuntu-$(BuildType) - job: displayName: macOS 14 (Sonoma) pool: vmImage: macOS-14 strategy: matrix: Debug: BuildType: debug cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON Release: BuildType: release cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release Secure: BuildType: secure cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_SECURE=ON steps: - task: CMake@1 inputs: workingDirectory: $(BuildType) cmakeArgs: .. $(cmakeExtraArgs) - script: make -j$(sysctl -n hw.ncpu) -C $(BuildType) displayName: Make - script: ctest --verbose --timeout 240 workingDirectory: $(BuildType) displayName: CTest # - upload: $(Build.SourcesDirectory)/$(BuildType) # artifact: mimalloc-macos-$(BuildType) # ---------------------------------------------------------- # Other OS versions (just debug mode) # ---------------------------------------------------------- - job: displayName: Windows 2019 pool: vmImage: windows-2019 strategy: matrix: Debug: BuildType: debug cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON MSBuildConfiguration: Debug Release: BuildType: release cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release MSBuildConfiguration: Release steps: - task: CMake@1 inputs: workingDirectory: $(BuildType) cmakeArgs: .. $(cmakeExtraArgs) - task: MSBuild@1 inputs: solution: $(BuildType)/libmimalloc.sln configuration: '$(MSBuildConfiguration)' msbuildArguments: -m - script: ctest --verbose --timeout 240 -C $(MSBuildConfiguration) workingDirectory: $(BuildType) displayName: CTest - job: displayName: Ubuntu 24.04 pool: vmImage: ubuntu-24.04 strategy: matrix: Debug: CC: gcc CXX: g++ BuildType: debug cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON Debug++: CC: gcc CXX: g++ BuildType: debug-cxx cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON -DMI_USE_CXX=ON Debug Clang: CC: clang CXX: clang++ BuildType: debug-clang cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON Debug++ Clang: CC: clang CXX: clang++ BuildType: debug-clang-cxx cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON -DMI_USE_CXX=ON Release Clang: CC: clang CXX: clang++ BuildType: release-clang cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release steps: - task: CMake@1 inputs: workingDirectory: $(BuildType) cmakeArgs: .. $(cmakeExtraArgs) - script: make -j$(nproc) -C $(BuildType) displayName: Make - script: ctest --verbose --timeout 240 workingDirectory: $(BuildType) displayName: CTest - job: displayName: macOS 15 (Sequoia) pool: vmImage: macOS-15 strategy: matrix: Debug: BuildType: debug cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON Release: BuildType: release cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release steps: - task: CMake@1 inputs: workingDirectory: $(BuildType) cmakeArgs: .. $(cmakeExtraArgs) - script: make -j$(sysctl -n hw.ncpu) -C $(BuildType) displayName: Make - script: ctest --verbose --timeout 240 workingDirectory: $(BuildType) displayName: CTest ================================================ FILE: third-party/mimalloc/bin/readme.md ================================================ # Windows Override We use a separate redirection DLL to override mimalloc on Windows such that we redirect all malloc/free calls that go through the (dynamic) C runtime allocator, including those from other DLL's or libraries. As it intercepts all allocation calls on a low level, it can be used on large programs that include other 3rd party components. There are four requirements to make the overriding work well: 1. Use the C-runtime library as a DLL (using the `/MD` or `/MDd` switch). 2. Link your program explicitly with the `mimalloc.dll.lib` export library for the `mimalloc.dll` -- which contains all mimalloc functionality. To ensure the `mimalloc.dll` is actually loaded at run-time it is easiest to insert some call to the mimalloc API in the `main` function, like `mi_version()` (or use the `/include:mi_version` switch on the linker, or similarly, `#pragma comment(linker, "/include:mi_version")` in some source file). See the `mimalloc-test-override` project for an example on how to use this. 3. The `mimalloc-redirect.dll` must be put in the same folder as the main `mimalloc.dll` at runtime (as it is a dependency of that DLL). The redirection DLL ensures that all calls to the C runtime malloc API get redirected to mimalloc functions (which reside in `mimalloc.dll`). 4. Ensure the `mimalloc.dll` comes as early as possible in the import list of the final executable (so it can intercept all potential allocations). You can use `minject -l ` to check this if needed. ```csharp ┌──────────────┐ │ Your Program │ └────┬─────────┘ │ │ mi_version() ┌───────────────┐ ┌───────────────────────┐ ├──────────────►│ mimalloc.dll ├────►│ mimalloc-redirect.dll │ │ └──────┬────────┘ └───────────────────────┘ │ ▼ │ malloc() etc. ┌──────────────┐ ├──────────────►│ ucrtbase.dll │ │ └──────────────┘ │ │ └──────────────► ... ``` For best performance on Windows with C++, it is also recommended to also override the `new`/`delete` operations (by including [`mimalloc-new-delete.h`](../include/mimalloc-new-delete.h) a single(!) source file in your project). The environment variable `MIMALLOC_DISABLE_REDIRECT=1` can be used to disable dynamic overriding at run-time. Use `MIMALLOC_VERBOSE=1` to check if mimalloc was successfully redirected. ### Other Platforms You always link with `mimalloc.dll` but for different platforms you may need a specific redirection DLL: - __x64__: `mimalloc-redirect.dll`. - __x86__: `mimalloc-redirect32.dll`. Use for older 32-bit Windows programs. - __arm64__: `mimalloc-redirect-arm64.dll`. Use for native Windows arm64 programs. - __arm64ec__: `mimalloc-redirect-arm64ec.dll`. The [arm64ec] ABI is "emulation compatible" mode on Windows arm64. Unfortunately we cannot run x64 code emulated on Windows arm64 with the x64 mimalloc override directly (since the C runtime always uses `arm64ec`). Instead: 1. Build the program as normal for x64 and link as normal with the x64 `mimalloc.lib` export library. 2. Now separately build `mimalloc.dll` in `arm64ec` mode and _overwrite_ your previous (x64) `mimalloc.dll` -- the loader can handle the mix of arm64ec and x64 code. Now use `mimalloc-redirect-arm64ec.dll` to match your new arm64ec `mimalloc.dll`. The main program stays as is and can be fully x64 or contain more arm64ec modules. At runtime, the arm64ec `mimalloc.dll` will run with native arm64 instructions while the rest of the program runs emulated x64. [arm64ec]: https://learn.microsoft.com/en-us/windows/arm/arm64ec ### Minject We cannot always re-link an executable with `mimalloc.dll`, and similarly, we cannot always ensure that the DLL comes first in the import table of the final executable. In many cases though we can patch existing executables without any recompilation if they are linked with the dynamic C runtime (`ucrtbase.dll`) -- just put the `mimalloc.dll` into the import table (and put `mimalloc-redirect.dll` in the same directory) Such patching can be done for example with [CFF Explorer](https://ntcore.com/?page_id=388). The `minject` program can also do this from the command line Use `minject --help` for options: ``` > minject --help minject: Injects the mimalloc dll into the import table of a 64-bit executable, and/or ensures that it comes first in het import table. usage: > minject [options] options: -h --help show this help -v --verbose be verbose -l --list only list imported modules -i --inplace update the exe in-place (make sure there is a backup!) -f --force always overwrite without prompting --postfix=

use

as a postfix to the mimalloc dll. e.g. use --postfix=debug to link with mimalloc-debug.dll notes: Without '--inplace' an injected is generated with the same name ending in '-mi'. Ensure 'mimalloc-redirect.dll' is in the same folder as the mimalloc dll. examples: > minject --list myprogram.exe > minject --force --inplace myprogram.exe ``` For x86 32-bit binaries, use `minject32`, and for arm64 binaries use `minject-arm64`. ================================================ FILE: third-party/mimalloc/cmake/JoinPaths.cmake ================================================ # This module provides function for joining paths # known from most languages # # SPDX-License-Identifier: (MIT OR CC0-1.0) # Copyright 2020 Jan Tojnar # https://github.com/jtojnar/cmake-snips # # Modelled after Python’s os.path.join # https://docs.python.org/3.7/library/os.path.html#os.path.join # Windows not supported function(join_paths joined_path first_path_segment) set(temp_path "${first_path_segment}") foreach(current_segment IN LISTS ARGN) if(NOT ("${current_segment}" STREQUAL "")) if(IS_ABSOLUTE "${current_segment}") set(temp_path "${current_segment}") else() set(temp_path "${temp_path}/${current_segment}") endif() endif() endforeach() set(${joined_path} "${temp_path}" PARENT_SCOPE) endfunction() ================================================ FILE: third-party/mimalloc/cmake/mimalloc-config-version.cmake ================================================ set(mi_version_major 2) set(mi_version_minor 2) set(mi_version_patch 2) set(mi_version ${mi_version_major}.${mi_version_minor}) set(PACKAGE_VERSION ${mi_version}) if(PACKAGE_FIND_VERSION_MAJOR) if("${PACKAGE_FIND_VERSION_MAJOR}" EQUAL "${mi_version_major}") if ("${PACKAGE_FIND_VERSION_MINOR}" EQUAL "${mi_version_minor}") set(PACKAGE_VERSION_EXACT TRUE) elseif("${PACKAGE_FIND_VERSION_MINOR}" LESS "${mi_version_minor}") set(PACKAGE_VERSION_COMPATIBLE TRUE) else() set(PACKAGE_VERSION_UNSUITABLE TRUE) endif() else() set(PACKAGE_VERSION_UNSUITABLE TRUE) endif() endif() ================================================ FILE: third-party/mimalloc/cmake/mimalloc-config.cmake ================================================ include(${CMAKE_CURRENT_LIST_DIR}/mimalloc.cmake) get_filename_component(MIMALLOC_CMAKE_DIR "${CMAKE_CURRENT_LIST_DIR}" PATH) # one up from the cmake dir, e.g. /usr/local/lib/cmake/mimalloc-2.0 get_filename_component(MIMALLOC_VERSION_DIR "${CMAKE_CURRENT_LIST_DIR}" NAME) string(REPLACE "/lib/cmake" "/lib" MIMALLOC_LIBRARY_DIR "${MIMALLOC_CMAKE_DIR}") if("${MIMALLOC_VERSION_DIR}" EQUAL "mimalloc") # top level install string(REPLACE "/lib/cmake" "/include" MIMALLOC_INCLUDE_DIR "${MIMALLOC_CMAKE_DIR}") set(MIMALLOC_OBJECT_DIR "${MIMALLOC_LIBRARY_DIR}") else() # versioned string(REPLACE "/lib/cmake/" "/include/" MIMALLOC_INCLUDE_DIR "${CMAKE_CURRENT_LIST_DIR}") string(REPLACE "/lib/cmake/" "/lib/" MIMALLOC_OBJECT_DIR "${CMAKE_CURRENT_LIST_DIR}") endif() set(MIMALLOC_TARGET_DIR "${MIMALLOC_LIBRARY_DIR}") # legacy ================================================ FILE: third-party/mimalloc/contrib/docker/alpine/Dockerfile ================================================ # alpine image FROM alpine # Install tools RUN apk add build-base make cmake RUN apk add git RUN apk add vim RUN mkdir -p /home/dev WORKDIR /home/dev # Get mimalloc RUN git clone https://github.com/microsoft/mimalloc -b dev2 RUN mkdir -p mimalloc/out/release RUN mkdir -p mimalloc/out/debug # Build mimalloc debug WORKDIR /home/dev/mimalloc/out/debug RUN cmake ../.. -DMI_DEBUG_FULL=ON RUN make -j RUN make test CMD ["/bin/sh"] ================================================ FILE: third-party/mimalloc/contrib/docker/alpine-arm32v7/Dockerfile ================================================ # install from an image # download first an appropriate tar.gz image into the current directory # from: FROM scratch # Substitute the image name that was downloaded ADD alpine-minirootfs-20240329-armv7.tar.gz / # Install tools RUN apk add build-base make cmake RUN apk add git RUN apk add vim RUN mkdir -p /home/dev WORKDIR /home/dev # Get mimalloc RUN git clone https://github.com/microsoft/mimalloc -b dev2 RUN mkdir -p mimalloc/out/release RUN mkdir -p mimalloc/out/debug # Build mimalloc debug WORKDIR /home/dev/mimalloc/out/debug RUN cmake ../.. -DMI_DEBUG_FULL=ON RUN make -j RUN make test CMD ["/bin/sh"] ================================================ FILE: third-party/mimalloc/contrib/docker/manylinux-x64/Dockerfile ================================================ FROM quay.io/pypa/manylinux2014_x86_64 # Install tools RUN yum install -y openssl-devel RUN yum install -y gcc gcc-c++ kernel-devel make RUN yum install -y git cmake RUN yum install -y vim RUN mkdir -p /home/dev WORKDIR /home/dev # Get mimalloc RUN git clone https://github.com/microsoft/mimalloc -b dev2 RUN mkdir -p mimalloc/out/release RUN mkdir -p mimalloc/out/debug # Build mimalloc debug WORKDIR /home/dev/mimalloc/out/debug RUN cmake ../.. -DMI_DEBUG_FULL=ON RUN make -j RUN make test CMD ["/bin/sh"] ================================================ FILE: third-party/mimalloc/contrib/docker/readme.md ================================================ Various example docker files used for testing. Usage: ``` > cd > docker build -t -mimalloc . > docker run -it -mimalloc >> make test ``` ================================================ FILE: third-party/mimalloc/contrib/vcpkg/portfile.cmake ================================================ vcpkg_from_github( OUT_SOURCE_PATH SOURCE_PATH REPO microsoft/mimalloc HEAD_REF master # The "REF" can be a commit hash, branch name (dev2), or a version (v2.2.1). # REF "v${VERSION}" REF e2db21e9ba9fb9172b7b0aa0fe9b8742525e8774 # The sha512 is the hash of the tar.gz bundle. # (To get the sha512, run `vcpkg install mimalloc[override] --overlay-ports=

` and copy the sha from the error message.) SHA512 8cbb601fdf8b46dd6a9c0d314d6da9d4960699853829e96d2470753867f90689fb4caeaf30d628943fd388670dc11902dbecc9cc7c329b99a510524a09bdb612 ) vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS FEATURES c MI_NO_USE_CXX guarded MI_GUARDED secure MI_SECURE override MI_OVERRIDE optarch MI_OPT_ARCH optsimd MI_OPT_SIMD xmalloc MI_XMALLOC asm MI_SEE_ASM ) string(COMPARE EQUAL "${VCPKG_LIBRARY_LINKAGE}" "static" MI_BUILD_STATIC) string(COMPARE EQUAL "${VCPKG_LIBRARY_LINKAGE}" "dynamic" MI_BUILD_SHARED) vcpkg_cmake_configure( SOURCE_PATH "${SOURCE_PATH}" OPTIONS -DMI_USE_CXX=ON -DMI_BUILD_TESTS=OFF -DMI_BUILD_OBJECT=ON -DMI_BUILD_STATIC=${MI_BUILD_STATIC} -DMI_BUILD_SHARED=${MI_BUILD_SHARED} -DMI_INSTALL_TOPLEVEL=ON ${FEATURE_OPTIONS} ) vcpkg_cmake_install() vcpkg_copy_pdbs() file(COPY "${CMAKE_CURRENT_LIST_DIR}/vcpkg-cmake-wrapper.cmake" "${CMAKE_CURRENT_LIST_DIR}/usage" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}" ) vcpkg_cmake_config_fixup(CONFIG_PATH lib/cmake/mimalloc) if(VCPKG_LIBRARY_LINKAGE STREQUAL "dynamic") # todo: why is this needed? vcpkg_replace_string( "${CURRENT_PACKAGES_DIR}/include/mimalloc.h" "!defined(MI_SHARED_LIB)" "0 // !defined(MI_SHARED_LIB)" ) endif() file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/include") file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/share") vcpkg_fixup_pkgconfig() vcpkg_install_copyright(FILE_LIST "${SOURCE_PATH}/LICENSE") ================================================ FILE: third-party/mimalloc/contrib/vcpkg/readme.md ================================================ # Vcpkg support This directory is meant to provide the sources for the official [vcpkg port] of mimalloc, but can also be used to override the official port with your own variant. For example, you can edit the [`portfile.cmake`](portfile.cmake) to check out a specific commit, version, or branch of mimalloc, or set further options. You can install such custom port as: ```sh $ vcpkg install "mimalloc[override]" --recurse --overlay-ports=./contrib/vcpkg ``` This will also show the correct sha512 hash if you use a custom version. Another way is to refer to the overlay from the [vcpkg-configuration.json](https://learn.microsoft.com/en-us/vcpkg/reference/vcpkg-configuration-json) file. See also the vcpkg [documentation](https://learn.microsoft.com/en-us/vcpkg/produce/update-package-version) for more information. # Using mimalloc from vcpkg When using [cmake with vcpkg](https://learn.microsoft.com/en-us/vcpkg/get_started/get-started?pivots=shell-powershell), you can use mimalloc from the `CMakeLists.txt` as: ```cmake find_package(mimalloc CONFIG REQUIRED) target_link_libraries(main PRIVATE mimalloc) ``` See [`test/CMakeLists.txt](../../test/CMakeLists.txt) for more examples. # Acknowledgements The original port for vckpg was contributed by many people, including: @vicroms, @myd7349, @PhoubeHui, @LilyWangL, @JonLiu1993, @RT2Code, Remy Tassoux, @wangao, @BillyONeal, @jiayuehua, @dg0yt, @gerar-ryan-immersaview, @nickdademo, and @jimwang118 -- Thank you so much! [vcpkg port]: https://github.com/microsoft/vcpkg/tree/master/ports/mimalloc ================================================ FILE: third-party/mimalloc/contrib/vcpkg/usage ================================================ Use the following CMake targets to import mimalloc: find_package(mimalloc CONFIG REQUIRED) target_link_libraries(main PRIVATE mimalloc) And use mimalloc in your sources as: #include #include int main(int argc, char** argv) { int* p = mi_malloc_tp(int); *p = mi_version(); printf("mimalloc version: %d\n", *p); mi_free(p); return 0; } When dynamically overriding on Windows, ensure `mimalloc.dll` is linked through some call to mimalloc (e.g. `mi_version()`), and that the `mimalloc-redirect.dll` is in the same directory. See https://github.com/microsoft/mimalloc/blob/dev/bin/readme.md for detailed information. ================================================ FILE: third-party/mimalloc/contrib/vcpkg/vcpkg-cmake-wrapper.cmake ================================================ _find_package(${ARGS}) if(CMAKE_CURRENT_LIST_DIR STREQUAL "${MIMALLOC_CMAKE_DIR}/${MIMALLOC_VERSION_DIR}") set(MIMALLOC_INCLUDE_DIR "${VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}/include") # As in vcpkg.cmake if(NOT DEFINED CMAKE_BUILD_TYPE OR CMAKE_BUILD_TYPE MATCHES "^[Dd][Ee][Bb][Uu][Gg]$") set(MIMALLOC_LIBRARY_DIR "${VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}/debug/lib") else() set(MIMALLOC_LIBRARY_DIR "${VCPKG_INSTALLED_DIR}/${VCPKG_TARGET_TRIPLET}/lib") endif() set(MIMALLOC_OBJECT_DIR "${MIMALLOC_LIBRARY_DIR}") set(MIMALLOC_TARGET_DIR "${MIMALLOC_LIBRARY_DIR}") endif() # vcpkg always configures either a static or dynamic library. # ensure to always expose the mimalloc target as either the static or dynamic build. if(TARGET mimalloc-static AND NOT TARGET mimalloc) add_library(mimalloc INTERFACE IMPORTED) set_target_properties(mimalloc PROPERTIES INTERFACE_LINK_LIBRARIES mimalloc-static) endif() ================================================ FILE: third-party/mimalloc/contrib/vcpkg/vcpkg.json ================================================ { "name": "mimalloc", "version": "2.2.2", "port-version": 1, "description": "Compact general purpose allocator with excellent performance", "homepage": "https://github.com/microsoft/mimalloc", "license": "MIT", "supports": "!uwp", "dependencies": [ { "name": "vcpkg-cmake", "host": true }, { "name": "vcpkg-cmake-config", "host": true } ], "features": { "c": { "description": "Use C11 compilation (this can still override new/delete)" }, "override": { "description": "Override the standard malloc/free interface" }, "secure": { "description": "Use full security mitigations (like guard pages and randomization)" }, "guarded": { "description": "Use build that support guard pages after objects controlled with MIMALLOC_GUARDED_SAMPLE_RATE" }, "xmalloc": { "description": "If out-of-memory, call abort() instead of returning NULL" }, "optarch": { "description": "Use architecture specific optimizations (on x64: '-march=haswell;-mavx2', on arm64: '-march=armv8.1-a')" }, "optsimd": { "description": "Allow use of SIMD instructions (avx2 or neon) (requires 'optarch' to be enabled)" }, "asm": { "description": "Generate assembly files" } } } ================================================ FILE: third-party/mimalloc/doc/doxyfile ================================================ # Doxyfile 1.11.0 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project. # # All text after a double hash (##) is considered a comment and is placed in # front of the TAG it is preceding. # # All text after a single hash (#) is considered a comment and will be ignored. # The format is: # TAG = value [value, ...] # For lists, items can also be appended using: # TAG += value [value, ...] # Values that contain spaces should be placed between quotes (\" \"). # # Note: # # Use doxygen to compare the used configuration file with the template # configuration file: # doxygen -x [configFile] # Use doxygen to compare the used configuration file with the template # configuration file without replacing the environment variables or CMake type # replacement variables: # doxygen -x_noenv [configFile] #--------------------------------------------------------------------------- # Project related configuration options #--------------------------------------------------------------------------- # This tag specifies the encoding used for all characters in the configuration # file that follow. The default is UTF-8 which is also the encoding used for all # text before the first occurrence of this tag. Doxygen uses libiconv (or the # iconv built into libc) for the transcoding. See # https://www.gnu.org/software/libiconv/ for the list of possible encodings. # The default value is: UTF-8. DOXYFILE_ENCODING = UTF-8 # The PROJECT_NAME tag is a single word (or a sequence of words surrounded by # double-quotes, unless you are using Doxywizard) that should identify the # project for which the documentation is generated. This name is used in the # title of most generated pages and in a few other places. # The default value is: My Project. PROJECT_NAME = mi-malloc # The PROJECT_NUMBER tag can be used to enter a project or revision number. This # could be handy for archiving the generated documentation or if some version # control system is used. PROJECT_NUMBER = 1.8/2.1 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a # quick idea about the purpose of the project. Keep the description short. PROJECT_BRIEF = # With the PROJECT_LOGO tag one can specify a logo or an icon that is included # in the documentation. The maximum height of the logo should not exceed 55 # pixels and the maximum width should not exceed 200 pixels. Doxygen will copy # the logo to the output directory. PROJECT_LOGO = mimalloc-logo.svg # With the PROJECT_ICON tag one can specify an icon that is included in the tabs # when the HTML document is shown. Doxygen will copy the logo to the output # directory. PROJECT_ICON = # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path # into which the generated documentation will be written. If a relative path is # entered, it will be relative to the location where doxygen was started. If # left blank the current directory will be used. OUTPUT_DIRECTORY = .. # If the CREATE_SUBDIRS tag is set to YES then doxygen will create up to 4096 # sub-directories (in 2 levels) under the output directory of each output format # and will distribute the generated files over these directories. Enabling this # option can be useful when feeding doxygen a huge amount of source files, where # putting all generated files in the same directory would otherwise causes # performance problems for the file system. Adapt CREATE_SUBDIRS_LEVEL to # control the number of sub-directories. # The default value is: NO. CREATE_SUBDIRS = NO # Controls the number of sub-directories that will be created when # CREATE_SUBDIRS tag is set to YES. Level 0 represents 16 directories, and every # level increment doubles the number of directories, resulting in 4096 # directories at level 8 which is the default and also the maximum value. The # sub-directories are organized in 2 levels, the first level always has a fixed # number of 16 directories. # Minimum value: 0, maximum value: 8, default value: 8. # This tag requires that the tag CREATE_SUBDIRS is set to YES. CREATE_SUBDIRS_LEVEL = 8 # If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII # characters to appear in the names of generated files. If set to NO, non-ASCII # characters will be escaped, for example _xE3_x81_x84 will be used for Unicode # U+3044. # The default value is: NO. ALLOW_UNICODE_NAMES = NO # The OUTPUT_LANGUAGE tag is used to specify the language in which all # documentation generated by doxygen is written. Doxygen will use this # information to generate all constant output in the proper language. # Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Bulgarian, # Catalan, Chinese, Chinese-Traditional, Croatian, Czech, Danish, Dutch, English # (United States), Esperanto, Farsi (Persian), Finnish, French, German, Greek, # Hindi, Hungarian, Indonesian, Italian, Japanese, Japanese-en (Japanese with # English messages), Korean, Korean-en (Korean with English messages), Latvian, # Lithuanian, Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, # Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, # Swedish, Turkish, Ukrainian and Vietnamese. # The default value is: English. OUTPUT_LANGUAGE = English # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member # descriptions after the members that are listed in the file and class # documentation (similar to Javadoc). Set to NO to disable this. # The default value is: YES. BRIEF_MEMBER_DESC = YES # If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief # description of a member or function before the detailed description # # Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the # brief descriptions will be completely suppressed. # The default value is: YES. REPEAT_BRIEF = YES # This tag implements a quasi-intelligent brief description abbreviator that is # used to form the text in various listings. Each string in this list, if found # as the leading text of the brief description, will be stripped from the text # and the result, after processing the whole list, is used as the annotated # text. Otherwise, the brief description is used as-is. If left blank, the # following values are used ($name is automatically replaced with the name of # the entity):The $name class, The $name widget, The $name file, is, provides, # specifies, contains, represents, a, an and the. ABBREVIATE_BRIEF = "The $name class" \ "The $name widget" \ "The $name file" \ is \ provides \ specifies \ contains \ represents \ a \ an \ the # If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then # doxygen will generate a detailed section even if there is only a brief # description. # The default value is: NO. ALWAYS_DETAILED_SEC = NO # If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all # inherited members of a class in the documentation of that class as if those # members were ordinary class members. Constructors, destructors and assignment # operators of the base classes will not be shown. # The default value is: NO. INLINE_INHERITED_MEMB = NO # If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path # before files name in the file list and in the header files. If set to NO the # shortest path that makes the file name unique will be used # The default value is: YES. FULL_PATH_NAMES = YES # The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. # Stripping is only done if one of the specified strings matches the left-hand # part of the path. The tag can be used to show relative paths in the file list. # If left blank the directory from which doxygen is run is used as the path to # strip. # # Note that you can specify absolute paths here, but also relative paths, which # will be relative from the directory where doxygen is started. # This tag requires that the tag FULL_PATH_NAMES is set to YES. STRIP_FROM_PATH = # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the # path mentioned in the documentation of a class, which tells the reader which # header file to include in order to use a class. If left blank only the name of # the header file containing the class definition is used. Otherwise one should # specify the list of include paths that are normally passed to the compiler # using the -I flag. STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but # less readable) file names. This can be useful is your file systems doesn't # support long names like on DOS, Mac, or CD-ROM. # The default value is: NO. SHORT_NAMES = NO # If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the # first line (until the first dot) of a Javadoc-style comment as the brief # description. If set to NO, the Javadoc-style will behave just like regular Qt- # style comments (thus requiring an explicit @brief command for a brief # description.) # The default value is: NO. JAVADOC_AUTOBRIEF = YES # If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line # such as # /*************** # as being the beginning of a Javadoc-style comment "banner". If set to NO, the # Javadoc-style will behave just like regular comments and it will not be # interpreted by doxygen. # The default value is: NO. JAVADOC_BANNER = NO # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first # line (until the first dot) of a Qt-style comment as the brief description. If # set to NO, the Qt-style will behave just like regular Qt-style comments (thus # requiring an explicit \brief command for a brief description.) # The default value is: NO. QT_AUTOBRIEF = NO # The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a # multi-line C++ special comment block (i.e. a block of //! or /// comments) as # a brief description. This used to be the default behavior. The new default is # to treat a multi-line C++ comment block as a detailed description. Set this # tag to YES if you prefer the old behavior instead. # # Note that setting this tag to YES also means that rational rose comments are # not recognized any more. # The default value is: NO. MULTILINE_CPP_IS_BRIEF = NO # By default Python docstrings are displayed as preformatted text and doxygen's # special commands cannot be used. By setting PYTHON_DOCSTRING to NO the # doxygen's special commands can be used and the contents of the docstring # documentation blocks is shown as doxygen documentation. # The default value is: YES. PYTHON_DOCSTRING = YES # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the # documentation from any documented member that it re-implements. # The default value is: YES. INHERIT_DOCS = YES # If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new # page for each member. If set to NO, the documentation of a member will be part # of the file/class/namespace that contains it. # The default value is: NO. SEPARATE_MEMBER_PAGES = NO # The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen # uses this value to replace tabs by spaces in code fragments. # Minimum value: 1, maximum value: 16, default value: 4. TAB_SIZE = 2 # This tag can be used to specify a number of aliases that act as commands in # the documentation. An alias has the form: # name=value # For example adding # "sideeffect=@par Side Effects:^^" # will allow you to put the command \sideeffect (or @sideeffect) in the # documentation, which will result in a user-defined paragraph with heading # "Side Effects:". Note that you cannot put \n's in the value part of an alias # to insert newlines (in the resulting output). You can put ^^ in the value part # of an alias to insert a newline as if a physical newline was in the original # file. When you need a literal { or } or , in the value part of an alias you # have to escape them by means of a backslash (\), this can lead to conflicts # with the commands \{ and \} for these it is advised to use the version @{ and # @} or use a double escape (\\{ and \\}) ALIASES = # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources # only. Doxygen will then generate output that is more tailored for C. For # instance, some of the names that are used will be different. The list of all # members will be omitted, etc. # The default value is: NO. OPTIMIZE_OUTPUT_FOR_C = YES # Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or # Python sources only. Doxygen will then generate output that is more tailored # for that language. For instance, namespaces will be presented as packages, # qualified scopes will look different, etc. # The default value is: NO. OPTIMIZE_OUTPUT_JAVA = NO # Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran # sources. Doxygen will then generate output that is tailored for Fortran. # The default value is: NO. OPTIMIZE_FOR_FORTRAN = NO # Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL # sources. Doxygen will then generate output that is tailored for VHDL. # The default value is: NO. OPTIMIZE_OUTPUT_VHDL = NO # Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice # sources only. Doxygen will then generate output that is more tailored for that # language. For instance, namespaces will be presented as modules, types will be # separated into more groups, etc. # The default value is: NO. OPTIMIZE_OUTPUT_SLICE = NO # Doxygen selects the parser to use depending on the extension of the files it # parses. With this tag you can assign which parser to use for a given # extension. Doxygen has a built-in mapping, but you can override or extend it # using this tag. The format is ext=language, where ext is a file extension, and # language is one of the parsers supported by doxygen: IDL, Java, JavaScript, # Csharp (C#), C, C++, Lex, D, PHP, md (Markdown), Objective-C, Python, Slice, # VHDL, Fortran (fixed format Fortran: FortranFixed, free formatted Fortran: # FortranFree, unknown formatted Fortran: Fortran. In the later case the parser # tries to guess whether the code is fixed or free formatted code, this is the # default for Fortran type files). For instance to make doxygen treat .inc files # as Fortran files (default is PHP), and .f files as C (default is Fortran), # use: inc=Fortran f=C. # # Note: For files without extension you can use no_extension as a placeholder. # # Note that for custom extensions you also need to set FILE_PATTERNS otherwise # the files are not read by doxygen. When specifying no_extension you should add # * to the FILE_PATTERNS. # # Note see also the list of default file extension mappings. EXTENSION_MAPPING = # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments # according to the Markdown format, which allows for more readable # documentation. See https://daringfireball.net/projects/markdown/ for details. # The output of markdown processing is further processed by doxygen, so you can # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in # case of backward compatibilities issues. # The default value is: YES. MARKDOWN_SUPPORT = YES # When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up # to that level are automatically included in the table of contents, even if # they do not have an id attribute. # Note: This feature currently applies only to Markdown headings. # Minimum value: 0, maximum value: 99, default value: 6. # This tag requires that the tag MARKDOWN_SUPPORT is set to YES. TOC_INCLUDE_HEADINGS = 0 # The MARKDOWN_ID_STYLE tag can be used to specify the algorithm used to # generate identifiers for the Markdown headings. Note: Every identifier is # unique. # Possible values are: DOXYGEN use a fixed 'autotoc_md' string followed by a # sequence number starting at 0 and GITHUB use the lower case version of title # with any whitespace replaced by '-' and punctuation characters removed. # The default value is: DOXYGEN. # This tag requires that the tag MARKDOWN_SUPPORT is set to YES. MARKDOWN_ID_STYLE = DOXYGEN # When enabled doxygen tries to link words that correspond to documented # classes, or namespaces to their corresponding documentation. Such a link can # be prevented in individual cases by putting a % sign in front of the word or # globally by setting AUTOLINK_SUPPORT to NO. # The default value is: YES. AUTOLINK_SUPPORT = YES # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want # to include (a tag file for) the STL sources as input, then you should set this # tag to YES in order to let doxygen match functions declarations and # definitions whose arguments contain STL classes (e.g. func(std::string); # versus func(std::string) {}). This also makes the inheritance and # collaboration diagrams that involve STL classes more complete and accurate. # The default value is: NO. BUILTIN_STL_SUPPORT = NO # If you use Microsoft's C++/CLI language, you should set this option to YES to # enable parsing support. # The default value is: NO. CPP_CLI_SUPPORT = NO # Set the SIP_SUPPORT tag to YES if your project consists of sip (see: # https://www.riverbankcomputing.com/software) sources only. Doxygen will parse # them like normal C++ but will assume all classes use public instead of private # inheritance when no explicit protection keyword is present. # The default value is: NO. SIP_SUPPORT = NO # For Microsoft's IDL there are propget and propput attributes to indicate # getter and setter methods for a property. Setting this option to YES will make # doxygen to replace the get and set methods by a property in the documentation. # This will only work if the methods are indeed getting or setting a simple # type. If this is not the case, or you want to show the methods anyway, you # should set this option to NO. # The default value is: YES. IDL_PROPERTY_SUPPORT = YES # If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC # tag is set to YES then doxygen will reuse the documentation of the first # member in the group (if any) for the other members of the group. By default # all members of a group must be documented explicitly. # The default value is: NO. DISTRIBUTE_GROUP_DOC = NO # If one adds a struct or class to a group and this option is enabled, then also # any nested class or struct is added to the same group. By default this option # is disabled and one has to add nested compounds explicitly via \ingroup. # The default value is: NO. GROUP_NESTED_COMPOUNDS = NO # Set the SUBGROUPING tag to YES to allow class member groups of the same type # (for instance a group of public functions) to be put as a subgroup of that # type (e.g. under the Public Functions section). Set it to NO to prevent # subgrouping. Alternatively, this can be done per class using the # \nosubgrouping command. # The default value is: YES. SUBGROUPING = YES # When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions # are shown inside the group in which they are included (e.g. using \ingroup) # instead of on a separate page (for HTML and Man pages) or section (for LaTeX # and RTF). # # Note that this feature does not work in combination with # SEPARATE_MEMBER_PAGES. # The default value is: NO. INLINE_GROUPED_CLASSES = NO # When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions # with only public data fields or simple typedef fields will be shown inline in # the documentation of the scope in which they are defined (i.e. file, # namespace, or group documentation), provided this scope is documented. If set # to NO, structs, classes, and unions are shown on a separate page (for HTML and # Man pages) or section (for LaTeX and RTF). # The default value is: NO. INLINE_SIMPLE_STRUCTS = YES # When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or # enum is documented as struct, union, or enum with the name of the typedef. So # typedef struct TypeS {} TypeT, will appear in the documentation as a struct # with name TypeT. When disabled the typedef will appear as a member of a file, # namespace, or class. And the struct will be named TypeS. This can typically be # useful for C code in case the coding convention dictates that all compound # types are typedef'ed and only the typedef is referenced, never the tag name. # The default value is: NO. TYPEDEF_HIDES_STRUCT = YES # The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This # cache is used to resolve symbols given their name and scope. Since this can be # an expensive process and often the same symbol appears multiple times in the # code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small # doxygen will become slower. If the cache is too large, memory is wasted. The # cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range # is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 # symbols. At the end of a run doxygen will report the cache usage and suggest # the optimal cache size from a speed point of view. # Minimum value: 0, maximum value: 9, default value: 0. LOOKUP_CACHE_SIZE = 0 # The NUM_PROC_THREADS specifies the number of threads doxygen is allowed to use # during processing. When set to 0 doxygen will based this on the number of # cores available in the system. You can set it explicitly to a value larger # than 0 to get more control over the balance between CPU load and processing # speed. At this moment only the input processing can be done using multiple # threads. Since this is still an experimental feature the default is set to 1, # which effectively disables parallel processing. Please report any issues you # encounter. Generating dot graphs in parallel is controlled by the # DOT_NUM_THREADS setting. # Minimum value: 0, maximum value: 32, default value: 1. NUM_PROC_THREADS = 1 # If the TIMESTAMP tag is set different from NO then each generated page will # contain the date or date and time when the page was generated. Setting this to # NO can help when comparing the output of multiple runs. # Possible values are: YES, NO, DATETIME and DATE. # The default value is: NO. TIMESTAMP = NO #--------------------------------------------------------------------------- # Build related configuration options #--------------------------------------------------------------------------- # If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in # documentation are documented, even if no documentation was available. Private # class members and static file members will be hidden unless the # EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. # Note: This will also disable the warnings about undocumented members that are # normally produced when WARNINGS is set to YES. # The default value is: NO. EXTRACT_ALL = YES # If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will # be included in the documentation. # The default value is: NO. EXTRACT_PRIVATE = NO # If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual # methods of a class will be included in the documentation. # The default value is: NO. EXTRACT_PRIV_VIRTUAL = NO # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal # scope will be included in the documentation. # The default value is: NO. EXTRACT_PACKAGE = NO # If the EXTRACT_STATIC tag is set to YES, all static members of a file will be # included in the documentation. # The default value is: NO. EXTRACT_STATIC = NO # If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined # locally in source files will be included in the documentation. If set to NO, # only classes defined in header files are included. Does not have any effect # for Java sources. # The default value is: YES. EXTRACT_LOCAL_CLASSES = YES # This flag is only useful for Objective-C code. If set to YES, local methods, # which are defined in the implementation section but not in the interface are # included in the documentation. If set to NO, only methods in the interface are # included. # The default value is: NO. EXTRACT_LOCAL_METHODS = NO # If this flag is set to YES, the members of anonymous namespaces will be # extracted and appear in the documentation as a namespace called # 'anonymous_namespace{file}', where file will be replaced with the base name of # the file that contains the anonymous namespace. By default anonymous namespace # are hidden. # The default value is: NO. EXTRACT_ANON_NSPACES = NO # If this flag is set to YES, the name of an unnamed parameter in a declaration # will be determined by the corresponding definition. By default unnamed # parameters remain unnamed in the output. # The default value is: YES. RESOLVE_UNNAMED_PARAMS = YES # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all # undocumented members inside documented classes or files. If set to NO these # members will be included in the various overviews, but no documentation # section is generated. This option has no effect if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_MEMBERS = NO # If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all # undocumented classes that are normally visible in the class hierarchy. If set # to NO, these classes will be included in the various overviews. This option # will also hide undocumented C++ concepts if enabled. This option has no effect # if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_CLASSES = NO # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend # declarations. If set to NO, these declarations will be included in the # documentation. # The default value is: NO. HIDE_FRIEND_COMPOUNDS = NO # If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any # documentation blocks found inside the body of a function. If set to NO, these # blocks will be appended to the function's detailed documentation block. # The default value is: NO. HIDE_IN_BODY_DOCS = NO # The INTERNAL_DOCS tag determines if documentation that is typed after a # \internal command is included. If the tag is set to NO then the documentation # will be excluded. Set it to YES to include the internal documentation. # The default value is: NO. INTERNAL_DOCS = NO # With the correct setting of option CASE_SENSE_NAMES doxygen will better be # able to match the capabilities of the underlying filesystem. In case the # filesystem is case sensitive (i.e. it supports files in the same directory # whose names only differ in casing), the option must be set to YES to properly # deal with such files in case they appear in the input. For filesystems that # are not case sensitive the option should be set to NO to properly deal with # output files written for symbols that only differ in casing, such as for two # classes, one named CLASS and the other named Class, and to also support # references to files without having to specify the exact matching casing. On # Windows (including Cygwin) and MacOS, users should typically set this option # to NO, whereas on Linux or other Unix flavors it should typically be set to # YES. # Possible values are: SYSTEM, NO and YES. # The default value is: SYSTEM. CASE_SENSE_NAMES = NO # If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with # their full class and namespace scopes in the documentation. If set to YES, the # scope will be hidden. # The default value is: NO. HIDE_SCOPE_NAMES = NO # If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will # append additional text to a page's title, such as Class Reference. If set to # YES the compound reference will be hidden. # The default value is: NO. HIDE_COMPOUND_REFERENCE= NO # If the SHOW_HEADERFILE tag is set to YES then the documentation for a class # will show which file needs to be included to use the class. # The default value is: YES. SHOW_HEADERFILE = YES # If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of # the files that are included by a file in the documentation of that file. # The default value is: YES. SHOW_INCLUDE_FILES = YES # If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each # grouped member an include statement to the documentation, telling the reader # which file to include in order to use the member. # The default value is: NO. SHOW_GROUPED_MEMB_INC = NO # If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include # files with double quotes in the documentation rather than with sharp brackets. # The default value is: NO. FORCE_LOCAL_INCLUDES = NO # If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the # documentation for inline members. # The default value is: YES. INLINE_INFO = YES # If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the # (detailed) documentation of file and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. # The default value is: YES. SORT_MEMBER_DOCS = YES # If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief # descriptions of file, namespace and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. Note that # this will also influence the order of the classes in the class list. # The default value is: NO. SORT_BRIEF_DOCS = NO # If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the # (brief and detailed) documentation of class members so that constructors and # destructors are listed first. If set to NO the constructors will appear in the # respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. # Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief # member documentation. # Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting # detailed member documentation. # The default value is: NO. SORT_MEMBERS_CTORS_1ST = NO # If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy # of group names into alphabetical order. If set to NO the group names will # appear in their defined order. # The default value is: NO. SORT_GROUP_NAMES = NO # If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by # fully-qualified names, including namespaces. If set to NO, the class list will # be sorted only by class name, not including the namespace part. # Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. # Note: This option applies only to the class list, not to the alphabetical # list. # The default value is: NO. SORT_BY_SCOPE_NAME = NO # If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper # type resolution of all parameters of a function it will reject a match between # the prototype and the implementation of a member function even if there is # only one candidate or it is obvious which candidate to choose by doing a # simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still # accept a match between prototype and implementation in such cases. # The default value is: NO. STRICT_PROTO_MATCHING = NO # The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo # list. This list is created by putting \todo commands in the documentation. # The default value is: YES. GENERATE_TODOLIST = YES # The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test # list. This list is created by putting \test commands in the documentation. # The default value is: YES. GENERATE_TESTLIST = YES # The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug # list. This list is created by putting \bug commands in the documentation. # The default value is: YES. GENERATE_BUGLIST = YES # The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) # the deprecated list. This list is created by putting \deprecated commands in # the documentation. # The default value is: YES. GENERATE_DEPRECATEDLIST= YES # The ENABLED_SECTIONS tag can be used to enable conditional documentation # sections, marked by \if ... \endif and \cond # ... \endcond blocks. ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the # initial value of a variable or macro / define can have for it to appear in the # documentation. If the initializer consists of more lines than specified here # it will be hidden. Use a value of 0 to hide initializers completely. The # appearance of the value of individual variables and macros / defines can be # controlled using \showinitializer or \hideinitializer command in the # documentation regardless of this setting. # Minimum value: 0, maximum value: 10000, default value: 30. MAX_INITIALIZER_LINES = 0 # Set the SHOW_USED_FILES tag to NO to disable the list of files generated at # the bottom of the documentation of classes and structs. If set to YES, the # list will mention the files that were used to generate the documentation. # The default value is: YES. SHOW_USED_FILES = NO # Set the SHOW_FILES tag to NO to disable the generation of the Files page. This # will remove the Files entry from the Quick Index and from the Folder Tree View # (if specified). # The default value is: YES. SHOW_FILES = NO # Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces # page. This will remove the Namespaces entry from the Quick Index and from the # Folder Tree View (if specified). # The default value is: YES. SHOW_NAMESPACES = YES # The FILE_VERSION_FILTER tag can be used to specify a program or script that # doxygen should invoke to get the current version for each file (typically from # the version control system). Doxygen will invoke the program by executing (via # popen()) the command command input-file, where command is the value of the # FILE_VERSION_FILTER tag, and input-file is the name of an input file provided # by doxygen. Whatever the program writes to standard output is used as the file # version. For an example see the documentation. FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed # by doxygen. The layout file controls the global structure of the generated # output files in an output format independent way. To create the layout file # that represents doxygen's defaults, run doxygen with the -l option. You can # optionally specify a file name after the option, if omitted DoxygenLayout.xml # will be used as the name of the layout file. See also section "Changing the # layout of pages" for information. # # Note that if you run doxygen from a directory containing a file called # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE # tag is left empty. LAYOUT_FILE = # The CITE_BIB_FILES tag can be used to specify one or more bib files containing # the reference definitions. This must be a list of .bib files. The .bib # extension is automatically appended if omitted. This requires the bibtex tool # to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info. # For LaTeX the style of the bibliography can be controlled using # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the # search path. See also \cite for info how to create references. CITE_BIB_FILES = #--------------------------------------------------------------------------- # Configuration options related to warning and progress messages #--------------------------------------------------------------------------- # The QUIET tag can be used to turn on/off the messages that are generated to # standard output by doxygen. If QUIET is set to YES this implies that the # messages are off. # The default value is: NO. QUIET = NO # The WARNINGS tag can be used to turn on/off the warning messages that are # generated to standard error (stderr) by doxygen. If WARNINGS is set to YES # this implies that the warnings are on. # # Tip: Turn warnings on while writing the documentation. # The default value is: YES. WARNINGS = YES # If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate # warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag # will automatically be disabled. # The default value is: YES. WARN_IF_UNDOCUMENTED = YES # If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for # potential errors in the documentation, such as documenting some parameters in # a documented function twice, or documenting parameters that don't exist or # using markup commands wrongly. # The default value is: YES. WARN_IF_DOC_ERROR = YES # If WARN_IF_INCOMPLETE_DOC is set to YES, doxygen will warn about incomplete # function parameter documentation. If set to NO, doxygen will accept that some # parameters have no documentation without warning. # The default value is: YES. WARN_IF_INCOMPLETE_DOC = YES # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that # are documented, but have no documentation for their parameters or return # value. If set to NO, doxygen will only warn about wrong parameter # documentation, but not about the absence of documentation. If EXTRACT_ALL is # set to YES then this flag will automatically be disabled. See also # WARN_IF_INCOMPLETE_DOC # The default value is: NO. WARN_NO_PARAMDOC = NO # If WARN_IF_UNDOC_ENUM_VAL option is set to YES, doxygen will warn about # undocumented enumeration values. If set to NO, doxygen will accept # undocumented enumeration values. If EXTRACT_ALL is set to YES then this flag # will automatically be disabled. # The default value is: NO. WARN_IF_UNDOC_ENUM_VAL = NO # If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when # a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS # then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but # at the end of the doxygen process doxygen will return with a non-zero status. # If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS_PRINT then doxygen behaves # like FAIL_ON_WARNINGS but in case no WARN_LOGFILE is defined doxygen will not # write the warning messages in between other messages but write them at the end # of a run, in case a WARN_LOGFILE is defined the warning messages will be # besides being in the defined file also be shown at the end of a run, unless # the WARN_LOGFILE is defined as - i.e. standard output (stdout) in that case # the behavior will remain as with the setting FAIL_ON_WARNINGS. # Possible values are: NO, YES, FAIL_ON_WARNINGS and FAIL_ON_WARNINGS_PRINT. # The default value is: NO. WARN_AS_ERROR = NO # The WARN_FORMAT tag determines the format of the warning messages that doxygen # can produce. The string should contain the $file, $line, and $text tags, which # will be replaced by the file and line number from which the warning originated # and the warning text. Optionally the format may contain $version, which will # be replaced by the version of the file (if it could be obtained via # FILE_VERSION_FILTER) # See also: WARN_LINE_FORMAT # The default value is: $file:$line: $text. WARN_FORMAT = "$file:$line: $text" # In the $text part of the WARN_FORMAT command it is possible that a reference # to a more specific place is given. To make it easier to jump to this place # (outside of doxygen) the user can define a custom "cut" / "paste" string. # Example: # WARN_LINE_FORMAT = "'vi $file +$line'" # See also: WARN_FORMAT # The default value is: at line $line of file $file. WARN_LINE_FORMAT = "at line $line of file $file" # The WARN_LOGFILE tag can be used to specify a file to which warning and error # messages should be written. If left blank the output is written to standard # error (stderr). In case the file specified cannot be opened for writing the # warning and error messages are written to standard error. When as file - is # specified the warning and error messages are written to standard output # (stdout). WARN_LOGFILE = #--------------------------------------------------------------------------- # Configuration options related to the input files #--------------------------------------------------------------------------- # The INPUT tag is used to specify the files and/or directories that contain # documented source files. You may enter file names like myfile.cpp or # directories like /usr/src/myproject. Separate the files or directories with # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. INPUT = mimalloc-doc.h # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses # libiconv (or the iconv built into libc) for the transcoding. See the libiconv # documentation (see: # https://www.gnu.org/software/libiconv/) for the list of possible encodings. # See also: INPUT_FILE_ENCODING # The default value is: UTF-8. INPUT_ENCODING = UTF-8 # This tag can be used to specify the character encoding of the source files # that doxygen parses The INPUT_FILE_ENCODING tag can be used to specify # character encoding on a per file pattern basis. Doxygen will compare the file # name with each pattern and apply the encoding instead of the default # INPUT_ENCODING) if there is a match. The character encodings are a list of the # form: pattern=encoding (like *.php=ISO-8859-1). # See also: INPUT_ENCODING for further information on supported encodings. INPUT_FILE_ENCODING = # If the value of the INPUT tag contains directories, you can use the # FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and # *.h) to filter out the source-files in the directories. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # read by doxygen. # # Note the list of default checked file patterns might differ from the list of # default file extension mappings. # # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cxxm, # *.cpp, *.cppm, *.ccm, *.c++, *.c++m, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, # *.idl, *.ddl, *.odl, *.h, *.hh, *.hxx, *.hpp, *.h++, *.ixx, *.l, *.cs, *.d, # *.php, *.php4, *.php5, *.phtml, *.inc, *.m, *.markdown, *.md, *.mm, *.dox (to # be provided as doxygen C comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, # *.f18, *.f, *.for, *.vhd, *.vhdl, *.ucf, *.qsf and *.ice. FILE_PATTERNS = *.c \ *.cc \ *.cxx \ *.cpp \ *.c++ \ *.java \ *.ii \ *.ixx \ *.ipp \ *.i++ \ *.inl \ *.idl \ *.ddl \ *.odl \ *.h \ *.hh \ *.hxx \ *.hpp \ *.h++ \ *.cs \ *.d \ *.php \ *.php4 \ *.php5 \ *.phtml \ *.inc \ *.m \ *.markdown \ *.md \ *.mm \ *.dox \ *.py \ *.pyw \ *.f90 \ *.f95 \ *.f03 \ *.f08 \ *.f \ *.for \ *.tcl \ *.vhd \ *.vhdl \ *.ucf \ *.qsf # The RECURSIVE tag can be used to specify whether or not subdirectories should # be searched for input files as well. # The default value is: NO. RECURSIVE = NO # The EXCLUDE tag can be used to specify files and/or directories that should be # excluded from the INPUT source files. This way you can easily exclude a # subdirectory from a directory tree whose root is specified with the INPUT tag. # # Note that relative paths are relative to the directory from which doxygen is # run. EXCLUDE = # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded # from the input. # The default value is: NO. EXCLUDE_SYMLINKS = NO # If the value of the INPUT tag contains directories, you can use the # EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude # certain files from those directories. # # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories for example use the pattern */test/* EXCLUDE_PATTERNS = # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the # output. The symbol name can be a fully qualified name, a word, or if the # wildcard * is used, a substring. Examples: ANamespace, AClass, # ANamespace::AClass, ANamespace::*Test EXCLUDE_SYMBOLS = # The EXAMPLE_PATH tag can be used to specify one or more files or directories # that contain example code fragments that are included (see the \include # command). EXAMPLE_PATH = # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and # *.h) to filter out the source-files in the directories. If left blank all # files are included. EXAMPLE_PATTERNS = * # If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be # searched for input files to be used with the \include or \dontinclude commands # irrespective of the value of the RECURSIVE tag. # The default value is: NO. EXAMPLE_RECURSIVE = NO # The IMAGE_PATH tag can be used to specify one or more files or directories # that contain images that are to be included in the documentation (see the # \image command). IMAGE_PATH = # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program # by executing (via popen()) the command: # # # # where is the value of the INPUT_FILTER tag, and is the # name of an input file. Doxygen will then use the output that the filter # program writes to standard output. If FILTER_PATTERNS is specified, this tag # will be ignored. # # Note that the filter must not add or remove lines; it is applied before the # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. # # Note that doxygen will use the data processed and written to standard output # for further processing, therefore nothing else, like debug statements or used # commands (so in case of a Windows batch file always use @echo OFF), should be # written to standard output. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # properly processed by doxygen. INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. Doxygen will compare the file name with each pattern and apply the # filter if there is a match. The filters are a list of the form: pattern=filter # (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how # filters are used. If the FILTER_PATTERNS tag is empty or if none of the # patterns match the file name, INPUT_FILTER is applied. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # properly processed by doxygen. FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will also be used to filter the input files that are used for # producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). # The default value is: NO. FILTER_SOURCE_FILES = NO # The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file # pattern. A pattern will override the setting for FILTER_PATTERN (if any) and # it is also possible to disable source filtering for a specific pattern using # *.ext= (so without naming a filter). # This tag requires that the tag FILTER_SOURCE_FILES is set to YES. FILTER_SOURCE_PATTERNS = # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that # is part of the input, its contents will be placed on the main page # (index.html). This can be useful if you have a project on for instance GitHub # and want to reuse the introduction page also for the doxygen output. USE_MDFILE_AS_MAINPAGE = # The Fortran standard specifies that for fixed formatted Fortran code all # characters from position 72 are to be considered as comment. A common # extension is to allow longer lines before the automatic comment starts. The # setting FORTRAN_COMMENT_AFTER will also make it possible that longer lines can # be processed before the automatic comment starts. # Minimum value: 7, maximum value: 10000, default value: 72. FORTRAN_COMMENT_AFTER = 72 #--------------------------------------------------------------------------- # Configuration options related to source browsing #--------------------------------------------------------------------------- # If the SOURCE_BROWSER tag is set to YES then a list of source files will be # generated. Documented entities will be cross-referenced with these sources. # # Note: To get rid of all source code in the generated output, make sure that # also VERBATIM_HEADERS is set to NO. # The default value is: NO. SOURCE_BROWSER = NO # Setting the INLINE_SOURCES tag to YES will include the body of functions, # multi-line macros, enums or list initialized variables directly into the # documentation. # The default value is: NO. INLINE_SOURCES = NO # Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any # special comment blocks from generated source code fragments. Normal C, C++ and # Fortran comments will always remain visible. # The default value is: YES. STRIP_CODE_COMMENTS = YES # If the REFERENCED_BY_RELATION tag is set to YES then for each documented # entity all documented functions referencing it will be listed. # The default value is: NO. REFERENCED_BY_RELATION = NO # If the REFERENCES_RELATION tag is set to YES then for each documented function # all documented entities called/used by that function will be listed. # The default value is: NO. REFERENCES_RELATION = NO # If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set # to YES then the hyperlinks from functions in REFERENCES_RELATION and # REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will # link to the documentation. # The default value is: YES. REFERENCES_LINK_SOURCE = YES # If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the # source code will show a tooltip with additional information such as prototype, # brief description and links to the definition and documentation. Since this # will make the HTML file larger and loading of large files a bit slower, you # can opt to disable this feature. # The default value is: YES. # This tag requires that the tag SOURCE_BROWSER is set to YES. SOURCE_TOOLTIPS = YES # If the USE_HTAGS tag is set to YES then the references to source code will # point to the HTML generated by the htags(1) tool instead of doxygen built-in # source browser. The htags tool is part of GNU's global source tagging system # (see https://www.gnu.org/software/global/global.html). You will need version # 4.8.6 or higher. # # To use it do the following: # - Install the latest version of global # - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file # - Make sure the INPUT points to the root of the source tree # - Run doxygen as normal # # Doxygen will invoke htags (and that will in turn invoke gtags), so these # tools must be available from the command line (i.e. in the search path). # # The result: instead of the source browser generated by doxygen, the links to # source code will now point to the output of htags. # The default value is: NO. # This tag requires that the tag SOURCE_BROWSER is set to YES. USE_HTAGS = NO # If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a # verbatim copy of the header file for each class for which an include is # specified. Set to NO to disable this. # See also: Section \class. # The default value is: YES. VERBATIM_HEADERS = YES # If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the # clang parser (see: # http://clang.llvm.org/) for more accurate parsing at the cost of reduced # performance. This can be particularly helpful with template rich C++ code for # which doxygen's built-in parser lacks the necessary type information. # Note: The availability of this option depends on whether or not doxygen was # generated with the -Duse_libclang=ON option for CMake. # The default value is: NO. CLANG_ASSISTED_PARSING = NO # If the CLANG_ASSISTED_PARSING tag is set to YES and the CLANG_ADD_INC_PATHS # tag is set to YES then doxygen will add the directory of each input to the # include path. # The default value is: YES. # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. CLANG_ADD_INC_PATHS = YES # If clang assisted parsing is enabled you can provide the compiler with command # line options that you would normally use when invoking the compiler. Note that # the include paths will already be set by doxygen for the files and directories # specified with INPUT and INCLUDE_PATH. # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. CLANG_OPTIONS = # If clang assisted parsing is enabled you can provide the clang parser with the # path to the directory containing a file called compile_commands.json. This # file is the compilation database (see: # http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) containing the # options used when the source files were built. This is equivalent to # specifying the -p option to a clang tool, such as clang-check. These options # will then be passed to the parser. Any options specified with CLANG_OPTIONS # will be added as well. # Note: The availability of this option depends on whether or not doxygen was # generated with the -Duse_libclang=ON option for CMake. CLANG_DATABASE_PATH = #--------------------------------------------------------------------------- # Configuration options related to the alphabetical class index #--------------------------------------------------------------------------- # If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all # compounds will be generated. Enable this if the project contains a lot of # classes, structs, unions or interfaces. # The default value is: YES. ALPHABETICAL_INDEX = YES # The IGNORE_PREFIX tag can be used to specify a prefix (or a list of prefixes) # that should be ignored while generating the index headers. The IGNORE_PREFIX # tag works for classes, function and member names. The entity will be placed in # the alphabetical list under the first letter of the entity name that remains # after removing the prefix. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. IGNORE_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the HTML output #--------------------------------------------------------------------------- # If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output # The default value is: YES. GENERATE_HTML = YES # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of # it. # The default directory is: html. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_OUTPUT = docs # The HTML_FILE_EXTENSION tag can be used to specify the file extension for each # generated HTML page (for example: .htm, .php, .asp). # The default value is: .html. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FILE_EXTENSION = .html # The HTML_HEADER tag can be used to specify a user-defined HTML header file for # each generated HTML page. If the tag is left blank doxygen will generate a # standard header. # # To get valid HTML the header file that includes any scripts and style sheets # that doxygen needs, which is dependent on the configuration options used (e.g. # the setting GENERATE_TREEVIEW). It is highly recommended to start with a # default header using # doxygen -w html new_header.html new_footer.html new_stylesheet.css # YourConfigFile # and then modify the file new_header.html. See also section "Doxygen usage" # for information on how to generate the default header that doxygen normally # uses. # Note: The header is subject to change so you typically have to regenerate the # default header when upgrading to a newer version of doxygen. For a description # of the possible markers and block names see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_HEADER = # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each # generated HTML page. If the tag is left blank doxygen will generate a standard # footer. See HTML_HEADER for more information on how to generate a default # footer and what special commands can be used inside the footer. See also # section "Doxygen usage" for information on how to generate the default footer # that doxygen normally uses. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FOOTER = # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style # sheet that is used by each HTML page. It can be used to fine-tune the look of # the HTML output. If left blank doxygen will generate a default style sheet. # See also section "Doxygen usage" for information on how to generate the style # sheet that doxygen normally uses. # Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as # it is more robust and this tag (HTML_STYLESHEET) will in the future become # obsolete. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_STYLESHEET = # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined # cascading style sheets that are included after the standard style sheets # created by doxygen. Using this option one can overrule certain style aspects. # This is preferred over using HTML_STYLESHEET since it does not replace the # standard style sheet and is therefore more robust against future updates. # Doxygen will copy the style sheet files to the output directory. # Note: The order of the extra style sheet files is of importance (e.g. the last # style sheet in the list overrules the setting of the previous ones in the # list). # Note: Since the styling of scrollbars can currently not be overruled in # Webkit/Chromium, the styling will be left out of the default doxygen.css if # one or more extra stylesheets have been specified. So if scrollbar # customization is desired it has to be added explicitly. For an example see the # documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_STYLESHEET = mimalloc-doxygen.css # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note # that these files will be copied to the base HTML output directory. Use the # $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these # files. In the HTML_STYLESHEET file, use the file name only. Also note that the # files will be copied as-is; there are no commands or markers available. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_FILES = # The HTML_COLORSTYLE tag can be used to specify if the generated HTML output # should be rendered with a dark or light theme. # Possible values are: LIGHT always generates light mode output, DARK always # generates dark mode output, AUTO_LIGHT automatically sets the mode according # to the user preference, uses light mode if no preference is set (the default), # AUTO_DARK automatically sets the mode according to the user preference, uses # dark mode if no preference is set and TOGGLE allows a user to switch between # light and dark mode via a button. # The default value is: AUTO_LIGHT. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE = LIGHT # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to # this color. Hue is specified as an angle on a color-wheel, see # https://en.wikipedia.org/wiki/Hue for more information. For instance the value # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 # purple, and 360 is red again. # Minimum value: 0, maximum value: 359, default value: 220. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_HUE = 189 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors # in the HTML output. For a value of 0 the output will use gray-scales only. A # value of 255 will produce the most vivid colors. # Minimum value: 0, maximum value: 255, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_SAT = 12 # The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the # luminance component of the colors in the HTML output. Values below 100 # gradually make the output lighter, whereas values above 100 make the output # darker. The value divided by 100 is the actual gamma applied, so 80 represents # a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not # change the gamma. # Minimum value: 40, maximum value: 240, default value: 80. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_GAMMA = 240 # If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML # documentation will contain a main index with vertical navigation menus that # are dynamically created via JavaScript. If disabled, the navigation index will # consists of multiple levels of tabs that are statically embedded in every HTML # page. Disable this option to support browsers that do not have JavaScript, # like the Qt help browser. # The default value is: YES. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_DYNAMIC_MENUS = NO # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML # documentation will contain sections that can be hidden and shown after the # page has loaded. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_DYNAMIC_SECTIONS = NO # If the HTML_CODE_FOLDING tag is set to YES then classes and functions can be # dynamically folded and expanded in the generated HTML source code. # The default value is: YES. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_CODE_FOLDING = YES # If the HTML_COPY_CLIPBOARD tag is set to YES then doxygen will show an icon in # the top right corner of code and text fragments that allows the user to copy # its content to the clipboard. Note this only works if supported by the browser # and the web page is served via a secure context (see: # https://www.w3.org/TR/secure-contexts/), i.e. using the https: or file: # protocol. # The default value is: YES. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COPY_CLIPBOARD = YES # Doxygen stores a couple of settings persistently in the browser (via e.g. # cookies). By default these settings apply to all HTML pages generated by # doxygen across all projects. The HTML_PROJECT_COOKIE tag can be used to store # the settings under a project specific key, such that the user preferences will # be stored separately. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_PROJECT_COOKIE = # With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries # shown in the various tree structured indices initially; the user can expand # and collapse entries dynamically later on. Doxygen will expand the tree to # such a level that at most the specified number of entries are visible (unless # a fully collapsed tree already exceeds this amount). So setting the number of # entries 1 will produce a full collapsed tree by default. 0 is a special value # representing an infinite number of entries and will result in a full expanded # tree by default. # Minimum value: 0, maximum value: 9999, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_INDEX_NUM_ENTRIES = 100 # If the GENERATE_DOCSET tag is set to YES, additional index files will be # generated that can be used as input for Apple's Xcode 3 integrated development # environment (see: # https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To # create a documentation set, doxygen will generate a Makefile in the HTML # output directory. Running make will produce the docset in that directory and # running make install will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at # startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy # genXcode/_index.html for more information. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_DOCSET = NO # This tag determines the name of the docset feed. A documentation feed provides # an umbrella under which multiple documentation sets from a single provider # (such as a company or product suite) can be grouped. # The default value is: Doxygen generated docs. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_FEEDNAME = "Doxygen generated docs" # This tag determines the URL of the docset feed. A documentation feed provides # an umbrella under which multiple documentation sets from a single provider # (such as a company or product suite) can be grouped. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_FEEDURL = # This tag specifies a string that should uniquely identify the documentation # set bundle. This should be a reverse domain-name style string, e.g. # com.mycompany.MyDocSet. Doxygen will append .docset to the name. # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_BUNDLE_ID = org.doxygen.Project # The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify # the documentation publisher. This should be a reverse domain-name style # string, e.g. com.mycompany.MyDocSet.documentation. # The default value is: org.doxygen.Publisher. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_ID = org.doxygen.Publisher # The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. # The default value is: Publisher. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_NAME = Publisher # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three # additional HTML index files: index.hhp, index.hhc, and index.hhk. The # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop # on Windows. In the beginning of 2021 Microsoft took the original page, with # a.o. the download links, offline the HTML help workshop was already many years # in maintenance mode). You can download the HTML help workshop from the web # archives at Installation executable (see: # http://web.archive.org/web/20160201063255/http://download.microsoft.com/downlo # ad/0/A/9/0A939EF6-E31C-430F-A3DF-DFAE7960D564/htmlhelp.exe). # # The HTML Help Workshop contains a compiler that can convert all HTML output # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML # files are now used as the Windows 98 help format, and will replace the old # Windows help format (.hlp) on all Windows platforms in the future. Compressed # HTML files also contain an index, a table of contents, and you can search for # words in the documentation. The HTML workshop also contains a viewer for # compressed HTML files. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_HTMLHELP = NO # The CHM_FILE tag can be used to specify the file name of the resulting .chm # file. You can add a path in front of the file if the result should not be # written to the html output directory. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. CHM_FILE = # The HHC_LOCATION tag can be used to specify the location (absolute path # including file name) of the HTML help compiler (hhc.exe). If non-empty, # doxygen will try to run the HTML help compiler on the generated index.hhp. # The file has to be specified with full path. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. HHC_LOCATION = # The GENERATE_CHI flag controls if a separate .chi index file is generated # (YES) or that it should be included in the main .chm file (NO). # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. GENERATE_CHI = NO # The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) # and project file content. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. CHM_INDEX_ENCODING = # The BINARY_TOC flag controls whether a binary table of contents is generated # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it # enables the Previous and Next buttons. # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. BINARY_TOC = NO # The TOC_EXPAND flag can be set to YES to add extra items for group members to # the table of contents of the HTML help documentation and to the tree view. # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. TOC_EXPAND = NO # The SITEMAP_URL tag is used to specify the full URL of the place where the # generated documentation will be placed on the server by the user during the # deployment of the documentation. The generated sitemap is called sitemap.xml # and placed on the directory specified by HTML_OUTPUT. In case no SITEMAP_URL # is specified no sitemap is generated. For information about the sitemap # protocol see https://www.sitemaps.org # This tag requires that the tag GENERATE_HTML is set to YES. SITEMAP_URL = # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and # QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that # can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help # (.qch) of the generated HTML documentation. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_QHP = NO # If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify # the file name of the resulting .qch file. The path specified is relative to # the HTML output folder. # This tag requires that the tag GENERATE_QHP is set to YES. QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace # (see: # https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace). # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_QHP is set to YES. QHP_NAMESPACE = org.doxygen.Project # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt # Help Project output. For more information please see Qt Help Project / Virtual # Folders (see: # https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders). # The default value is: doc. # This tag requires that the tag GENERATE_QHP is set to YES. QHP_VIRTUAL_FOLDER = doc # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom # filter to add. For more information please see Qt Help Project / Custom # Filters (see: # https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom # Filters (see: # https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's filter section matches. Qt Help Project / Filter Attributes (see: # https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_SECT_FILTER_ATTRS = # The QHG_LOCATION tag can be used to specify the location (absolute path # including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to # run qhelpgenerator on the generated .qhp file. # This tag requires that the tag GENERATE_QHP is set to YES. QHG_LOCATION = # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be # generated, together with the HTML files, they form an Eclipse help plugin. To # install this plugin and make it available under the help contents menu in # Eclipse, the contents of the directory containing the HTML and XML files needs # to be copied into the plugins directory of eclipse. The name of the directory # within the plugins directory should be the same as the ECLIPSE_DOC_ID value. # After copying Eclipse needs to be restarted before the help appears. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_ECLIPSEHELP = NO # A unique identifier for the Eclipse help plugin. When installing the plugin # the directory name containing the HTML and XML files should also have this # name. Each documentation set should have its own identifier. # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. ECLIPSE_DOC_ID = org.doxygen.Project # If you want full control over the layout of the generated HTML pages it might # be necessary to disable the index and replace it with your own. The # DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top # of each HTML page. A value of NO enables the index and the value YES disables # it. Since the tabs in the index contain the same information as the navigation # tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. DISABLE_INDEX = YES # The GENERATE_TREEVIEW tag is used to specify whether a tree-like index # structure should be generated to display hierarchical information. If the tag # value is set to YES, a side panel will be generated containing a tree-like # index structure (just like the one that is generated for HTML Help). For this # to work a browser that supports JavaScript, DHTML, CSS and frames is required # (i.e. any modern browser). Windows users are probably better off using the # HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can # further fine tune the look of the index (see "Fine-tuning the output"). As an # example, the default style sheet generated by doxygen has an example that # shows how to put an image at the root of the tree instead of the PROJECT_NAME. # Since the tree basically has the same information as the tab index, you could # consider setting DISABLE_INDEX to YES when enabling this option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_TREEVIEW = YES # When both GENERATE_TREEVIEW and DISABLE_INDEX are set to YES, then the # FULL_SIDEBAR option determines if the side bar is limited to only the treeview # area (value NO) or if it should extend to the full height of the window (value # YES). Setting this to YES gives a layout similar to # https://docs.readthedocs.io with more room for contents, but less room for the # project logo, title, and description. If either GENERATE_TREEVIEW or # DISABLE_INDEX is set to NO, this option has no effect. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. FULL_SIDEBAR = NO # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that # doxygen will group on one line in the generated HTML documentation. # # Note that a value of 0 will completely suppress the enum values from appearing # in the overview section. # Minimum value: 0, maximum value: 20, default value: 4. # This tag requires that the tag GENERATE_HTML is set to YES. ENUM_VALUES_PER_LINE = 4 # If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used # to set the initial width (in pixels) of the frame in which the tree is shown. # Minimum value: 0, maximum value: 1500, default value: 250. # This tag requires that the tag GENERATE_HTML is set to YES. TREEVIEW_WIDTH = 180 # If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to # external symbols imported via tag files in a separate window. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. EXT_LINKS_IN_WINDOW = NO # If the OBFUSCATE_EMAILS tag is set to YES, doxygen will obfuscate email # addresses. # The default value is: YES. # This tag requires that the tag GENERATE_HTML is set to YES. OBFUSCATE_EMAILS = YES # If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg # tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see # https://inkscape.org) to generate formulas as SVG images instead of PNGs for # the HTML output. These images will generally look nicer at scaled resolutions. # Possible values are: png (the default) and svg (looks nicer but requires the # pdf2svg or inkscape tool). # The default value is: png. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FORMULA_FORMAT = png # Use this tag to change the font size of LaTeX formulas included as images in # the HTML documentation. When you change the font size after a successful # doxygen run you need to manually remove any form_*.png images from the HTML # output directory to force them to be regenerated. # Minimum value: 8, maximum value: 50, default value: 10. # This tag requires that the tag GENERATE_HTML is set to YES. FORMULA_FONTSIZE = 10 # The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands # to create new LaTeX commands to be used in formulas as building blocks. See # the section "Including formulas" for details. FORMULA_MACROFILE = # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see # https://www.mathjax.org) which uses client side JavaScript for the rendering # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX # installed or if you want to formulas look prettier in the HTML output. When # enabled you may also need to install MathJax separately and configure the path # to it using the MATHJAX_RELPATH option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. USE_MATHJAX = NO # With MATHJAX_VERSION it is possible to specify the MathJax version to be used. # Note that the different versions of MathJax have different requirements with # regards to the different settings, so it is possible that also other MathJax # settings have to be changed when switching between the different MathJax # versions. # Possible values are: MathJax_2 and MathJax_3. # The default value is: MathJax_2. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_VERSION = MathJax_2 # When MathJax is enabled you can set the default output format to be used for # the MathJax output. For more details about the output format see MathJax # version 2 (see: # http://docs.mathjax.org/en/v2.7-latest/output.html) and MathJax version 3 # (see: # http://docs.mathjax.org/en/latest/web/components/output.html). # Possible values are: HTML-CSS (which is slower, but has the best # compatibility. This is the name for Mathjax version 2, for MathJax version 3 # this will be translated into chtml), NativeMML (i.e. MathML. Only supported # for MathJax 2. For MathJax version 3 chtml will be used instead.), chtml (This # is the name for Mathjax version 3, for MathJax version 2 this will be # translated into HTML-CSS) and SVG. # The default value is: HTML-CSS. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_FORMAT = HTML-CSS # When MathJax is enabled you need to specify the location relative to the HTML # output directory using the MATHJAX_RELPATH option. The destination directory # should contain the MathJax.js script. For instance, if the mathjax directory # is located at the same level as the HTML output directory, then # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax # Content Delivery Network so you can quickly see the result without installing # MathJax. However, it is strongly recommended to install a local copy of # MathJax from https://www.mathjax.org before deployment. The default value is: # - in case of MathJax version 2: https://cdn.jsdelivr.net/npm/mathjax@2 # - in case of MathJax version 3: https://cdn.jsdelivr.net/npm/mathjax@3 # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest # The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax # extension names that should be enabled during MathJax rendering. For example # for MathJax version 2 (see # https://docs.mathjax.org/en/v2.7-latest/tex.html#tex-and-latex-extensions): # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols # For example for MathJax version 3 (see # http://docs.mathjax.org/en/latest/input/tex/extensions/index.html): # MATHJAX_EXTENSIONS = ams # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_EXTENSIONS = # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces # of code that will be used on startup of the MathJax code. See the MathJax site # (see: # http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_CODEFILE = # When the SEARCHENGINE tag is enabled doxygen will generate a search box for # the HTML output. The underlying search engine uses javascript and DHTML and # should work on any modern browser. Note that when using HTML help # (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) # there is already a search function so this one should typically be disabled. # For large projects the javascript based search engine can be slow, then # enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to # search using the keyboard; to jump to the search box use + S # (what the is depends on the OS and browser, but it is typically # , /Node, # Edge and Graph Attributes specification You need to make sure dot is able # to find the font, which can be done by putting it in a standard location or by # setting the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the # directory containing the font. Default graphviz fontsize is 14. # The default value is: fontname=Helvetica,fontsize=10. # This tag requires that the tag HAVE_DOT is set to YES. DOT_COMMON_ATTR = "fontname=Helvetica,fontsize=10" # DOT_EDGE_ATTR is concatenated with DOT_COMMON_ATTR. For elegant style you can # add 'arrowhead=open, arrowtail=open, arrowsize=0.5'. Complete documentation about # arrows shapes. # The default value is: labelfontname=Helvetica,labelfontsize=10. # This tag requires that the tag HAVE_DOT is set to YES. DOT_EDGE_ATTR = "labelfontname=Helvetica,labelfontsize=10" # DOT_NODE_ATTR is concatenated with DOT_COMMON_ATTR. For view without boxes # around nodes set 'shape=plain' or 'shape=plaintext' Shapes specification # The default value is: shape=box,height=0.2,width=0.4. # This tag requires that the tag HAVE_DOT is set to YES. DOT_NODE_ATTR = "shape=box,height=0.2,width=0.4" # You can set the path where dot can find font specified with fontname in # DOT_COMMON_ATTR and others dot attributes. # This tag requires that the tag HAVE_DOT is set to YES. DOT_FONTPATH = # If the CLASS_GRAPH tag is set to YES or GRAPH or BUILTIN then doxygen will # generate a graph for each documented class showing the direct and indirect # inheritance relations. In case the CLASS_GRAPH tag is set to YES or GRAPH and # HAVE_DOT is enabled as well, then dot will be used to draw the graph. In case # the CLASS_GRAPH tag is set to YES and HAVE_DOT is disabled or if the # CLASS_GRAPH tag is set to BUILTIN, then the built-in generator will be used. # If the CLASS_GRAPH tag is set to TEXT the direct and indirect inheritance # relations will be shown as texts / links. Explicit enabling an inheritance # graph or choosing a different representation for an inheritance graph of a # specific class, can be accomplished by means of the command \inheritancegraph. # Disabling an inheritance graph can be accomplished by means of the command # \hideinheritancegraph. # Possible values are: NO, YES, TEXT, GRAPH and BUILTIN. # The default value is: YES. CLASS_GRAPH = YES # If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a # graph for each documented class showing the direct and indirect implementation # dependencies (inheritance, containment, and class references variables) of the # class with other documented classes. Explicit enabling a collaboration graph, # when COLLABORATION_GRAPH is set to NO, can be accomplished by means of the # command \collaborationgraph. Disabling a collaboration graph can be # accomplished by means of the command \hidecollaborationgraph. # The default value is: YES. # This tag requires that the tag HAVE_DOT is set to YES. COLLABORATION_GRAPH = YES # If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for # groups, showing the direct groups dependencies. Explicit enabling a group # dependency graph, when GROUP_GRAPHS is set to NO, can be accomplished by means # of the command \groupgraph. Disabling a directory graph can be accomplished by # means of the command \hidegroupgraph. See also the chapter Grouping in the # manual. # The default value is: YES. # This tag requires that the tag HAVE_DOT is set to YES. GROUP_GRAPHS = YES # If the UML_LOOK tag is set to YES, doxygen will generate inheritance and # collaboration diagrams in a style similar to the OMG's Unified Modeling # Language. # The default value is: NO. # This tag requires that the tag HAVE_DOT is set to YES. UML_LOOK = NO # If the UML_LOOK tag is enabled, the fields and methods are shown inside the # class node. If there are many fields or methods and many nodes the graph may # become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the # number of items for each type to make the size more manageable. Set this to 0 # for no limit. Note that the threshold may be exceeded by 50% before the limit # is enforced. So when you set the threshold to 10, up to 15 fields may appear, # but if the number exceeds 15, the total amount of fields shown is limited to # 10. # Minimum value: 0, maximum value: 100, default value: 10. # This tag requires that the tag UML_LOOK is set to YES. UML_LIMIT_NUM_FIELDS = 10 # If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and # methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS # tag is set to YES, doxygen will add type and arguments for attributes and # methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen # will not generate fields with class member information in the UML graphs. The # class diagrams will look similar to the default class diagrams but using UML # notation for the relationships. # Possible values are: NO, YES and NONE. # The default value is: NO. # This tag requires that the tag UML_LOOK is set to YES. DOT_UML_DETAILS = NO # The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters # to display on a single line. If the actual line length exceeds this threshold # significantly it will be wrapped across multiple lines. Some heuristics are # applied to avoid ugly line breaks. # Minimum value: 0, maximum value: 1000, default value: 17. # This tag requires that the tag HAVE_DOT is set to YES. DOT_WRAP_THRESHOLD = 17 # If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and # collaboration graphs will show the relations between templates and their # instances. # The default value is: NO. # This tag requires that the tag HAVE_DOT is set to YES. TEMPLATE_RELATIONS = NO # If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to # YES then doxygen will generate a graph for each documented file showing the # direct and indirect include dependencies of the file with other documented # files. Explicit enabling an include graph, when INCLUDE_GRAPH is is set to NO, # can be accomplished by means of the command \includegraph. Disabling an # include graph can be accomplished by means of the command \hideincludegraph. # The default value is: YES. # This tag requires that the tag HAVE_DOT is set to YES. INCLUDE_GRAPH = YES # If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are # set to YES then doxygen will generate a graph for each documented file showing # the direct and indirect include dependencies of the file with other documented # files. Explicit enabling an included by graph, when INCLUDED_BY_GRAPH is set # to NO, can be accomplished by means of the command \includedbygraph. Disabling # an included by graph can be accomplished by means of the command # \hideincludedbygraph. # The default value is: YES. # This tag requires that the tag HAVE_DOT is set to YES. INCLUDED_BY_GRAPH = YES # If the CALL_GRAPH tag is set to YES then doxygen will generate a call # dependency graph for every global function or class method. # # Note that enabling this option will significantly increase the time of a run. # So in most cases it will be better to enable call graphs for selected # functions only using the \callgraph command. Disabling a call graph can be # accomplished by means of the command \hidecallgraph. # The default value is: NO. # This tag requires that the tag HAVE_DOT is set to YES. CALL_GRAPH = NO # If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller # dependency graph for every global function or class method. # # Note that enabling this option will significantly increase the time of a run. # So in most cases it will be better to enable caller graphs for selected # functions only using the \callergraph command. Disabling a caller graph can be # accomplished by means of the command \hidecallergraph. # The default value is: NO. # This tag requires that the tag HAVE_DOT is set to YES. CALLER_GRAPH = NO # If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical # hierarchy of all classes instead of a textual one. # The default value is: YES. # This tag requires that the tag HAVE_DOT is set to YES. GRAPHICAL_HIERARCHY = YES # If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the # dependencies a directory has on other directories in a graphical way. The # dependency relations are determined by the #include relations between the # files in the directories. Explicit enabling a directory graph, when # DIRECTORY_GRAPH is set to NO, can be accomplished by means of the command # \directorygraph. Disabling a directory graph can be accomplished by means of # the command \hidedirectorygraph. # The default value is: YES. # This tag requires that the tag HAVE_DOT is set to YES. DIRECTORY_GRAPH = YES # The DIR_GRAPH_MAX_DEPTH tag can be used to limit the maximum number of levels # of child directories generated in directory dependency graphs by dot. # Minimum value: 1, maximum value: 25, default value: 1. # This tag requires that the tag DIRECTORY_GRAPH is set to YES. DIR_GRAPH_MAX_DEPTH = 1 # The DOT_IMAGE_FORMAT tag can be used to set the image format of the images # generated by dot. For an explanation of the image formats see the section # output formats in the documentation of the dot tool (Graphviz (see: # https://www.graphviz.org/)). # Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order # to make the SVG files visible in IE 9+ (other browsers do not have this # requirement). # Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo, # png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and # png:gdiplus:gdiplus. # The default value is: png. # This tag requires that the tag HAVE_DOT is set to YES. DOT_IMAGE_FORMAT = png # If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to # enable generation of interactive SVG images that allow zooming and panning. # # Note that this requires a modern browser other than Internet Explorer. Tested # and working are Firefox, Chrome, Safari, and Opera. # Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make # the SVG files visible. Older versions of IE do not have SVG support. # The default value is: NO. # This tag requires that the tag HAVE_DOT is set to YES. INTERACTIVE_SVG = NO # The DOT_PATH tag can be used to specify the path where the dot tool can be # found. If left blank, it is assumed the dot tool can be found in the path. # This tag requires that the tag HAVE_DOT is set to YES. DOT_PATH = # The DOTFILE_DIRS tag can be used to specify one or more directories that # contain dot files that are included in the documentation (see the \dotfile # command). # This tag requires that the tag HAVE_DOT is set to YES. DOTFILE_DIRS = # You can include diagrams made with dia in doxygen documentation. Doxygen will # then run dia to produce the diagram and insert it in the documentation. The # DIA_PATH tag allows you to specify the directory where the dia binary resides. # If left empty dia is assumed to be found in the default search path. DIA_PATH = # The DIAFILE_DIRS tag can be used to specify one or more directories that # contain dia files that are included in the documentation (see the \diafile # command). DIAFILE_DIRS = # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the # path where java can find the plantuml.jar file or to the filename of jar file # to be used. If left blank, it is assumed PlantUML is not used or called during # a preprocessing step. Doxygen will generate a warning when it encounters a # \startuml command in this case and will not generate output for the diagram. PLANTUML_JAR_PATH = # When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a # configuration file for plantuml. PLANTUML_CFG_FILE = # When using plantuml, the specified paths are searched for files specified by # the !include statement in a plantuml block. PLANTUML_INCLUDE_PATH = # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes # that will be shown in the graph. If the number of nodes in a graph becomes # larger than this value, doxygen will truncate the graph, which is visualized # by representing a node as a red box. Note that if the number of direct # children of the root node in a graph is already larger than # DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that # the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH. # Minimum value: 0, maximum value: 10000, default value: 50. # This tag requires that the tag HAVE_DOT is set to YES. DOT_GRAPH_MAX_NODES = 50 # The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs # generated by dot. A depth value of 3 means that only nodes reachable from the # root by following a path via at most 3 edges will be shown. Nodes that lay # further from the root node will be omitted. Note that setting this option to 1 # or 2 may greatly reduce the computation time needed for large code bases. Also # note that the size of a graph can be further restricted by # DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction. # Minimum value: 0, maximum value: 1000, default value: 0. # This tag requires that the tag HAVE_DOT is set to YES. MAX_DOT_GRAPH_DEPTH = 0 # Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output # files in one run (i.e. multiple -o and -T options on the command line). This # makes dot run faster, but since only newer versions of dot (>1.8.10) support # this, this feature is disabled by default. # The default value is: NO. # This tag requires that the tag HAVE_DOT is set to YES. DOT_MULTI_TARGETS = NO # If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page # explaining the meaning of the various boxes and arrows in the dot generated # graphs. # Note: This tag requires that UML_LOOK isn't set, i.e. the doxygen internal # graphical representation for inheritance and collaboration diagrams is used. # The default value is: YES. # This tag requires that the tag HAVE_DOT is set to YES. GENERATE_LEGEND = YES # If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate # files that are used to generate the various graphs. # # Note: This setting is not only used for dot files but also for msc temporary # files. # The default value is: YES. DOT_CLEANUP = YES # You can define message sequence charts within doxygen comments using the \msc # command. If the MSCGEN_TOOL tag is left empty (the default), then doxygen will # use a built-in version of mscgen tool to produce the charts. Alternatively, # the MSCGEN_TOOL tag can also specify the name an external tool. For instance, # specifying prog as the value, doxygen will call the tool as prog -T # -o . The external tool should support # output file formats "png", "eps", "svg", and "ismap". MSCGEN_TOOL = # The MSCFILE_DIRS tag can be used to specify one or more directories that # contain msc files that are included in the documentation (see the \mscfile # command). MSCFILE_DIRS = ================================================ FILE: third-party/mimalloc/doc/mimalloc-doc.h ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2025, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #error "documentation file only!" /*! \mainpage This is the API documentation of the [mimalloc](https://github.com/microsoft/mimalloc) allocator (pronounced "me-malloc") -- a general purpose allocator with excellent [performance](bench.html) characteristics. Initially developed by Daan Leijen for the run-time systems of the [Koka](https://github.com/koka-lang/koka) and [Lean](https://github.com/leanprover/lean) languages. It is a drop-in replacement for `malloc` and can be used in other programs without code changes, for example, on Unix you can use it as: ``` > LD_PRELOAD=/usr/bin/libmimalloc.so myprogram ``` Notable aspects of the design include: - __small and consistent__: the library is about 8k LOC using simple and consistent data structures. This makes it very suitable to integrate and adapt in other projects. For runtime systems it provides hooks for a monotonic _heartbeat_ and deferred freeing (for bounded worst-case times with reference counting). Partly due to its simplicity, mimalloc has been ported to many systems (Windows, macOS, Linux, WASM, various BSD's, Haiku, MUSL, etc) and has excellent support for dynamic overriding. At the same time, it is an industrial strength allocator that runs (very) large scale distributed services on thousands of machines with excellent worst case latencies. - __free list sharding__: instead of one big free list (per size class) we have many smaller lists per "mimalloc page" which reduces fragmentation and increases locality -- things that are allocated close in time get allocated close in memory. (A mimalloc page contains blocks of one size class and is usually 64KiB on a 64-bit system). - __free list multi-sharding__: the big idea! Not only do we shard the free list per mimalloc page, but for each page we have multiple free lists. In particular, there is one list for thread-local `free` operations, and another one for concurrent `free` operations. Free-ing from another thread can now be a single CAS without needing sophisticated coordination between threads. Since there will be thousands of separate free lists, contention is naturally distributed over the heap, and the chance of contending on a single location will be low -- this is quite similar to randomized algorithms like skip lists where adding a random oracle removes the need for a more complex algorithm. - __eager page purging__: when a "page" becomes empty (with increased chance due to free list sharding) the memory is marked to the OS as unused (reset or decommitted) reducing (real) memory pressure and fragmentation, especially in long running programs. - __secure__: _mimalloc_ can be built in secure mode, adding guard pages, randomized allocation, encrypted free lists, etc. to protect against various heap vulnerabilities. The performance penalty is usually around 10% on average over our benchmarks. - __first-class heaps__: efficiently create and use multiple heaps to allocate across different regions. A heap can be destroyed at once instead of deallocating each object separately. - __bounded__: it does not suffer from _blowup_ \[1\], has bounded worst-case allocation times (_wcat_) (upto OS primitives), bounded space overhead (~0.2% meta-data, with low internal fragmentation), and has no internal points of contention using only atomic operations. - __fast__: In our benchmarks (see [below](#bench)), _mimalloc_ outperforms other leading allocators (_jemalloc_, _tcmalloc_, _Hoard_, etc), and often uses less memory. A nice property is that it does consistently well over a wide range of benchmarks. There is also good huge OS page support for larger server programs. You can read more on the design of _mimalloc_ in the [technical report](https://www.microsoft.com/en-us/research/publication/mimalloc-free-list-sharding-in-action) which also has detailed benchmark results. Further information: - \ref build - \ref using - \ref environment - \ref overrides - \ref bench - \ref malloc - \ref extended - \ref aligned - \ref heap - \ref typed - \ref analysis - \ref options - \ref posix - \ref cpp */ /// \defgroup malloc Basic Allocation /// The basic allocation interface. /// \{ /// Free previously allocated memory. /// The pointer `p` must have been allocated before (or be \a NULL). /// @param p pointer to free, or \a NULL. void mi_free(void* p); /// Allocate \a size bytes. /// @param size number of bytes to allocate. /// @returns pointer to the allocated memory or \a NULL if out of memory. /// Returns a unique pointer if called with \a size 0. void* mi_malloc(size_t size); /// Allocate zero-initialized `size` bytes. /// @param size The size in bytes. /// @returns Pointer to newly allocated zero initialized memory, /// or \a NULL if out of memory. void* mi_zalloc(size_t size); /// Allocate zero-initialized \a count elements of \a size bytes. /// @param count number of elements. /// @param size size of each element. /// @returns pointer to the allocated memory /// of \a size*\a count bytes, or \a NULL if either out of memory /// or when `count*size` overflows. /// /// Returns a unique pointer if called with either \a size or \a count of 0. /// @see mi_zalloc() void* mi_calloc(size_t count, size_t size); /// Re-allocate memory to \a newsize bytes. /// @param p pointer to previously allocated memory (or \a NULL). /// @param newsize the new required size in bytes. /// @returns pointer to the re-allocated memory /// of \a newsize bytes, or \a NULL if out of memory. /// If \a NULL is returned, the pointer \a p is not freed. /// Otherwise the original pointer is either freed or returned /// as the reallocated result (in case it fits in-place with the /// new size). If the pointer \a p is \a NULL, it behaves as /// \a mi_malloc(\a newsize). If \a newsize is larger than the /// original \a size allocated for \a p, the bytes after \a size /// are uninitialized. void* mi_realloc(void* p, size_t newsize); /// Re-allocate memory to \a count elements of \a size bytes, with extra memory initialized to zero. /// @param p Pointer to a previously allocated block (or \a NULL). /// @param count The number of elements. /// @param size The size of each element. /// @returns A pointer to a re-allocated block of \a count * \a size bytes, or \a NULL /// if out of memory or if \a count * \a size overflows. /// /// If there is no overflow, it behaves exactly like `mi_rezalloc(p,count*size)`. /// @see mi_reallocn() /// @see [recallocarray()](http://man.openbsd.org/reallocarray) (on BSD). void* mi_recalloc(void* p, size_t count, size_t size); /// Try to re-allocate memory to \a newsize bytes _in place_. /// @param p pointer to previously allocated memory (or \a NULL). /// @param newsize the new required size in bytes. /// @returns pointer to the re-allocated memory /// of \a newsize bytes (always equal to \a p), /// or \a NULL if either out of memory or if /// the memory could not be expanded in place. /// If \a NULL is returned, the pointer \a p is not freed. /// Otherwise the original pointer is returned /// as the reallocated result since it fits in-place with the /// new size. If \a newsize is larger than the /// original \a size allocated for \a p, the bytes after \a size /// are uninitialized. void* mi_expand(void* p, size_t newsize); /// Allocate \a count elements of \a size bytes. /// @param count The number of elements. /// @param size The size of each element. /// @returns A pointer to a block of \a count * \a size bytes, or \a NULL /// if out of memory or if \a count * \a size overflows. /// /// If there is no overflow, it behaves exactly like `mi_malloc(count*size)`. /// @see mi_calloc() /// @see mi_zallocn() void* mi_mallocn(size_t count, size_t size); /// Re-allocate memory to \a count elements of \a size bytes. /// @param p Pointer to a previously allocated block (or \a NULL). /// @param count The number of elements. /// @param size The size of each element. /// @returns A pointer to a re-allocated block of \a count * \a size bytes, or \a NULL /// if out of memory or if \a count * \a size overflows. /// /// If there is no overflow, it behaves exactly like `mi_realloc(p,count*size)`. /// @see [reallocarray()]() (on BSD) void* mi_reallocn(void* p, size_t count, size_t size); /// Re-allocate memory to \a newsize bytes, /// @param p pointer to previously allocated memory (or \a NULL). /// @param newsize the new required size in bytes. /// @returns pointer to the re-allocated memory /// of \a newsize bytes, or \a NULL if out of memory. /// /// In contrast to mi_realloc(), if \a NULL is returned, the original pointer /// \a p is freed (if it was not \a NULL itself). /// Otherwise the original pointer is either freed or returned /// as the reallocated result (in case it fits in-place with the /// new size). If the pointer \a p is \a NULL, it behaves as /// \a mi_malloc(\a newsize). If \a newsize is larger than the /// original \a size allocated for \a p, the bytes after \a size /// are uninitialized. /// /// @see [reallocf](https://www.freebsd.org/cgi/man.cgi?query=reallocf) (on BSD) void* mi_reallocf(void* p, size_t newsize); /// Allocate and duplicate a string. /// @param s string to duplicate (or \a NULL). /// @returns a pointer to newly allocated memory initialized /// to string \a s, or \a NULL if either out of memory or if /// \a s is \a NULL. /// /// Replacement for the standard [strdup()](http://pubs.opengroup.org/onlinepubs/9699919799/functions/strdup.html) /// such that mi_free() can be used on the returned result. char* mi_strdup(const char* s); /// Allocate and duplicate a string up to \a n bytes. /// @param s string to duplicate (or \a NULL). /// @param n maximum number of bytes to copy (excluding the terminating zero). /// @returns a pointer to newly allocated memory initialized /// to string \a s up to the first \a n bytes (and always zero terminated), /// or \a NULL if either out of memory or if \a s is \a NULL. /// /// Replacement for the standard [strndup()](http://pubs.opengroup.org/onlinepubs/9699919799/functions/strndup.html) /// such that mi_free() can be used on the returned result. char* mi_strndup(const char* s, size_t n); /// Resolve a file path name. /// @param fname File name. /// @param resolved_name Should be \a NULL (but can also point to a buffer /// of at least \a PATH_MAX bytes). /// @returns If successful a pointer to the resolved absolute file name, or /// \a NULL on failure (with \a errno set to the error code). /// /// If \a resolved_name was \a NULL, the returned result should be freed with /// mi_free(). /// /// Replacement for the standard [realpath()](http://pubs.opengroup.org/onlinepubs/9699919799/functions/realpath.html) /// such that mi_free() can be used on the returned result (if \a resolved_name was \a NULL). char* mi_realpath(const char* fname, char* resolved_name); /// \} // ------------------------------------------------------ // Extended functionality // ------------------------------------------------------ /// \defgroup extended Extended Functions /// Extended functionality. /// \{ /// Maximum size allowed for small allocations in /// #mi_malloc_small and #mi_zalloc_small (usually `128*sizeof(void*)` (= 1KB on 64-bit systems)) #define MI_SMALL_SIZE_MAX (128*sizeof(void*)) /// Allocate a small object. /// @param size The size in bytes, can be at most #MI_SMALL_SIZE_MAX. /// @returns a pointer to newly allocated memory of at least \a size /// bytes, or \a NULL if out of memory. /// This function is meant for use in run-time systems for best /// performance and does not check if \a size was indeed small -- use /// with care! void* mi_malloc_small(size_t size); /// Allocate a zero initialized small object. /// @param size The size in bytes, can be at most #MI_SMALL_SIZE_MAX. /// @returns a pointer to newly allocated zero-initialized memory of at /// least \a size bytes, or \a NULL if out of memory. /// This function is meant for use in run-time systems for best /// performance and does not check if \a size was indeed small -- use /// with care! void* mi_zalloc_small(size_t size); /// Return the available bytes in a memory block. /// @param p Pointer to previously allocated memory (or \a NULL) /// @returns Returns the available bytes in the memory block, or /// 0 if \a p was \a NULL. /// /// The returned size can be /// used to call \a mi_expand successfully. /// The returned size is always at least equal to the /// allocated size of \a p. /// /// @see [_msize](https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/msize?view=vs-2017) (Windows) /// @see [malloc_usable_size](http://man7.org/linux/man-pages/man3/malloc_usable_size.3.html) (Linux) /// @see mi_good_size() size_t mi_usable_size(void* p); /// Return the used allocation size. /// @param size The minimal required size in bytes. /// @returns the size `n` that will be allocated, where `n >= size`. /// /// Generally, `mi_usable_size(mi_malloc(size)) == mi_good_size(size)`. /// This can be used to reduce internal wasted space when /// allocating buffers for example. /// /// @see mi_usable_size() size_t mi_good_size(size_t size); /// Eagerly free memory. /// @param force If \a true, aggressively return memory to the OS (can be expensive!) /// /// Regular code should not have to call this function. It can be beneficial /// in very narrow circumstances; in particular, when a long running thread /// allocates a lot of blocks that are freed by other threads it may improve /// resource usage by calling this every once in a while. void mi_collect(bool force); /// Deprecated /// @param out Ignored, outputs to the registered output function or stderr by default. /// /// Most detailed when using a debug build. void mi_stats_print(void* out); /// Print the main statistics. /// @param out An output function or \a NULL for the default. /// @param arg Optional argument passed to \a out (if not \a NULL) /// /// Most detailed when using a debug build. void mi_stats_print_out(mi_output_fun* out, void* arg); /// Reset statistics. void mi_stats_reset(void); /// Merge thread local statistics with the main statistics and reset. void mi_stats_merge(void); /// Initialize mimalloc on a thread. /// Should not be used as on most systems (pthreads, windows) this is done /// automatically. void mi_thread_init(void); /// Uninitialize mimalloc on a thread. /// Should not be used as on most systems (pthreads, windows) this is done /// automatically. Ensures that any memory that is not freed yet (but will /// be freed by other threads in the future) is properly handled. void mi_thread_done(void); /// Print out heap statistics for this thread. /// @param out An output function or \a NULL for the default. /// @param arg Optional argument passed to \a out (if not \a NULL) /// /// Most detailed when using a debug build. void mi_thread_stats_print_out(mi_output_fun* out, void* arg); /// Type of deferred free functions. /// @param force If \a true all outstanding items should be freed. /// @param heartbeat A monotonically increasing count. /// @param arg Argument that was passed at registration to hold extra state. /// /// @see mi_register_deferred_free typedef void (mi_deferred_free_fun)(bool force, unsigned long long heartbeat, void* arg); /// Register a deferred free function. /// @param deferred_free Address of a deferred free-ing function or \a NULL to unregister. /// @param arg Argument that will be passed on to the deferred free function. /// /// Some runtime systems use deferred free-ing, for example when using /// reference counting to limit the worst case free time. /// Such systems can register (re-entrant) deferred free function /// to free more memory on demand. When the \a force parameter is /// \a true all possible memory should be freed. /// The per-thread \a heartbeat parameter is monotonically increasing /// and guaranteed to be deterministic if the program allocates /// deterministically. The \a deferred_free function is guaranteed /// to be called deterministically after some number of allocations /// (regardless of freeing or available free memory). /// At most one \a deferred_free function can be active. void mi_register_deferred_free(mi_deferred_free_fun* deferred_free, void* arg); /// Type of output functions. /// @param msg Message to output. /// @param arg Argument that was passed at registration to hold extra state. /// /// @see mi_register_output() typedef void (mi_output_fun)(const char* msg, void* arg); /// Register an output function. /// @param out The output function, use `NULL` to output to stderr. /// @param arg Argument that will be passed on to the output function. /// /// The `out` function is called to output any information from mimalloc, /// like verbose or warning messages. void mi_register_output(mi_output_fun* out, void* arg); /// Type of error callback functions. /// @param err Error code (see mi_register_error() for a complete list). /// @param arg Argument that was passed at registration to hold extra state. /// /// @see mi_register_error() typedef void (mi_error_fun)(int err, void* arg); /// Register an error callback function. /// @param errfun The error function that is called on an error (use \a NULL for default) /// @param arg Extra argument that will be passed on to the error function. /// /// The \a errfun function is called on an error in mimalloc after emitting /// an error message (through the output function). It as always legal to just /// return from the \a errfun function in which case allocation functions generally /// return \a NULL or ignore the condition. The default function only calls abort() /// when compiled in secure mode with an \a EFAULT error. The possible error /// codes are: /// * \a EAGAIN: Double free was detected (only in debug and secure mode). /// * \a EFAULT: Corrupted free list or meta-data was detected (only in debug and secure mode). /// * \a ENOMEM: Not enough memory available to satisfy the request. /// * \a EOVERFLOW: Too large a request, for example in mi_calloc(), the \a count and \a size parameters are too large. /// * \a EINVAL: Trying to free or re-allocate an invalid pointer. void mi_register_error(mi_error_fun* errfun, void* arg); /// Is a pointer part of our heap? /// @param p The pointer to check. /// @returns \a true if this is a pointer into our heap. /// This function is relatively fast. bool mi_is_in_heap_region(const void* p); /// Reserve OS memory for use by mimalloc. Reserved areas are used /// before allocating from the OS again. By reserving a large area upfront, /// allocation can be more efficient, and can be better managed on systems /// without `mmap`/`VirtualAlloc` (like WASM for example). /// @param size The size to reserve. /// @param commit Commit the memory upfront. /// @param allow_large Allow large OS pages (2MiB) to be used? /// @return \a 0 if successful, and an error code otherwise (e.g. `ENOMEM`). int mi_reserve_os_memory(size_t size, bool commit, bool allow_large); /// Manage a particular memory area for use by mimalloc. /// This is just like `mi_reserve_os_memory` except that the area should already be /// allocated in some manner and available for use my mimalloc. /// @param start Start of the memory area /// @param size The size of the memory area. /// @param is_committed Is the area already committed? /// @param is_large Does it consist of large OS pages? Set this to \a true as well for memory /// that should not be decommitted or protected (like rdma etc.) /// @param is_zero Does the area consists of zero's? /// @param numa_node Possible associated numa node or `-1`. /// @return \a true if successful, and \a false on error. bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node); /// Reserve \a pages of huge OS pages (1GiB) evenly divided over \a numa_nodes nodes, /// but stops after at most `timeout_msecs` seconds. /// @param pages The number of 1GiB pages to reserve. /// @param numa_nodes The number of nodes do evenly divide the pages over, or 0 for using the actual number of NUMA nodes. /// @param timeout_msecs Maximum number of milli-seconds to try reserving, or 0 for no timeout. /// @returns 0 if successful, \a ENOMEM if running out of memory, or \a ETIMEDOUT if timed out. /// /// The reserved memory is used by mimalloc to satisfy allocations. /// May quit before \a timeout_msecs are expired if it estimates it will take more than /// 1.5 times \a timeout_msecs. The time limit is needed because on some operating systems /// it can take a long time to reserve contiguous memory if the physical memory is /// fragmented. int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs); /// Reserve \a pages of huge OS pages (1GiB) at a specific \a numa_node, /// but stops after at most `timeout_msecs` seconds. /// @param pages The number of 1GiB pages to reserve. /// @param numa_node The NUMA node where the memory is reserved (start at 0). Use -1 for no affinity. /// @param timeout_msecs Maximum number of milli-seconds to try reserving, or 0 for no timeout. /// @returns 0 if successful, \a ENOMEM if running out of memory, or \a ETIMEDOUT if timed out. /// /// The reserved memory is used by mimalloc to satisfy allocations. /// May quit before \a timeout_msecs are expired if it estimates it will take more than /// 1.5 times \a timeout_msecs. The time limit is needed because on some operating systems /// it can take a long time to reserve contiguous memory if the physical memory is /// fragmented. int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs); /// Is the C runtime \a malloc API redirected? /// @returns \a true if all malloc API calls are redirected to mimalloc. /// /// Currently only used on Windows. bool mi_is_redirected(); /// Return process information (time and memory usage). /// @param elapsed_msecs Optional. Elapsed wall-clock time of the process in milli-seconds. /// @param user_msecs Optional. User time in milli-seconds (as the sum over all threads). /// @param system_msecs Optional. System time in milli-seconds. /// @param current_rss Optional. Current working set size (touched pages). /// @param peak_rss Optional. Peak working set size (touched pages). /// @param current_commit Optional. Current committed memory (backed by the page file). /// @param peak_commit Optional. Peak committed memory (backed by the page file). /// @param page_faults Optional. Count of hard page faults. /// /// The \a current_rss is precise on Windows and MacOSX; other systems estimate /// this using \a current_commit. The \a commit is precise on Windows but estimated /// on other systems as the amount of read/write accessible memory reserved by mimalloc. void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults); /// @brief Show all current arena's. /// @param show_inuse Show the arena blocks that are in use. /// @param show_abandoned Show the abandoned arena blocks. /// @param show_purge Show arena blocks scheduled for purging. void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge); /// Mimalloc uses large (virtual) memory areas, called "arena"s, from the OS to manage its memory. /// Each arena has an associated identifier. typedef int mi_arena_id_t; /// @brief Return the size of an arena. /// @param arena_id The arena identifier. /// @param size Returned size in bytes of the (virtual) arena area. /// @return base address of the arena. void* mi_arena_area(mi_arena_id_t arena_id, size_t* size); /// @brief Reserve huge OS pages (1GiB) into a single arena. /// @param pages Number of 1GiB pages to reserve. /// @param numa_node The associated NUMA node, or -1 for no NUMA preference. /// @param timeout_msecs Max amount of milli-seconds this operation is allowed to take. (0 is infinite) /// @param exclusive If exclusive, only a heap associated with this arena can allocate in it. /// @param arena_id The arena identifier. /// @return 0 if successful, \a ENOMEM if running out of memory, or \a ETIMEDOUT if timed out. int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id); /// @brief Reserve OS memory to be managed in an arena. /// @param size Size the reserve. /// @param commit Should the memory be initially committed? /// @param allow_large Allow the use of large OS pages? /// @param exclusive Is the returned arena exclusive? /// @param arena_id The new arena identifier. /// @return Zero on success, an error code otherwise. int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id); /// @brief Manage externally allocated memory as a mimalloc arena. This memory will not be freed by mimalloc. /// @param start Start address of the area. /// @param size Size in bytes of the area. /// @param is_committed Is the memory already committed? /// @param is_large Does it consist of (pinned) large OS pages? /// @param is_zero Is the memory zero-initialized? /// @param numa_node Associated NUMA node, or -1 to have no NUMA preference. /// @param exclusive Is the arena exclusive (where only heaps associated with the arena can allocate in it) /// @param arena_id The new arena identifier. /// @return `true` if successful. bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id); /// @brief Create a new heap that only allocates in the specified arena. /// @param arena_id The arena identifier. /// @return The new heap or `NULL`. mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id); /// @brief Create a new heap /// @param heap_tag The heap tag associated with this heap; heaps only reclaim memory between heaps with the same tag. /// @param allow_destroy Is \a mi_heap_destroy allowed? Not allowing this allows the heap to reclaim memory from terminated threads. /// @param arena_id If not 0, the heap will only allocate from the specified arena. /// @return A new heap or `NULL` on failure. /// /// The \a arena_id can be used by runtimes to allocate only in a specified pre-reserved arena. /// This is used for example for a compressed pointer heap in Koka. /// The \a heap_tag enables heaps to keep objects of a certain type isolated to heaps with that tag. /// This is used for example in the CPython integration. mi_heap_t* mi_heap_new_ex(int heap_tag, bool allow_destroy, mi_arena_id_t arena_id); /// A process can associate threads with sub-processes. /// A sub-process will not reclaim memory from (abandoned heaps/threads) /// other subprocesses. typedef void* mi_subproc_id_t; /// @brief Get the main sub-process identifier. mi_subproc_id_t mi_subproc_main(void); /// @brief Create a fresh sub-process (with no associated threads yet). /// @return The new sub-process identifier. mi_subproc_id_t mi_subproc_new(void); /// @brief Delete a previously created sub-process. /// @param subproc The sub-process identifier. /// Only delete sub-processes if all associated threads have terminated. void mi_subproc_delete(mi_subproc_id_t subproc); /// Add the current thread to the given sub-process. /// This should be called right after a thread is created (and no allocation has taken place yet) void mi_subproc_add_current_thread(mi_subproc_id_t subproc); /// \} // ------------------------------------------------------ // Aligned allocation // ------------------------------------------------------ /// \defgroup aligned Aligned Allocation /// /// Allocating aligned memory blocks. /// Note that `alignment` always follows `size` for consistency with the unaligned /// allocation API, but unfortunately this differs from `posix_memalign` and `aligned_alloc` in the C library. /// /// \{ /// Allocate \a size bytes aligned by \a alignment. /// @param size number of bytes to allocate. /// @param alignment the minimal alignment of the allocated memory. /// @returns pointer to the allocated memory or \a NULL if out of memory, /// or if the alignment is not a power of 2 (including 0). The \a size is unrestricted /// (and does not have to be an integral multiple of the \a alignment). /// The returned pointer is aligned by \a alignment, i.e. `(uintptr_t)p % alignment == 0`. /// Returns a unique pointer if called with \a size 0. /// /// Note that `alignment` always follows `size` for consistency with the unaligned /// allocation API, but unfortunately this differs from `posix_memalign` and `aligned_alloc` in the C library. /// /// @see [aligned_alloc](https://en.cppreference.com/w/c/memory/aligned_alloc) (in the standard C11 library, with switched arguments!) /// @see [_aligned_malloc](https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-malloc?view=vs-2017) (on Windows) /// @see [aligned_alloc](http://man.openbsd.org/reallocarray) (on BSD, with switched arguments!) /// @see [posix_memalign](https://linux.die.net/man/3/posix_memalign) (on Posix, with switched arguments!) /// @see [memalign](https://linux.die.net/man/3/posix_memalign) (on Linux, with switched arguments!) void* mi_malloc_aligned(size_t size, size_t alignment); void* mi_zalloc_aligned(size_t size, size_t alignment); void* mi_calloc_aligned(size_t count, size_t size, size_t alignment); void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment); /// Allocate \a size bytes aligned by \a alignment at a specified \a offset. /// @param size number of bytes to allocate. /// @param alignment the minimal alignment of the allocated memory at \a offset. /// @param offset the offset that should be aligned. /// @returns pointer to the allocated memory or \a NULL if out of memory, /// or if the alignment is not a power of 2 (including 0). The \a size is unrestricted /// (and does not have to be an integral multiple of the \a alignment). /// The returned pointer is aligned by \a alignment, i.e. `(uintptr_t)p % alignment == 0`. /// Returns a unique pointer if called with \a size 0. /// /// @see [_aligned_offset_malloc](https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/aligned-offset-malloc?view=vs-2017) (on Windows) void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset); void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset); void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset); void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset); /// \} /// \defgroup heap Heap Allocation /// /// First-class heaps that can be destroyed in one go. /// /// \{ /// Type of first-class heaps. /// A heap can only be used for allocation in /// the thread that created this heap! Any allocated /// blocks can be freed or reallocated by any other thread though. struct mi_heap_s; /// Type of first-class heaps. /// A heap can only be used for (re)allocation in /// the thread that created this heap! Any allocated /// blocks can be freed by any other thread though. typedef struct mi_heap_s mi_heap_t; /// Create a new heap that can be used for allocation. mi_heap_t* mi_heap_new(); /// Delete a previously allocated heap. /// This will release resources and migrate any /// still allocated blocks in this heap (efficiently) /// to the default heap. /// /// If \a heap is the default heap, the default /// heap is set to the backing heap. void mi_heap_delete(mi_heap_t* heap); /// Destroy a heap, freeing all its still allocated blocks. /// Use with care as this will free all blocks still /// allocated in the heap. However, this can be a very /// efficient way to free all heap memory in one go. /// /// If \a heap is the default heap, the default /// heap is set to the backing heap. void mi_heap_destroy(mi_heap_t* heap); /// Set the default heap to use in the current thread for mi_malloc() et al. /// @param heap The new default heap. /// @returns The previous default heap. mi_heap_t* mi_heap_set_default(mi_heap_t* heap); /// Get the default heap that is used for mi_malloc() et al. (for the current thread). /// @returns The current default heap. mi_heap_t* mi_heap_get_default(); /// Get the backing heap. /// The _backing_ heap is the initial default heap for /// a thread and always available for allocations. /// It cannot be destroyed or deleted /// except by exiting the thread. mi_heap_t* mi_heap_get_backing(); /// Release outstanding resources in a specific heap. void mi_heap_collect(mi_heap_t* heap, bool force); /// Allocate in a specific heap. /// @see mi_malloc() void* mi_heap_malloc(mi_heap_t* heap, size_t size); /// Allocate a small object in a specific heap. /// \a size must be smaller or equal to MI_SMALL_SIZE_MAX(). /// @see mi_malloc() void* mi_heap_malloc_small(mi_heap_t* heap, size_t size); /// Allocate zero-initialized in a specific heap. /// @see mi_zalloc() void* mi_heap_zalloc(mi_heap_t* heap, size_t size); /// Allocate \a count zero-initialized elements in a specific heap. /// @see mi_calloc() void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size); /// Allocate \a count elements in a specific heap. /// @see mi_mallocn() void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size); /// Duplicate a string in a specific heap. /// @see mi_strdup() char* mi_heap_strdup(mi_heap_t* heap, const char* s); /// Duplicate a string of at most length \a n in a specific heap. /// @see mi_strndup() char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n); /// Resolve a file path name using a specific \a heap to allocate the result. /// @see mi_realpath() char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name); void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize); void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size); void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize); void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment); void* mi_heap_malloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset); void* mi_heap_zalloc_aligned(mi_heap_t* heap, size_t size, size_t alignment); void* mi_heap_zalloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset); void* mi_heap_calloc_aligned(mi_heap_t* heap, size_t count, size_t size, size_t alignment); void* mi_heap_calloc_aligned_at(mi_heap_t* heap, size_t count, size_t size, size_t alignment, size_t offset); void* mi_heap_realloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment); void* mi_heap_realloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset); /// \} /// \defgroup zeroinit Zero initialized re-allocation /// /// The zero-initialized re-allocations are only valid on memory that was /// originally allocated with zero initialization too. /// e.g. `mi_calloc`, `mi_zalloc`, `mi_zalloc_aligned` etc. /// see /// /// \{ void* mi_rezalloc(void* p, size_t newsize); void* mi_recalloc(void* p, size_t newcount, size_t size) ; void* mi_rezalloc_aligned(void* p, size_t newsize, size_t alignment); void* mi_rezalloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset); void* mi_recalloc_aligned(void* p, size_t newcount, size_t size, size_t alignment); void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t size, size_t alignment, size_t offset); void* mi_heap_rezalloc(mi_heap_t* heap, void* p, size_t newsize); void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t newcount, size_t size); void* mi_heap_rezalloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment); void* mi_heap_rezalloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset); void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment); void* mi_heap_recalloc_aligned_at(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset); /// \} /// \defgroup typed Typed Macros /// /// Typed allocation macros. For example: /// ``` /// int* p = mi_malloc_tp(int) /// ``` /// /// \{ /// Allocate a block of type \a tp. /// @param tp The type of the block to allocate. /// @returns A pointer to an object of type \a tp, or /// \a NULL if out of memory. /// /// **Example:** /// ``` /// int* p = mi_malloc_tp(int) /// ``` /// /// @see mi_malloc() #define mi_malloc_tp(tp) ((tp*)mi_malloc(sizeof(tp))) /// Allocate a zero-initialized block of type \a tp. #define mi_zalloc_tp(tp) ((tp*)mi_zalloc(sizeof(tp))) /// Allocate \a count zero-initialized blocks of type \a tp. #define mi_calloc_tp(tp,count) ((tp*)mi_calloc(count,sizeof(tp))) /// Allocate \a count blocks of type \a tp. #define mi_mallocn_tp(tp,count) ((tp*)mi_mallocn(count,sizeof(tp))) /// Re-allocate to \a count blocks of type \a tp. #define mi_reallocn_tp(p,tp,count) ((tp*)mi_reallocn(p,count,sizeof(tp))) /// Allocate a block of type \a tp in a heap \a hp. #define mi_heap_malloc_tp(hp,tp) ((tp*)mi_heap_malloc(hp,sizeof(tp))) /// Allocate a zero-initialized block of type \a tp in a heap \a hp. #define mi_heap_zalloc_tp(hp,tp) ((tp*)mi_heap_zalloc(hp,sizeof(tp))) /// Allocate \a count zero-initialized blocks of type \a tp in a heap \a hp. #define mi_heap_calloc_tp(hp,tp,count) ((tp*)mi_heap_calloc(hp,count,sizeof(tp))) /// Allocate \a count blocks of type \a tp in a heap \a hp. #define mi_heap_mallocn_tp(hp,tp,count) ((tp*)mi_heap_mallocn(hp,count,sizeof(tp))) /// Re-allocate to \a count blocks of type \a tp in a heap \a hp. #define mi_heap_reallocn_tp(hp,p,tp,count) ((tp*)mi_heap_reallocn(p,count,sizeof(tp))) /// Re-allocate to \a count zero initialized blocks of type \a tp in a heap \a hp. #define mi_heap_recalloc_tp(hp,p,tp,count) ((tp*)mi_heap_recalloc(p,count,sizeof(tp))) /// \} /// \defgroup analysis Heap Introspection /// /// Inspect the heap at runtime. /// /// \{ /// Does a heap contain a pointer to a previously allocated block? /// @param heap The heap. /// @param p Pointer to a previously allocated block (in any heap)-- cannot be some /// random pointer! /// @returns \a true if the block pointed to by \a p is in the \a heap. /// @see mi_heap_check_owned() bool mi_heap_contains_block(mi_heap_t* heap, const void* p); /// Check safely if any pointer is part of a heap. /// @param heap The heap. /// @param p Any pointer -- not required to be previously allocated by us. /// @returns \a true if \a p points to a block in \a heap. /// /// Note: expensive function, linear in the pages in the heap. /// @see mi_heap_contains_block() /// @see mi_heap_get_default() bool mi_heap_check_owned(mi_heap_t* heap, const void* p); /// Check safely if any pointer is part of the default heap of this thread. /// @param p Any pointer -- not required to be previously allocated by us. /// @returns \a true if \a p points to a block in default heap of this thread. /// /// Note: expensive function, linear in the pages in the heap. /// @see mi_heap_contains_block() /// @see mi_heap_get_default() bool mi_check_owned(const void* p); /// An area of heap space contains blocks of a single size. /// The bytes in freed blocks are `committed - used`. typedef struct mi_heap_area_s { void* blocks; ///< start of the area containing heap blocks size_t reserved; ///< bytes reserved for this area size_t committed; ///< current committed bytes of this area size_t used; ///< bytes in use by allocated blocks size_t block_size; ///< size in bytes of one block size_t full_block_size; ///< size in bytes of a full block including padding and metadata. int heap_tag; ///< heap tag associated with this area (see \a mi_heap_new_ex) } mi_heap_area_t; /// Visitor function passed to mi_heap_visit_blocks() /// @returns \a true if ok, \a false to stop visiting (i.e. break) /// /// This function is always first called for every \a area /// with \a block as a \a NULL pointer. If \a visit_all_blocks /// was \a true, the function is then called for every allocated /// block in that area. typedef bool (mi_block_visit_fun)(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg); /// Visit all areas and blocks in a heap. /// @param heap The heap to visit. /// @param visit_all_blocks If \a true visits all allocated blocks, otherwise /// \a visitor is only called for every heap area. /// @param visitor This function is called for every area in the heap /// (with \a block as \a NULL). If \a visit_all_blocks is /// \a true, \a visitor is also called for every allocated /// block in every area (with `block!=NULL`). /// return \a false from this function to stop visiting early. /// @param arg Extra argument passed to \a visitor. /// @returns \a true if all areas and blocks were visited. bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_all_blocks, mi_block_visit_fun* visitor, void* arg); /// @brief Visit all areas and blocks in abandoned heaps. /// @param subproc_id The sub-process id associated with the abandoned heaps. /// @param heap_tag Visit only abandoned memory with the specified heap tag, use -1 to visit all abandoned memory. /// @param visit_blocks If \a true visits all allocated blocks, otherwise /// \a visitor is only called for every heap area. /// @param visitor This function is called for every area in the heap /// (with \a block as \a NULL). If \a visit_all_blocks is /// \a true, \a visitor is also called for every allocated /// block in every area (with `block!=NULL`). /// return \a false from this function to stop visiting early. /// @param arg extra argument passed to the \a visitor. /// @return \a true if all areas and blocks were visited. /// /// Note: requires the option `mi_option_visit_abandoned` to be set /// at the start of the program. bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg); /// \} /// \defgroup options Runtime Options /// /// Set runtime behavior. /// /// \{ /// Runtime options. typedef enum mi_option_e { // stable options mi_option_show_errors, ///< Print error messages. mi_option_show_stats, ///< Print statistics on termination. mi_option_verbose, ///< Print verbose messages. mi_option_max_errors, ///< issue at most N error messages mi_option_max_warnings, ///< issue at most N warning messages // advanced options mi_option_reserve_huge_os_pages, ///< reserve N huge OS pages (1GiB pages) at startup mi_option_reserve_huge_os_pages_at, ///< Reserve N huge OS pages at a specific NUMA node N. mi_option_reserve_os_memory, ///< reserve specified amount of OS memory in an arena at startup (internally, this value is in KiB; use `mi_option_get_size`) mi_option_allow_large_os_pages, ///< allow large (2 or 4 MiB) OS pages, implies eager commit. If false, also disables THP for the process. mi_option_purge_decommits, ///< should a memory purge decommit? (=1). Set to 0 to use memory reset on a purge (instead of decommit) mi_option_arena_reserve, ///< initial memory size for arena reservation (= 1 GiB on 64-bit) (internally, this value is in KiB; use `mi_option_get_size`) mi_option_os_tag, ///< tag used for OS logging (macOS only for now) (=100) mi_option_retry_on_oom, ///< retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries. (only on windows) // experimental options mi_option_eager_commit, ///< eager commit segments? (after `eager_commit_delay` segments) (enabled by default). mi_option_eager_commit_delay, ///< the first N segments per thread are not eagerly committed (but per page in the segment on demand) mi_option_arena_eager_commit, ///< eager commit arenas? Use 2 to enable just on overcommit systems (=2) mi_option_abandoned_page_purge, ///< immediately purge delayed purges on thread termination mi_option_purge_delay, ///< memory purging is delayed by N milli seconds; use 0 for immediate purging or -1 for no purging at all. (=10) mi_option_use_numa_nodes, ///< 0 = use all available numa nodes, otherwise use at most N nodes. mi_option_disallow_os_alloc, ///< 1 = do not use OS memory for allocation (but only programmatically reserved arenas) mi_option_limit_os_alloc, ///< If set to 1, do not use OS memory for allocation (but only pre-reserved arenas) mi_option_max_segment_reclaim, ///< max. percentage of the abandoned segments can be reclaimed per try (=10%) mi_option_destroy_on_exit, ///< if set, release all memory on exit; sometimes used for dynamic unloading but can be unsafe mi_option_arena_purge_mult, ///< multiplier for `purge_delay` for the purging delay for arenas (=10) mi_option_abandoned_reclaim_on_free, ///< allow to reclaim an abandoned segment on a free (=1) mi_option_purge_extend_delay, ///< extend purge delay on each subsequent delay (=1) mi_option_disallow_arena_alloc, ///< 1 = do not use arena's for allocation (except if using specific arena id's) mi_option_visit_abandoned, ///< allow visiting heap blocks from abandoned threads (=0) _mi_option_last } mi_option_t; bool mi_option_is_enabled(mi_option_t option); void mi_option_enable(mi_option_t option); void mi_option_disable(mi_option_t option); void mi_option_set_enabled(mi_option_t option, bool enable); void mi_option_set_enabled_default(mi_option_t option, bool enable); long mi_option_get(mi_option_t option); long mi_option_get_clamp(mi_option_t option, long min, long max); size_t mi_option_get_size(mi_option_t option); void mi_option_set(mi_option_t option, long value); void mi_option_set_default(mi_option_t option, long value); /// \} /// \defgroup posix Posix /// /// `mi_` prefixed implementations of various Posix, Unix, and C++ allocation functions. /// Defined for convenience as all redirect to the regular mimalloc API. /// /// \{ /// Just as `free` but also checks if the pointer `p` belongs to our heap. void mi_cfree(void* p); void* mi__expand(void* p, size_t newsize); void* mi_recalloc(void* p, size_t count, size_t size); size_t mi_malloc_size(const void* p); size_t mi_malloc_good_size(size_t size); size_t mi_malloc_usable_size(const void *p); int mi_posix_memalign(void** p, size_t alignment, size_t size); int mi__posix_memalign(void** p, size_t alignment, size_t size); void* mi_memalign(size_t alignment, size_t size); void* mi_valloc(size_t size); void* mi_pvalloc(size_t size); void* mi_aligned_alloc(size_t alignment, size_t size); unsigned short* mi_wcsdup(const unsigned short* s); unsigned char* mi_mbsdup(const unsigned char* s); int mi_dupenv_s(char** buf, size_t* size, const char* name); int mi_wdupenv_s(unsigned short** buf, size_t* size, const unsigned short* name); /// Correspond s to [reallocarray](https://www.freebsd.org/cgi/man.cgi?query=reallocarray&sektion=3&manpath=freebsd-release-ports) /// in FreeBSD. void* mi_reallocarray(void* p, size_t count, size_t size); /// Corresponds to [reallocarr](https://man.netbsd.org/reallocarr.3) in NetBSD. int mi_reallocarr(void* p, size_t count, size_t size); void* mi_aligned_recalloc(void* p, size_t newcount, size_t size, size_t alignment); void* mi_aligned_offset_recalloc(void* p, size_t newcount, size_t size, size_t alignment, size_t offset); void mi_free_size(void* p, size_t size); void mi_free_size_aligned(void* p, size_t size, size_t alignment); void mi_free_aligned(void* p, size_t alignment); /// \} /// \defgroup cpp C++ wrappers /// /// `mi_` prefixed implementations of various allocation functions /// that use C++ semantics on out-of-memory, generally calling /// `std::get_new_handler` and raising a `std::bad_alloc` exception on failure. /// /// Note: use the `mimalloc-new-delete.h` header to override the \a new /// and \a delete operators globally. The wrappers here are mostly /// for convenience for library writers that need to interface with /// mimalloc from C++. /// /// \{ /// like mi_malloc(), but when out of memory, use `std::get_new_handler` and raise `std::bad_alloc` exception on failure. void* mi_new(std::size_t n) noexcept(false); /// like mi_mallocn(), but when out of memory, use `std::get_new_handler` and raise `std::bad_alloc` exception on failure. void* mi_new_n(size_t count, size_t size) noexcept(false); /// like mi_malloc_aligned(), but when out of memory, use `std::get_new_handler` and raise `std::bad_alloc` exception on failure. void* mi_new_aligned(std::size_t n, std::align_val_t alignment) noexcept(false); /// like `mi_malloc`, but when out of memory, use `std::get_new_handler` but return \a NULL on failure. void* mi_new_nothrow(size_t n); /// like `mi_malloc_aligned`, but when out of memory, use `std::get_new_handler` but return \a NULL on failure. void* mi_new_aligned_nothrow(size_t n, size_t alignment); /// like mi_realloc(), but when out of memory, use `std::get_new_handler` and raise `std::bad_alloc` exception on failure. void* mi_new_realloc(void* p, size_t newsize); /// like mi_reallocn(), but when out of memory, use `std::get_new_handler` and raise `std::bad_alloc` exception on failure. void* mi_new_reallocn(void* p, size_t newcount, size_t size); /// \a std::allocator implementation for mimalloc for use in STL containers. /// For example: /// ``` /// std::vector > vec; /// vec.push_back(1); /// vec.pop_back(); /// ``` template struct mi_stl_allocator { } /// \} /*! \page build Building Checkout the sources from GitHub: ``` git clone https://github.com/microsoft/mimalloc ``` ## Windows Open `ide/vs2019/mimalloc.sln` in Visual Studio 2019 and build (or `ide/vs2017/mimalloc.sln`). The `mimalloc` project builds a static library (in `out/msvc-x64`), while the `mimalloc-override` project builds a DLL for overriding malloc in the entire program. ## macOS, Linux, BSD, etc. We use [`cmake`](https://cmake.org)1 as the build system: ``` > mkdir -p out/release > cd out/release > cmake ../.. > make ``` This builds the library as a shared (dynamic) library (`.so` or `.dylib`), a static library (`.a`), and as a single object file (`.o`). `> sudo make install` (install the library and header files in `/usr/local/lib` and `/usr/local/include`) You can build the debug version which does many internal checks and maintains detailed statistics as: ``` > mkdir -p out/debug > cd out/debug > cmake -DCMAKE_BUILD_TYPE=Debug ../.. > make ``` This will name the shared library as `libmimalloc-debug.so`. Finally, you can build a _secure_ version that uses guard pages, encrypted free lists, etc, as: ``` > mkdir -p out/secure > cd out/secure > cmake -DMI_SECURE=ON ../.. > make ``` This will name the shared library as `libmimalloc-secure.so`. Use `ccmake`2 instead of `cmake` to see and customize all the available build options. Notes: 1. Install CMake: `sudo apt-get install cmake` 2. Install CCMake: `sudo apt-get install cmake-curses-gui` */ /*! \page using Using the library ### Build The preferred usage is including ``, linking with the shared- or static library, and using the `mi_malloc` API exclusively for allocation. For example, ``` gcc -o myprogram -lmimalloc myfile.c ``` mimalloc uses only safe OS calls (`mmap` and `VirtualAlloc`) and can co-exist with other allocators linked to the same program. If you use `cmake`, you can simply use: ``` find_package(mimalloc 2.1 REQUIRED) ``` in your `CMakeLists.txt` to find a locally installed mimalloc. Then use either: ``` target_link_libraries(myapp PUBLIC mimalloc) ``` to link with the shared (dynamic) library, or: ``` target_link_libraries(myapp PUBLIC mimalloc-static) ``` to link with the static library. See `test\CMakeLists.txt` for an example. ### C++ For best performance in C++ programs, it is also recommended to override the global `new` and `delete` operators. For convenience, mimalloc provides [`mimalloc-new-delete.h`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-new-delete.h) which does this for you -- just include it in a single(!) source file in your project. In C++, mimalloc also provides the `mi_stl_allocator` struct which implements the `std::allocator` interface. For example: ``` std::vector> vec; vec.push_back(some_struct()); ``` ### Statistics You can pass environment variables to print verbose messages (`MIMALLOC_VERBOSE=1`) and statistics (`MIMALLOC_SHOW_STATS=1`) (in the debug version): ``` > env MIMALLOC_SHOW_STATS=1 ./cfrac 175451865205073170563711388363 175451865205073170563711388363 = 374456281610909315237213 * 468551 heap stats: peak total freed unit normal 2: 16.4 kb 17.5 mb 17.5 mb 16 b ok normal 3: 16.3 kb 15.2 mb 15.2 mb 24 b ok normal 4: 64 b 4.6 kb 4.6 kb 32 b ok normal 5: 80 b 118.4 kb 118.4 kb 40 b ok normal 6: 48 b 48 b 48 b 48 b ok normal 17: 960 b 960 b 960 b 320 b ok heap stats: peak total freed unit normal: 33.9 kb 32.8 mb 32.8 mb 1 b ok huge: 0 b 0 b 0 b 1 b ok total: 33.9 kb 32.8 mb 32.8 mb 1 b ok malloc requested: 32.8 mb committed: 58.2 kb 58.2 kb 58.2 kb 1 b ok reserved: 2.0 mb 2.0 mb 2.0 mb 1 b ok reset: 0 b 0 b 0 b 1 b ok segments: 1 1 1 -abandoned: 0 pages: 6 6 6 -abandoned: 0 mmaps: 3 mmap fast: 0 mmap slow: 1 threads: 0 elapsed: 2.022s process: user: 1.781s, system: 0.016s, faults: 756, reclaims: 0, rss: 2.7 mb ``` The above model of using the `mi_` prefixed API is not always possible though in existing programs that already use the standard malloc interface, and another option is to override the standard malloc interface completely and redirect all calls to the _mimalloc_ library instead. See \ref overrides for more info. */ /*! \page environment Environment Options You can set further options either programmatically (using [`mi_option_set`](https://microsoft.github.io/mimalloc/group__options.html)), or via environment variables: - `MIMALLOC_SHOW_STATS=1`: show statistics when the program terminates. - `MIMALLOC_VERBOSE=1`: show verbose messages. - `MIMALLOC_SHOW_ERRORS=1`: show error and warning messages. Advanced options: - `MIMALLOC_ARENA_EAGER_COMMIT=2`: turns on eager commit for the large arenas (usually 1GiB) from which mimalloc allocates segments and pages. Set this to 2 (default) to only enable this on overcommit systems (e.g. Linux). Set this to 1 to enable explicitly on other systems as well (like Windows or macOS) which may improve performance (as the whole arena is committed at once). Note that eager commit only increases the commit but not the actual the peak resident set (rss) so it is generally ok to enable this. - `MIMALLOC_PURGE_DELAY=N`: the delay in `N` milli-seconds (by default `10`) after which mimalloc will purge OS pages that are not in use. This signals to the OS that the underlying physical memory can be reused which can reduce memory fragmentation especially in long running (server) programs. Setting `N` to `0` purges immediately when a page becomes unused which can improve memory usage but also decreases performance. Setting `N` to a higher value like `100` can improve performance (sometimes by a lot) at the cost of potentially using more memory at times. Setting it to `-1` disables purging completely. - `MIMALLOC_PURGE_DECOMMITS=1`: By default "purging" memory means unused memory is decommitted (`MEM_DECOMMIT` on Windows, `MADV_DONTNEED` (which decresease rss immediately) on `mmap` systems). Set this to 0 to instead "reset" unused memory on a purge (`MEM_RESET` on Windows, generally `MADV_FREE` (which does not decrease rss immediately) on `mmap` systems). Mimalloc generally does not "free" OS memory but only "purges" OS memory, in other words, it tries to keep virtual address ranges and decommits within those ranges (to make the underlying physical memory available to other processes). Further options for large workloads and services: - `MIMALLOC_USE_NUMA_NODES=N`: pretend there are at most `N` NUMA nodes. If not set, the actual NUMA nodes are detected at runtime. Setting `N` to 1 may avoid problems in some virtual environments. Also, setting it to a lower number than the actual NUMA nodes is fine and will only cause threads to potentially allocate more memory across actual NUMA nodes (but this can happen in any case as NUMA local allocation is always a best effort but not guaranteed). - `MIMALLOC_ALLOW_LARGE_OS_PAGES=1`: use large OS pages (2 or 4MiB) when available; for some workloads this can significantly improve performance. When this option is disabled (default), it also disables transparent huge pages (THP) for the process (on Linux and Android). On Linux the default setting is 2 -- this enables the use of large pages through THP only. Use `MIMALLOC_VERBOSE` to check if the large OS pages are enabled -- usually one needs to explicitly give permissions for large OS pages (as on [Windows][windows-huge] and [Linux][linux-huge]). However, sometimes the OS is very slow to reserve contiguous physical memory for large OS pages so use with care on systems that can have fragmented memory (for that reason, we generally recommend to use `MIMALLOC_RESERVE_HUGE_OS_PAGES` instead whenever possible). - `MIMALLOC_RESERVE_HUGE_OS_PAGES=N`: where `N` is the number of 1GiB _huge_ OS pages. This reserves the huge pages at startup and sometimes this can give a large (latency) performance improvement on big workloads. Usually it is better to not use `MIMALLOC_ALLOW_LARGE_OS_PAGES=1` in combination with this setting. Just like large OS pages, use with care as reserving contiguous physical memory can take a long time when memory is fragmented (but reserving the huge pages is done at startup only once). Note that we usually need to explicitly give permission for huge OS pages (as on [Windows][windows-huge] and [Linux][linux-huge])). With huge OS pages, it may be beneficial to set the setting `MIMALLOC_EAGER_COMMIT_DELAY=N` (`N` is 1 by default) to delay the initial `N` segments (of 4MiB) of a thread to not allocate in the huge OS pages; this prevents threads that are short lived and allocate just a little to take up space in the huge OS page area (which cannot be purged as huge OS pages are pinned to physical memory). The huge pages are usually allocated evenly among NUMA nodes. We can use `MIMALLOC_RESERVE_HUGE_OS_PAGES_AT=N` where `N` is the numa node (starting at 0) to allocate all the huge pages at a specific numa node instead. Use caution when using `fork` in combination with either large or huge OS pages: on a fork, the OS uses copy-on-write for all pages in the original process including the huge OS pages. When any memory is now written in that area, the OS will copy the entire 1GiB huge page (or 2MiB large page) which can cause the memory usage to grow in large increments. [linux-huge]: https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/5/html/tuning_and_optimizing_red_hat_enterprise_linux_for_oracle_9i_and_10g_databases/sect-oracle_9i_and_10g_tuning_guide-large_memory_optimization_big_pages_and_huge_pages-configuring_huge_pages_in_red_hat_enterprise_linux_4_or_5 [windows-huge]: https://docs.microsoft.com/en-us/sql/database-engine/configure-windows/enable-the-lock-pages-in-memory-option-windows?view=sql-server-2017 */ /*! \page overrides Overriding Malloc Overriding the standard `malloc` (and `new`) can be done either _dynamically_ or _statically_. ## Dynamic override This is the recommended way to override the standard malloc interface. ### Dynamic Override on Linux, BSD On these ELF-based systems we preload the mimalloc shared library so all calls to the standard `malloc` interface are resolved to the _mimalloc_ library. ``` > env LD_PRELOAD=/usr/lib/libmimalloc.so myprogram ``` You can set extra environment variables to check that mimalloc is running, like: ``` > env MIMALLOC_VERBOSE=1 LD_PRELOAD=/usr/lib/libmimalloc.so myprogram ``` or run with the debug version to get detailed statistics: ``` > env MIMALLOC_SHOW_STATS=1 LD_PRELOAD=/usr/lib/libmimalloc-debug.so myprogram ``` ### Dynamic Override on MacOS On macOS we can also preload the mimalloc shared library so all calls to the standard `malloc` interface are resolved to the _mimalloc_ library. ``` > env DYLD_INSERT_LIBRARIES=/usr/lib/libmimalloc.dylib myprogram ``` Note that certain security restrictions may apply when doing this from the [shell](https://stackoverflow.com/questions/43941322/dyld-insert-libraries-ignored-when-calling-application-through-bash). ### Dynamic Override on Windows Dynamically overriding on mimalloc on Windows is robust and has the particular advantage to be able to redirect all malloc/free calls that go through the (dynamic) C runtime allocator, including those from other DLL's or libraries. As it intercepts all allocation calls on a low level, it can be used reliably on large programs that include other 3rd party components. There are four requirements to make the overriding work well: 1. Use the C-runtime library as a DLL (using the `/MD` or `/MDd` switch). 2. Link your program explicitly with the `mimalloc.lib` export library for the `mimalloc.dll`. (which must be compiled with `-DMI_OVERRIDE=ON`, which is the default though). To ensure the `mimalloc.dll` is actually loaded at run-time it is easiest to insert some call to the mimalloc API in the `main` function, like `mi_version()` (or use the `/include:mi_version` switch on the linker command, or similarly, `#pragma comment(linker, "/include:mi_version")` in some source file). See the `mimalloc-test-override` project for an example on how to use this. 3. The `mimalloc-redirect.dll` must be put in the same directory as the main `mimalloc.dll` at runtime (as it is a dependency of that DLL). The redirection DLL ensures that all calls to the C runtime malloc API get redirected to mimalloc functions (which reside in `mimalloc.dll`). 4. Ensure the `mimalloc.dll` comes as early as possible in the import list of the final executable (so it can intercept all potential allocations). You can use `minject -l ` to check this if needed. For best performance on Windows with C++, it is also recommended to also override the `new`/`delete` operations (by including [`mimalloc-new-delete.h`](include/mimalloc-new-delete.h) a single(!) source file in your project). The environment variable `MIMALLOC_DISABLE_REDIRECT=1` can be used to disable dynamic overriding at run-time. Use `MIMALLOC_VERBOSE=1` to check if mimalloc was successfully redirected. For different platforms than x64, you may need a specific [redirection dll](bin). Furthermore, we cannot always re-link an executable or ensure `mimalloc.dll` comes first in the import table. In such cases the [`minject`](bin) tool can be used to patch the executable's import tables. ## Static override On Unix-like systems, you can also statically link with _mimalloc_ to override the standard malloc interface. The recommended way is to link the final program with the _mimalloc_ single object file (`mimalloc.o`). We use an object file instead of a library file as linkers give preference to that over archives to resolve symbols. To ensure that the standard malloc interface resolves to the _mimalloc_ library, link it as the first object file. For example: ``` > gcc -o myprogram mimalloc.o myfile1.c ... ``` Another way to override statically that works on all platforms, is to link statically to mimalloc (as shown in the introduction) and include a header file in each source file that re-defines `malloc` etc. to `mi_malloc`. This is provided by [`mimalloc-override.h`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-override.h). This only works reliably though if all sources are under your control or otherwise mixing of pointers from different heaps may occur! ## List of Overrides: The specific functions that get redirected to the _mimalloc_ library are: ``` // C void* malloc(size_t size); void* calloc(size_t size, size_t n); void* realloc(void* p, size_t newsize); void free(void* p); void* aligned_alloc(size_t alignment, size_t size); char* strdup(const char* s); char* strndup(const char* s, size_t n); char* realpath(const char* fname, char* resolved_name); // C++ void operator delete(void* p); void operator delete[](void* p); void* operator new(std::size_t n) noexcept(false); void* operator new[](std::size_t n) noexcept(false); void* operator new( std::size_t n, std::align_val_t align) noexcept(false); void* operator new[]( std::size_t n, std::align_val_t align) noexcept(false); void* operator new ( std::size_t count, const std::nothrow_t& tag); void* operator new[]( std::size_t count, const std::nothrow_t& tag); void* operator new ( std::size_t count, std::align_val_t al, const std::nothrow_t&); void* operator new[]( std::size_t count, std::align_val_t al, const std::nothrow_t&); // Posix int posix_memalign(void** p, size_t alignment, size_t size); // Linux void* memalign(size_t alignment, size_t size); void* valloc(size_t size); void* pvalloc(size_t size); size_t malloc_usable_size(void *p); void* reallocf(void* p, size_t newsize); // macOS void vfree(void* p); size_t malloc_size(const void* p); size_t malloc_good_size(size_t size); // BSD void* reallocarray( void* p, size_t count, size_t size ); void* reallocf(void* p, size_t newsize); void cfree(void* p); // NetBSD int reallocarr(void* p, size_t count, size_t size); // Windows void* _expand(void* p, size_t newsize); size_t _msize(void* p); void* _malloc_dbg(size_t size, int block_type, const char* fname, int line); void* _realloc_dbg(void* p, size_t newsize, int block_type, const char* fname, int line); void* _calloc_dbg(size_t count, size_t size, int block_type, const char* fname, int line); void* _expand_dbg(void* p, size_t size, int block_type, const char* fname, int line); size_t _msize_dbg(void* p, int block_type); void _free_dbg(void* p, int block_type); ``` */ /*! \page bench Performance We tested _mimalloc_ against many other top allocators over a wide range of benchmarks, ranging from various real world programs to synthetic benchmarks that see how the allocator behaves under more extreme circumstances. In our benchmarks, _mimalloc_ always outperforms all other leading allocators (_jemalloc_, _tcmalloc_, _Hoard_, etc) (Jan 2021), and usually uses less memory (up to 25% more in the worst case). A nice property is that it does *consistently* well over the wide range of benchmarks. See the [Performance](https://github.com/microsoft/mimalloc#Performance) section in the _mimalloc_ repository for benchmark results, or the the technical report for detailed benchmark results. */ ================================================ FILE: third-party/mimalloc/doc/mimalloc-doxygen.css ================================================ #projectlogo img { padding: 1ex; } tt, code, kbd, samp, div.memproto, div.fragment, div.line, table.memname { font-family: Consolas, Monaco, Inconsolata, "Courier New", monospace; } .image img, .textblock img { max-width: 99%; max-height: 350px; } table.memname, .memname{ font-weight: bold; } code { background-color: #EEE; padding: 0ex 0.25ex; } body { margin: 1ex 1ex 0ex 1ex; border: 1px solid black; } .contents table, .contents div, .contents p, .contents dl { font-size: 16px; line-height: 1.44; } body #nav-tree .label { font-size: 14px; } a{ text-decoration: underline; } #side-nav { margin-left: 1ex; border-left: 1px solid black; } #nav-tree { padding-left: 1ex; } #nav-path { display: none; } div.fragment { background-color: #EEE; padding: 0.25ex 0.5ex; border-color: black; } #nav-sync img { display: none; } h1,h2,h3,h4,h5,h6 { transition:none; } .memtitle { background-image: none; background-color: #EEE; } table.memproto, .memproto { text-shadow: none; font-size: 110%; } ================================================ FILE: third-party/mimalloc/ide/vs2022/mimalloc-lib.vcxproj ================================================  Debug ARM64 Debug ARM64EC Debug Win32 Release ARM64 Release ARM64EC Release Win32 Debug x64 Release x64 15.0 {ABB5EAE7-B3E6-432E-B636-333449892EA6} mimalloc-lib 10.0 mimalloc-lib StaticLibrary true v143 StaticLibrary false v143 true StaticLibrary true v143 StaticLibrary true v143 StaticLibrary true v143 StaticLibrary false v143 true StaticLibrary false v143 true StaticLibrary false v143 true $(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ .lib mimalloc $(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ .lib mimalloc $(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ .lib mimalloc $(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ .lib mimalloc $(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ .lib mimalloc $(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ .lib mimalloc $(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ .lib mimalloc $(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ .lib mimalloc false Level4 Disabled true Default ../../include MI_DEBUG=3;%(PreprocessorDefinitions); CompileAsCpp false stdcpp20 Level4 Disabled true Default ../../include MI_DEBUG=3;MI_GUARDED=0;%(PreprocessorDefinitions); CompileAsCpp false stdcpp20 Level4 Disabled true Default ../../include MI_DEBUG=3;MI_GUARDED=0;%(PreprocessorDefinitions); CompileAsCpp false stdcpp20 Level4 Disabled true Default ../../include MI_DEBUG=3;MI_GUARDED=0;%(PreprocessorDefinitions); CompileAsCpp false stdcpp20 Level4 MaxSpeed true Default ../../include %(PreprocessorDefinitions);NDEBUG AssemblyAndSourceCode $(IntDir) false false Default CompileAsCpp true stdcpp20 true true Level4 MaxSpeed true Default ../../include %(PreprocessorDefinitions);NDEBUG AssemblyAndSourceCode $(IntDir) false false Default CompileAsC true stdcpp20 true true Level4 MaxSpeed true Default ../../include %(PreprocessorDefinitions);NDEBUG AssemblyAndSourceCode $(IntDir) false false Default CompileAsCpp true stdcpp20 CPUExtensionRequirementsARMv81 Sync true true Level4 MaxSpeed true Default ../../include %(PreprocessorDefinitions);NDEBUG AssemblyAndSourceCode $(IntDir) false false Default CompileAsCpp true stdcpp20 CPUExtensionRequirementsARMv81 Sync true true false false false false false false false false true true true true true true true true true true true true true true true true false false false true true true true true true true true true true true true true true true true true true true true true true true true ================================================ FILE: third-party/mimalloc/ide/vs2022/mimalloc-lib.vcxproj.filters ================================================  Sources Sources Sources Sources Sources Sources Sources Sources Sources Sources Sources Sources Sources Sources Sources Sources Sources Sources Sources Sources Sources Headers Headers Headers Headers Headers Headers Headers Headers Headers Headers {1430490c-e711-4ace-a1b8-36f4d5105873} {461c78ef-04b0-44d1-a0ca-7d488abaa592} ================================================ FILE: third-party/mimalloc/ide/vs2022/mimalloc-override-dll.vcxproj ================================================  Debug ARM64 Debug ARM64EC Debug Win32 Release ARM64 Release ARM64EC Release Win32 Debug x64 Release x64 15.0 {ABB5EAE7-B3E6-432E-B636-333449892EA7} mimalloc-override-dll 10.0 mimalloc-override-dll DynamicLibrary true v143 DynamicLibrary false v143 DynamicLibrary true v143 DynamicLibrary true v143 DynamicLibrary true v143 DynamicLibrary false v143 DynamicLibrary false v143 DynamicLibrary false v143 $(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ .dll mimalloc $(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ .dll mimalloc $(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ .dll mimalloc $(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ .dll mimalloc $(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ .dll mimalloc $(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ .dll mimalloc $(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ .dll mimalloc $(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ .dll mimalloc false Level3 Disabled true true ../../include MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions); MultiThreadedDebugDLL false CompileAsCpp $(ProjectDir)\..\..\bin\mimalloc-redirect32.lib;%(AdditionalDependencies) Default false $(OutDir)$(TargetName).dll.lib $(OutDir)$(TargetName).dll.pdb COPY /Y "$(ProjectDir)..\..\bin\mimalloc-redirect32.dll" "$(OutputPath)" Copy mimalloc-redirect32.dll to the output directory Level3 Disabled true true ../../include MI_DEBUG=4;MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions); MultiThreadedDebugDLL false CompileAsCpp $(ProjectDir)\..\..\bin\mimalloc-redirect.lib;%(AdditionalDependencies) Default false $(OutDir)$(TargetName).dll.lib $(OutDir)$(TargetName).dll.pdb COPY /Y "$(ProjectDir)..\..\bin\mimalloc-redirect.dll" "$(OutputPath)" copy mimalloc-redirect.dll to the output directory Level3 Disabled true true ../../include MI_DEBUG=4;MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions); MultiThreadedDebugDLL false CompileAsCpp $(ProjectDir)\..\..\bin\mimalloc-redirect-arm64.lib;%(AdditionalDependencies) Default false $(OutDir)$(TargetName).dll.lib $(OutDir)$(TargetName).dll.pdb COPY /Y "$(ProjectDir)..\..\bin\mimalloc-redirect-arm64.dll" "$(OutputPath)" copy mimalloc-redirect-arm64.dll to the output directory Level3 Disabled true true ../../include MI_DEBUG=4;MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions); MultiThreadedDebugDLL false CompileAsCpp $(ProjectDir)\..\..\bin\mimalloc-redirect-arm64ec.lib;%(AdditionalDependencies) Default false $(OutDir)$(TargetName).dll.lib $(OutDir)$(TargetName).dll.pdb COPY /Y "$(ProjectDir)..\..\bin\mimalloc-redirect-arm64ec.dll" "$(OutputPath)" copy mimalloc-redirect-arm64ec.dll to the output directory Level3 MaxSpeed true true true ../../include MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);NDEBUG AssemblyAndSourceCode $(IntDir) false MultiThreadedDLL CompileAsCpp false true true $(ProjectDir)\..\..\bin\mimalloc-redirect32.lib;%(AdditionalDependencies) Default false $(OutDir)$(TargetName).dll.lib $(OutDir)$(TargetName).dll.pdb COPY /Y "$(ProjectDir)..\..\bin\mimalloc-redirect32.dll" "$(OutputPath)" Copy mimalloc-redirect32.dll to the output directory Level3 MaxSpeed true true true ../../include MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);NDEBUG AssemblyAndSourceCode $(IntDir) false MultiThreadedDLL CompileAsCpp false true true $(ProjectDir)\..\..\bin\mimalloc-redirect.lib;%(AdditionalDependencies) Default false $(OutDir)$(TargetName).dll.lib $(OutDir)$(TargetName).dll.pdb COPY /Y "$(ProjectDir)..\..\bin\mimalloc-redirect.dll" "$(OutputPath)" copy mimalloc-redirect.dll to the output directory Level3 MaxSpeed true true true ../../include MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);NDEBUG AssemblyAndSourceCode $(IntDir) false MultiThreadedDLL CompileAsCpp false CPUExtensionRequirementsARMv81 true true $(ProjectDir)\..\..\bin\mimalloc-redirect-arm64.lib;%(AdditionalDependencies) Default false $(OutDir)$(TargetName).dll.lib $(OutDir)$(TargetName).dll.pdb COPY /Y "$(ProjectDir)..\..\bin\mimalloc-redirect-arm64.dll" "$(OutputPath)" copy mimalloc-redirect-arm64.dll to the output directory Level3 MaxSpeed true true true ../../include MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);NDEBUG AssemblyAndSourceCode $(IntDir) false MultiThreadedDLL CompileAsCpp false CPUExtensionRequirementsARMv81 true true $(ProjectDir)\..\..\bin\mimalloc-redirect-arm64ec.lib;%(AdditionalDependencies) Default false $(OutDir)$(TargetName).dll.lib $(OutDir)$(TargetName).dll.pdb COPY /Y "$(ProjectDir)..\..\bin\mimalloc-redirect-arm64ec.dll" "$(OutputPath)" copy mimalloc-redirect-arm64ec.dll to the output directory false false false false false false false false true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true true ================================================ FILE: third-party/mimalloc/ide/vs2022/mimalloc-override-dll.vcxproj.filters ================================================  Sources Sources Sources Sources Sources Sources Sources Sources Sources Sources Sources Sources Sources Sources Sources Sources Sources Sources Sources Sources Headers Headers Headers Headers Headers Headers Headers Headers Headers Headers Headers Headers {262c6c21-e270-4ba6-bd63-4ac999307e4e} {94b40bdc-a741-45dd-81aa-c05fabcd2970} ================================================ FILE: third-party/mimalloc/ide/vs2022/mimalloc-override-test-dep.vcxproj ================================================ Debug ARM64 Debug ARM64EC Debug Win32 Release ARM64 Release ARM64EC Release Win32 Debug x64 Release x64 15.0 {FEF7869F-750E-4C21-A04D-22707CC66879} mimalloc-test-override-dep 10.0 mimalloc-test-override-dep DynamicLibrary true v143 DynamicLibrary false v143 true DynamicLibrary true v143 DynamicLibrary true v143 DynamicLibrary true v143 DynamicLibrary false v143 true DynamicLibrary false v143 true DynamicLibrary false v143 true $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ false Level3 Disabled true true ..\..\include MultiThreadedDebugDLL Sync Default false Console kernel32.lib;%(AdditionalDependencies) Level3 Disabled true true ..\..\include MultiThreadedDebugDLL Sync Default false Console kernel32.lib;%(AdditionalDependencies) Level3 Disabled true true ..\..\include MultiThreadedDebugDLL Sync Default false Console kernel32.lib;%(AdditionalDependencies) Level3 Disabled true true ..\..\include MultiThreadedDebugDLL Sync Default false Console kernel32.lib;%(AdditionalDependencies) Level3 MaxSpeed true true true true ..\..\include _MBCS;%(PreprocessorDefinitions);NDEBUG MultiThreadedDLL true true Console kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) Level3 MaxSpeed true true true true ..\..\include _MBCS;%(PreprocessorDefinitions);NDEBUG MultiThreadedDLL true true Console kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) Level3 MaxSpeed true true true true ..\..\include _MBCS;%(PreprocessorDefinitions);NDEBUG MultiThreadedDLL true true Console kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) Level3 MaxSpeed true true true true ..\..\include _MBCS;%(PreprocessorDefinitions);NDEBUG MultiThreadedDLL true true Console kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) ================================================ FILE: third-party/mimalloc/ide/vs2022/mimalloc-override-test.vcxproj ================================================ Debug ARM64 Debug ARM64EC Debug Win32 Release ARM64 Release ARM64EC Release Win32 Debug x64 Release x64 15.0 {FEF7868F-750E-4C21-A04D-22707CC66879} mimalloc-override-test 10.0 mimalloc-test-override Application true v143 Application false v143 true Application true v143 Application true v143 Application true v143 Application false v143 true Application false v143 true Application false v143 true $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ false Level3 Disabled true true ..\..\include MultiThreadedDebugDLL Sync Default false Console kernel32.lib;%(AdditionalDependencies) Level3 Disabled true true ..\..\include MultiThreadedDebugDLL Sync Default false Console kernel32.lib;%(AdditionalDependencies) Level3 Disabled true true ..\..\include MultiThreadedDebugDLL Sync Default false Console kernel32.lib;%(AdditionalDependencies) Level3 Disabled true true ..\..\include MultiThreadedDebugDLL Sync Default false Console kernel32.lib;%(AdditionalDependencies) Level3 MaxSpeed true true true true ..\..\include _MBCS;%(PreprocessorDefinitions);NDEBUG MultiThreadedDLL true true Console kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) Level3 MaxSpeed true true true true ..\..\include _MBCS;%(PreprocessorDefinitions);NDEBUG MultiThreadedDLL true true Console kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) Level3 MaxSpeed true true true true ..\..\include _MBCS;%(PreprocessorDefinitions);NDEBUG MultiThreadedDLL true true Console kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) Level3 MaxSpeed true true true true ..\..\include _MBCS;%(PreprocessorDefinitions);NDEBUG MultiThreadedDLL true true Console kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) {abb5eae7-b3e6-432e-b636-333449892ea7} {fef7869f-750e-4c21-a04d-22707cc66879} ================================================ FILE: third-party/mimalloc/ide/vs2022/mimalloc-test-api.vcxproj ================================================ Debug ARM64 Debug ARM64EC Debug Win32 Release ARM64 Release ARM64EC Release Win32 Debug x64 Release x64 15.0 {FFF7958F-750E-4C21-A04D-22707CC66878} mimalloc-test-api 10.0 mimalloc-test-api Application true v143 Application false v143 true Application true v143 Application true v143 Application true v143 Application false v143 true Application false v143 true Application false v143 true $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ false Level3 Disabled true true ..\..\include Console Level3 Disabled true true ..\..\include Console Level3 Disabled true true ..\..\include Console Level3 Disabled true true ..\..\include Console Level3 MaxSpeed true true true true ..\..\include %(PreprocessorDefinitions);NDEBUG true true Console Level3 MaxSpeed true true true true ..\..\include %(PreprocessorDefinitions);NDEBUG true true Console Level3 MaxSpeed true true true true ..\..\include %(PreprocessorDefinitions);NDEBUG true true Console Level3 MaxSpeed true true true true ..\..\include %(PreprocessorDefinitions);NDEBUG true true Console true true true true true true true true false false false {abb5eae7-b3e6-432e-b636-333449892ea6} ================================================ FILE: third-party/mimalloc/ide/vs2022/mimalloc-test-stress.vcxproj ================================================ Debug ARM64 Debug ARM64EC Debug Win32 Release ARM64 Release ARM64EC Release Win32 Debug x64 Release x64 15.0 {FEF7958F-750E-4C21-A04D-22707CC66878} mimalloc-test-stress 10.0 mimalloc-test-stress Application true v143 Application false v143 true Application true v143 Application true v143 Application true v143 Application false v143 true Application false v143 true Application false v143 true $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ false Level3 Disabled true true ..\..\include Console Level3 Disabled true true ..\..\include Console Level3 Disabled true true ..\..\include Console Level3 Disabled true true ..\..\include Console Level3 MaxSpeed true true true true ..\..\include %(PreprocessorDefinitions);NDEBUG true true Console Level3 MaxSpeed true true true true ..\..\include %(PreprocessorDefinitions);NDEBUG true true Console Level3 MaxSpeed true true true true ..\..\include %(PreprocessorDefinitions);NDEBUG CPUExtensionRequirementsARMv81 true true Console Level3 MaxSpeed true true true true ..\..\include %(PreprocessorDefinitions);NDEBUG CPUExtensionRequirementsARMv81 true true Console false false false false false false false false {abb5eae7-b3e6-432e-b636-333449892ea6} ================================================ FILE: third-party/mimalloc/ide/vs2022/mimalloc-test.vcxproj ================================================ Debug ARM64 Debug ARM64EC Debug Win32 Release ARM64 Release ARM64EC Release Win32 Debug x64 Release x64 15.0 {FEF7858F-750E-4C21-A04D-22707CC66878} mimalloctest 10.0 mimalloc-test-static Application true v143 Application false v143 true Application true v143 Application true v143 Application true v143 Application false v143 true Application false v143 true Application false v143 true $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ false Level3 Disabled true true ..\..\include stdcpp17 Console Level3 Disabled true true ..\..\include stdcpp17 Console Level3 Disabled true true ..\..\include stdcpp17 Console Level3 Disabled true true ..\..\include stdcpp17 Console Level3 MaxSpeed true true true true ..\..\include _MBCS;%(PreprocessorDefinitions);NDEBUG stdcpp17 true true Console Level3 MaxSpeed true true true true ..\..\include _MBCS;%(PreprocessorDefinitions);NDEBUG stdcpp17 true true Console Level3 MaxSpeed true true true true ..\..\include _MBCS;%(PreprocessorDefinitions);NDEBUG stdcpp17 true true Console Level3 MaxSpeed true true true true ..\..\include _MBCS;%(PreprocessorDefinitions);NDEBUG stdcpp17 true true Console {abb5eae7-b3e6-432e-b636-333449892ea6} ================================================ FILE: third-party/mimalloc/ide/vs2022/mimalloc.sln ================================================  Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio Version 17 VisualStudioVersion = 17.12.35527.113 MinimumVisualStudioVersion = 10.0.40219.1 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc-lib", "mimalloc-lib.vcxproj", "{ABB5EAE7-B3E6-432E-B636-333449892EA6}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc-test-static", "mimalloc-test.vcxproj", "{FEF7858F-750E-4C21-A04D-22707CC66878}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc-override-dll", "mimalloc-override-dll.vcxproj", "{ABB5EAE7-B3E6-432E-B636-333449892EA7}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc-test-override-dep", "mimalloc-override-test-dep.vcxproj", "{FEF7869F-750E-4C21-A04D-22707CC66879}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc-test-override", "mimalloc-override-test.vcxproj", "{FEF7868F-750E-4C21-A04D-22707CC66879}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc-test-stress", "mimalloc-test-stress.vcxproj", "{FEF7958F-750E-4C21-A04D-22707CC66878}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mimalloc-test-api", "mimalloc-test-api.vcxproj", "{FFF7958F-750E-4C21-A04D-22707CC66878}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|ARM64 = Debug|ARM64 Debug|ARM64EC = Debug|ARM64EC Debug|x64 = Debug|x64 Debug|x86 = Debug|x86 Release|ARM64 = Release|ARM64 Release|ARM64EC = Release|ARM64EC Release|x64 = Release|x64 Release|x86 = Release|x86 EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution {ABB5EAE7-B3E6-432E-B636-333449892EA6}.Debug|ARM64.ActiveCfg = Debug|ARM64 {ABB5EAE7-B3E6-432E-B636-333449892EA6}.Debug|ARM64.Build.0 = Debug|ARM64 {ABB5EAE7-B3E6-432E-B636-333449892EA6}.Debug|ARM64EC.ActiveCfg = Debug|ARM64EC {ABB5EAE7-B3E6-432E-B636-333449892EA6}.Debug|ARM64EC.Build.0 = Debug|ARM64EC {ABB5EAE7-B3E6-432E-B636-333449892EA6}.Debug|x64.ActiveCfg = Debug|x64 {ABB5EAE7-B3E6-432E-B636-333449892EA6}.Debug|x64.Build.0 = Debug|x64 {ABB5EAE7-B3E6-432E-B636-333449892EA6}.Debug|x86.ActiveCfg = Debug|Win32 {ABB5EAE7-B3E6-432E-B636-333449892EA6}.Debug|x86.Build.0 = Debug|Win32 {ABB5EAE7-B3E6-432E-B636-333449892EA6}.Release|ARM64.ActiveCfg = Release|ARM64 {ABB5EAE7-B3E6-432E-B636-333449892EA6}.Release|ARM64.Build.0 = Release|ARM64 {ABB5EAE7-B3E6-432E-B636-333449892EA6}.Release|ARM64EC.ActiveCfg = Release|ARM64EC {ABB5EAE7-B3E6-432E-B636-333449892EA6}.Release|ARM64EC.Build.0 = Release|ARM64EC {ABB5EAE7-B3E6-432E-B636-333449892EA6}.Release|x64.ActiveCfg = Release|x64 {ABB5EAE7-B3E6-432E-B636-333449892EA6}.Release|x64.Build.0 = Release|x64 {ABB5EAE7-B3E6-432E-B636-333449892EA6}.Release|x86.ActiveCfg = Release|Win32 {ABB5EAE7-B3E6-432E-B636-333449892EA6}.Release|x86.Build.0 = Release|Win32 {FEF7858F-750E-4C21-A04D-22707CC66878}.Debug|ARM64.ActiveCfg = Debug|ARM64 {FEF7858F-750E-4C21-A04D-22707CC66878}.Debug|ARM64.Build.0 = Debug|ARM64 {FEF7858F-750E-4C21-A04D-22707CC66878}.Debug|ARM64EC.ActiveCfg = Debug|ARM64EC {FEF7858F-750E-4C21-A04D-22707CC66878}.Debug|ARM64EC.Build.0 = Debug|ARM64EC {FEF7858F-750E-4C21-A04D-22707CC66878}.Debug|x64.ActiveCfg = Debug|x64 {FEF7858F-750E-4C21-A04D-22707CC66878}.Debug|x64.Build.0 = Debug|x64 {FEF7858F-750E-4C21-A04D-22707CC66878}.Debug|x86.ActiveCfg = Debug|Win32 {FEF7858F-750E-4C21-A04D-22707CC66878}.Debug|x86.Build.0 = Debug|Win32 {FEF7858F-750E-4C21-A04D-22707CC66878}.Release|ARM64.ActiveCfg = Release|ARM64 {FEF7858F-750E-4C21-A04D-22707CC66878}.Release|ARM64.Build.0 = Release|ARM64 {FEF7858F-750E-4C21-A04D-22707CC66878}.Release|ARM64EC.ActiveCfg = Release|ARM64EC {FEF7858F-750E-4C21-A04D-22707CC66878}.Release|ARM64EC.Build.0 = Release|ARM64EC {FEF7858F-750E-4C21-A04D-22707CC66878}.Release|x64.ActiveCfg = Release|x64 {FEF7858F-750E-4C21-A04D-22707CC66878}.Release|x64.Build.0 = Release|x64 {FEF7858F-750E-4C21-A04D-22707CC66878}.Release|x86.ActiveCfg = Release|Win32 {FEF7858F-750E-4C21-A04D-22707CC66878}.Release|x86.Build.0 = Release|Win32 {ABB5EAE7-B3E6-432E-B636-333449892EA7}.Debug|ARM64.ActiveCfg = Debug|ARM64 {ABB5EAE7-B3E6-432E-B636-333449892EA7}.Debug|ARM64.Build.0 = Debug|ARM64 {ABB5EAE7-B3E6-432E-B636-333449892EA7}.Debug|ARM64EC.ActiveCfg = Debug|ARM64EC {ABB5EAE7-B3E6-432E-B636-333449892EA7}.Debug|ARM64EC.Build.0 = Debug|ARM64EC {ABB5EAE7-B3E6-432E-B636-333449892EA7}.Debug|x64.ActiveCfg = Debug|x64 {ABB5EAE7-B3E6-432E-B636-333449892EA7}.Debug|x64.Build.0 = Debug|x64 {ABB5EAE7-B3E6-432E-B636-333449892EA7}.Debug|x86.ActiveCfg = Debug|Win32 {ABB5EAE7-B3E6-432E-B636-333449892EA7}.Debug|x86.Build.0 = Debug|Win32 {ABB5EAE7-B3E6-432E-B636-333449892EA7}.Release|ARM64.ActiveCfg = Release|ARM64 {ABB5EAE7-B3E6-432E-B636-333449892EA7}.Release|ARM64.Build.0 = Release|ARM64 {ABB5EAE7-B3E6-432E-B636-333449892EA7}.Release|ARM64EC.ActiveCfg = Release|ARM64EC {ABB5EAE7-B3E6-432E-B636-333449892EA7}.Release|ARM64EC.Build.0 = Release|ARM64EC {ABB5EAE7-B3E6-432E-B636-333449892EA7}.Release|x64.ActiveCfg = Release|x64 {ABB5EAE7-B3E6-432E-B636-333449892EA7}.Release|x64.Build.0 = Release|x64 {ABB5EAE7-B3E6-432E-B636-333449892EA7}.Release|x86.ActiveCfg = Release|Win32 {ABB5EAE7-B3E6-432E-B636-333449892EA7}.Release|x86.Build.0 = Release|Win32 {FEF7869F-750E-4C21-A04D-22707CC66879}.Debug|ARM64.ActiveCfg = Debug|ARM64 {FEF7869F-750E-4C21-A04D-22707CC66879}.Debug|ARM64.Build.0 = Debug|ARM64 {FEF7869F-750E-4C21-A04D-22707CC66879}.Debug|ARM64EC.ActiveCfg = Debug|ARM64EC {FEF7869F-750E-4C21-A04D-22707CC66879}.Debug|ARM64EC.Build.0 = Debug|ARM64EC {FEF7869F-750E-4C21-A04D-22707CC66879}.Debug|x64.ActiveCfg = Debug|x64 {FEF7869F-750E-4C21-A04D-22707CC66879}.Debug|x64.Build.0 = Debug|x64 {FEF7869F-750E-4C21-A04D-22707CC66879}.Debug|x86.ActiveCfg = Debug|Win32 {FEF7869F-750E-4C21-A04D-22707CC66879}.Debug|x86.Build.0 = Debug|Win32 {FEF7869F-750E-4C21-A04D-22707CC66879}.Release|ARM64.ActiveCfg = Release|ARM64 {FEF7869F-750E-4C21-A04D-22707CC66879}.Release|ARM64.Build.0 = Release|ARM64 {FEF7869F-750E-4C21-A04D-22707CC66879}.Release|ARM64EC.ActiveCfg = Release|ARM64EC {FEF7869F-750E-4C21-A04D-22707CC66879}.Release|ARM64EC.Build.0 = Release|ARM64EC {FEF7869F-750E-4C21-A04D-22707CC66879}.Release|x64.ActiveCfg = Release|x64 {FEF7869F-750E-4C21-A04D-22707CC66879}.Release|x64.Build.0 = Release|x64 {FEF7869F-750E-4C21-A04D-22707CC66879}.Release|x86.ActiveCfg = Release|Win32 {FEF7869F-750E-4C21-A04D-22707CC66879}.Release|x86.Build.0 = Release|Win32 {FEF7868F-750E-4C21-A04D-22707CC66879}.Debug|ARM64.ActiveCfg = Debug|ARM64 {FEF7868F-750E-4C21-A04D-22707CC66879}.Debug|ARM64.Build.0 = Debug|ARM64 {FEF7868F-750E-4C21-A04D-22707CC66879}.Debug|ARM64EC.ActiveCfg = Debug|ARM64EC {FEF7868F-750E-4C21-A04D-22707CC66879}.Debug|ARM64EC.Build.0 = Debug|ARM64EC {FEF7868F-750E-4C21-A04D-22707CC66879}.Debug|x64.ActiveCfg = Debug|x64 {FEF7868F-750E-4C21-A04D-22707CC66879}.Debug|x64.Build.0 = Debug|x64 {FEF7868F-750E-4C21-A04D-22707CC66879}.Debug|x86.ActiveCfg = Debug|Win32 {FEF7868F-750E-4C21-A04D-22707CC66879}.Debug|x86.Build.0 = Debug|Win32 {FEF7868F-750E-4C21-A04D-22707CC66879}.Release|ARM64.ActiveCfg = Release|ARM64 {FEF7868F-750E-4C21-A04D-22707CC66879}.Release|ARM64.Build.0 = Release|ARM64 {FEF7868F-750E-4C21-A04D-22707CC66879}.Release|ARM64EC.ActiveCfg = Release|ARM64EC {FEF7868F-750E-4C21-A04D-22707CC66879}.Release|ARM64EC.Build.0 = Release|ARM64EC {FEF7868F-750E-4C21-A04D-22707CC66879}.Release|x64.ActiveCfg = Release|x64 {FEF7868F-750E-4C21-A04D-22707CC66879}.Release|x64.Build.0 = Release|x64 {FEF7868F-750E-4C21-A04D-22707CC66879}.Release|x86.ActiveCfg = Release|Win32 {FEF7868F-750E-4C21-A04D-22707CC66879}.Release|x86.Build.0 = Release|Win32 {FEF7958F-750E-4C21-A04D-22707CC66878}.Debug|ARM64.ActiveCfg = Debug|ARM64 {FEF7958F-750E-4C21-A04D-22707CC66878}.Debug|ARM64.Build.0 = Debug|ARM64 {FEF7958F-750E-4C21-A04D-22707CC66878}.Debug|ARM64EC.ActiveCfg = Debug|ARM64EC {FEF7958F-750E-4C21-A04D-22707CC66878}.Debug|ARM64EC.Build.0 = Debug|ARM64EC {FEF7958F-750E-4C21-A04D-22707CC66878}.Debug|x64.ActiveCfg = Debug|x64 {FEF7958F-750E-4C21-A04D-22707CC66878}.Debug|x64.Build.0 = Debug|x64 {FEF7958F-750E-4C21-A04D-22707CC66878}.Debug|x86.ActiveCfg = Debug|Win32 {FEF7958F-750E-4C21-A04D-22707CC66878}.Debug|x86.Build.0 = Debug|Win32 {FEF7958F-750E-4C21-A04D-22707CC66878}.Release|ARM64.ActiveCfg = Release|ARM64 {FEF7958F-750E-4C21-A04D-22707CC66878}.Release|ARM64.Build.0 = Release|ARM64 {FEF7958F-750E-4C21-A04D-22707CC66878}.Release|ARM64EC.ActiveCfg = Release|ARM64EC {FEF7958F-750E-4C21-A04D-22707CC66878}.Release|ARM64EC.Build.0 = Release|ARM64EC {FEF7958F-750E-4C21-A04D-22707CC66878}.Release|x64.ActiveCfg = Release|x64 {FEF7958F-750E-4C21-A04D-22707CC66878}.Release|x64.Build.0 = Release|x64 {FEF7958F-750E-4C21-A04D-22707CC66878}.Release|x86.ActiveCfg = Release|Win32 {FEF7958F-750E-4C21-A04D-22707CC66878}.Release|x86.Build.0 = Release|Win32 {FFF7958F-750E-4C21-A04D-22707CC66878}.Debug|ARM64.ActiveCfg = Debug|ARM64 {FFF7958F-750E-4C21-A04D-22707CC66878}.Debug|ARM64.Build.0 = Debug|ARM64 {FFF7958F-750E-4C21-A04D-22707CC66878}.Debug|ARM64EC.ActiveCfg = Debug|ARM64EC {FFF7958F-750E-4C21-A04D-22707CC66878}.Debug|ARM64EC.Build.0 = Debug|ARM64EC {FFF7958F-750E-4C21-A04D-22707CC66878}.Debug|x64.ActiveCfg = Debug|x64 {FFF7958F-750E-4C21-A04D-22707CC66878}.Debug|x64.Build.0 = Debug|x64 {FFF7958F-750E-4C21-A04D-22707CC66878}.Debug|x86.ActiveCfg = Debug|Win32 {FFF7958F-750E-4C21-A04D-22707CC66878}.Debug|x86.Build.0 = Debug|Win32 {FFF7958F-750E-4C21-A04D-22707CC66878}.Release|ARM64.ActiveCfg = Release|ARM64 {FFF7958F-750E-4C21-A04D-22707CC66878}.Release|ARM64.Build.0 = Release|ARM64 {FFF7958F-750E-4C21-A04D-22707CC66878}.Release|ARM64EC.ActiveCfg = Release|ARM64EC {FFF7958F-750E-4C21-A04D-22707CC66878}.Release|ARM64EC.Build.0 = Release|ARM64EC {FFF7958F-750E-4C21-A04D-22707CC66878}.Release|x64.ActiveCfg = Release|x64 {FFF7958F-750E-4C21-A04D-22707CC66878}.Release|x64.Build.0 = Release|x64 {FFF7958F-750E-4C21-A04D-22707CC66878}.Release|x86.ActiveCfg = Release|Win32 {FFF7958F-750E-4C21-A04D-22707CC66878}.Release|x86.Build.0 = Release|Win32 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {4297F93D-486A-4243-995F-7D32F59AE82A} EndGlobalSection EndGlobal ================================================ FILE: third-party/mimalloc/include/mimalloc/atomic.h ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2024 Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #pragma once #ifndef MIMALLOC_ATOMIC_H #define MIMALLOC_ATOMIC_H // include windows.h or pthreads.h #if defined(_WIN32) #ifndef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN #endif #include #elif !defined(__wasi__) && (!defined(__EMSCRIPTEN__) || defined(__EMSCRIPTEN_PTHREADS__)) #define MI_USE_PTHREADS #include #endif // -------------------------------------------------------------------------------------------- // Atomics // We need to be portable between C, C++, and MSVC. // We base the primitives on the C/C++ atomics and create a minimal wrapper for MSVC in C compilation mode. // This is why we try to use only `uintptr_t` and `*` as atomic types. // To gain better insight in the range of used atomics, we use explicitly named memory order operations // instead of passing the memory order as a parameter. // ----------------------------------------------------------------------------------------------- #if defined(__cplusplus) // Use C++ atomics #include #define _Atomic(tp) std::atomic #define mi_atomic(name) std::atomic_##name #define mi_memory_order(name) std::memory_order_##name #if (__cplusplus >= 202002L) // c++20, see issue #571 #define MI_ATOMIC_VAR_INIT(x) x #elif !defined(ATOMIC_VAR_INIT) #define MI_ATOMIC_VAR_INIT(x) x #else #define MI_ATOMIC_VAR_INIT(x) ATOMIC_VAR_INIT(x) #endif #elif defined(_MSC_VER) // Use MSVC C wrapper for C11 atomics #define _Atomic(tp) tp #define MI_ATOMIC_VAR_INIT(x) x #define mi_atomic(name) mi_atomic_##name #define mi_memory_order(name) mi_memory_order_##name #else // Use C11 atomics #include #define mi_atomic(name) atomic_##name #define mi_memory_order(name) memory_order_##name #if (__STDC_VERSION__ >= 201710L) // c17, see issue #735 #define MI_ATOMIC_VAR_INIT(x) x #elif !defined(ATOMIC_VAR_INIT) #define MI_ATOMIC_VAR_INIT(x) x #else #define MI_ATOMIC_VAR_INIT(x) ATOMIC_VAR_INIT(x) #endif #endif // Various defines for all used memory orders in mimalloc #define mi_atomic_cas_weak(p,expected,desired,mem_success,mem_fail) \ mi_atomic(compare_exchange_weak_explicit)(p,expected,desired,mem_success,mem_fail) #define mi_atomic_cas_strong(p,expected,desired,mem_success,mem_fail) \ mi_atomic(compare_exchange_strong_explicit)(p,expected,desired,mem_success,mem_fail) #define mi_atomic_load_acquire(p) mi_atomic(load_explicit)(p,mi_memory_order(acquire)) #define mi_atomic_load_relaxed(p) mi_atomic(load_explicit)(p,mi_memory_order(relaxed)) #define mi_atomic_store_release(p,x) mi_atomic(store_explicit)(p,x,mi_memory_order(release)) #define mi_atomic_store_relaxed(p,x) mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed)) #define mi_atomic_exchange_relaxed(p,x) mi_atomic(exchange_explicit)(p,x,mi_memory_order(relaxed)) #define mi_atomic_exchange_release(p,x) mi_atomic(exchange_explicit)(p,x,mi_memory_order(release)) #define mi_atomic_exchange_acq_rel(p,x) mi_atomic(exchange_explicit)(p,x,mi_memory_order(acq_rel)) #define mi_atomic_cas_weak_release(p,exp,des) mi_atomic_cas_weak(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed)) #define mi_atomic_cas_weak_acq_rel(p,exp,des) mi_atomic_cas_weak(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire)) #define mi_atomic_cas_strong_release(p,exp,des) mi_atomic_cas_strong(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed)) #define mi_atomic_cas_strong_acq_rel(p,exp,des) mi_atomic_cas_strong(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire)) #define mi_atomic_add_relaxed(p,x) mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(relaxed)) #define mi_atomic_sub_relaxed(p,x) mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(relaxed)) #define mi_atomic_add_acq_rel(p,x) mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(acq_rel)) #define mi_atomic_sub_acq_rel(p,x) mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(acq_rel)) #define mi_atomic_and_acq_rel(p,x) mi_atomic(fetch_and_explicit)(p,x,mi_memory_order(acq_rel)) #define mi_atomic_or_acq_rel(p,x) mi_atomic(fetch_or_explicit)(p,x,mi_memory_order(acq_rel)) #define mi_atomic_increment_relaxed(p) mi_atomic_add_relaxed(p,(uintptr_t)1) #define mi_atomic_decrement_relaxed(p) mi_atomic_sub_relaxed(p,(uintptr_t)1) #define mi_atomic_increment_acq_rel(p) mi_atomic_add_acq_rel(p,(uintptr_t)1) #define mi_atomic_decrement_acq_rel(p) mi_atomic_sub_acq_rel(p,(uintptr_t)1) static inline void mi_atomic_yield(void); static inline intptr_t mi_atomic_addi(_Atomic(intptr_t)*p, intptr_t add); static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub); #if defined(__cplusplus) || !defined(_MSC_VER) // In C++/C11 atomics we have polymorphic atomics so can use the typed `ptr` variants (where `tp` is the type of atomic value) // We use these macros so we can provide a typed wrapper in MSVC in C compilation mode as well #define mi_atomic_load_ptr_acquire(tp,p) mi_atomic_load_acquire(p) #define mi_atomic_load_ptr_relaxed(tp,p) mi_atomic_load_relaxed(p) // In C++ we need to add casts to help resolve templates if NULL is passed #if defined(__cplusplus) #define mi_atomic_store_ptr_release(tp,p,x) mi_atomic_store_release(p,(tp*)x) #define mi_atomic_store_ptr_relaxed(tp,p,x) mi_atomic_store_relaxed(p,(tp*)x) #define mi_atomic_cas_ptr_weak_release(tp,p,exp,des) mi_atomic_cas_weak_release(p,exp,(tp*)des) #define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des) mi_atomic_cas_weak_acq_rel(p,exp,(tp*)des) #define mi_atomic_cas_ptr_strong_release(tp,p,exp,des) mi_atomic_cas_strong_release(p,exp,(tp*)des) #define mi_atomic_exchange_ptr_relaxed(tp,p,x) mi_atomic_exchange_relaxed(p,(tp*)x) #define mi_atomic_exchange_ptr_release(tp,p,x) mi_atomic_exchange_release(p,(tp*)x) #define mi_atomic_exchange_ptr_acq_rel(tp,p,x) mi_atomic_exchange_acq_rel(p,(tp*)x) #else #define mi_atomic_store_ptr_release(tp,p,x) mi_atomic_store_release(p,x) #define mi_atomic_store_ptr_relaxed(tp,p,x) mi_atomic_store_relaxed(p,x) #define mi_atomic_cas_ptr_weak_release(tp,p,exp,des) mi_atomic_cas_weak_release(p,exp,des) #define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des) mi_atomic_cas_weak_acq_rel(p,exp,des) #define mi_atomic_cas_ptr_strong_release(tp,p,exp,des) mi_atomic_cas_strong_release(p,exp,des) #define mi_atomic_exchange_ptr_relaxed(tp,p,x) mi_atomic_exchange_relaxed(p,x) #define mi_atomic_exchange_ptr_release(tp,p,x) mi_atomic_exchange_release(p,x) #define mi_atomic_exchange_ptr_acq_rel(tp,p,x) mi_atomic_exchange_acq_rel(p,x) #endif // These are used by the statistics static inline int64_t mi_atomic_addi64_relaxed(volatile int64_t* p, int64_t add) { return mi_atomic(fetch_add_explicit)((_Atomic(int64_t)*)p, add, mi_memory_order(relaxed)); } static inline void mi_atomic_void_addi64_relaxed(volatile int64_t* p, const volatile int64_t* padd) { const int64_t add = mi_atomic_load_relaxed((_Atomic(int64_t)*)padd); if (add != 0) { mi_atomic(fetch_add_explicit)((_Atomic(int64_t)*)p, add, mi_memory_order(relaxed)); } } static inline void mi_atomic_maxi64_relaxed(volatile int64_t* p, int64_t x) { int64_t current = mi_atomic_load_relaxed((_Atomic(int64_t)*)p); while (current < x && !mi_atomic_cas_weak_release((_Atomic(int64_t)*)p, ¤t, x)) { /* nothing */ }; } // Used by timers #define mi_atomic_loadi64_acquire(p) mi_atomic(load_explicit)(p,mi_memory_order(acquire)) #define mi_atomic_loadi64_relaxed(p) mi_atomic(load_explicit)(p,mi_memory_order(relaxed)) #define mi_atomic_storei64_release(p,x) mi_atomic(store_explicit)(p,x,mi_memory_order(release)) #define mi_atomic_storei64_relaxed(p,x) mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed)) #define mi_atomic_casi64_strong_acq_rel(p,e,d) mi_atomic_cas_strong_acq_rel(p,e,d) #define mi_atomic_addi64_acq_rel(p,i) mi_atomic_add_acq_rel(p,i) #elif defined(_MSC_VER) // Legacy MSVC plain C compilation wrapper that uses Interlocked operations to model C11 atomics. #include #ifdef _WIN64 typedef LONG64 msc_intptr_t; #define MI_64(f) f##64 #else typedef LONG msc_intptr_t; #define MI_64(f) f #endif typedef enum mi_memory_order_e { mi_memory_order_relaxed, mi_memory_order_consume, mi_memory_order_acquire, mi_memory_order_release, mi_memory_order_acq_rel, mi_memory_order_seq_cst } mi_memory_order; static inline uintptr_t mi_atomic_fetch_add_explicit(_Atomic(uintptr_t)*p, uintptr_t add, mi_memory_order mo) { (void)(mo); return (uintptr_t)MI_64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, (msc_intptr_t)add); } static inline uintptr_t mi_atomic_fetch_sub_explicit(_Atomic(uintptr_t)*p, uintptr_t sub, mi_memory_order mo) { (void)(mo); return (uintptr_t)MI_64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, -((msc_intptr_t)sub)); } static inline uintptr_t mi_atomic_fetch_and_explicit(_Atomic(uintptr_t)*p, uintptr_t x, mi_memory_order mo) { (void)(mo); return (uintptr_t)MI_64(_InterlockedAnd)((volatile msc_intptr_t*)p, (msc_intptr_t)x); } static inline uintptr_t mi_atomic_fetch_or_explicit(_Atomic(uintptr_t)*p, uintptr_t x, mi_memory_order mo) { (void)(mo); return (uintptr_t)MI_64(_InterlockedOr)((volatile msc_intptr_t*)p, (msc_intptr_t)x); } static inline bool mi_atomic_compare_exchange_strong_explicit(_Atomic(uintptr_t)*p, uintptr_t* expected, uintptr_t desired, mi_memory_order mo1, mi_memory_order mo2) { (void)(mo1); (void)(mo2); uintptr_t read = (uintptr_t)MI_64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)desired, (msc_intptr_t)(*expected)); if (read == *expected) { return true; } else { *expected = read; return false; } } static inline bool mi_atomic_compare_exchange_weak_explicit(_Atomic(uintptr_t)*p, uintptr_t* expected, uintptr_t desired, mi_memory_order mo1, mi_memory_order mo2) { return mi_atomic_compare_exchange_strong_explicit(p, expected, desired, mo1, mo2); } static inline uintptr_t mi_atomic_exchange_explicit(_Atomic(uintptr_t)*p, uintptr_t exchange, mi_memory_order mo) { (void)(mo); return (uintptr_t)MI_64(_InterlockedExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)exchange); } static inline void mi_atomic_thread_fence(mi_memory_order mo) { (void)(mo); _Atomic(uintptr_t) x = 0; mi_atomic_exchange_explicit(&x, 1, mo); } static inline uintptr_t mi_atomic_load_explicit(_Atomic(uintptr_t) const* p, mi_memory_order mo) { (void)(mo); #if defined(_M_IX86) || defined(_M_X64) return *p; #else uintptr_t x = *p; if (mo > mi_memory_order_relaxed) { while (!mi_atomic_compare_exchange_weak_explicit((_Atomic(uintptr_t)*)p, &x, x, mo, mi_memory_order_relaxed)) { /* nothing */ }; } return x; #endif } static inline void mi_atomic_store_explicit(_Atomic(uintptr_t)*p, uintptr_t x, mi_memory_order mo) { (void)(mo); #if defined(_M_IX86) || defined(_M_X64) *p = x; #else mi_atomic_exchange_explicit(p, x, mo); #endif } static inline int64_t mi_atomic_loadi64_explicit(_Atomic(int64_t)*p, mi_memory_order mo) { (void)(mo); #if defined(_M_X64) return *p; #else int64_t old = *p; int64_t x = old; while ((old = InterlockedCompareExchange64(p, x, old)) != x) { x = old; } return x; #endif } static inline void mi_atomic_storei64_explicit(_Atomic(int64_t)*p, int64_t x, mi_memory_order mo) { (void)(mo); #if defined(x_M_IX86) || defined(_M_X64) *p = x; #else InterlockedExchange64(p, x); #endif } // These are used by the statistics static inline int64_t mi_atomic_addi64_relaxed(volatile _Atomic(int64_t)*p, int64_t add) { #ifdef _WIN64 return (int64_t)mi_atomic_addi((int64_t*)p, add); #else int64_t current; int64_t sum; do { current = *p; sum = current + add; } while (_InterlockedCompareExchange64(p, sum, current) != current); return current; #endif } static inline void mi_atomic_void_addi64_relaxed(volatile int64_t* p, const volatile int64_t* padd) { const int64_t add = *padd; if (add != 0) { mi_atomic_addi64_relaxed((volatile _Atomic(int64_t)*)p, add); } } static inline void mi_atomic_maxi64_relaxed(volatile _Atomic(int64_t)*p, int64_t x) { int64_t current; do { current = *p; } while (current < x && _InterlockedCompareExchange64(p, x, current) != current); } static inline void mi_atomic_addi64_acq_rel(volatile _Atomic(int64_t*)p, int64_t i) { mi_atomic_addi64_relaxed(p, i); } static inline bool mi_atomic_casi64_strong_acq_rel(volatile _Atomic(int64_t*)p, int64_t* exp, int64_t des) { int64_t read = _InterlockedCompareExchange64(p, des, *exp); if (read == *exp) { return true; } else { *exp = read; return false; } } // The pointer macros cast to `uintptr_t`. #define mi_atomic_load_ptr_acquire(tp,p) (tp*)mi_atomic_load_acquire((_Atomic(uintptr_t)*)(p)) #define mi_atomic_load_ptr_relaxed(tp,p) (tp*)mi_atomic_load_relaxed((_Atomic(uintptr_t)*)(p)) #define mi_atomic_store_ptr_release(tp,p,x) mi_atomic_store_release((_Atomic(uintptr_t)*)(p),(uintptr_t)(x)) #define mi_atomic_store_ptr_relaxed(tp,p,x) mi_atomic_store_relaxed((_Atomic(uintptr_t)*)(p),(uintptr_t)(x)) #define mi_atomic_cas_ptr_weak_release(tp,p,exp,des) mi_atomic_cas_weak_release((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des) #define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des) mi_atomic_cas_weak_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des) #define mi_atomic_cas_ptr_strong_release(tp,p,exp,des) mi_atomic_cas_strong_release((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des) #define mi_atomic_exchange_ptr_relaxed(tp,p,x) (tp*)mi_atomic_exchange_relaxed((_Atomic(uintptr_t)*)(p),(uintptr_t)x) #define mi_atomic_exchange_ptr_release(tp,p,x) (tp*)mi_atomic_exchange_release((_Atomic(uintptr_t)*)(p),(uintptr_t)x) #define mi_atomic_exchange_ptr_acq_rel(tp,p,x) (tp*)mi_atomic_exchange_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t)x) #define mi_atomic_loadi64_acquire(p) mi_atomic(loadi64_explicit)(p,mi_memory_order(acquire)) #define mi_atomic_loadi64_relaxed(p) mi_atomic(loadi64_explicit)(p,mi_memory_order(relaxed)) #define mi_atomic_storei64_release(p,x) mi_atomic(storei64_explicit)(p,x,mi_memory_order(release)) #define mi_atomic_storei64_relaxed(p,x) mi_atomic(storei64_explicit)(p,x,mi_memory_order(relaxed)) #endif // Atomically add a signed value; returns the previous value. static inline intptr_t mi_atomic_addi(_Atomic(intptr_t)*p, intptr_t add) { return (intptr_t)mi_atomic_add_acq_rel((_Atomic(uintptr_t)*)p, (uintptr_t)add); } // Atomically subtract a signed value; returns the previous value. static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub) { return (intptr_t)mi_atomic_addi(p, -sub); } // ---------------------------------------------------------------------- // Once and Guard // ---------------------------------------------------------------------- typedef _Atomic(uintptr_t) mi_atomic_once_t; // Returns true only on the first invocation static inline bool mi_atomic_once( mi_atomic_once_t* once ) { if (mi_atomic_load_relaxed(once) != 0) return false; // quick test uintptr_t expected = 0; return mi_atomic_cas_strong_acq_rel(once, &expected, (uintptr_t)1); // try to set to 1 } typedef _Atomic(uintptr_t) mi_atomic_guard_t; // Allows only one thread to execute at a time #define mi_atomic_guard(guard) \ uintptr_t _mi_guard_expected = 0; \ for(bool _mi_guard_once = true; \ _mi_guard_once && mi_atomic_cas_strong_acq_rel(guard,&_mi_guard_expected,(uintptr_t)1); \ (mi_atomic_store_release(guard,(uintptr_t)0), _mi_guard_once = false) ) // ---------------------------------------------------------------------- // Yield // ---------------------------------------------------------------------- #if defined(__cplusplus) #include static inline void mi_atomic_yield(void) { std::this_thread::yield(); } #elif defined(_WIN32) static inline void mi_atomic_yield(void) { YieldProcessor(); } #elif defined(__SSE2__) #include static inline void mi_atomic_yield(void) { _mm_pause(); } #elif (defined(__GNUC__) || defined(__clang__)) && \ (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__armel__) || defined(__ARMEL__) || \ defined(__aarch64__) || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)) || defined(__POWERPC__) #if defined(__x86_64__) || defined(__i386__) static inline void mi_atomic_yield(void) { __asm__ volatile ("pause" ::: "memory"); } #elif defined(__aarch64__) static inline void mi_atomic_yield(void) { __asm__ volatile("wfe"); } #elif (defined(__arm__) && __ARM_ARCH__ >= 7) static inline void mi_atomic_yield(void) { __asm__ volatile("yield" ::: "memory"); } #elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__POWERPC__) #ifdef __APPLE__ static inline void mi_atomic_yield(void) { __asm__ volatile ("or r27,r27,r27" ::: "memory"); } #else static inline void mi_atomic_yield(void) { __asm__ __volatile__ ("or 27,27,27" ::: "memory"); } #endif #elif defined(__armel__) || defined(__ARMEL__) static inline void mi_atomic_yield(void) { __asm__ volatile ("nop" ::: "memory"); } #endif #elif defined(__sun) // Fallback for other archs #include static inline void mi_atomic_yield(void) { smt_pause(); } #elif defined(__wasi__) #include static inline void mi_atomic_yield(void) { sched_yield(); } #else #include static inline void mi_atomic_yield(void) { sleep(0); } #endif // ---------------------------------------------------------------------- // Locks // These do not have to be recursive and should be light-weight // in-process only locks. Only used for reserving arena's and to // maintain the abandoned list. // ---------------------------------------------------------------------- #if _MSC_VER #pragma warning(disable:26110) // unlock with holding lock #endif #define mi_lock(lock) for(bool _go = (mi_lock_acquire(lock),true); _go; (mi_lock_release(lock), _go=false) ) #if defined(_WIN32) #if 1 #define mi_lock_t SRWLOCK // slim reader-writer lock static inline bool mi_lock_try_acquire(mi_lock_t* lock) { return TryAcquireSRWLockExclusive(lock); } static inline void mi_lock_acquire(mi_lock_t* lock) { AcquireSRWLockExclusive(lock); } static inline void mi_lock_release(mi_lock_t* lock) { ReleaseSRWLockExclusive(lock); } static inline void mi_lock_init(mi_lock_t* lock) { InitializeSRWLock(lock); } static inline void mi_lock_done(mi_lock_t* lock) { (void)(lock); } #else #define mi_lock_t CRITICAL_SECTION static inline bool mi_lock_try_acquire(mi_lock_t* lock) { return TryEnterCriticalSection(lock); } static inline void mi_lock_acquire(mi_lock_t* lock) { EnterCriticalSection(lock); } static inline void mi_lock_release(mi_lock_t* lock) { LeaveCriticalSection(lock); } static inline void mi_lock_init(mi_lock_t* lock) { InitializeCriticalSection(lock); } static inline void mi_lock_done(mi_lock_t* lock) { DeleteCriticalSection(lock); } #endif #elif defined(MI_USE_PTHREADS) void _mi_error_message(int err, const char* fmt, ...); #define mi_lock_t pthread_mutex_t static inline bool mi_lock_try_acquire(mi_lock_t* lock) { return (pthread_mutex_trylock(lock) == 0); } static inline void mi_lock_acquire(mi_lock_t* lock) { const int err = pthread_mutex_lock(lock); if (err != 0) { _mi_error_message(err, "internal error: lock cannot be acquired\n"); } } static inline void mi_lock_release(mi_lock_t* lock) { pthread_mutex_unlock(lock); } static inline void mi_lock_init(mi_lock_t* lock) { pthread_mutex_init(lock, NULL); } static inline void mi_lock_done(mi_lock_t* lock) { pthread_mutex_destroy(lock); } #elif defined(__cplusplus) #include #define mi_lock_t std::mutex static inline bool mi_lock_try_acquire(mi_lock_t* lock) { return lock->try_lock(); } static inline void mi_lock_acquire(mi_lock_t* lock) { lock->lock(); } static inline void mi_lock_release(mi_lock_t* lock) { lock->unlock(); } static inline void mi_lock_init(mi_lock_t* lock) { (void)(lock); } static inline void mi_lock_done(mi_lock_t* lock) { (void)(lock); } #else // fall back to poor man's locks. // this should only be the case in a single-threaded environment (like __wasi__) #define mi_lock_t _Atomic(uintptr_t) static inline bool mi_lock_try_acquire(mi_lock_t* lock) { uintptr_t expected = 0; return mi_atomic_cas_strong_acq_rel(lock, &expected, (uintptr_t)1); } static inline void mi_lock_acquire(mi_lock_t* lock) { for (int i = 0; i < 1000; i++) { // for at most 1000 tries? if (mi_lock_try_acquire(lock)) return; mi_atomic_yield(); } } static inline void mi_lock_release(mi_lock_t* lock) { mi_atomic_store_release(lock, (uintptr_t)0); } static inline void mi_lock_init(mi_lock_t* lock) { mi_lock_release(lock); } static inline void mi_lock_done(mi_lock_t* lock) { (void)(lock); } #endif #endif // __MIMALLOC_ATOMIC_H ================================================ FILE: third-party/mimalloc/include/mimalloc/internal.h ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2023, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #pragma once #ifndef MIMALLOC_INTERNAL_H #define MIMALLOC_INTERNAL_H // -------------------------------------------------------------------------- // This file contains the internal API's of mimalloc and various utility // functions and macros. // -------------------------------------------------------------------------- #include "types.h" #include "track.h" #if (MI_DEBUG>0) #define mi_trace_message(...) _mi_trace_message(__VA_ARGS__) #else #define mi_trace_message(...) #endif #define MI_CACHE_LINE 64 #if defined(_MSC_VER) #pragma warning(disable:4127) // suppress constant conditional warning (due to MI_SECURE paths) #pragma warning(disable:26812) // unscoped enum warning #define mi_decl_noinline __declspec(noinline) #define mi_decl_thread __declspec(thread) #define mi_decl_cache_align __declspec(align(MI_CACHE_LINE)) #define mi_decl_weak #define mi_decl_hidden #elif (defined(__GNUC__) && (__GNUC__ >= 3)) || defined(__clang__) // includes clang and icc #define mi_decl_noinline __attribute__((noinline)) #define mi_decl_thread __thread #define mi_decl_cache_align __attribute__((aligned(MI_CACHE_LINE))) #define mi_decl_weak __attribute__((weak)) #define mi_decl_hidden __attribute__((visibility("hidden"))) #elif __cplusplus >= 201103L // c++11 #define mi_decl_noinline #define mi_decl_thread thread_local #define mi_decl_cache_align alignas(MI_CACHE_LINE) #define mi_decl_weak #define mi_decl_hidden #else #define mi_decl_noinline #define mi_decl_thread __thread // hope for the best :-) #define mi_decl_cache_align #define mi_decl_weak #define mi_decl_hidden #endif #if defined(__EMSCRIPTEN__) && !defined(__wasi__) #define __wasi__ #endif #if defined(__cplusplus) #define mi_decl_externc extern "C" #else #define mi_decl_externc #endif // "libc.c" #include int _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args); int _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...); char _mi_toupper(char c); int _mi_strnicmp(const char* s, const char* t, size_t n); void _mi_strlcpy(char* dest, const char* src, size_t dest_size); void _mi_strlcat(char* dest, const char* src, size_t dest_size); size_t _mi_strlen(const char* s); size_t _mi_strnlen(const char* s, size_t max_len); bool _mi_getenv(const char* name, char* result, size_t result_size); // "options.c" void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message); void _mi_fprintf(mi_output_fun* out, void* arg, const char* fmt, ...); void _mi_message(const char* fmt, ...); void _mi_warning_message(const char* fmt, ...); void _mi_verbose_message(const char* fmt, ...); void _mi_trace_message(const char* fmt, ...); void _mi_options_init(void); long _mi_option_get_fast(mi_option_t option); void _mi_error_message(int err, const char* fmt, ...); // random.c void _mi_random_init(mi_random_ctx_t* ctx); void _mi_random_init_weak(mi_random_ctx_t* ctx); void _mi_random_reinit_if_weak(mi_random_ctx_t * ctx); void _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* new_ctx); uintptr_t _mi_random_next(mi_random_ctx_t* ctx); uintptr_t _mi_heap_random_next(mi_heap_t* heap); uintptr_t _mi_os_random_weak(uintptr_t extra_seed); static inline uintptr_t _mi_random_shuffle(uintptr_t x); // init.c extern mi_decl_cache_align mi_stats_t _mi_stats_main; extern mi_decl_hidden mi_decl_cache_align const mi_page_t _mi_page_empty; void _mi_process_load(void); void mi_cdecl _mi_process_done(void); bool _mi_is_redirected(void); bool _mi_allocator_init(const char** message); void _mi_allocator_done(void); bool _mi_is_main_thread(void); size_t _mi_current_thread_count(void); bool _mi_preloading(void); // true while the C runtime is not initialized yet void _mi_thread_done(mi_heap_t* heap); void _mi_thread_data_collect(void); void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap); mi_threadid_t _mi_thread_id(void) mi_attr_noexcept; mi_heap_t* _mi_heap_main_get(void); // statically allocated main backing heap mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id); void _mi_heap_guarded_init(mi_heap_t* heap); // os.c void _mi_os_init(void); // called from process init void* _mi_os_alloc(size_t size, mi_memid_t* memid); void _mi_os_free(void* p, size_t size, mi_memid_t memid); void _mi_os_free_ex(void* p, size_t size, bool still_committed, mi_memid_t memid); size_t _mi_os_page_size(void); size_t _mi_os_good_alloc_size(size_t size); bool _mi_os_has_overcommit(void); bool _mi_os_has_virtual_reserve(void); bool _mi_os_reset(void* addr, size_t size); bool _mi_os_commit(void* p, size_t size, bool* is_zero); bool _mi_os_decommit(void* addr, size_t size); bool _mi_os_protect(void* addr, size_t size); bool _mi_os_unprotect(void* addr, size_t size); bool _mi_os_purge(void* p, size_t size); bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, size_t stat_size); void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid); void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_memid_t* memid); void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size); bool _mi_os_use_large_page(size_t size, size_t alignment); size_t _mi_os_large_page_size(void); void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid); // arena.c mi_arena_id_t _mi_arena_id_none(void); void _mi_arena_free(void* p, size_t size, size_t still_committed_size, mi_memid_t memid); void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid); void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid); bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id); bool _mi_arena_contains(const void* p); void _mi_arenas_collect(bool force_purge); void _mi_arena_unsafe_destroy_all(void); bool _mi_arena_segment_clear_abandoned(mi_segment_t* segment); void _mi_arena_segment_mark_abandoned(mi_segment_t* segment); void* _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid); void _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size); typedef struct mi_arena_field_cursor_s { // abstract struct size_t os_list_count; // max entries to visit in the OS abandoned list size_t start; // start arena idx (may need to be wrapped) size_t end; // end arena idx (exclusive, may need to be wrapped) size_t bitmap_idx; // current bit idx for an arena mi_subproc_t* subproc; // only visit blocks in this sub-process bool visit_all; // ensure all abandoned blocks are seen (blocking) bool hold_visit_lock; // if the subproc->abandoned_os_visit_lock is held } mi_arena_field_cursor_t; void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, bool visit_all, mi_arena_field_cursor_t* current); mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous); void _mi_arena_field_cursor_done(mi_arena_field_cursor_t* current); // "segment-map.c" void _mi_segment_map_allocated_at(const mi_segment_t* segment); void _mi_segment_map_freed_at(const mi_segment_t* segment); void _mi_segment_map_unsafe_destroy(void); // "segment.c" mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld); void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld); void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld); bool _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segments_tld_t* tld); void _mi_segment_collect(mi_segment_t* segment, bool force); #if MI_HUGE_PAGE_ABANDON void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block); #else void _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_block_t* block); #endif uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size); // page start for any page void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld); void _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld); bool _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment); bool _mi_segment_visit_blocks(mi_segment_t* segment, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg); // "page.c" void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept mi_attr_malloc; void _mi_page_retire(mi_page_t* page) mi_attr_noexcept; // free the page if there are no other pages with many free blocks void _mi_page_unfull(mi_page_t* page); void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force); // free the page void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq); // abandon the page, to be picked up by another thread... void _mi_page_force_abandon(mi_page_t* page); void _mi_heap_delayed_free_all(mi_heap_t* heap); bool _mi_heap_delayed_free_partial(mi_heap_t* heap); void _mi_heap_collect_retired(mi_heap_t* heap, bool force); void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never); bool _mi_page_try_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never); size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append); void _mi_deferred_free(mi_heap_t* heap, bool force); void _mi_page_free_collect(mi_page_t* page,bool force); void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page); // callback from segments size_t _mi_bin_size(size_t bin); // for stats size_t _mi_bin(size_t size); // for stats // "heap.c" void _mi_heap_init(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag); void _mi_heap_destroy_pages(mi_heap_t* heap); void _mi_heap_collect_abandon(mi_heap_t* heap); void _mi_heap_set_default_direct(mi_heap_t* heap); bool _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid); void _mi_heap_unsafe_destroy_all(mi_heap_t* heap); mi_heap_t* _mi_heap_by_tag(mi_heap_t* heap, uint8_t tag); void _mi_heap_area_init(mi_heap_area_t* area, mi_page_t* page); bool _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_block_visit_fun* visitor, void* arg); // "stats.c" void _mi_stats_done(mi_stats_t* stats); mi_msecs_t _mi_clock_now(void); mi_msecs_t _mi_clock_end(mi_msecs_t start); mi_msecs_t _mi_clock_start(void); // "alloc.c" void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept; // called from `_mi_malloc_generic` void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept; // called from `_mi_heap_malloc_aligned` void* _mi_page_malloc_zeroed(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept; // called from `_mi_heap_malloc_aligned` void* _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept; void* _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept; // called from `_mi_heap_malloc_aligned` void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept; mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p); bool _mi_free_delayed_block(mi_block_t* block); void _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept; // for runtime integration void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size); #if MI_DEBUG>1 bool _mi_page_is_valid(mi_page_t* page); #endif // ------------------------------------------------------ // Branches // ------------------------------------------------------ #if defined(__GNUC__) || defined(__clang__) #define mi_unlikely(x) (__builtin_expect(!!(x),false)) #define mi_likely(x) (__builtin_expect(!!(x),true)) #elif (defined(__cplusplus) && (__cplusplus >= 202002L)) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L) #define mi_unlikely(x) (x) [[unlikely]] #define mi_likely(x) (x) [[likely]] #else #define mi_unlikely(x) (x) #define mi_likely(x) (x) #endif #ifndef __has_builtin #define __has_builtin(x) 0 #endif /* ----------------------------------------------------------- Error codes passed to `_mi_fatal_error` All are recoverable but EFAULT is a serious error and aborts by default in secure mode. For portability define undefined error codes using common Unix codes: ----------------------------------------------------------- */ #include #ifndef EAGAIN // double free #define EAGAIN (11) #endif #ifndef ENOMEM // out of memory #define ENOMEM (12) #endif #ifndef EFAULT // corrupted free-list or meta-data #define EFAULT (14) #endif #ifndef EINVAL // trying to free an invalid pointer #define EINVAL (22) #endif #ifndef EOVERFLOW // count*size overflow #define EOVERFLOW (75) #endif /* ----------------------------------------------------------- Inlined definitions ----------------------------------------------------------- */ #define MI_UNUSED(x) (void)(x) #if (MI_DEBUG>0) #define MI_UNUSED_RELEASE(x) #else #define MI_UNUSED_RELEASE(x) MI_UNUSED(x) #endif #define MI_INIT4(x) x(),x(),x(),x() #define MI_INIT8(x) MI_INIT4(x),MI_INIT4(x) #define MI_INIT16(x) MI_INIT8(x),MI_INIT8(x) #define MI_INIT32(x) MI_INIT16(x),MI_INIT16(x) #define MI_INIT64(x) MI_INIT32(x),MI_INIT32(x) #define MI_INIT128(x) MI_INIT64(x),MI_INIT64(x) #define MI_INIT256(x) MI_INIT128(x),MI_INIT128(x) #define MI_INIT74(x) MI_INIT64(x),MI_INIT8(x),x(),x() #include // initialize a local variable to zero; use memset as compilers optimize constant sized memset's #define _mi_memzero_var(x) memset(&x,0,sizeof(x)) // Is `x` a power of two? (0 is considered a power of two) static inline bool _mi_is_power_of_two(uintptr_t x) { return ((x & (x - 1)) == 0); } // Is a pointer aligned? static inline bool _mi_is_aligned(void* p, size_t alignment) { mi_assert_internal(alignment != 0); return (((uintptr_t)p % alignment) == 0); } // Align upwards static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) { mi_assert_internal(alignment != 0); uintptr_t mask = alignment - 1; if ((alignment & mask) == 0) { // power of two? return ((sz + mask) & ~mask); } else { return (((sz + mask)/alignment)*alignment); } } // Align downwards static inline uintptr_t _mi_align_down(uintptr_t sz, size_t alignment) { mi_assert_internal(alignment != 0); uintptr_t mask = alignment - 1; if ((alignment & mask) == 0) { // power of two? return (sz & ~mask); } else { return ((sz / alignment) * alignment); } } // Align a pointer upwards static inline void* mi_align_up_ptr(void* p, size_t alignment) { return (void*)_mi_align_up((uintptr_t)p, alignment); } // Align a pointer downwards static inline void* mi_align_down_ptr(void* p, size_t alignment) { return (void*)_mi_align_down((uintptr_t)p, alignment); } // Divide upwards: `s <= _mi_divide_up(s,d)*d < s+d`. static inline uintptr_t _mi_divide_up(uintptr_t size, size_t divider) { mi_assert_internal(divider != 0); return (divider == 0 ? size : ((size + divider - 1) / divider)); } // clamp an integer static inline size_t _mi_clamp(size_t sz, size_t min, size_t max) { if (sz < min) return min; else if (sz > max) return max; else return sz; } // Is memory zero initialized? static inline bool mi_mem_is_zero(const void* p, size_t size) { for (size_t i = 0; i < size; i++) { if (((uint8_t*)p)[i] != 0) return false; } return true; } // Align a byte size to a size in _machine words_, // i.e. byte size == `wsize*sizeof(void*)`. static inline size_t _mi_wsize_from_size(size_t size) { mi_assert_internal(size <= SIZE_MAX - sizeof(uintptr_t)); return (size + sizeof(uintptr_t) - 1) / sizeof(uintptr_t); } // Overflow detecting multiply #if __has_builtin(__builtin_umul_overflow) || (defined(__GNUC__) && (__GNUC__ >= 5)) #include // UINT_MAX, ULONG_MAX #if defined(_CLOCK_T) // for Illumos #undef _CLOCK_T #endif static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) { #if (SIZE_MAX == ULONG_MAX) return __builtin_umull_overflow(count, size, (unsigned long *)total); #elif (SIZE_MAX == UINT_MAX) return __builtin_umul_overflow(count, size, (unsigned int *)total); #else return __builtin_umulll_overflow(count, size, (unsigned long long *)total); #endif } #else /* __builtin_umul_overflow is unavailable */ static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) { #define MI_MUL_COULD_OVERFLOW ((size_t)1 << (4*sizeof(size_t))) // sqrt(SIZE_MAX) *total = count * size; // note: gcc/clang optimize this to directly check the overflow flag return ((size >= MI_MUL_COULD_OVERFLOW || count >= MI_MUL_COULD_OVERFLOW) && size > 0 && (SIZE_MAX / size) < count); } #endif // Safe multiply `count*size` into `total`; return `true` on overflow. static inline bool mi_count_size_overflow(size_t count, size_t size, size_t* total) { if (count==1) { // quick check for the case where count is one (common for C++ allocators) *total = size; return false; } else if mi_unlikely(mi_mul_overflow(count, size, total)) { #if MI_DEBUG > 0 _mi_error_message(EOVERFLOW, "allocation request is too large (%zu * %zu bytes)\n", count, size); #endif *total = SIZE_MAX; return true; } else return false; } /*---------------------------------------------------------------------------------------- Heap functions ------------------------------------------------------------------------------------------- */ extern mi_decl_hidden const mi_heap_t _mi_heap_empty; // read-only empty heap, initial value of the thread local default heap static inline bool mi_heap_is_backing(const mi_heap_t* heap) { return (heap->tld->heap_backing == heap); } static inline bool mi_heap_is_initialized(mi_heap_t* heap) { mi_assert_internal(heap != NULL); return (heap != NULL && heap != &_mi_heap_empty); } static inline uintptr_t _mi_ptr_cookie(const void* p) { extern mi_decl_hidden mi_heap_t _mi_heap_main; mi_assert_internal(_mi_heap_main.cookie != 0); return ((uintptr_t)p ^ _mi_heap_main.cookie); } /* ----------------------------------------------------------- Pages ----------------------------------------------------------- */ static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t size) { mi_assert_internal(size <= (MI_SMALL_SIZE_MAX + MI_PADDING_SIZE)); const size_t idx = _mi_wsize_from_size(size); mi_assert_internal(idx < MI_PAGES_DIRECT); return heap->pages_free_direct[idx]; } // Segment that contains the pointer // Large aligned blocks may be aligned at N*MI_SEGMENT_SIZE (inside a huge segment > MI_SEGMENT_SIZE), // and we need align "down" to the segment info which is `MI_SEGMENT_SIZE` bytes before it; // therefore we align one byte before `p`. // We check for NULL afterwards on 64-bit systems to improve codegen for `mi_free`. static inline mi_segment_t* _mi_ptr_segment(const void* p) { mi_segment_t* const segment = (mi_segment_t*)(((uintptr_t)p - 1) & ~MI_SEGMENT_MASK); #if MI_INTPTR_SIZE <= 4 return (p==NULL ? NULL : segment); #else return ((intptr_t)segment <= 0 ? NULL : segment); #endif } static inline mi_page_t* mi_slice_to_page(mi_slice_t* s) { mi_assert_internal(s->slice_offset== 0 && s->slice_count > 0); return (mi_page_t*)(s); } static inline mi_slice_t* mi_page_to_slice(mi_page_t* p) { mi_assert_internal(p->slice_offset== 0 && p->slice_count > 0); return (mi_slice_t*)(p); } // Segment belonging to a page static inline mi_segment_t* _mi_page_segment(const mi_page_t* page) { mi_assert_internal(page!=NULL); mi_segment_t* segment = _mi_ptr_segment(page); mi_assert_internal(segment == NULL || ((mi_slice_t*)page >= segment->slices && (mi_slice_t*)page < segment->slices + segment->slice_entries)); return segment; } static inline mi_slice_t* mi_slice_first(const mi_slice_t* slice) { mi_slice_t* start = (mi_slice_t*)((uint8_t*)slice - slice->slice_offset); mi_assert_internal(start >= _mi_ptr_segment(slice)->slices); mi_assert_internal(start->slice_offset == 0); mi_assert_internal(start + start->slice_count > slice); return start; } // Get the page containing the pointer (performance critical as it is called in mi_free) static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const void* p) { mi_assert_internal(p > (void*)segment); ptrdiff_t diff = (uint8_t*)p - (uint8_t*)segment; mi_assert_internal(diff > 0 && diff <= (ptrdiff_t)MI_SEGMENT_SIZE); size_t idx = (size_t)diff >> MI_SEGMENT_SLICE_SHIFT; mi_assert_internal(idx <= segment->slice_entries); mi_slice_t* slice0 = (mi_slice_t*)&segment->slices[idx]; mi_slice_t* slice = mi_slice_first(slice0); // adjust to the block that holds the page data mi_assert_internal(slice->slice_offset == 0); mi_assert_internal(slice >= segment->slices && slice < segment->slices + segment->slice_entries); return mi_slice_to_page(slice); } // Quick page start for initialized pages static inline uint8_t* mi_page_start(const mi_page_t* page) { mi_assert_internal(page->page_start != NULL); mi_assert_expensive(_mi_segment_page_start(_mi_page_segment(page),page,NULL) == page->page_start); return page->page_start; } // Get the page containing the pointer static inline mi_page_t* _mi_ptr_page(void* p) { mi_assert_internal(p!=NULL); return _mi_segment_page_of(_mi_ptr_segment(p), p); } // Get the block size of a page (special case for huge objects) static inline size_t mi_page_block_size(const mi_page_t* page) { mi_assert_internal(page->block_size > 0); return page->block_size; } static inline bool mi_page_is_huge(const mi_page_t* page) { mi_assert_internal((page->is_huge && _mi_page_segment(page)->kind == MI_SEGMENT_HUGE) || (!page->is_huge && _mi_page_segment(page)->kind != MI_SEGMENT_HUGE)); return page->is_huge; } // Get the usable block size of a page without fixed padding. // This may still include internal padding due to alignment and rounding up size classes. static inline size_t mi_page_usable_block_size(const mi_page_t* page) { return mi_page_block_size(page) - MI_PADDING_SIZE; } // size of a segment static inline size_t mi_segment_size(mi_segment_t* segment) { return segment->segment_slices * MI_SEGMENT_SLICE_SIZE; } static inline uint8_t* mi_segment_end(mi_segment_t* segment) { return (uint8_t*)segment + mi_segment_size(segment); } // Thread free access static inline mi_block_t* mi_page_thread_free(const mi_page_t* page) { return (mi_block_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free) & ~3); } static inline mi_delayed_t mi_page_thread_free_flag(const mi_page_t* page) { return (mi_delayed_t)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free) & 3); } // Heap access static inline mi_heap_t* mi_page_heap(const mi_page_t* page) { return (mi_heap_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xheap)); } static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) { mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING); mi_atomic_store_release(&page->xheap,(uintptr_t)heap); if (heap != NULL) { page->heap_tag = heap->tag; } } // Thread free flag helpers static inline mi_block_t* mi_tf_block(mi_thread_free_t tf) { return (mi_block_t*)(tf & ~0x03); } static inline mi_delayed_t mi_tf_delayed(mi_thread_free_t tf) { return (mi_delayed_t)(tf & 0x03); } static inline mi_thread_free_t mi_tf_make(mi_block_t* block, mi_delayed_t delayed) { return (mi_thread_free_t)((uintptr_t)block | (uintptr_t)delayed); } static inline mi_thread_free_t mi_tf_set_delayed(mi_thread_free_t tf, mi_delayed_t delayed) { return mi_tf_make(mi_tf_block(tf),delayed); } static inline mi_thread_free_t mi_tf_set_block(mi_thread_free_t tf, mi_block_t* block) { return mi_tf_make(block, mi_tf_delayed(tf)); } // are all blocks in a page freed? // note: needs up-to-date used count, (as the `xthread_free` list may not be empty). see `_mi_page_collect_free`. static inline bool mi_page_all_free(const mi_page_t* page) { mi_assert_internal(page != NULL); return (page->used == 0); } // are there any available blocks? static inline bool mi_page_has_any_available(const mi_page_t* page) { mi_assert_internal(page != NULL && page->reserved > 0); return (page->used < page->reserved || (mi_page_thread_free(page) != NULL)); } // are there immediately available blocks, i.e. blocks available on the free list. static inline bool mi_page_immediate_available(const mi_page_t* page) { mi_assert_internal(page != NULL); return (page->free != NULL); } // is more than 7/8th of a page in use? static inline bool mi_page_is_mostly_used(const mi_page_t* page) { if (page==NULL) return true; uint16_t frac = page->reserved / 8U; return (page->reserved - page->used <= frac); } static inline mi_page_queue_t* mi_page_queue(const mi_heap_t* heap, size_t size) { return &((mi_heap_t*)heap)->pages[_mi_bin(size)]; } //----------------------------------------------------------- // Page flags //----------------------------------------------------------- static inline bool mi_page_is_in_full(const mi_page_t* page) { return page->flags.x.in_full; } static inline void mi_page_set_in_full(mi_page_t* page, bool in_full) { page->flags.x.in_full = in_full; } static inline bool mi_page_has_aligned(const mi_page_t* page) { return page->flags.x.has_aligned; } static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) { page->flags.x.has_aligned = has_aligned; } /* ------------------------------------------------------------------- Guarded objects ------------------------------------------------------------------- */ #if MI_GUARDED static inline bool mi_block_ptr_is_guarded(const mi_block_t* block, const void* p) { const ptrdiff_t offset = (uint8_t*)p - (uint8_t*)block; return (offset >= (ptrdiff_t)(sizeof(mi_block_t)) && block->next == MI_BLOCK_TAG_GUARDED); } static inline bool mi_heap_malloc_use_guarded(mi_heap_t* heap, size_t size) { // this code is written to result in fast assembly as it is on the hot path for allocation const size_t count = heap->guarded_sample_count - 1; // if the rate was 0, this will underflow and count for a long time.. if mi_likely(count != 0) { // no sample heap->guarded_sample_count = count; return false; } else if (size >= heap->guarded_size_min && size <= heap->guarded_size_max) { // use guarded allocation heap->guarded_sample_count = heap->guarded_sample_rate; // reset return (heap->guarded_sample_rate != 0); } else { // failed size criteria, rewind count (but don't write to an empty heap) if (heap->guarded_sample_rate != 0) { heap->guarded_sample_count = 1; } return false; } } mi_decl_restrict void* _mi_heap_malloc_guarded(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept; #endif /* ------------------------------------------------------------------- Encoding/Decoding the free list next pointers This is to protect against buffer overflow exploits where the free list is mutated. Many hardened allocators xor the next pointer `p` with a secret key `k1`, as `p^k1`. This prevents overwriting with known values but might be still too weak: if the attacker can guess the pointer `p` this can reveal `k1` (since `p^k1^p == k1`). Moreover, if multiple blocks can be read as well, the attacker can xor both as `(p1^k1) ^ (p2^k1) == p1^p2` which may reveal a lot about the pointers (and subsequently `k1`). Instead mimalloc uses an extra key `k2` and encodes as `((p^k2)<<> (MI_INTPTR_BITS - shift)))); } static inline uintptr_t mi_rotr(uintptr_t x, uintptr_t shift) { shift %= MI_INTPTR_BITS; return (shift==0 ? x : ((x >> shift) | (x << (MI_INTPTR_BITS - shift)))); } static inline void* mi_ptr_decode(const void* null, const mi_encoded_t x, const uintptr_t* keys) { void* p = (void*)(mi_rotr(x - keys[0], keys[0]) ^ keys[1]); return (p==null ? NULL : p); } static inline mi_encoded_t mi_ptr_encode(const void* null, const void* p, const uintptr_t* keys) { uintptr_t x = (uintptr_t)(p==NULL ? null : p); return mi_rotl(x ^ keys[1], keys[0]) + keys[0]; } static inline uint32_t mi_ptr_encode_canary(const void* null, const void* p, const uintptr_t* keys) { const uint32_t x = (uint32_t)(mi_ptr_encode(null,p,keys)); // make the lowest byte 0 to prevent spurious read overflows which could be a security issue (issue #951) #ifdef MI_BIG_ENDIAN return (x & 0x00FFFFFF); #else return (x & 0xFFFFFF00); #endif } static inline mi_block_t* mi_block_nextx( const void* null, const mi_block_t* block, const uintptr_t* keys ) { mi_track_mem_defined(block,sizeof(mi_block_t)); mi_block_t* next; #ifdef MI_ENCODE_FREELIST next = (mi_block_t*)mi_ptr_decode(null, block->next, keys); #else MI_UNUSED(keys); MI_UNUSED(null); next = (mi_block_t*)block->next; #endif mi_track_mem_noaccess(block,sizeof(mi_block_t)); return next; } static inline void mi_block_set_nextx(const void* null, mi_block_t* block, const mi_block_t* next, const uintptr_t* keys) { mi_track_mem_undefined(block,sizeof(mi_block_t)); #ifdef MI_ENCODE_FREELIST block->next = mi_ptr_encode(null, next, keys); #else MI_UNUSED(keys); MI_UNUSED(null); block->next = (mi_encoded_t)next; #endif mi_track_mem_noaccess(block,sizeof(mi_block_t)); } static inline mi_block_t* mi_block_next(const mi_page_t* page, const mi_block_t* block) { #ifdef MI_ENCODE_FREELIST mi_block_t* next = mi_block_nextx(page,block,page->keys); // check for free list corruption: is `next` at least in the same page? // TODO: check if `next` is `page->block_size` aligned? if mi_unlikely(next!=NULL && !mi_is_in_same_page(block, next)) { _mi_error_message(EFAULT, "corrupted free list entry of size %zub at %p: value 0x%zx\n", mi_page_block_size(page), block, (uintptr_t)next); next = NULL; } return next; #else MI_UNUSED(page); return mi_block_nextx(page,block,NULL); #endif } static inline void mi_block_set_next(const mi_page_t* page, mi_block_t* block, const mi_block_t* next) { #ifdef MI_ENCODE_FREELIST mi_block_set_nextx(page,block,next, page->keys); #else MI_UNUSED(page); mi_block_set_nextx(page,block,next,NULL); #endif } // ------------------------------------------------------------------- // commit mask // ------------------------------------------------------------------- static inline void mi_commit_mask_create_empty(mi_commit_mask_t* cm) { for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) { cm->mask[i] = 0; } } static inline void mi_commit_mask_create_full(mi_commit_mask_t* cm) { for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) { cm->mask[i] = ~((size_t)0); } } static inline bool mi_commit_mask_is_empty(const mi_commit_mask_t* cm) { for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) { if (cm->mask[i] != 0) return false; } return true; } static inline bool mi_commit_mask_is_full(const mi_commit_mask_t* cm) { for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) { if (cm->mask[i] != ~((size_t)0)) return false; } return true; } // defined in `segment.c`: size_t _mi_commit_mask_committed_size(const mi_commit_mask_t* cm, size_t total); size_t _mi_commit_mask_next_run(const mi_commit_mask_t* cm, size_t* idx); #define mi_commit_mask_foreach(cm,idx,count) \ idx = 0; \ while ((count = _mi_commit_mask_next_run(cm,&idx)) > 0) { #define mi_commit_mask_foreach_end() \ idx += count; \ } /* ----------------------------------------------------------- memory id's ----------------------------------------------------------- */ static inline mi_memid_t _mi_memid_create(mi_memkind_t memkind) { mi_memid_t memid; _mi_memzero_var(memid); memid.memkind = memkind; return memid; } static inline mi_memid_t _mi_memid_none(void) { return _mi_memid_create(MI_MEM_NONE); } static inline mi_memid_t _mi_memid_create_os(bool committed, bool is_zero, bool is_large) { mi_memid_t memid = _mi_memid_create(MI_MEM_OS); memid.initially_committed = committed; memid.initially_zero = is_zero; memid.is_pinned = is_large; return memid; } // ------------------------------------------------------------------- // Fast "random" shuffle // ------------------------------------------------------------------- static inline uintptr_t _mi_random_shuffle(uintptr_t x) { if (x==0) { x = 17; } // ensure we don't get stuck in generating zeros #if (MI_INTPTR_SIZE>=8) // by Sebastiano Vigna, see: x ^= x >> 30; x *= 0xbf58476d1ce4e5b9UL; x ^= x >> 27; x *= 0x94d049bb133111ebUL; x ^= x >> 31; #elif (MI_INTPTR_SIZE==4) // by Chris Wellons, see: x ^= x >> 16; x *= 0x7feb352dUL; x ^= x >> 15; x *= 0x846ca68bUL; x ^= x >> 16; #endif return x; } // ------------------------------------------------------------------- // Optimize numa node access for the common case (= one node) // ------------------------------------------------------------------- int _mi_os_numa_node_get(void); size_t _mi_os_numa_node_count_get(void); extern mi_decl_hidden _Atomic(size_t) _mi_numa_node_count; static inline int _mi_os_numa_node(void) { if mi_likely(mi_atomic_load_relaxed(&_mi_numa_node_count) == 1) { return 0; } else return _mi_os_numa_node_get(); } static inline size_t _mi_os_numa_node_count(void) { const size_t count = mi_atomic_load_relaxed(&_mi_numa_node_count); if mi_likely(count > 0) { return count; } else return _mi_os_numa_node_count_get(); } // ----------------------------------------------------------------------- // Count bits: trailing or leading zeros (with MI_INTPTR_BITS on all zero) // ----------------------------------------------------------------------- #if defined(__GNUC__) #include // LONG_MAX #define MI_HAVE_FAST_BITSCAN static inline size_t mi_clz(size_t x) { if (x==0) return MI_SIZE_BITS; #if (SIZE_MAX == ULONG_MAX) return __builtin_clzl(x); #else return __builtin_clzll(x); #endif } static inline size_t mi_ctz(size_t x) { if (x==0) return MI_SIZE_BITS; #if (SIZE_MAX == ULONG_MAX) return __builtin_ctzl(x); #else return __builtin_ctzll(x); #endif } #elif defined(_MSC_VER) #include // LONG_MAX #include // BitScanReverse64 #define MI_HAVE_FAST_BITSCAN static inline size_t mi_clz(size_t x) { if (x==0) return MI_SIZE_BITS; unsigned long idx; #if (SIZE_MAX == ULONG_MAX) _BitScanReverse(&idx, x); #else _BitScanReverse64(&idx, x); #endif return ((MI_SIZE_BITS - 1) - idx); } static inline size_t mi_ctz(size_t x) { if (x==0) return MI_SIZE_BITS; unsigned long idx; #if (SIZE_MAX == ULONG_MAX) _BitScanForward(&idx, x); #else _BitScanForward64(&idx, x); #endif return idx; } #else static inline size_t mi_ctz_generic32(uint32_t x) { // de Bruijn multiplication, see static const uint8_t debruijn[32] = { 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 }; if (x==0) return 32; return debruijn[(uint32_t)((x & -(int32_t)x) * (uint32_t)(0x077CB531U)) >> 27]; } static inline size_t mi_clz_generic32(uint32_t x) { // de Bruijn multiplication, see static const uint8_t debruijn[32] = { 31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1, 23, 19, 11, 3, 16, 14, 7, 24, 12, 4, 8, 25, 5, 26, 27, 0 }; if (x==0) return 32; x |= x >> 1; x |= x >> 2; x |= x >> 4; x |= x >> 8; x |= x >> 16; return debruijn[(uint32_t)(x * (uint32_t)(0x07C4ACDDU)) >> 27]; } static inline size_t mi_ctz(size_t x) { if (x==0) return MI_SIZE_BITS; #if (MI_SIZE_BITS <= 32) return mi_ctz_generic32((uint32_t)x); #else const uint32_t lo = (uint32_t)x; if (lo != 0) { return mi_ctz_generic32(lo); } else { return (32 + mi_ctz_generic32((uint32_t)(x>>32))); } #endif } static inline size_t mi_clz(size_t x) { if (x==0) return MI_SIZE_BITS; #if (MI_SIZE_BITS <= 32) return mi_clz_generic32((uint32_t)x); #else const uint32_t hi = (uint32_t)(x>>32); if (hi != 0) { return mi_clz_generic32(hi); } else { return 32 + mi_clz_generic32((uint32_t)x); } #endif } #endif // "bit scan reverse": Return index of the highest bit (or MI_SIZE_BITS if `x` is zero) static inline size_t mi_bsr(size_t x) { return (x==0 ? MI_SIZE_BITS : MI_SIZE_BITS - 1 - mi_clz(x)); } // --------------------------------------------------------------------------------- // Provide our own `_mi_memcpy` for potential performance optimizations. // // For now, only on Windows with msvc/clang-cl we optimize to `rep movsb` if // we happen to run on x86/x64 cpu's that have "fast short rep movsb" (FSRM) support // (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017). See also issue #201 and pr #253. // --------------------------------------------------------------------------------- #if !MI_TRACK_ENABLED && defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64)) #include extern bool _mi_cpu_has_fsrm; extern bool _mi_cpu_has_erms; static inline void _mi_memcpy(void* dst, const void* src, size_t n) { if ((_mi_cpu_has_fsrm && n <= 128) || (_mi_cpu_has_erms && n > 128)) { __movsb((unsigned char*)dst, (const unsigned char*)src, n); } else { memcpy(dst, src, n); } } static inline void _mi_memzero(void* dst, size_t n) { if ((_mi_cpu_has_fsrm && n <= 128) || (_mi_cpu_has_erms && n > 128)) { __stosb((unsigned char*)dst, 0, n); } else { memset(dst, 0, n); } } #else static inline void _mi_memcpy(void* dst, const void* src, size_t n) { memcpy(dst, src, n); } static inline void _mi_memzero(void* dst, size_t n) { memset(dst, 0, n); } #endif // ------------------------------------------------------------------------------- // The `_mi_memcpy_aligned` can be used if the pointers are machine-word aligned // This is used for example in `mi_realloc`. // ------------------------------------------------------------------------------- #if (defined(__GNUC__) && (__GNUC__ >= 4)) || defined(__clang__) // On GCC/CLang we provide a hint that the pointers are word aligned. static inline void _mi_memcpy_aligned(void* dst, const void* src, size_t n) { mi_assert_internal(((uintptr_t)dst % MI_INTPTR_SIZE == 0) && ((uintptr_t)src % MI_INTPTR_SIZE == 0)); void* adst = __builtin_assume_aligned(dst, MI_INTPTR_SIZE); const void* asrc = __builtin_assume_aligned(src, MI_INTPTR_SIZE); _mi_memcpy(adst, asrc, n); } static inline void _mi_memzero_aligned(void* dst, size_t n) { mi_assert_internal((uintptr_t)dst % MI_INTPTR_SIZE == 0); void* adst = __builtin_assume_aligned(dst, MI_INTPTR_SIZE); _mi_memzero(adst, n); } #else // Default fallback on `_mi_memcpy` static inline void _mi_memcpy_aligned(void* dst, const void* src, size_t n) { mi_assert_internal(((uintptr_t)dst % MI_INTPTR_SIZE == 0) && ((uintptr_t)src % MI_INTPTR_SIZE == 0)); _mi_memcpy(dst, src, n); } static inline void _mi_memzero_aligned(void* dst, size_t n) { mi_assert_internal((uintptr_t)dst % MI_INTPTR_SIZE == 0); _mi_memzero(dst, n); } #endif #endif ================================================ FILE: third-party/mimalloc/include/mimalloc/prim.h ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2024, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #pragma once #ifndef MIMALLOC_PRIM_H #define MIMALLOC_PRIM_H // -------------------------------------------------------------------------- // This file specifies the primitive portability API. // Each OS/host needs to implement these primitives, see `src/prim` // for implementations on Window, macOS, WASI, and Linux/Unix. // // note: on all primitive functions, we always have result parameters != NULL, and: // addr != NULL and page aligned // size > 0 and page aligned // the return value is an error code as an `int` where 0 is success // -------------------------------------------------------------------------- // OS memory configuration typedef struct mi_os_mem_config_s { size_t page_size; // default to 4KiB size_t large_page_size; // 0 if not supported, usually 2MiB (4MiB on Windows) size_t alloc_granularity; // smallest allocation size (usually 4KiB, on Windows 64KiB) size_t physical_memory_in_kib; // physical memory size in KiB size_t virtual_address_bits; // usually 48 or 56 bits on 64-bit systems. (used to determine secure randomization) bool has_overcommit; // can we reserve more memory than can be actually committed? bool has_partial_free; // can allocated blocks be freed partially? (true for mmap, false for VirtualAlloc) bool has_virtual_reserve; // supports virtual address space reservation? (if true we can reserve virtual address space without using commit or physical memory) } mi_os_mem_config_t; // Initialize void _mi_prim_mem_init( mi_os_mem_config_t* config ); // Free OS memory int _mi_prim_free(void* addr, size_t size ); // Allocate OS memory. Return NULL on error. // The `try_alignment` is just a hint and the returned pointer does not have to be aligned. // If `commit` is false, the virtual memory range only needs to be reserved (with no access) // which will later be committed explicitly using `_mi_prim_commit`. // `is_zero` is set to true if the memory was zero initialized (as on most OS's) // The `hint_addr` address is either `NULL` or a preferred allocation address but can be ignored. // pre: !commit => !allow_large // try_alignment >= _mi_os_page_size() and a power of 2 int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr); // Commit memory. Returns error code or 0 on success. // For example, on Linux this would make the memory PROT_READ|PROT_WRITE. // `is_zero` is set to true if the memory was zero initialized (e.g. on Windows) int _mi_prim_commit(void* addr, size_t size, bool* is_zero); // Decommit memory. Returns error code or 0 on success. The `needs_recommit` result is true // if the memory would need to be re-committed. For example, on Windows this is always true, // but on Linux we could use MADV_DONTNEED to decommit which does not need a recommit. // pre: needs_recommit != NULL int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit); // Reset memory. The range keeps being accessible but the content might be reset. // Returns error code or 0 on success. int _mi_prim_reset(void* addr, size_t size); // Protect memory. Returns error code or 0 on success. int _mi_prim_protect(void* addr, size_t size, bool protect); // Allocate huge (1GiB) pages possibly associated with a NUMA node. // `is_zero` is set to true if the memory was zero initialized (as on most OS's) // pre: size > 0 and a multiple of 1GiB. // numa_node is either negative (don't care), or a numa node number. int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr); // Return the current NUMA node size_t _mi_prim_numa_node(void); // Return the number of logical NUMA nodes size_t _mi_prim_numa_node_count(void); // Clock ticks mi_msecs_t _mi_prim_clock_now(void); // Return process information (only for statistics) typedef struct mi_process_info_s { mi_msecs_t elapsed; mi_msecs_t utime; mi_msecs_t stime; size_t current_rss; size_t peak_rss; size_t current_commit; size_t peak_commit; size_t page_faults; } mi_process_info_t; void _mi_prim_process_info(mi_process_info_t* pinfo); // Default stderr output. (only for warnings etc. with verbose enabled) // msg != NULL && _mi_strlen(msg) > 0 void _mi_prim_out_stderr( const char* msg ); // Get an environment variable. (only for options) // name != NULL, result != NULL, result_size >= 64 bool _mi_prim_getenv(const char* name, char* result, size_t result_size); // Fill a buffer with strong randomness; return `false` on error or if // there is no strong randomization available. bool _mi_prim_random_buf(void* buf, size_t buf_len); // Called on the first thread start, and should ensure `_mi_thread_done` is called on thread termination. void _mi_prim_thread_init_auto_done(void); // Called on process exit and may take action to clean up resources associated with the thread auto done. void _mi_prim_thread_done_auto_done(void); // Called when the default heap for a thread changes void _mi_prim_thread_associate_default_heap(mi_heap_t* heap); //------------------------------------------------------------------- // Access to TLS (thread local storage) slots. // We need fast access to both a unique thread id (in `free.c:mi_free`) and // to a thread-local heap pointer (in `alloc.c:mi_malloc`). // To achieve this we use specialized code for various platforms. //------------------------------------------------------------------- // On some libc + platform combinations we can directly access a thread-local storage (TLS) slot. // The TLS layout depends on both the OS and libc implementation so we use specific tests for each main platform. // If you test on another platform and it works please send a PR :-) // see also https://akkadia.org/drepper/tls.pdf for more info on the TLS register. // // Note: we would like to prefer `__builtin_thread_pointer()` nowadays instead of using assembly, // but unfortunately we can not detect support reliably (see issue #883) // We also use it on Apple OS as we use a TLS slot for the default heap there. #if defined(__GNUC__) && ( \ (defined(__GLIBC__) && (defined(__x86_64__) || defined(__i386__) || (defined(__arm__) && __ARM_ARCH >= 7) || defined(__aarch64__))) \ || (defined(__APPLE__) && (defined(__x86_64__) || defined(__aarch64__) || defined(__POWERPC__))) \ || (defined(__BIONIC__) && (defined(__x86_64__) || defined(__i386__) || (defined(__arm__) && __ARM_ARCH >= 7) || defined(__aarch64__))) \ || (defined(__FreeBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \ || (defined(__OpenBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \ ) #define MI_HAS_TLS_SLOT 1 static inline void* mi_prim_tls_slot(size_t slot) mi_attr_noexcept { void* res; const size_t ofs = (slot*sizeof(void*)); #if defined(__i386__) __asm__("movl %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : ); // x86 32-bit always uses GS #elif defined(__APPLE__) && defined(__x86_64__) __asm__("movq %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : ); // x86_64 macOSX uses GS #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4) __asm__("movl %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : ); // x32 ABI #elif defined(__x86_64__) __asm__("movq %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : ); // x86_64 Linux, BSD uses FS #elif defined(__arm__) void** tcb; MI_UNUSED(ofs); __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb)); res = tcb[slot]; #elif defined(__aarch64__) void** tcb; MI_UNUSED(ofs); #if defined(__APPLE__) // M1, issue #343 __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb)); #else __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb)); #endif res = tcb[slot]; #elif defined(__APPLE__) && defined(__POWERPC__) // ppc, issue #781 MI_UNUSED(ofs); res = pthread_getspecific(slot); #endif return res; } // setting a tls slot is only used on macOS for now static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexcept { const size_t ofs = (slot*sizeof(void*)); #if defined(__i386__) __asm__("movl %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : ); // 32-bit always uses GS #elif defined(__APPLE__) && defined(__x86_64__) __asm__("movq %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : ); // x86_64 macOS uses GS #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4) __asm__("movl %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : ); // x32 ABI #elif defined(__x86_64__) __asm__("movq %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : ); // x86_64 Linux, BSD uses FS #elif defined(__arm__) void** tcb; MI_UNUSED(ofs); __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb)); tcb[slot] = value; #elif defined(__aarch64__) void** tcb; MI_UNUSED(ofs); #if defined(__APPLE__) // M1, issue #343 __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb)); #else __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb)); #endif tcb[slot] = value; #elif defined(__APPLE__) && defined(__POWERPC__) // ppc, issue #781 MI_UNUSED(ofs); pthread_setspecific(slot, value); #endif } #elif _WIN32 && MI_WIN_USE_FIXED_TLS && !defined(MI_WIN_USE_FLS) // On windows we can store the thread-local heap at a fixed TLS slot to avoid // thread-local initialization checks in the fast path. This uses a fixed location // in the TCB though (last user-reserved slot by default) which may clash with other applications. #define MI_HAS_TLS_SLOT 2 // 2 = we can reliably initialize the slot (saving a test on each malloc) #if MI_WIN_USE_FIXED_TLS > 1 #define MI_TLS_SLOT (MI_WIN_USE_FIXED_TLS) #elif MI_SIZE_SIZE == 4 #define MI_TLS_SLOT (0x710) // Last user-reserved slot // #define MI_TLS_SLOT (0xF0C) // Last TlsSlot (might clash with other app reserved slot) #else #define MI_TLS_SLOT (0x888) // Last user-reserved slot // #define MI_TLS_SLOT (0x1678) // Last TlsSlot (might clash with other app reserved slot) #endif static inline void* mi_prim_tls_slot(size_t slot) mi_attr_noexcept { #if (_M_X64 || _M_AMD64) && !defined(_M_ARM64EC) return (void*)__readgsqword((unsigned long)slot); // direct load at offset from gs #elif _M_IX86 && !defined(_M_ARM64EC) return (void*)__readfsdword((unsigned long)slot); // direct load at offset from fs #else return ((void**)NtCurrentTeb())[slot / sizeof(void*)]; #endif } static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexcept { ((void**)NtCurrentTeb())[slot / sizeof(void*)] = value; } #endif //------------------------------------------------------------------- // Get a fast unique thread id. // // Getting the thread id should be performant as it is called in the // fast path of `_mi_free` and we specialize for various platforms as // inlined definitions. Regular code should call `init.c:_mi_thread_id()`. // We only require _mi_prim_thread_id() to return a unique id // for each thread (unequal to zero). //------------------------------------------------------------------- // Do we have __builtin_thread_pointer? This would be the preferred way to get a unique thread id // but unfortunately, it seems we cannot test for this reliably at this time (see issue #883) // Nevertheless, it seems needed on older graviton platforms (see issue #851). // For now, we only enable this for specific platforms. #if !defined(__APPLE__) /* on apple (M1) the wrong register is read (tpidr_el0 instead of tpidrro_el0) so fall back to TLS slot assembly ()*/ \ && !defined(__CYGWIN__) \ && !defined(MI_LIBC_MUSL) \ && (!defined(__clang_major__) || __clang_major__ >= 14) /* older clang versions emit bad code; fall back to using the TLS slot () */ #if (defined(__GNUC__) && (__GNUC__ >= 7) && defined(__aarch64__)) /* aarch64 for older gcc versions (issue #851) */ \ || (defined(__GNUC__) && (__GNUC__ >= 11) && defined(__x86_64__)) \ || (defined(__clang_major__) && (__clang_major__ >= 14) && (defined(__aarch64__) || defined(__x86_64__))) #define MI_USE_BUILTIN_THREAD_POINTER 1 #endif #endif // defined in `init.c`; do not use these directly extern mi_decl_thread mi_heap_t* _mi_heap_default; // default heap to allocate from extern bool _mi_process_is_initialized; // has mi_process_init been called? static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept; // Get a unique id for the current thread. #if defined(MI_PRIM_THREAD_ID) static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept { return MI_PRIM_THREAD_ID(); // used for example by CPython for a free threaded build (see python/cpython#115488) } #elif defined(_WIN32) static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept { // Windows: works on Intel and ARM in both 32- and 64-bit return (uintptr_t)NtCurrentTeb(); } #elif MI_USE_BUILTIN_THREAD_POINTER static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept { // Works on most Unix based platforms with recent compilers return (uintptr_t)__builtin_thread_pointer(); } #elif MI_HAS_TLS_SLOT static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept { #if defined(__BIONIC__) // issue #384, #495: on the Bionic libc (Android), slot 1 is the thread id // see: https://github.com/aosp-mirror/platform_bionic/blob/c44b1d0676ded732df4b3b21c5f798eacae93228/libc/platform/bionic/tls_defines.h#L86 return (uintptr_t)mi_prim_tls_slot(1); #else // in all our other targets, slot 0 is the thread id // glibc: https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=sysdeps/x86_64/nptl/tls.h // apple: https://github.com/apple/darwin-xnu/blob/main/libsyscall/os/tsd.h#L36 return (uintptr_t)mi_prim_tls_slot(0); #endif } #else // otherwise use portable C, taking the address of a thread local variable (this is still very fast on most platforms). static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept { return (uintptr_t)&_mi_heap_default; } #endif /* ---------------------------------------------------------------------------------------- Get the thread local default heap: `_mi_prim_get_default_heap()` This is inlined here as it is on the fast path for allocation functions. On most platforms (Windows, Linux, FreeBSD, NetBSD, etc), this just returns a __thread local variable (`_mi_heap_default`). With the initial-exec TLS model this ensures that the storage will always be available (allocated on the thread stacks). On some platforms though we cannot use that when overriding `malloc` since the underlying TLS implementation (or the loader) will call itself `malloc` on a first access and recurse. We try to circumvent this in an efficient way: - macOSX : we use an unused TLS slot from the OS allocated slots (MI_TLS_SLOT). On OSX, the loader itself calls `malloc` even before the modules are initialized. - OpenBSD: we use an unused slot from the pthread block (MI_TLS_PTHREAD_SLOT_OFS). - DragonFly: defaults are working but seem slow compared to freeBSD (see PR #323) ------------------------------------------------------------------------------------------- */ static inline mi_heap_t* mi_prim_get_default_heap(void); #if defined(MI_MALLOC_OVERRIDE) #if defined(__APPLE__) // macOS #define MI_TLS_SLOT 89 // seems unused? // other possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89) // see #elif defined(__OpenBSD__) // use end bytes of a name; goes wrong if anyone uses names > 23 characters (ptrhread specifies 16) // see #define MI_TLS_PTHREAD_SLOT_OFS (6*sizeof(int) + 4*sizeof(void*) + 24) // #elif defined(__DragonFly__) // #warning "mimalloc is not working correctly on DragonFly yet." // #define MI_TLS_PTHREAD_SLOT_OFS (4 + 1*sizeof(void*)) // offset `uniqueid` (also used by gdb?) #elif defined(__ANDROID__) // See issue #381 #define MI_TLS_PTHREAD #endif #endif #if MI_TLS_SLOT # if !defined(MI_HAS_TLS_SLOT) # error "trying to use a TLS slot for the default heap, but the mi_prim_tls_slot primitives are not defined" # endif static inline mi_heap_t* mi_prim_get_default_heap(void) { mi_heap_t* heap = (mi_heap_t*)mi_prim_tls_slot(MI_TLS_SLOT); #if MI_HAS_TLS_SLOT == 1 // check if the TLS slot is initialized if mi_unlikely(heap == NULL) { #ifdef __GNUC__ __asm(""); // prevent conditional load of the address of _mi_heap_empty #endif heap = (mi_heap_t*)&_mi_heap_empty; } #endif return heap; } #elif defined(MI_TLS_PTHREAD_SLOT_OFS) static inline mi_heap_t** mi_prim_tls_pthread_heap_slot(void) { pthread_t self = pthread_self(); #if defined(__DragonFly__) if (self==NULL) return NULL; #endif return (mi_heap_t**)((uint8_t*)self + MI_TLS_PTHREAD_SLOT_OFS); } static inline mi_heap_t* mi_prim_get_default_heap(void) { mi_heap_t** pheap = mi_prim_tls_pthread_heap_slot(); if mi_unlikely(pheap == NULL) return _mi_heap_main_get(); mi_heap_t* heap = *pheap; if mi_unlikely(heap == NULL) return (mi_heap_t*)&_mi_heap_empty; return heap; } #elif defined(MI_TLS_PTHREAD) extern pthread_key_t _mi_heap_default_key; static inline mi_heap_t* mi_prim_get_default_heap(void) { mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key)); return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap); } #else // default using a thread local variable; used on most platforms. static inline mi_heap_t* mi_prim_get_default_heap(void) { #if defined(MI_TLS_RECURSE_GUARD) if (mi_unlikely(!_mi_process_is_initialized)) return _mi_heap_main_get(); #endif return _mi_heap_default; } #endif // mi_prim_get_default_heap() #endif // MIMALLOC_PRIM_H ================================================ FILE: third-party/mimalloc/include/mimalloc/track.h ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2023, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #pragma once #ifndef MIMALLOC_TRACK_H #define MIMALLOC_TRACK_H /* ------------------------------------------------------------------------------------------------------ Track memory ranges with macros for tools like Valgrind address sanitizer, or other memory checkers. These can be defined for tracking allocation: #define mi_track_malloc_size(p,reqsize,size,zero) #define mi_track_free_size(p,_size) The macros are set up such that the size passed to `mi_track_free_size` always matches the size of `mi_track_malloc_size`. (currently, `size == mi_usable_size(p)`). The `reqsize` is what the user requested, and `size >= reqsize`. The `size` is either byte precise (and `size==reqsize`) if `MI_PADDING` is enabled, or otherwise it is the usable block size which may be larger than the original request. Use `_mi_block_size_of(void* p)` to get the full block size that was allocated (including padding etc). The `zero` parameter is `true` if the allocated block is zero initialized. Optional: #define mi_track_align(p,alignedp,offset,size) #define mi_track_resize(p,oldsize,newsize) #define mi_track_init() The `mi_track_align` is called right after a `mi_track_malloc` for aligned pointers in a block. The corresponding `mi_track_free` still uses the block start pointer and original size (corresponding to the `mi_track_malloc`). The `mi_track_resize` is currently unused but could be called on reallocations within a block. `mi_track_init` is called at program start. The following macros are for tools like asan and valgrind to track whether memory is defined, undefined, or not accessible at all: #define mi_track_mem_defined(p,size) #define mi_track_mem_undefined(p,size) #define mi_track_mem_noaccess(p,size) -------------------------------------------------------------------------------------------------------*/ #if MI_TRACK_VALGRIND // valgrind tool #define MI_TRACK_ENABLED 1 #define MI_TRACK_HEAP_DESTROY 1 // track free of individual blocks on heap_destroy #define MI_TRACK_TOOL "valgrind" #include #include #define mi_track_malloc_size(p,reqsize,size,zero) VALGRIND_MALLOCLIKE_BLOCK(p,size,MI_PADDING_SIZE /*red zone*/,zero) #define mi_track_free_size(p,_size) VALGRIND_FREELIKE_BLOCK(p,MI_PADDING_SIZE /*red zone*/) #define mi_track_resize(p,oldsize,newsize) VALGRIND_RESIZEINPLACE_BLOCK(p,oldsize,newsize,MI_PADDING_SIZE /*red zone*/) #define mi_track_mem_defined(p,size) VALGRIND_MAKE_MEM_DEFINED(p,size) #define mi_track_mem_undefined(p,size) VALGRIND_MAKE_MEM_UNDEFINED(p,size) #define mi_track_mem_noaccess(p,size) VALGRIND_MAKE_MEM_NOACCESS(p,size) #elif MI_TRACK_ASAN // address sanitizer #define MI_TRACK_ENABLED 1 #define MI_TRACK_HEAP_DESTROY 0 #define MI_TRACK_TOOL "asan" #include #define mi_track_malloc_size(p,reqsize,size,zero) ASAN_UNPOISON_MEMORY_REGION(p,size) #define mi_track_free_size(p,size) ASAN_POISON_MEMORY_REGION(p,size) #define mi_track_mem_defined(p,size) ASAN_UNPOISON_MEMORY_REGION(p,size) #define mi_track_mem_undefined(p,size) ASAN_UNPOISON_MEMORY_REGION(p,size) #define mi_track_mem_noaccess(p,size) ASAN_POISON_MEMORY_REGION(p,size) #elif MI_TRACK_ETW // windows event tracing #define MI_TRACK_ENABLED 1 #define MI_TRACK_HEAP_DESTROY 1 #define MI_TRACK_TOOL "ETW" #include "../src/prim/windows/etw.h" #define mi_track_init() EventRegistermicrosoft_windows_mimalloc(); #define mi_track_malloc_size(p,reqsize,size,zero) EventWriteETW_MI_ALLOC((UINT64)(p), size) #define mi_track_free_size(p,size) EventWriteETW_MI_FREE((UINT64)(p), size) #else // no tracking #define MI_TRACK_ENABLED 0 #define MI_TRACK_HEAP_DESTROY 0 #define MI_TRACK_TOOL "none" #define mi_track_malloc_size(p,reqsize,size,zero) #define mi_track_free_size(p,_size) #endif // ------------------- // Utility definitions #ifndef mi_track_resize #define mi_track_resize(p,oldsize,newsize) mi_track_free_size(p,oldsize); mi_track_malloc(p,newsize,false) #endif #ifndef mi_track_align #define mi_track_align(p,alignedp,offset,size) mi_track_mem_noaccess(p,offset) #endif #ifndef mi_track_init #define mi_track_init() #endif #ifndef mi_track_mem_defined #define mi_track_mem_defined(p,size) #endif #ifndef mi_track_mem_undefined #define mi_track_mem_undefined(p,size) #endif #ifndef mi_track_mem_noaccess #define mi_track_mem_noaccess(p,size) #endif #if MI_PADDING #define mi_track_malloc(p,reqsize,zero) \ if ((p)!=NULL) { \ mi_assert_internal(mi_usable_size(p)==(reqsize)); \ mi_track_malloc_size(p,reqsize,reqsize,zero); \ } #else #define mi_track_malloc(p,reqsize,zero) \ if ((p)!=NULL) { \ mi_assert_internal(mi_usable_size(p)>=(reqsize)); \ mi_track_malloc_size(p,reqsize,mi_usable_size(p),zero); \ } #endif #endif ================================================ FILE: third-party/mimalloc/include/mimalloc/types.h ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2024, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #pragma once #ifndef MIMALLOC_TYPES_H #define MIMALLOC_TYPES_H // -------------------------------------------------------------------------- // This file contains the main type definitions for mimalloc: // mi_heap_t : all data for a thread-local heap, contains // lists of all managed heap pages. // mi_segment_t : a larger chunk of memory (32GiB) from where pages // are allocated. A segment is divided in slices (64KiB) from // which pages are allocated. // mi_page_t : a "mimalloc" page (usually 64KiB or 512KiB) from // where objects are allocated. // Note: we write "OS page" for OS memory pages while // using plain "page" for mimalloc pages (`mi_page_t`). // -------------------------------------------------------------------------- #include #include // ptrdiff_t #include // uintptr_t, uint16_t, etc #include "atomic.h" // _Atomic #ifdef _MSC_VER #pragma warning(disable:4214) // bitfield is not int #endif // Minimal alignment necessary. On most platforms 16 bytes are needed // due to SSE registers for example. This must be at least `sizeof(void*)` #ifndef MI_MAX_ALIGN_SIZE #define MI_MAX_ALIGN_SIZE 16 // sizeof(max_align_t) #endif // ------------------------------------------------------ // Variants // ------------------------------------------------------ // Define NDEBUG in the release version to disable assertions. // #define NDEBUG // Define MI_TRACK_ to enable tracking support // #define MI_TRACK_VALGRIND 1 // #define MI_TRACK_ASAN 1 // #define MI_TRACK_ETW 1 // Define MI_STAT as 1 to maintain statistics; set it to 2 to have detailed statistics (but costs some performance). // #define MI_STAT 1 // Define MI_SECURE to enable security mitigations // #define MI_SECURE 1 // guard page around metadata // #define MI_SECURE 2 // guard page around each mimalloc page // #define MI_SECURE 3 // encode free lists (detect corrupted free list (buffer overflow), and invalid pointer free) // #define MI_SECURE 4 // checks for double free. (may be more expensive) #if !defined(MI_SECURE) #define MI_SECURE 0 #endif // Define MI_DEBUG for debug mode // #define MI_DEBUG 1 // basic assertion checks and statistics, check double free, corrupted free list, and invalid pointer free. // #define MI_DEBUG 2 // + internal assertion checks // #define MI_DEBUG 3 // + extensive internal invariant checking (cmake -DMI_DEBUG_FULL=ON) #if !defined(MI_DEBUG) #if !defined(NDEBUG) || defined(_DEBUG) #define MI_DEBUG 2 #else #define MI_DEBUG 0 #endif #endif // Use guard pages behind objects of a certain size (set by the MIMALLOC_DEBUG_GUARDED_MIN/MAX options) // Padding should be disabled when using guard pages // #define MI_GUARDED 1 #if defined(MI_GUARDED) #define MI_PADDING 0 #endif // Reserve extra padding at the end of each block to be more resilient against heap block overflows. // The padding can detect buffer overflow on free. #if !defined(MI_PADDING) && (MI_SECURE>=3 || MI_DEBUG>=1 || (MI_TRACK_VALGRIND || MI_TRACK_ASAN || MI_TRACK_ETW)) #define MI_PADDING 1 #endif // Check padding bytes; allows byte-precise buffer overflow detection #if !defined(MI_PADDING_CHECK) && MI_PADDING && (MI_SECURE>=3 || MI_DEBUG>=1) #define MI_PADDING_CHECK 1 #endif // Encoded free lists allow detection of corrupted free lists // and can detect buffer overflows, modify after free, and double `free`s. #if (MI_SECURE>=3 || MI_DEBUG>=1) #define MI_ENCODE_FREELIST 1 #endif // We used to abandon huge pages in order to eagerly deallocate it if freed from another thread. // Unfortunately, that makes it not possible to visit them during a heap walk or include them in a // `mi_heap_destroy`. We therefore instead reset/decommit the huge blocks nowadays if freed from // another thread so the memory becomes "virtually" available (and eventually gets properly freed by // the owning thread). // #define MI_HUGE_PAGE_ABANDON 1 // ------------------------------------------------------ // Platform specific values // ------------------------------------------------------ // ------------------------------------------------------ // Size of a pointer. // We assume that `sizeof(void*)==sizeof(intptr_t)` // and it holds for all platforms we know of. // // However, the C standard only requires that: // p == (void*)((intptr_t)p)) // but we also need: // i == (intptr_t)((void*)i) // or otherwise one might define an intptr_t type that is larger than a pointer... // ------------------------------------------------------ #if INTPTR_MAX > INT64_MAX # define MI_INTPTR_SHIFT (4) // assume 128-bit (as on arm CHERI for example) #elif INTPTR_MAX == INT64_MAX # define MI_INTPTR_SHIFT (3) #elif INTPTR_MAX == INT32_MAX # define MI_INTPTR_SHIFT (2) #else #error platform pointers must be 32, 64, or 128 bits #endif #if SIZE_MAX == UINT64_MAX # define MI_SIZE_SHIFT (3) typedef int64_t mi_ssize_t; #elif SIZE_MAX == UINT32_MAX # define MI_SIZE_SHIFT (2) typedef int32_t mi_ssize_t; #else #error platform objects must be 32 or 64 bits #endif #if (SIZE_MAX/2) > LONG_MAX # define MI_ZU(x) x##ULL # define MI_ZI(x) x##LL #else # define MI_ZU(x) x##UL # define MI_ZI(x) x##L #endif #define MI_INTPTR_SIZE (1< 4 #define MI_SEGMENT_SHIFT ( 9 + MI_SEGMENT_SLICE_SHIFT) // 32MiB #else #define MI_SEGMENT_SHIFT ( 7 + MI_SEGMENT_SLICE_SHIFT) // 4MiB on 32-bit #endif #endif #ifndef MI_SMALL_PAGE_SHIFT #define MI_SMALL_PAGE_SHIFT (MI_SEGMENT_SLICE_SHIFT) // 64KiB #endif #ifndef MI_MEDIUM_PAGE_SHIFT #define MI_MEDIUM_PAGE_SHIFT ( 3 + MI_SMALL_PAGE_SHIFT) // 512KiB #endif // Derived constants #define MI_SEGMENT_SIZE (MI_ZU(1)<= 655360) #error "mimalloc internal: define more bins" #endif // Maximum block size for which blocks are guaranteed to be block size aligned. (see `segment.c:_mi_segment_page_start`) #define MI_MAX_ALIGN_GUARANTEE (MI_MEDIUM_OBJ_SIZE_MAX) // Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in dedicated huge page segments #define MI_BLOCK_ALIGNMENT_MAX (MI_SEGMENT_SIZE >> 1) // Maximum slice count (255) for which we can find the page for interior pointers #define MI_MAX_SLICE_OFFSET_COUNT ((MI_BLOCK_ALIGNMENT_MAX / MI_SEGMENT_SLICE_SIZE) - 1) // we never allocate more than PTRDIFF_MAX (see also ) // on 64-bit+ systems we also limit the maximum allocation size such that the slice count fits in 32-bits. (issue #877) #if (PTRDIFF_MAX > INT32_MAX) && (PTRDIFF_MAX >= (MI_SEGMENT_SLIZE_SIZE * UINT32_MAX)) #define MI_MAX_ALLOC_SIZE (MI_SEGMENT_SLICE_SIZE * (UINT32_MAX-1)) #else #define MI_MAX_ALLOC_SIZE PTRDIFF_MAX #endif // ------------------------------------------------------ // Mimalloc pages contain allocated blocks // ------------------------------------------------------ // The free lists use encoded next fields // (Only actually encodes when MI_ENCODED_FREELIST is defined.) typedef uintptr_t mi_encoded_t; // thread id's typedef size_t mi_threadid_t; // free lists contain blocks typedef struct mi_block_s { mi_encoded_t next; } mi_block_t; #if MI_GUARDED // we always align guarded pointers in a block at an offset // the block `next` field is then used as a tag to distinguish regular offset aligned blocks from guarded ones #define MI_BLOCK_TAG_ALIGNED ((mi_encoded_t)(0)) #define MI_BLOCK_TAG_GUARDED (~MI_BLOCK_TAG_ALIGNED) #endif // The delayed flags are used for efficient multi-threaded free-ing typedef enum mi_delayed_e { MI_USE_DELAYED_FREE = 0, // push on the owning heap thread delayed list MI_DELAYED_FREEING = 1, // temporary: another thread is accessing the owning heap MI_NO_DELAYED_FREE = 2, // optimize: push on page local thread free queue if another block is already in the heap thread delayed free list MI_NEVER_DELAYED_FREE = 3 // sticky: used for abandoned pages without a owning heap; this only resets on page reclaim } mi_delayed_t; // The `in_full` and `has_aligned` page flags are put in a union to efficiently // test if both are false (`full_aligned == 0`) in the `mi_free` routine. #if !MI_TSAN typedef union mi_page_flags_s { uint8_t full_aligned; struct { uint8_t in_full : 1; uint8_t has_aligned : 1; } x; } mi_page_flags_t; #else // under thread sanitizer, use a byte for each flag to suppress warning, issue #130 typedef union mi_page_flags_s { uint32_t full_aligned; struct { uint8_t in_full; uint8_t has_aligned; } x; } mi_page_flags_t; #endif // Thread free list. // We use the bottom 2 bits of the pointer for mi_delayed_t flags typedef uintptr_t mi_thread_free_t; // A page contains blocks of one specific size (`block_size`). // Each page has three list of free blocks: // `free` for blocks that can be allocated, // `local_free` for freed blocks that are not yet available to `mi_malloc` // `thread_free` for freed blocks by other threads // The `local_free` and `thread_free` lists are migrated to the `free` list // when it is exhausted. The separate `local_free` list is necessary to // implement a monotonic heartbeat. The `thread_free` list is needed for // avoiding atomic operations in the common case. // // `used - |thread_free|` == actual blocks that are in use (alive) // `used - |thread_free| + |free| + |local_free| == capacity` // // We don't count `freed` (as |free|) but use `used` to reduce // the number of memory accesses in the `mi_page_all_free` function(s). // // Notes: // - Access is optimized for `free.c:mi_free` and `alloc.c:mi_page_alloc` // - Using `uint16_t` does not seem to slow things down // - The size is 12 words on 64-bit which helps the page index calculations // (and 14 words on 32-bit, and encoded free lists add 2 words) // - `xthread_free` uses the bottom bits as a delayed-free flags to optimize // concurrent frees where only the first concurrent free adds to the owning // heap `thread_delayed_free` list (see `free.c:mi_free_block_mt`). // The invariant is that no-delayed-free is only set if there is // at least one block that will be added, or as already been added, to // the owning heap `thread_delayed_free` list. This guarantees that pages // will be freed correctly even if only other threads free blocks. typedef struct mi_page_s { // "owned" by the segment uint32_t slice_count; // slices in this page (0 if not a page) uint32_t slice_offset; // distance from the actual page data slice (0 if a page) uint8_t is_committed:1; // `true` if the page virtual memory is committed uint8_t is_zero_init:1; // `true` if the page was initially zero initialized uint8_t is_huge:1; // `true` if the page is in a huge segment (`segment->kind == MI_SEGMENT_HUGE`) // padding // layout like this to optimize access in `mi_malloc` and `mi_free` uint16_t capacity; // number of blocks committed, must be the first field, see `segment.c:page_clear` uint16_t reserved; // number of blocks reserved in memory mi_page_flags_t flags; // `in_full` and `has_aligned` flags (8 bits) uint8_t free_is_zero:1; // `true` if the blocks in the free list are zero initialized uint8_t retire_expire:7; // expiration count for retired blocks mi_block_t* free; // list of available free blocks (`malloc` allocates from this list) mi_block_t* local_free; // list of deferred free blocks by this thread (migrates to `free`) uint16_t used; // number of blocks in use (including blocks in `thread_free`) uint8_t block_size_shift; // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`) uint8_t heap_tag; // tag of the owning heap, used to separate heaps by object type // padding size_t block_size; // size available in each block (always `>0`) uint8_t* page_start; // start of the page area containing the blocks #if (MI_ENCODE_FREELIST || MI_PADDING) uintptr_t keys[2]; // two random keys to encode the free lists (see `_mi_block_next`) or padding canary #endif _Atomic(mi_thread_free_t) xthread_free; // list of deferred free blocks freed by other threads _Atomic(uintptr_t) xheap; struct mi_page_s* next; // next page owned by this thread with the same `block_size` struct mi_page_s* prev; // previous page owned by this thread with the same `block_size` // 64-bit 11 words, 32-bit 13 words, (+2 for secure) void* padding[1]; } mi_page_t; // ------------------------------------------------------ // Mimalloc segments contain mimalloc pages // ------------------------------------------------------ typedef enum mi_page_kind_e { MI_PAGE_SMALL, // small blocks go into 64KiB pages inside a segment MI_PAGE_MEDIUM, // medium blocks go into 512KiB pages inside a segment MI_PAGE_LARGE, // larger blocks go into a single page spanning a whole segment MI_PAGE_HUGE // a huge page is a single page in a segment of variable size // used for blocks `> MI_LARGE_OBJ_SIZE_MAX` or an aligment `> MI_BLOCK_ALIGNMENT_MAX`. } mi_page_kind_t; typedef enum mi_segment_kind_e { MI_SEGMENT_NORMAL, // MI_SEGMENT_SIZE size with pages inside. MI_SEGMENT_HUGE, // segment with just one huge page inside. } mi_segment_kind_t; // ------------------------------------------------------ // A segment holds a commit mask where a bit is set if // the corresponding MI_COMMIT_SIZE area is committed. // The MI_COMMIT_SIZE must be a multiple of the slice // size. If it is equal we have the most fine grained // decommit (but setting it higher can be more efficient). // The MI_MINIMAL_COMMIT_SIZE is the minimal amount that will // be committed in one go which can be set higher than // MI_COMMIT_SIZE for efficiency (while the decommit mask // is still tracked in fine-grained MI_COMMIT_SIZE chunks) // ------------------------------------------------------ #define MI_MINIMAL_COMMIT_SIZE (1*MI_SEGMENT_SLICE_SIZE) #define MI_COMMIT_SIZE (MI_SEGMENT_SLICE_SIZE) // 64KiB #define MI_COMMIT_MASK_BITS (MI_SEGMENT_SIZE / MI_COMMIT_SIZE) #define MI_COMMIT_MASK_FIELD_BITS MI_SIZE_BITS #define MI_COMMIT_MASK_FIELD_COUNT (MI_COMMIT_MASK_BITS / MI_COMMIT_MASK_FIELD_BITS) #if (MI_COMMIT_MASK_BITS != (MI_COMMIT_MASK_FIELD_COUNT * MI_COMMIT_MASK_FIELD_BITS)) #error "the segment size must be exactly divisible by the (commit size * size_t bits)" #endif typedef struct mi_commit_mask_s { size_t mask[MI_COMMIT_MASK_FIELD_COUNT]; } mi_commit_mask_t; typedef mi_page_t mi_slice_t; typedef int64_t mi_msecs_t; // --------------------------------------------------------------- // a memory id tracks the provenance of arena/OS allocated memory // --------------------------------------------------------------- // Memory can reside in arena's, direct OS allocated, or statically allocated. The memid keeps track of this. typedef enum mi_memkind_e { MI_MEM_NONE, // not allocated MI_MEM_EXTERNAL, // not owned by mimalloc but provided externally (via `mi_manage_os_memory` for example) MI_MEM_STATIC, // allocated in a static area and should not be freed (for arena meta data for example) MI_MEM_OS, // allocated from the OS MI_MEM_OS_HUGE, // allocated as huge OS pages (usually 1GiB, pinned to physical memory) MI_MEM_OS_REMAP, // allocated in a remapable area (i.e. using `mremap`) MI_MEM_ARENA // allocated from an arena (the usual case) } mi_memkind_t; static inline bool mi_memkind_is_os(mi_memkind_t memkind) { return (memkind >= MI_MEM_OS && memkind <= MI_MEM_OS_REMAP); } typedef struct mi_memid_os_info { void* base; // actual base address of the block (used for offset aligned allocations) size_t size; // full allocation size } mi_memid_os_info_t; typedef struct mi_memid_arena_info { size_t block_index; // index in the arena mi_arena_id_t id; // arena id (>= 1) bool is_exclusive; // this arena can only be used for specific arena allocations } mi_memid_arena_info_t; typedef struct mi_memid_s { union { mi_memid_os_info_t os; // only used for MI_MEM_OS mi_memid_arena_info_t arena; // only used for MI_MEM_ARENA } mem; bool is_pinned; // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large (2Mib) or huge (1GiB) OS pages) bool initially_committed;// `true` if the memory was originally allocated as committed bool initially_zero; // `true` if the memory was originally zero initialized mi_memkind_t memkind; } mi_memid_t; // ----------------------------------------------------------------------------------------- // Segments are large allocated memory blocks (32mb on 64 bit) from arenas or the OS. // // Inside segments we allocated fixed size mimalloc pages (`mi_page_t`) that contain blocks. // The start of a segment is this structure with a fixed number of slice entries (`slices`) // usually followed by a guard OS page and the actual allocation area with pages. // While a page is not allocated, we view it's data as a `mi_slice_t` (instead of a `mi_page_t`). // Of any free area, the first slice has the info and `slice_offset == 0`; for any subsequent // slices part of the area, the `slice_offset` is the byte offset back to the first slice // (so we can quickly find the page info on a free, `internal.h:_mi_segment_page_of`). // For slices, the `block_size` field is repurposed to signify if a slice is used (`1`) or not (`0`). // Small and medium pages use a fixed amount of slices to reduce slice fragmentation, while // large and huge pages span a variable amount of slices. typedef struct mi_subproc_s mi_subproc_t; typedef struct mi_segment_s { // constant fields mi_memid_t memid; // memory id for arena/OS allocation bool allow_decommit; // can we decommmit the memory bool allow_purge; // can we purge the memory (reset or decommit) size_t segment_size; mi_subproc_t* subproc; // segment belongs to sub process // segment fields mi_msecs_t purge_expire; // purge slices in the `purge_mask` after this time mi_commit_mask_t purge_mask; // slices that can be purged mi_commit_mask_t commit_mask; // slices that are currently committed // from here is zero initialized struct mi_segment_s* next; // the list of freed segments in the cache (must be first field, see `segment.c:mi_segment_init`) bool was_reclaimed; // true if it was reclaimed (used to limit on-free reclamation) bool dont_free; // can be temporarily true to ensure the segment is not freed size_t abandoned; // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`) size_t abandoned_visits; // count how often this segment is visited during abondoned reclamation (to force reclaim if it takes too long) size_t used; // count of pages in use uintptr_t cookie; // verify addresses in debug mode: `mi_ptr_cookie(segment) == segment->cookie` struct mi_segment_s* abandoned_os_next; // only used for abandoned segments outside arena's, and only if `mi_option_visit_abandoned` is enabled struct mi_segment_s* abandoned_os_prev; size_t segment_slices; // for huge segments this may be different from `MI_SLICES_PER_SEGMENT` size_t segment_info_slices; // initial count of slices that we are using for segment info and possible guard pages. // layout like this to optimize access in `mi_free` mi_segment_kind_t kind; size_t slice_entries; // entries in the `slices` array, at most `MI_SLICES_PER_SEGMENT` _Atomic(mi_threadid_t) thread_id; // unique id of the thread owning this segment mi_slice_t slices[MI_SLICES_PER_SEGMENT+1]; // one extra final entry for huge blocks with large alignment } mi_segment_t; // ------------------------------------------------------ // Heaps // Provide first-class heaps to allocate from. // A heap just owns a set of pages for allocation and // can only be allocate/reallocate from the thread that created it. // Freeing blocks can be done from any thread though. // Per thread, the segments are shared among its heaps. // Per thread, there is always a default heap that is // used for allocation; it is initialized to statically // point to an empty heap to avoid initialization checks // in the fast path. // ------------------------------------------------------ // Thread local data typedef struct mi_tld_s mi_tld_t; // Pages of a certain block size are held in a queue. typedef struct mi_page_queue_s { mi_page_t* first; mi_page_t* last; size_t block_size; } mi_page_queue_t; #define MI_BIN_FULL (MI_BIN_HUGE+1) // Random context typedef struct mi_random_cxt_s { uint32_t input[16]; uint32_t output[16]; int output_available; bool weak; } mi_random_ctx_t; // In debug mode there is a padding structure at the end of the blocks to check for buffer overflows #if (MI_PADDING) typedef struct mi_padding_s { uint32_t canary; // encoded block value to check validity of the padding (in case of overflow) uint32_t delta; // padding bytes before the block. (mi_usable_size(p) - delta == exact allocated bytes) } mi_padding_t; #define MI_PADDING_SIZE (sizeof(mi_padding_t)) #define MI_PADDING_WSIZE ((MI_PADDING_SIZE + MI_INTPTR_SIZE - 1) / MI_INTPTR_SIZE) #else #define MI_PADDING_SIZE 0 #define MI_PADDING_WSIZE 0 #endif #define MI_PAGES_DIRECT (MI_SMALL_WSIZE_MAX + MI_PADDING_WSIZE + 1) // A heap owns a set of pages. struct mi_heap_s { mi_tld_t* tld; _Atomic(mi_block_t*) thread_delayed_free; mi_threadid_t thread_id; // thread this heap belongs too mi_arena_id_t arena_id; // arena id if the heap belongs to a specific arena (or 0) uintptr_t cookie; // random cookie to verify pointers (see `_mi_ptr_cookie`) uintptr_t keys[2]; // two random keys used to encode the `thread_delayed_free` list mi_random_ctx_t random; // random number context used for secure allocation size_t page_count; // total number of pages in the `pages` queues. size_t page_retired_min; // smallest retired index (retired pages are fully free, but still in the page queues) size_t page_retired_max; // largest retired index into the `pages` array. long generic_count; // how often is `_mi_malloc_generic` called? long generic_collect_count; // how often is `_mi_malloc_generic` called without collecting? mi_heap_t* next; // list of heaps per thread bool no_reclaim; // `true` if this heap should not reclaim abandoned pages uint8_t tag; // custom tag, can be used for separating heaps based on the object types #if MI_GUARDED size_t guarded_size_min; // minimal size for guarded objects size_t guarded_size_max; // maximal size for guarded objects size_t guarded_sample_rate; // sample rate (set to 0 to disable guarded pages) size_t guarded_sample_seed; // starting sample count size_t guarded_sample_count; // current sample count (counting down to 0) #endif mi_page_t* pages_free_direct[MI_PAGES_DIRECT]; // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size. mi_page_queue_t pages[MI_BIN_FULL + 1]; // queue of pages for each size class (or "bin") }; // ------------------------------------------------------ // Sub processes do not reclaim or visit segments // from other sub processes. These are essentially the // static variables of a process. // ------------------------------------------------------ struct mi_subproc_s { _Atomic(size_t) abandoned_count; // count of abandoned segments for this sub-process _Atomic(size_t) abandoned_os_list_count; // count of abandoned segments in the os-list mi_lock_t abandoned_os_lock; // lock for the abandoned os segment list (outside of arena's) (this lock protect list operations) mi_lock_t abandoned_os_visit_lock; // ensure only one thread per subproc visits the abandoned os list mi_segment_t* abandoned_os_list; // doubly-linked list of abandoned segments outside of arena's (in OS allocated memory) mi_segment_t* abandoned_os_list_tail; // the tail-end of the list mi_memid_t memid; // provenance of this memory block }; // ------------------------------------------------------ // Thread Local data // ------------------------------------------------------ // A "span" is is an available range of slices. The span queues keep // track of slice spans of at most the given `slice_count` (but more than the previous size class). typedef struct mi_span_queue_s { mi_slice_t* first; mi_slice_t* last; size_t slice_count; } mi_span_queue_t; #define MI_SEGMENT_BIN_MAX (35) // 35 == mi_segment_bin(MI_SLICES_PER_SEGMENT) // Segments thread local data typedef struct mi_segments_tld_s { mi_span_queue_t spans[MI_SEGMENT_BIN_MAX+1]; // free slice spans inside segments size_t count; // current number of segments; size_t peak_count; // peak number of segments size_t current_size; // current size of all segments size_t peak_size; // peak size of all segments size_t reclaim_count;// number of reclaimed (abandoned) segments mi_subproc_t* subproc; // sub-process this thread belongs to. mi_stats_t* stats; // points to tld stats } mi_segments_tld_t; // Thread local data struct mi_tld_s { unsigned long long heartbeat; // monotonic heartbeat count bool recurse; // true if deferred was called; used to prevent infinite recursion. mi_heap_t* heap_backing; // backing heap of this thread (cannot be deleted) mi_heap_t* heaps; // list of heaps in this thread (so we can abandon all when the thread terminates) mi_segments_tld_t segments; // segment tld mi_stats_t stats; // statistics }; // ------------------------------------------------------ // Debug // ------------------------------------------------------ #if !defined(MI_DEBUG_UNINIT) #define MI_DEBUG_UNINIT (0xD0) #endif #if !defined(MI_DEBUG_FREED) #define MI_DEBUG_FREED (0xDF) #endif #if !defined(MI_DEBUG_PADDING) #define MI_DEBUG_PADDING (0xDE) #endif #if (MI_DEBUG) // use our own assertion to print without memory allocation void _mi_assert_fail(const char* assertion, const char* fname, unsigned int line, const char* func ); #define mi_assert(expr) ((expr) ? (void)0 : _mi_assert_fail(#expr,__FILE__,__LINE__,__func__)) #else #define mi_assert(x) #endif #if (MI_DEBUG>1) #define mi_assert_internal mi_assert #else #define mi_assert_internal(x) #endif #if (MI_DEBUG>2) #define mi_assert_expensive mi_assert #else #define mi_assert_expensive(x) #endif // ------------------------------------------------------ // Statistics // ------------------------------------------------------ #ifndef MI_STAT #if (MI_DEBUG>0) #define MI_STAT 2 #else #define MI_STAT 0 #endif #endif // add to stat keeping track of the peak void _mi_stat_increase(mi_stat_count_t* stat, size_t amount); void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount); // counters can just be increased void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount); #if (MI_STAT) #define mi_stat_increase(stat,amount) _mi_stat_increase( &(stat), amount) #define mi_stat_decrease(stat,amount) _mi_stat_decrease( &(stat), amount) #define mi_stat_counter_increase(stat,amount) _mi_stat_counter_increase( &(stat), amount) #else #define mi_stat_increase(stat,amount) ((void)0) #define mi_stat_decrease(stat,amount) ((void)0) #define mi_stat_counter_increase(stat,amount) ((void)0) #endif #define mi_heap_stat_counter_increase(heap,stat,amount) mi_stat_counter_increase( (heap)->tld->stats.stat, amount) #define mi_heap_stat_increase(heap,stat,amount) mi_stat_increase( (heap)->tld->stats.stat, amount) #define mi_heap_stat_decrease(heap,stat,amount) mi_stat_decrease( (heap)->tld->stats.stat, amount) #endif ================================================ FILE: third-party/mimalloc/include/mimalloc-new-delete.h ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2020 Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #pragma once #ifndef MIMALLOC_NEW_DELETE_H #define MIMALLOC_NEW_DELETE_H // ---------------------------------------------------------------------------- // This header provides convenient overrides for the new and // delete operations in C++. // // This header should be included in only one source file! // // On Windows, or when linking dynamically with mimalloc, these // can be more performant than the standard new-delete operations. // See // --------------------------------------------------------------------------- #if defined(__cplusplus) #include #include #if defined(_MSC_VER) && defined(_Ret_notnull_) && defined(_Post_writable_byte_size_) // stay consistent with VCRT definitions #define mi_decl_new(n) mi_decl_nodiscard mi_decl_restrict _Ret_notnull_ _Post_writable_byte_size_(n) #define mi_decl_new_nothrow(n) mi_decl_nodiscard mi_decl_restrict _Ret_maybenull_ _Success_(return != NULL) _Post_writable_byte_size_(n) #else #define mi_decl_new(n) mi_decl_nodiscard mi_decl_restrict #define mi_decl_new_nothrow(n) mi_decl_nodiscard mi_decl_restrict #endif void operator delete(void* p) noexcept { mi_free(p); }; void operator delete[](void* p) noexcept { mi_free(p); }; void operator delete (void* p, const std::nothrow_t&) noexcept { mi_free(p); } void operator delete[](void* p, const std::nothrow_t&) noexcept { mi_free(p); } mi_decl_new(n) void* operator new(std::size_t n) noexcept(false) { return mi_new(n); } mi_decl_new(n) void* operator new[](std::size_t n) noexcept(false) { return mi_new(n); } mi_decl_new_nothrow(n) void* operator new (std::size_t n, const std::nothrow_t& tag) noexcept { (void)(tag); return mi_new_nothrow(n); } mi_decl_new_nothrow(n) void* operator new[](std::size_t n, const std::nothrow_t& tag) noexcept { (void)(tag); return mi_new_nothrow(n); } #if (__cplusplus >= 201402L || _MSC_VER >= 1916) void operator delete (void* p, std::size_t n) noexcept { mi_free_size(p,n); }; void operator delete[](void* p, std::size_t n) noexcept { mi_free_size(p,n); }; #endif #if (__cplusplus > 201402L || defined(__cpp_aligned_new)) void operator delete (void* p, std::align_val_t al) noexcept { mi_free_aligned(p, static_cast(al)); } void operator delete[](void* p, std::align_val_t al) noexcept { mi_free_aligned(p, static_cast(al)); } void operator delete (void* p, std::size_t n, std::align_val_t al) noexcept { mi_free_size_aligned(p, n, static_cast(al)); }; void operator delete[](void* p, std::size_t n, std::align_val_t al) noexcept { mi_free_size_aligned(p, n, static_cast(al)); }; void operator delete (void* p, std::align_val_t al, const std::nothrow_t&) noexcept { mi_free_aligned(p, static_cast(al)); } void operator delete[](void* p, std::align_val_t al, const std::nothrow_t&) noexcept { mi_free_aligned(p, static_cast(al)); } void* operator new (std::size_t n, std::align_val_t al) noexcept(false) { return mi_new_aligned(n, static_cast(al)); } void* operator new[](std::size_t n, std::align_val_t al) noexcept(false) { return mi_new_aligned(n, static_cast(al)); } void* operator new (std::size_t n, std::align_val_t al, const std::nothrow_t&) noexcept { return mi_new_aligned_nothrow(n, static_cast(al)); } void* operator new[](std::size_t n, std::align_val_t al, const std::nothrow_t&) noexcept { return mi_new_aligned_nothrow(n, static_cast(al)); } #endif #endif #endif // MIMALLOC_NEW_DELETE_H ================================================ FILE: third-party/mimalloc/include/mimalloc-override.h ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2020 Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #pragma once #ifndef MIMALLOC_OVERRIDE_H #define MIMALLOC_OVERRIDE_H /* ---------------------------------------------------------------------------- This header can be used to statically redirect malloc/free and new/delete to the mimalloc variants. This can be useful if one can include this file on each source file in a project (but be careful when using external code to not accidentally mix pointers from different allocators). -----------------------------------------------------------------------------*/ #include // Standard C allocation #define malloc(n) mi_malloc(n) #define calloc(n,c) mi_calloc(n,c) #define realloc(p,n) mi_realloc(p,n) #define free(p) mi_free(p) #define strdup(s) mi_strdup(s) #define strndup(s,n) mi_strndup(s,n) #define realpath(f,n) mi_realpath(f,n) // Microsoft extensions #define _expand(p,n) mi_expand(p,n) #define _msize(p) mi_usable_size(p) #define _recalloc(p,n,c) mi_recalloc(p,n,c) #define _strdup(s) mi_strdup(s) #define _strndup(s,n) mi_strndup(s,n) #define _wcsdup(s) (wchar_t*)mi_wcsdup((const unsigned short*)(s)) #define _mbsdup(s) mi_mbsdup(s) #define _dupenv_s(b,n,v) mi_dupenv_s(b,n,v) #define _wdupenv_s(b,n,v) mi_wdupenv_s((unsigned short*)(b),n,(const unsigned short*)(v)) // Various Posix and Unix variants #define reallocf(p,n) mi_reallocf(p,n) #define malloc_size(p) mi_usable_size(p) #define malloc_usable_size(p) mi_usable_size(p) #define malloc_good_size(sz) mi_malloc_good_size(sz) #define cfree(p) mi_free(p) #define valloc(n) mi_valloc(n) #define pvalloc(n) mi_pvalloc(n) #define reallocarray(p,s,n) mi_reallocarray(p,s,n) #define reallocarr(p,s,n) mi_reallocarr(p,s,n) #define memalign(a,n) mi_memalign(a,n) #define aligned_alloc(a,n) mi_aligned_alloc(a,n) #define posix_memalign(p,a,n) mi_posix_memalign(p,a,n) #define _posix_memalign(p,a,n) mi_posix_memalign(p,a,n) // Microsoft aligned variants #define _aligned_malloc(n,a) mi_malloc_aligned(n,a) #define _aligned_realloc(p,n,a) mi_realloc_aligned(p,n,a) #define _aligned_recalloc(p,s,n,a) mi_aligned_recalloc(p,s,n,a) #define _aligned_msize(p,a,o) mi_usable_size(p) #define _aligned_free(p) mi_free(p) #define _aligned_offset_malloc(n,a,o) mi_malloc_aligned_at(n,a,o) #define _aligned_offset_realloc(p,n,a,o) mi_realloc_aligned_at(p,n,a,o) #define _aligned_offset_recalloc(p,s,n,a,o) mi_recalloc_aligned_at(p,s,n,a,o) #endif // MIMALLOC_OVERRIDE_H ================================================ FILE: third-party/mimalloc/include/mimalloc-stats.h ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2025, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #pragma once #ifndef MIMALLOC_STATS_H #define MIMALLOC_STATS_H #include #include #define MI_STAT_VERSION 1 // increased on every backward incompatible change // count allocation over time typedef struct mi_stat_count_s { int64_t total; // total allocated int64_t peak; // peak allocation int64_t current; // current allocation } mi_stat_count_t; // counters only increase typedef struct mi_stat_counter_s { int64_t total; // total count } mi_stat_counter_t; #define MI_STAT_FIELDS() \ MI_STAT_COUNT(pages) /* count of mimalloc pages */ \ MI_STAT_COUNT(reserved) /* reserved memory bytes */ \ MI_STAT_COUNT(committed) /* committed bytes */ \ MI_STAT_COUNT(reset) /* reset bytes */ \ MI_STAT_COUNT(purged) /* purged bytes */ \ MI_STAT_COUNT(page_committed) /* committed memory inside pages */ \ MI_STAT_COUNT(pages_abandoned) /* abandonded pages count */ \ MI_STAT_COUNT(threads) /* number of threads */ \ MI_STAT_COUNT(malloc_normal) /* allocated bytes <= MI_LARGE_OBJ_SIZE_MAX */ \ MI_STAT_COUNT(malloc_huge) /* allocated bytes in huge pages */ \ MI_STAT_COUNT(malloc_requested) /* malloc requested bytes */ \ \ MI_STAT_COUNTER(mmap_calls) \ MI_STAT_COUNTER(commit_calls) \ MI_STAT_COUNTER(reset_calls) \ MI_STAT_COUNTER(purge_calls) \ MI_STAT_COUNTER(arena_count) /* number of memory arena's */ \ MI_STAT_COUNTER(malloc_normal_count) /* number of blocks <= MI_LARGE_OBJ_SIZE_MAX */ \ MI_STAT_COUNTER(malloc_huge_count) /* number of huge bloks */ \ MI_STAT_COUNTER(malloc_guarded_count) /* number of allocations with guard pages */ \ \ /* internal statistics */ \ MI_STAT_COUNTER(arena_rollback_count) \ MI_STAT_COUNTER(arena_purges) \ MI_STAT_COUNTER(pages_extended) /* number of page extensions */ \ MI_STAT_COUNTER(pages_retire) /* number of pages that are retired */ \ MI_STAT_COUNTER(page_searches) /* searches for a fresh page */ \ /* only on v1 and v2 */ \ MI_STAT_COUNT(segments) \ MI_STAT_COUNT(segments_abandoned) \ MI_STAT_COUNT(segments_cache) \ MI_STAT_COUNT(_segments_reserved) \ /* only on v3 */ \ MI_STAT_COUNTER(pages_reclaim_on_alloc) \ MI_STAT_COUNTER(pages_reclaim_on_free) \ MI_STAT_COUNTER(pages_reabandon_full) \ MI_STAT_COUNTER(pages_unabandon_busy_wait) \ // Define the statistics structure #define MI_BIN_HUGE (73U) // see types.h #define MI_STAT_COUNT(stat) mi_stat_count_t stat; #define MI_STAT_COUNTER(stat) mi_stat_counter_t stat; typedef struct mi_stats_s { int version; MI_STAT_FIELDS() // future extension mi_stat_count_t _stat_reserved[4]; mi_stat_counter_t _stat_counter_reserved[4]; // size segregated statistics mi_stat_count_t malloc_bins[MI_BIN_HUGE+1]; // allocation per size bin mi_stat_count_t page_bins[MI_BIN_HUGE+1]; // pages allocated per size bin } mi_stats_t; #undef MI_STAT_COUNT #undef MI_STAT_COUNTER // Exported definitions #ifdef __cplusplus extern "C" { #endif mi_decl_export void mi_stats_get( size_t stats_size, mi_stats_t* stats ) mi_attr_noexcept; mi_decl_export char* mi_stats_get_json( size_t buf_size, char* buf ) mi_attr_noexcept; // use mi_free to free the result if the input buf == NULL #ifdef __cplusplus } #endif #endif // MIMALLOC_STATS_H ================================================ FILE: third-party/mimalloc/include/mimalloc.h ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2025, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #pragma once #ifndef MIMALLOC_H #define MIMALLOC_H #define MI_MALLOC_VERSION 222 // major + 2 digits minor // ------------------------------------------------------ // Compiler specific attributes // ------------------------------------------------------ #ifdef __cplusplus #if (__cplusplus >= 201103L) || (_MSC_VER > 1900) // C++11 #define mi_attr_noexcept noexcept #else #define mi_attr_noexcept throw() #endif #else #define mi_attr_noexcept #endif #if defined(__cplusplus) && (__cplusplus >= 201703) #define mi_decl_nodiscard [[nodiscard]] #elif (defined(__GNUC__) && (__GNUC__ >= 4)) || defined(__clang__) // includes clang, icc, and clang-cl #define mi_decl_nodiscard __attribute__((warn_unused_result)) #elif defined(_HAS_NODISCARD) #define mi_decl_nodiscard _NODISCARD #elif (_MSC_VER >= 1700) #define mi_decl_nodiscard _Check_return_ #else #define mi_decl_nodiscard #endif #if defined(_MSC_VER) || defined(__MINGW32__) #if !defined(MI_SHARED_LIB) #define mi_decl_export #elif defined(MI_SHARED_LIB_EXPORT) #define mi_decl_export __declspec(dllexport) #else #define mi_decl_export __declspec(dllimport) #endif #if defined(__MINGW32__) #define mi_decl_restrict #define mi_attr_malloc __attribute__((malloc)) #else #if (_MSC_VER >= 1900) && !defined(__EDG__) #define mi_decl_restrict __declspec(allocator) __declspec(restrict) #else #define mi_decl_restrict __declspec(restrict) #endif #define mi_attr_malloc #endif #define mi_cdecl __cdecl #define mi_attr_alloc_size(s) #define mi_attr_alloc_size2(s1,s2) #define mi_attr_alloc_align(p) #elif defined(__GNUC__) // includes clang and icc #if defined(MI_SHARED_LIB) && defined(MI_SHARED_LIB_EXPORT) #define mi_decl_export __attribute__((visibility("default"))) #else #define mi_decl_export #endif #define mi_cdecl // leads to warnings... __attribute__((cdecl)) #define mi_decl_restrict #define mi_attr_malloc __attribute__((malloc)) #if (defined(__clang_major__) && (__clang_major__ < 4)) || (__GNUC__ < 5) #define mi_attr_alloc_size(s) #define mi_attr_alloc_size2(s1,s2) #define mi_attr_alloc_align(p) #elif defined(__INTEL_COMPILER) #define mi_attr_alloc_size(s) __attribute__((alloc_size(s))) #define mi_attr_alloc_size2(s1,s2) __attribute__((alloc_size(s1,s2))) #define mi_attr_alloc_align(p) #else #define mi_attr_alloc_size(s) __attribute__((alloc_size(s))) #define mi_attr_alloc_size2(s1,s2) __attribute__((alloc_size(s1,s2))) #define mi_attr_alloc_align(p) __attribute__((alloc_align(p))) #endif #else #define mi_cdecl #define mi_decl_export #define mi_decl_restrict #define mi_attr_malloc #define mi_attr_alloc_size(s) #define mi_attr_alloc_size2(s1,s2) #define mi_attr_alloc_align(p) #endif // ------------------------------------------------------ // Includes // ------------------------------------------------------ #include // size_t #include // bool #include // INTPTR_MAX #ifdef __cplusplus extern "C" { #endif // ------------------------------------------------------ // Standard malloc interface // ------------------------------------------------------ mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1); mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_calloc(size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2); mi_decl_nodiscard mi_decl_export void* mi_realloc(void* p, size_t newsize) mi_attr_noexcept mi_attr_alloc_size(2); mi_decl_export void* mi_expand(void* p, size_t newsize) mi_attr_noexcept mi_attr_alloc_size(2); mi_decl_export void mi_free(void* p) mi_attr_noexcept; mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_strdup(const char* s) mi_attr_noexcept mi_attr_malloc; mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_strndup(const char* s, size_t n) mi_attr_noexcept mi_attr_malloc; mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_realpath(const char* fname, char* resolved_name) mi_attr_noexcept mi_attr_malloc; // ------------------------------------------------------ // Extended functionality // ------------------------------------------------------ #define MI_SMALL_WSIZE_MAX (128) #define MI_SMALL_SIZE_MAX (MI_SMALL_WSIZE_MAX*sizeof(void*)) mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc_small(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1); mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc_small(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1); mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1); mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_mallocn(size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2); mi_decl_nodiscard mi_decl_export void* mi_reallocn(void* p, size_t count, size_t size) mi_attr_noexcept mi_attr_alloc_size2(2,3); mi_decl_nodiscard mi_decl_export void* mi_reallocf(void* p, size_t newsize) mi_attr_noexcept mi_attr_alloc_size(2); mi_decl_nodiscard mi_decl_export size_t mi_usable_size(const void* p) mi_attr_noexcept; mi_decl_nodiscard mi_decl_export size_t mi_good_size(size_t size) mi_attr_noexcept; // ------------------------------------------------------ // Internals // ------------------------------------------------------ typedef void (mi_cdecl mi_deferred_free_fun)(bool force, unsigned long long heartbeat, void* arg); mi_decl_export void mi_register_deferred_free(mi_deferred_free_fun* deferred_free, void* arg) mi_attr_noexcept; typedef void (mi_cdecl mi_output_fun)(const char* msg, void* arg); mi_decl_export void mi_register_output(mi_output_fun* out, void* arg) mi_attr_noexcept; typedef void (mi_cdecl mi_error_fun)(int err, void* arg); mi_decl_export void mi_register_error(mi_error_fun* fun, void* arg); mi_decl_export void mi_collect(bool force) mi_attr_noexcept; mi_decl_export int mi_version(void) mi_attr_noexcept; mi_decl_export void mi_stats_reset(void) mi_attr_noexcept; mi_decl_export void mi_stats_merge(void) mi_attr_noexcept; mi_decl_export void mi_stats_print(void* out) mi_attr_noexcept; // backward compatibility: `out` is ignored and should be NULL mi_decl_export void mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept; mi_decl_export void mi_options_print(void) mi_attr_noexcept; mi_decl_export void mi_process_init(void) mi_attr_noexcept; mi_decl_export void mi_thread_init(void) mi_attr_noexcept; mi_decl_export void mi_thread_done(void) mi_attr_noexcept; mi_decl_export void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept; mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept; // ------------------------------------------------------------------------------------- // Aligned allocation // Note that `alignment` always follows `size` for consistency with unaligned // allocation, but unfortunately this differs from `posix_memalign` and `aligned_alloc`. // ------------------------------------------------------------------------------------- mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2); mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1); mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2); mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1); mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2) mi_attr_alloc_align(3); mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2); mi_decl_nodiscard mi_decl_export void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_alloc_size(2) mi_attr_alloc_align(3); mi_decl_nodiscard mi_decl_export void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size(2); // ------------------------------------------------------------------------------------- // Heaps: first-class, but can only allocate from the same thread that created it. // ------------------------------------------------------------------------------------- struct mi_heap_s; typedef struct mi_heap_s mi_heap_t; mi_decl_nodiscard mi_decl_export mi_heap_t* mi_heap_new(void); mi_decl_export void mi_heap_delete(mi_heap_t* heap); mi_decl_export void mi_heap_destroy(mi_heap_t* heap); mi_decl_export mi_heap_t* mi_heap_set_default(mi_heap_t* heap); mi_decl_export mi_heap_t* mi_heap_get_default(void); mi_decl_export mi_heap_t* mi_heap_get_backing(void); mi_decl_export void mi_heap_collect(mi_heap_t* heap, bool force) mi_attr_noexcept; mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2); mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2); mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3); mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3); mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2); mi_decl_nodiscard mi_decl_export void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept mi_attr_alloc_size(3); mi_decl_nodiscard mi_decl_export void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept mi_attr_alloc_size2(3,4); mi_decl_nodiscard mi_decl_export void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept mi_attr_alloc_size(3); mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_heap_strdup(mi_heap_t* heap, const char* s) mi_attr_noexcept mi_attr_malloc; mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n) mi_attr_noexcept mi_attr_malloc; mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept mi_attr_malloc; mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(3); mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2); mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_zalloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(3); mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_zalloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2); mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_calloc_aligned(mi_heap_t* heap, size_t count, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3) mi_attr_alloc_align(4); mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_calloc_aligned_at(mi_heap_t* heap, size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3); mi_decl_nodiscard mi_decl_export void* mi_heap_realloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_alloc_size(3) mi_attr_alloc_align(4); mi_decl_nodiscard mi_decl_export void* mi_heap_realloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size(3); // -------------------------------------------------------------------------------- // Zero initialized re-allocation. // Only valid on memory that was originally allocated with zero initialization too. // e.g. `mi_calloc`, `mi_zalloc`, `mi_zalloc_aligned` etc. // see // -------------------------------------------------------------------------------- mi_decl_nodiscard mi_decl_export void* mi_rezalloc(void* p, size_t newsize) mi_attr_noexcept mi_attr_alloc_size(2); mi_decl_nodiscard mi_decl_export void* mi_recalloc(void* p, size_t newcount, size_t size) mi_attr_noexcept mi_attr_alloc_size2(2,3); mi_decl_nodiscard mi_decl_export void* mi_rezalloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_alloc_size(2) mi_attr_alloc_align(3); mi_decl_nodiscard mi_decl_export void* mi_rezalloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size(2); mi_decl_nodiscard mi_decl_export void* mi_recalloc_aligned(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept mi_attr_alloc_size2(2,3) mi_attr_alloc_align(4); mi_decl_nodiscard mi_decl_export void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size2(2,3); mi_decl_nodiscard mi_decl_export void* mi_heap_rezalloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept mi_attr_alloc_size(3); mi_decl_nodiscard mi_decl_export void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t newcount, size_t size) mi_attr_noexcept mi_attr_alloc_size2(3,4); mi_decl_nodiscard mi_decl_export void* mi_heap_rezalloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_alloc_size(3) mi_attr_alloc_align(4); mi_decl_nodiscard mi_decl_export void* mi_heap_rezalloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size(3); mi_decl_nodiscard mi_decl_export void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept mi_attr_alloc_size2(3,4) mi_attr_alloc_align(5); mi_decl_nodiscard mi_decl_export void* mi_heap_recalloc_aligned_at(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size2(3,4); // ------------------------------------------------------ // Analysis // ------------------------------------------------------ mi_decl_export bool mi_heap_contains_block(mi_heap_t* heap, const void* p); mi_decl_export bool mi_heap_check_owned(mi_heap_t* heap, const void* p); mi_decl_export bool mi_check_owned(const void* p); // An area of heap space contains blocks of a single size. typedef struct mi_heap_area_s { void* blocks; // start of the area containing heap blocks size_t reserved; // bytes reserved for this area (virtual) size_t committed; // current available bytes for this area size_t used; // number of allocated blocks size_t block_size; // size in bytes of each block size_t full_block_size; // size in bytes of a full block including padding and metadata. int heap_tag; // heap tag associated with this area } mi_heap_area_t; typedef bool (mi_cdecl mi_block_visit_fun)(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg); mi_decl_export bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_blocks, mi_block_visit_fun* visitor, void* arg); // Experimental mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept; mi_decl_nodiscard mi_decl_export bool mi_is_redirected(void) mi_attr_noexcept; mi_decl_export int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept; mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept; mi_decl_export int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept; mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept; mi_decl_export void mi_debug_show_arenas(void) mi_attr_noexcept; mi_decl_export void mi_arenas_print(void) mi_attr_noexcept; // Experimental: heaps associated with specific memory arena's typedef int mi_arena_id_t; mi_decl_export void* mi_arena_area(mi_arena_id_t arena_id, size_t* size); mi_decl_export int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept; mi_decl_export int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept; mi_decl_export bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept; #if MI_MALLOC_VERSION >= 182 // Create a heap that only allocates in the specified arena mi_decl_nodiscard mi_decl_export mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id); #endif // Experimental: allow sub-processes whose memory areas stay separated (and no reclamation between them) // Used for example for separate interpreters in one process. typedef void* mi_subproc_id_t; mi_decl_export mi_subproc_id_t mi_subproc_main(void); mi_decl_export mi_subproc_id_t mi_subproc_new(void); mi_decl_export void mi_subproc_delete(mi_subproc_id_t subproc); mi_decl_export void mi_subproc_add_current_thread(mi_subproc_id_t subproc); // this should be called right after a thread is created (and no allocation has taken place yet) // Experimental: visit abandoned heap areas (that are not owned by a specific heap) mi_decl_export bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg); // Experimental: objects followed by a guard page. // A sample rate of 0 disables guarded objects, while 1 uses a guard page for every object. // A seed of 0 uses a random start point. Only objects within the size bound are eligable for guard pages. mi_decl_export void mi_heap_guarded_set_sample_rate(mi_heap_t* heap, size_t sample_rate, size_t seed); mi_decl_export void mi_heap_guarded_set_size_bound(mi_heap_t* heap, size_t min, size_t max); // Experimental: communicate that the thread is part of a threadpool mi_decl_export void mi_thread_set_in_threadpool(void) mi_attr_noexcept; // Experimental: create a new heap with a specified heap tag. Set `allow_destroy` to false to allow the thread // to reclaim abandoned memory (with a compatible heap_tag and arena_id) but in that case `mi_heap_destroy` will // fall back to `mi_heap_delete`. mi_decl_nodiscard mi_decl_export mi_heap_t* mi_heap_new_ex(int heap_tag, bool allow_destroy, mi_arena_id_t arena_id); // deprecated mi_decl_export int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept; mi_decl_export void mi_collect_reduce(size_t target_thread_owned) mi_attr_noexcept; // ------------------------------------------------------ // Convenience // ------------------------------------------------------ #define mi_malloc_tp(tp) ((tp*)mi_malloc(sizeof(tp))) #define mi_zalloc_tp(tp) ((tp*)mi_zalloc(sizeof(tp))) #define mi_calloc_tp(tp,n) ((tp*)mi_calloc(n,sizeof(tp))) #define mi_mallocn_tp(tp,n) ((tp*)mi_mallocn(n,sizeof(tp))) #define mi_reallocn_tp(p,tp,n) ((tp*)mi_reallocn(p,n,sizeof(tp))) #define mi_recalloc_tp(p,tp,n) ((tp*)mi_recalloc(p,n,sizeof(tp))) #define mi_heap_malloc_tp(hp,tp) ((tp*)mi_heap_malloc(hp,sizeof(tp))) #define mi_heap_zalloc_tp(hp,tp) ((tp*)mi_heap_zalloc(hp,sizeof(tp))) #define mi_heap_calloc_tp(hp,tp,n) ((tp*)mi_heap_calloc(hp,n,sizeof(tp))) #define mi_heap_mallocn_tp(hp,tp,n) ((tp*)mi_heap_mallocn(hp,n,sizeof(tp))) #define mi_heap_reallocn_tp(hp,p,tp,n) ((tp*)mi_heap_reallocn(hp,p,n,sizeof(tp))) #define mi_heap_recalloc_tp(hp,p,tp,n) ((tp*)mi_heap_recalloc(hp,p,n,sizeof(tp))) // ------------------------------------------------------ // Options // ------------------------------------------------------ typedef enum mi_option_e { // stable options mi_option_show_errors, // print error messages mi_option_show_stats, // print statistics on termination mi_option_verbose, // print verbose messages // advanced options mi_option_eager_commit, // eager commit segments? (after `eager_commit_delay` segments) (=1) mi_option_arena_eager_commit, // eager commit arenas? Use 2 to enable just on overcommit systems (=2) mi_option_purge_decommits, // should a memory purge decommit? (=1). Set to 0 to use memory reset on a purge (instead of decommit) mi_option_allow_large_os_pages, // allow large (2 or 4 MiB) OS pages, implies eager commit. If false, also disables THP for the process. mi_option_reserve_huge_os_pages, // reserve N huge OS pages (1GiB pages) at startup mi_option_reserve_huge_os_pages_at, // reserve huge OS pages at a specific NUMA node mi_option_reserve_os_memory, // reserve specified amount of OS memory in an arena at startup (internally, this value is in KiB; use `mi_option_get_size`) mi_option_deprecated_segment_cache, mi_option_deprecated_page_reset, mi_option_abandoned_page_purge, // immediately purge delayed purges on thread termination mi_option_deprecated_segment_reset, mi_option_eager_commit_delay, // the first N segments per thread are not eagerly committed (but per page in the segment on demand) mi_option_purge_delay, // memory purging is delayed by N milli seconds; use 0 for immediate purging or -1 for no purging at all. (=10) mi_option_use_numa_nodes, // 0 = use all available numa nodes, otherwise use at most N nodes. mi_option_disallow_os_alloc, // 1 = do not use OS memory for allocation (but only programmatically reserved arenas) mi_option_os_tag, // tag used for OS logging (macOS only for now) (=100) mi_option_max_errors, // issue at most N error messages mi_option_max_warnings, // issue at most N warning messages mi_option_max_segment_reclaim, // max. percentage of the abandoned segments can be reclaimed per try (=10%) mi_option_destroy_on_exit, // if set, release all memory on exit; sometimes used for dynamic unloading but can be unsafe mi_option_arena_reserve, // initial memory size for arena reservation (= 1 GiB on 64-bit) (internally, this value is in KiB; use `mi_option_get_size`) mi_option_arena_purge_mult, // multiplier for `purge_delay` for the purging delay for arenas (=10) mi_option_purge_extend_delay, mi_option_abandoned_reclaim_on_free, // allow to reclaim an abandoned segment on a free (=1) mi_option_disallow_arena_alloc, // 1 = do not use arena's for allocation (except if using specific arena id's) mi_option_retry_on_oom, // retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries. (only on windows) mi_option_visit_abandoned, // allow visiting heap blocks from abandoned threads (=0) mi_option_guarded_min, // only used when building with MI_GUARDED: minimal rounded object size for guarded objects (=0) mi_option_guarded_max, // only used when building with MI_GUARDED: maximal rounded object size for guarded objects (=0) mi_option_guarded_precise, // disregard minimal alignment requirement to always place guarded blocks exactly in front of a guard page (=0) mi_option_guarded_sample_rate, // 1 out of N allocations in the min/max range will be guarded (=1000) mi_option_guarded_sample_seed, // can be set to allow for a (more) deterministic re-execution when a guard page is triggered (=0) mi_option_target_segments_per_thread, // experimental (=0) mi_option_generic_collect, // collect heaps every N (=10000) generic allocation calls _mi_option_last, // legacy option names mi_option_large_os_pages = mi_option_allow_large_os_pages, mi_option_eager_region_commit = mi_option_arena_eager_commit, mi_option_reset_decommits = mi_option_purge_decommits, mi_option_reset_delay = mi_option_purge_delay, mi_option_abandoned_page_reset = mi_option_abandoned_page_purge, mi_option_limit_os_alloc = mi_option_disallow_os_alloc } mi_option_t; mi_decl_nodiscard mi_decl_export bool mi_option_is_enabled(mi_option_t option); mi_decl_export void mi_option_enable(mi_option_t option); mi_decl_export void mi_option_disable(mi_option_t option); mi_decl_export void mi_option_set_enabled(mi_option_t option, bool enable); mi_decl_export void mi_option_set_enabled_default(mi_option_t option, bool enable); mi_decl_nodiscard mi_decl_export long mi_option_get(mi_option_t option); mi_decl_nodiscard mi_decl_export long mi_option_get_clamp(mi_option_t option, long min, long max); mi_decl_nodiscard mi_decl_export size_t mi_option_get_size(mi_option_t option); mi_decl_export void mi_option_set(mi_option_t option, long value); mi_decl_export void mi_option_set_default(mi_option_t option, long value); // ------------------------------------------------------------------------------------------------------- // "mi" prefixed implementations of various posix, Unix, Windows, and C++ allocation functions. // (This can be convenient when providing overrides of these functions as done in `mimalloc-override.h`.) // note: we use `mi_cfree` as "checked free" and it checks if the pointer is in our heap before free-ing. // ------------------------------------------------------------------------------------------------------- mi_decl_export void mi_cfree(void* p) mi_attr_noexcept; mi_decl_export void* mi__expand(void* p, size_t newsize) mi_attr_noexcept; mi_decl_nodiscard mi_decl_export size_t mi_malloc_size(const void* p) mi_attr_noexcept; mi_decl_nodiscard mi_decl_export size_t mi_malloc_good_size(size_t size) mi_attr_noexcept; mi_decl_nodiscard mi_decl_export size_t mi_malloc_usable_size(const void *p) mi_attr_noexcept; mi_decl_export int mi_posix_memalign(void** p, size_t alignment, size_t size) mi_attr_noexcept; mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_memalign(size_t alignment, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(1); mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_valloc(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1); mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_pvalloc(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1); mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_aligned_alloc(size_t alignment, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(1); mi_decl_nodiscard mi_decl_export void* mi_reallocarray(void* p, size_t count, size_t size) mi_attr_noexcept mi_attr_alloc_size2(2,3); mi_decl_nodiscard mi_decl_export int mi_reallocarr(void* p, size_t count, size_t size) mi_attr_noexcept; mi_decl_nodiscard mi_decl_export void* mi_aligned_recalloc(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept; mi_decl_nodiscard mi_decl_export void* mi_aligned_offset_recalloc(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept; mi_decl_nodiscard mi_decl_export mi_decl_restrict unsigned short* mi_wcsdup(const unsigned short* s) mi_attr_noexcept mi_attr_malloc; mi_decl_nodiscard mi_decl_export mi_decl_restrict unsigned char* mi_mbsdup(const unsigned char* s) mi_attr_noexcept mi_attr_malloc; mi_decl_export int mi_dupenv_s(char** buf, size_t* size, const char* name) mi_attr_noexcept; mi_decl_export int mi_wdupenv_s(unsigned short** buf, size_t* size, const unsigned short* name) mi_attr_noexcept; mi_decl_export void mi_free_size(void* p, size_t size) mi_attr_noexcept; mi_decl_export void mi_free_size_aligned(void* p, size_t size, size_t alignment) mi_attr_noexcept; mi_decl_export void mi_free_aligned(void* p, size_t alignment) mi_attr_noexcept; // The `mi_new` wrappers implement C++ semantics on out-of-memory instead of directly returning `NULL`. // (and call `std::get_new_handler` and potentially raise a `std::bad_alloc` exception). mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new(size_t size) mi_attr_malloc mi_attr_alloc_size(1); mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new_aligned(size_t size, size_t alignment) mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2); mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new_nothrow(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1); mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new_aligned_nothrow(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2); mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new_n(size_t count, size_t size) mi_attr_malloc mi_attr_alloc_size2(1, 2); mi_decl_nodiscard mi_decl_export void* mi_new_realloc(void* p, size_t newsize) mi_attr_alloc_size(2); mi_decl_nodiscard mi_decl_export void* mi_new_reallocn(void* p, size_t newcount, size_t size) mi_attr_alloc_size2(2, 3); mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_alloc_new(mi_heap_t* heap, size_t size) mi_attr_malloc mi_attr_alloc_size(2); mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_alloc_new_n(mi_heap_t* heap, size_t count, size_t size) mi_attr_malloc mi_attr_alloc_size2(2, 3); #ifdef __cplusplus } #endif // --------------------------------------------------------------------------------------------- // Implement the C++ std::allocator interface for use in STL containers. // (note: see `mimalloc-new-delete.h` for overriding the new/delete operators globally) // --------------------------------------------------------------------------------------------- #ifdef __cplusplus #include // std::size_t #include // PTRDIFF_MAX #if (__cplusplus >= 201103L) || (_MSC_VER > 1900) // C++11 #include // std::true_type #include // std::forward #endif template struct _mi_stl_allocator_common { typedef T value_type; typedef std::size_t size_type; typedef std::ptrdiff_t difference_type; typedef value_type& reference; typedef value_type const& const_reference; typedef value_type* pointer; typedef value_type const* const_pointer; #if ((__cplusplus >= 201103L) || (_MSC_VER > 1900)) // C++11 using propagate_on_container_copy_assignment = std::true_type; using propagate_on_container_move_assignment = std::true_type; using propagate_on_container_swap = std::true_type; template void construct(U* p, Args&& ...args) { ::new(p) U(std::forward(args)...); } template void destroy(U* p) mi_attr_noexcept { p->~U(); } #else void construct(pointer p, value_type const& val) { ::new(p) value_type(val); } void destroy(pointer p) { p->~value_type(); } #endif size_type max_size() const mi_attr_noexcept { return (PTRDIFF_MAX/sizeof(value_type)); } pointer address(reference x) const { return &x; } const_pointer address(const_reference x) const { return &x; } }; template struct mi_stl_allocator : public _mi_stl_allocator_common { using typename _mi_stl_allocator_common::size_type; using typename _mi_stl_allocator_common::value_type; using typename _mi_stl_allocator_common::pointer; template struct rebind { typedef mi_stl_allocator other; }; mi_stl_allocator() mi_attr_noexcept = default; mi_stl_allocator(const mi_stl_allocator&) mi_attr_noexcept = default; template mi_stl_allocator(const mi_stl_allocator&) mi_attr_noexcept { } mi_stl_allocator select_on_container_copy_construction() const { return *this; } void deallocate(T* p, size_type) { mi_free(p); } #if (__cplusplus >= 201703L) // C++17 mi_decl_nodiscard T* allocate(size_type count) { return static_cast(mi_new_n(count, sizeof(T))); } mi_decl_nodiscard T* allocate(size_type count, const void*) { return allocate(count); } #else mi_decl_nodiscard pointer allocate(size_type count, const void* = 0) { return static_cast(mi_new_n(count, sizeof(value_type))); } #endif #if ((__cplusplus >= 201103L) || (_MSC_VER > 1900)) // C++11 using is_always_equal = std::true_type; #endif }; template bool operator==(const mi_stl_allocator& , const mi_stl_allocator& ) mi_attr_noexcept { return true; } template bool operator!=(const mi_stl_allocator& , const mi_stl_allocator& ) mi_attr_noexcept { return false; } #if (__cplusplus >= 201103L) || (_MSC_VER >= 1900) // C++11 #define MI_HAS_HEAP_STL_ALLOCATOR 1 #include // std::shared_ptr // Common base class for STL allocators in a specific heap template struct _mi_heap_stl_allocator_common : public _mi_stl_allocator_common { using typename _mi_stl_allocator_common::size_type; using typename _mi_stl_allocator_common::value_type; using typename _mi_stl_allocator_common::pointer; _mi_heap_stl_allocator_common(mi_heap_t* hp) : heap(hp, [](mi_heap_t*) {}) {} /* will not delete nor destroy the passed in heap */ #if (__cplusplus >= 201703L) // C++17 mi_decl_nodiscard T* allocate(size_type count) { return static_cast(mi_heap_alloc_new_n(this->heap.get(), count, sizeof(T))); } mi_decl_nodiscard T* allocate(size_type count, const void*) { return allocate(count); } #else mi_decl_nodiscard pointer allocate(size_type count, const void* = 0) { return static_cast(mi_heap_alloc_new_n(this->heap.get(), count, sizeof(value_type))); } #endif #if ((__cplusplus >= 201103L) || (_MSC_VER > 1900)) // C++11 using is_always_equal = std::false_type; #endif void collect(bool force) { mi_heap_collect(this->heap.get(), force); } template bool is_equal(const _mi_heap_stl_allocator_common& x) const { return (this->heap == x.heap); } protected: std::shared_ptr heap; template friend struct _mi_heap_stl_allocator_common; _mi_heap_stl_allocator_common() { mi_heap_t* hp = mi_heap_new(); this->heap.reset(hp, (_mi_destroy ? &heap_destroy : &heap_delete)); /* calls heap_delete/destroy when the refcount drops to zero */ } _mi_heap_stl_allocator_common(const _mi_heap_stl_allocator_common& x) mi_attr_noexcept : heap(x.heap) { } template _mi_heap_stl_allocator_common(const _mi_heap_stl_allocator_common& x) mi_attr_noexcept : heap(x.heap) { } private: static void heap_delete(mi_heap_t* hp) { if (hp != NULL) { mi_heap_delete(hp); } } static void heap_destroy(mi_heap_t* hp) { if (hp != NULL) { mi_heap_destroy(hp); } } }; // STL allocator allocation in a specific heap template struct mi_heap_stl_allocator : public _mi_heap_stl_allocator_common { using typename _mi_heap_stl_allocator_common::size_type; mi_heap_stl_allocator() : _mi_heap_stl_allocator_common() { } // creates fresh heap that is deleted when the destructor is called mi_heap_stl_allocator(mi_heap_t* hp) : _mi_heap_stl_allocator_common(hp) { } // no delete nor destroy on the passed in heap template mi_heap_stl_allocator(const mi_heap_stl_allocator& x) mi_attr_noexcept : _mi_heap_stl_allocator_common(x) { } mi_heap_stl_allocator select_on_container_copy_construction() const { return *this; } void deallocate(T* p, size_type) { mi_free(p); } template struct rebind { typedef mi_heap_stl_allocator other; }; }; template bool operator==(const mi_heap_stl_allocator& x, const mi_heap_stl_allocator& y) mi_attr_noexcept { return (x.is_equal(y)); } template bool operator!=(const mi_heap_stl_allocator& x, const mi_heap_stl_allocator& y) mi_attr_noexcept { return (!x.is_equal(y)); } // STL allocator allocation in a specific heap, where `free` does nothing and // the heap is destroyed in one go on destruction -- use with care! template struct mi_heap_destroy_stl_allocator : public _mi_heap_stl_allocator_common { using typename _mi_heap_stl_allocator_common::size_type; mi_heap_destroy_stl_allocator() : _mi_heap_stl_allocator_common() { } // creates fresh heap that is destroyed when the destructor is called mi_heap_destroy_stl_allocator(mi_heap_t* hp) : _mi_heap_stl_allocator_common(hp) { } // no delete nor destroy on the passed in heap template mi_heap_destroy_stl_allocator(const mi_heap_destroy_stl_allocator& x) mi_attr_noexcept : _mi_heap_stl_allocator_common(x) { } mi_heap_destroy_stl_allocator select_on_container_copy_construction() const { return *this; } void deallocate(T*, size_type) { /* do nothing as we destroy the heap on destruct. */ } template struct rebind { typedef mi_heap_destroy_stl_allocator other; }; }; template bool operator==(const mi_heap_destroy_stl_allocator& x, const mi_heap_destroy_stl_allocator& y) mi_attr_noexcept { return (x.is_equal(y)); } template bool operator!=(const mi_heap_destroy_stl_allocator& x, const mi_heap_destroy_stl_allocator& y) mi_attr_noexcept { return (!x.is_equal(y)); } #endif // C++11 #endif // __cplusplus #endif ================================================ FILE: third-party/mimalloc/mimalloc.pc.in ================================================ prefix=@CMAKE_INSTALL_PREFIX@ libdir=@mi_pc_libdir@ includedir=@mi_pc_includedir@ Name: @PROJECT_NAME@ Description: A compact general purpose allocator with excellent performance Version: @PACKAGE_VERSION@ URL: https://github.com/microsoft/mimalloc/ Libs: -L${libdir} -l@mi_libname@ Libs.private: @mi_pc_libraries@ Cflags: -I${includedir} ================================================ FILE: third-party/mimalloc/readme.md ================================================ [](https://dev.azure.com/Daan0324/mimalloc/_build?definitionId=1&_a=summary) # mimalloc   mimalloc (pronounced "me-malloc") is a general purpose allocator with excellent [performance](#performance) characteristics. Initially developed by Daan Leijen for the runtime systems of the [Koka](https://koka-lang.github.io) and [Lean](https://github.com/leanprover/lean) languages. Latest release : `v3.0.2` (beta) (2025-03-06). Latest v2 release: `v2.2.2` (2025-03-06). Latest v1 release: `v1.9.2` (2024-03-06). mimalloc is a drop-in replacement for `malloc` and can be used in other programs without code changes, for example, on dynamically linked ELF-based systems (Linux, BSD, etc.) you can use it as: ``` > LD_PRELOAD=/usr/lib/libmimalloc.so myprogram ``` It also includes a way to dynamically override the default allocator in [Windows](#override_on_windows). Notable aspects of the design include: - __small and consistent__: the library is about 10k LOC using simple and consistent data structures. This makes it very suitable to integrate and adapt in other projects. For runtime systems it provides hooks for a monotonic _heartbeat_ and deferred freeing (for bounded worst-case times with reference counting). Partly due to its simplicity, mimalloc has been ported to many systems (Windows, macOS, Linux, WASM, various BSD's, Haiku, MUSL, etc) and has excellent support for dynamic overriding. At the same time, it is an industrial strength allocator that runs (very) large scale distributed services on thousands of machines with excellent worst case latencies. - __free list sharding__: instead of one big free list (per size class) we have many smaller lists per "mimalloc page" which reduces fragmentation and increases locality -- things that are allocated close in time get allocated close in memory. (A mimalloc page contains blocks of one size class and is usually 64KiB on a 64-bit system). - __free list multi-sharding__: the big idea! Not only do we shard the free list per mimalloc page, but for each page we have multiple free lists. In particular, there is one list for thread-local `free` operations, and another one for concurrent `free` operations. Free-ing from another thread can now be a single CAS without needing sophisticated coordination between threads. Since there will be thousands of separate free lists, contention is naturally distributed over the heap, and the chance of contending on a single location will be low -- this is quite similar to randomized algorithms like skip lists where adding a random oracle removes the need for a more complex algorithm. - __eager page purging__: when a "page" becomes empty (with increased chance due to free list sharding) the memory is marked to the OS as unused (reset or decommitted) reducing (real) memory pressure and fragmentation, especially in long running programs. - __secure__: _mimalloc_ can be built in secure mode, adding guard pages, randomized allocation, encrypted free lists, etc. to protect against various heap vulnerabilities. The performance penalty is usually around 10% on average over our benchmarks. - __first-class heaps__: efficiently create and use multiple heaps to allocate across different regions. A heap can be destroyed at once instead of deallocating each object separately. - __bounded__: it does not suffer from _blowup_ \[1\], has bounded worst-case allocation times (_wcat_) (upto OS primitives), bounded space overhead (~0.2% meta-data, with low internal fragmentation), and has no internal points of contention using only atomic operations. - __fast__: In our benchmarks (see [below](#performance)), _mimalloc_ outperforms other leading allocators (_jemalloc_, _tcmalloc_, _Hoard_, etc), and often uses less memory. A nice property is that it does consistently well over a wide range of benchmarks. There is also good huge OS page support for larger server programs. The [documentation](https://microsoft.github.io/mimalloc) gives a full overview of the API. You can read more on the design of _mimalloc_ in the [technical report](https://www.microsoft.com/en-us/research/publication/mimalloc-free-list-sharding-in-action) which also has detailed benchmark results. Enjoy! ### Branches * `master`: latest stable release (still based on `dev2`). * `dev`: development branch for mimalloc v1. Use this branch for submitting PR's. * `dev2`: development branch for mimalloc v2. This branch is downstream of `dev` (and is essentially equal to `dev` except for `src/segment.c`). Uses larger sliced segments to manage mimalloc pages that can reduce fragmentation. * `dev3`: development branch for mimalloc v3-beta. This branch is downstream of `dev`. This version simplifies the lock-free ownership of previous versions, has no thread-local segments any more. This improves sharing of memory between threads, and on certain large workloads may use less memory with less fragmentation. ### Releases * 2025-03-06, `v1.9.2`, `v2.2.2`, `v3.0.2-beta`: Various small bug and build fixes. Add `mi_options_print`, `mi_arenas_print`, and the experimental `mi_stat_get` and `mi_stat_get_json`. Add `mi_thread_set_in_threadpool` and `mi_heap_set_numa_affinity` (v3 only). Add vcpkg portfile. Upgrade mimalloc-redirect to v1.3.2. `MI_OPT_ARCH` is off by default now but still assumes armv8.1-a on arm64 for fast atomic operations. Add QNX support. * 2025-01-03, `v1.8.9`, `v2.1.9`, `v3.0.1-alpha`: Interim release. Support Windows arm64. New [guarded](#guarded) build that can place OS guard pages behind objects to catch buffer overflows as they occur. Many small fixes: build on Windows arm64, cygwin, riscV, and dragonfly; fix Windows static library initialization to account for thread local destructors (in Rust/C++); macOS tag change; macOS TLS slot fix; improve stats; consistent `mimalloc.dll` on Windows (instead of `mimalloc-override.dll`); fix mimalloc-redirect on Win11 H2; add 0-byte to canary; upstream CPython fixes; reduce .bss size; allow fixed TLS slot on Windows for improved performance. * 2024-05-21, `v1.8.7`, `v2.1.7`: Fix build issues on less common platforms. Started upstreaming patches from the CPython [integration](https://github.com/python/cpython/issues/113141#issuecomment-2119255217). Upstream `vcpkg` patches. * 2024-05-13, `v1.8.6`, `v2.1.6`: Fix build errors on various (older) platforms. Refactored aligned allocation. * 2024-04-22, `v1.8.4`, `v2.1.4`: Fixes various bugs and build issues. Add `MI_LIBC_MUSL` cmake flag for musl builds. Free-ing code is refactored into a separate module (`free.c`). Mimalloc page info is simplified with the block size directly available (and new `block_size_shift` to improve aligned block free-ing). New approach to collection of abandoned segments: When a thread terminates the segments it owns are abandoned (containing still live objects) and these can be reclaimed by other threads. We no longer use a list of abandoned segments but this is now done using bitmaps in arena's which is more concurrent (and more aggressive). Abandoned memory can now also be reclaimed if a thread frees an object in an abandoned page (which can be disabled using `mi_option_abandoned_reclaim_on_free`). The option `mi_option_max_segment_reclaim` gives a maximum percentage of abandoned segments that can be reclaimed per try (=10%). * 2023-04-24, `v1.8.2`, `v2.1.2`: Fixes build issues on freeBSD, musl, and C17 (UE 5.1.1). Reduce code size/complexity by removing regions and segment-cache's and only use arenas with improved memory purging -- this may improve memory usage as well for larger services. Renamed options for consistency. Improved Valgrind and ASAN checking. * 2023-04-03, `v1.8.1`, `v2.1.1`: Fixes build issues on some platforms. * 2023-03-29, `v1.8.0`, `v2.1.0`: Improved support dynamic overriding on Windows 11. Improved tracing precision with [asan](#asan) and [Valgrind](#valgrind), and added Windows event tracing [ETW](#ETW) (contributed by Xinglong He). Created an OS abstraction layer to make it easier to port and separate platform dependent code (in `src/prim`). Fixed C++ STL compilation on older Microsoft C++ compilers, and various small bug fixes. * 2022-12-23, `v1.7.9`, `v2.0.9`: Supports building with [asan](#asan) and improved [Valgrind](#valgrind) support. Support arbitrary large alignments (in particular for `std::pmr` pools). Added C++ STL allocators attached to a specific heap (thanks @vmarkovtsev). Heap walks now visit all object (including huge objects). Support Windows nano server containers (by Johannes Schindelin,@dscho). Various small bug fixes. * 2022-11-03, `v1.7.7`, `v2.0.7`: Initial support for [Valgrind](#valgrind) for leak testing and heap block overflow detection. Initial support for attaching heaps to a specific memory area (only in v2). Fix `realloc` behavior for zero size blocks, remove restriction to integral multiple of the alignment in `alloc_align`, improved aligned allocation performance, reduced contention with many threads on few processors (thank you @dposluns!), vs2022 support, support `pkg-config`, . * 2022-04-14, `v1.7.6`, `v2.0.6`: fix fallback path for aligned OS allocation on Windows, improve Windows aligned allocation even when compiling with older SDK's, fix dynamic overriding on macOS Monterey, fix MSVC C++ dynamic overriding, fix warnings under Clang 14, improve performance if many OS threads are created and destroyed, fix statistics for large object allocations, using MIMALLOC_VERBOSE=1 has no maximum on the number of error messages, various small fixes. * 2022-02-14, `v1.7.5`, `v2.0.5` (alpha): fix malloc override on Windows 11, fix compilation with musl, potentially reduced committed memory, add `bin/minject` for Windows, improved wasm support, faster aligned allocation, various small fixes. * [Older release notes](#older-release-notes) Special thanks to: * [David Carlier](https://devnexen.blogspot.com/) (@devnexen) for his many contributions, and making mimalloc work better on many less common operating systems, like Haiku, Dragonfly, etc. * Mary Feofanova (@mary3000), Evgeniy Moiseenko, and Manuel Pöter (@mpoeter) for making mimalloc TSAN checkable, and finding memory model bugs using the [genMC] model checker. * Weipeng Liu (@pongba), Zhuowei Li, Junhua Wang, and Jakub Szymanski, for their early support of mimalloc and deployment at large scale services, leading to many improvements in the mimalloc algorithms for large workloads. * Jason Gibson (@jasongibson) for exhaustive testing on large scale workloads and server environments, and finding complex bugs in (early versions of) `mimalloc`. * Manuel Pöter (@mpoeter) and Sam Gross(@colesbury) for finding an ABA concurrency issue in abandoned segment reclamation. Sam also created the [no GIL](https://github.com/colesbury/nogil) Python fork which uses mimalloc internally. [genMC]: https://plv.mpi-sws.org/genmc/ ### Usage mimalloc is used in various large scale low-latency services and programs, for example: # Building ## Windows Open `ide/vs2022/mimalloc.sln` in Visual Studio 2022 and build. The `mimalloc-lib` project builds a static library (in `out/msvc-x64`), while the `mimalloc-override-dll` project builds DLL for overriding malloc in the entire program. ## Linux, macOS, BSD, etc. We use [`cmake`](https://cmake.org) as the build system: ``` > mkdir -p out/release > cd out/release > cmake ../.. > make ``` This builds the library as a shared (dynamic) library (`.so` or `.dylib`), a static library (`.a`), and as a single object file (`.o`). `> sudo make install` (install the library and header files in `/usr/local/lib` and `/usr/local/include`) You can build the debug version which does many internal checks and maintains detailed statistics as: ``` > mkdir -p out/debug > cd out/debug > cmake -DCMAKE_BUILD_TYPE=Debug ../.. > make ``` This will name the shared library as `libmimalloc-debug.so`. Finally, you can build a _secure_ version that uses guard pages, encrypted free lists, etc., as: ``` > mkdir -p out/secure > cd out/secure > cmake -DMI_SECURE=ON ../.. > make ``` This will name the shared library as `libmimalloc-secure.so`. Use `cmake ../.. -LH` to see all the available build options. The examples use the default compiler. If you like to use another, use: ``` > CC=clang CXX=clang++ cmake ../.. ``` ## Cmake with Visual Studio You can also use cmake on Windows. Open a Visual Studio 2022 development prompt and invoke `cmake` with the right [generator](https://cmake.org/cmake/help/latest/generator/Visual%20Studio%2017%202022.html) and architecture, like: ``` > cmake ..\.. -G "Visual Studio 17 2022" -A x64 -DMI_OVERRIDE=ON ``` The cmake build type is specified when actually building, for example: ``` > cmake --build . --config=Release ``` You can also install the [LLVM toolset](https://learn.microsoft.com/en-us/cpp/build/clang-support-msbuild?view=msvc-170#install-1) on Windows to build with the `clang-cl` compiler directly: ``` > cmake ../.. -G "Visual Studio 17 2022" -T ClangCl ``` ## Single Source You can also directly build the single `src/static.c` file as part of your project without needing `cmake` at all. Make sure to also add the mimalloc `include` directory to the include path. # Using the Library The preferred usage is including ``, linking with the shared- or static library, and using the `mi_malloc` API exclusively for allocation. For example, ``` > gcc -o myprogram -lmimalloc myfile.c ``` mimalloc uses only safe OS calls (`mmap` and `VirtualAlloc`) and can co-exist with other allocators linked to the same program. If you use `cmake`, you can simply use: ``` find_package(mimalloc 1.8 REQUIRED) ``` in your `CMakeLists.txt` to find a locally installed mimalloc. Then use either: ``` target_link_libraries(myapp PUBLIC mimalloc) ``` to link with the shared (dynamic) library, or: ``` target_link_libraries(myapp PUBLIC mimalloc-static) ``` to link with the static library. See `test\CMakeLists.txt` for an example. For best performance in C++ programs, it is also recommended to override the global `new` and `delete` operators. For convenience, mimalloc provides [`mimalloc-new-delete.h`](include/mimalloc-new-delete.h) which does this for you -- just include it in a single(!) source file in your project. In C++, mimalloc also provides the `mi_stl_allocator` struct which implements the `std::allocator` interface. You can pass environment variables to print verbose messages (`MIMALLOC_VERBOSE=1`) and statistics (`MIMALLOC_SHOW_STATS=1`) (in the debug version): ``` > env MIMALLOC_SHOW_STATS=1 ./cfrac 175451865205073170563711388363 175451865205073170563711388363 = 374456281610909315237213 * 468551 heap stats: peak total freed unit normal 2: 16.4 kb 17.5 mb 17.5 mb 16 b ok normal 3: 16.3 kb 15.2 mb 15.2 mb 24 b ok normal 4: 64 b 4.6 kb 4.6 kb 32 b ok normal 5: 80 b 118.4 kb 118.4 kb 40 b ok normal 6: 48 b 48 b 48 b 48 b ok normal 17: 960 b 960 b 960 b 320 b ok heap stats: peak total freed unit normal: 33.9 kb 32.8 mb 32.8 mb 1 b ok huge: 0 b 0 b 0 b 1 b ok total: 33.9 kb 32.8 mb 32.8 mb 1 b ok malloc requested: 32.8 mb committed: 58.2 kb 58.2 kb 58.2 kb 1 b ok reserved: 2.0 mb 2.0 mb 2.0 mb 1 b ok reset: 0 b 0 b 0 b 1 b ok segments: 1 1 1 -abandoned: 0 pages: 6 6 6 -abandoned: 0 mmaps: 3 mmap fast: 0 mmap slow: 1 threads: 0 elapsed: 2.022s process: user: 1.781s, system: 0.016s, faults: 756, reclaims: 0, rss: 2.7 mb ``` The above model of using the `mi_` prefixed API is not always possible though in existing programs that already use the standard malloc interface, and another option is to override the standard malloc interface completely and redirect all calls to the _mimalloc_ library instead . ## Environment Options You can set further options either programmatically (using [`mi_option_set`](https://microsoft.github.io/mimalloc/group__options.html)), or via environment variables: - `MIMALLOC_SHOW_STATS=1`: show statistics when the program terminates. - `MIMALLOC_VERBOSE=1`: show verbose messages. - `MIMALLOC_SHOW_ERRORS=1`: show error and warning messages. Advanced options: - `MIMALLOC_ARENA_EAGER_COMMIT=2`: turns on eager commit for the large arenas (usually 1GiB) from which mimalloc allocates segments and pages. Set this to 2 (default) to only enable this on overcommit systems (e.g. Linux). Set this to 1 to enable explicitly on other systems as well (like Windows or macOS) which may improve performance (as the whole arena is committed at once). Note that eager commit only increases the commit but not the actual the peak resident set (rss) so it is generally ok to enable this. - `MIMALLOC_PURGE_DELAY=N`: the delay in `N` milli-seconds (by default `10`) after which mimalloc will purge OS pages that are not in use. This signals to the OS that the underlying physical memory can be reused which can reduce memory fragmentation especially in long running (server) programs. Setting `N` to `0` purges immediately when a page becomes unused which can improve memory usage but also decreases performance. Setting `N` to a higher value like `100` can improve performance (sometimes by a lot) at the cost of potentially using more memory at times. Setting it to `-1` disables purging completely. - `MIMALLOC_PURGE_DECOMMITS=1`: By default "purging" memory means unused memory is decommitted (`MEM_DECOMMIT` on Windows, `MADV_DONTNEED` (which decresease rss immediately) on `mmap` systems). Set this to 0 to instead "reset" unused memory on a purge (`MEM_RESET` on Windows, generally `MADV_FREE` (which does not decrease rss immediately) on `mmap` systems). Mimalloc generally does not "free" OS memory but only "purges" OS memory, in other words, it tries to keep virtual address ranges and decommits within those ranges (to make the underlying physical memory available to other processes). Further options for large workloads and services: - `MIMALLOC_USE_NUMA_NODES=N`: pretend there are at most `N` NUMA nodes. If not set, the actual NUMA nodes are detected at runtime. Setting `N` to 1 may avoid problems in some virtual environments. Also, setting it to a lower number than the actual NUMA nodes is fine and will only cause threads to potentially allocate more memory across actual NUMA nodes (but this can happen in any case as NUMA local allocation is always a best effort but not guaranteed). - `MIMALLOC_ALLOW_LARGE_OS_PAGES=0`: Set to 1 to use large OS pages (2 or 4MiB) when available; for some workloads this can significantly improve performance. When this option is disabled (default), it also disables transparent huge pages (THP) for the process (on Linux and Android). On Linux the default setting is 2 -- this enables the use of large pages through THP only. Use `MIMALLOC_VERBOSE` to check if the large OS pages are enabled -- usually one needs to explicitly give permissions for large OS pages (as on [Windows][windows-huge] and [Linux][linux-huge]). However, sometimes the OS is very slow to reserve contiguous physical memory for large OS pages so use with care on systems that can have fragmented memory (for that reason, we generally recommend to use `MIMALLOC_RESERVE_HUGE_OS_PAGES` instead whenever possible). - `MIMALLOC_RESERVE_HUGE_OS_PAGES=N`: where `N` is the number of 1GiB _huge_ OS pages. This reserves the huge pages at startup and sometimes this can give a large (latency) performance improvement on big workloads. Usually it is better to not use `MIMALLOC_ALLOW_LARGE_OS_PAGES=1` in combination with this setting. Just like large OS pages, use with care as reserving contiguous physical memory can take a long time when memory is fragmented (but reserving the huge pages is done at startup only once). Note that we usually need to explicitly give permission for huge OS pages (as on [Windows][windows-huge] and [Linux][linux-huge])). With huge OS pages, it may be beneficial to set the setting `MIMALLOC_EAGER_COMMIT_DELAY=N` (`N` is 1 by default) to delay the initial `N` segments (of 4MiB) of a thread to not allocate in the huge OS pages; this prevents threads that are short lived and allocate just a little to take up space in the huge OS page area (which cannot be purged as huge OS pages are pinned to physical memory). The huge pages are usually allocated evenly among NUMA nodes. We can use `MIMALLOC_RESERVE_HUGE_OS_PAGES_AT=N` where `N` is the numa node (starting at 0) to allocate all the huge pages at a specific numa node instead. Use caution when using `fork` in combination with either large or huge OS pages: on a fork, the OS uses copy-on-write for all pages in the original process including the huge OS pages. When any memory is now written in that area, the OS will copy the entire 1GiB huge page (or 2MiB large page) which can cause the memory usage to grow in large increments. [linux-huge]: https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/5/html/tuning_and_optimizing_red_hat_enterprise_linux_for_oracle_9i_and_10g_databases/sect-oracle_9i_and_10g_tuning_guide-large_memory_optimization_big_pages_and_huge_pages-configuring_huge_pages_in_red_hat_enterprise_linux_4_or_5 [windows-huge]: https://docs.microsoft.com/en-us/sql/database-engine/configure-windows/enable-the-lock-pages-in-memory-option-windows?view=sql-server-2017 ## Secure Mode _mimalloc_ can be build in secure mode by using the `-DMI_SECURE=ON` flags in `cmake`. This build enables various mitigations to make mimalloc more robust against exploits. In particular: - All internal mimalloc pages are surrounded by guard pages and the heap metadata is behind a guard page as well (so a buffer overflow exploit cannot reach into the metadata). - All free list pointers are [encoded](https://github.com/microsoft/mimalloc/blob/783e3377f79ee82af43a0793910a9f2d01ac7863/include/mimalloc-internal.h#L396) with per-page keys which is used both to prevent overwrites with a known pointer, as well as to detect heap corruption. - Double free's are detected (and ignored). - The free lists are initialized in a random order and allocation randomly chooses between extension and reuse within a page to mitigate against attacks that rely on a predicable allocation order. Similarly, the larger heap blocks allocated by mimalloc from the OS are also address randomized. As always, evaluate with care as part of an overall security strategy as all of the above are mitigations but not guarantees. ## Debug Mode When _mimalloc_ is built using debug mode, (`-DCMAKE_BUILD_TYPE=Debug`), various checks are done at runtime to catch development errors. - Statistics are maintained in detail for each object size. They can be shown using `MIMALLOC_SHOW_STATS=1` at runtime. - All objects have padding at the end to detect (byte precise) heap block overflows. - Double free's, and freeing invalid heap pointers are detected. - Corrupted free-lists and some forms of use-after-free are detected. ## Guarded Mode _mimalloc_ can be build in guarded mode using the `-DMI_GUARDED=ON` flags in `cmake`. This enables placing OS guard pages behind certain object allocations to catch buffer overflows as they occur. This can be invaluable to catch buffer-overflow bugs in large programs. However, it also means that any object allocated with a guard page takes at least 8 KiB memory for the guard page and its alignment. As such, allocating a guard page for every allocation may be too expensive both in terms of memory, and in terms of performance with many system calls. Therefore, there are various environment variables (and options) to tune this: - `MIMALLOC_GUARDED_SAMPLE_RATE=N`: Set the sample rate to `N` (by default 4000). This mode places a guard page behind every `N` suitable object allocations (per thread). Since the performance in guarded mode without placing guard pages is close to release mode, this can be used to enable guard pages even in production to catch latent buffer overflow bugs. Set the sample rate to `1` to guard every object, and to `0` to place no guard pages at all. - `MIMALLOC_GUARDED_SAMPLE_SEED=N`: Start sampling at `N` (by default random). Can be used to reproduce a buffer overflow if needed. - `MIMALLOC_GUARDED_MIN=N`, `MIMALLOC_GUARDED_MAX=N`: Minimal and maximal _rounded_ object sizes for which a guard page is considered (`0` and `1GiB` respectively). If you suspect a buffer overflow occurs with an object of size 141, set the minimum and maximum to `148` and the sample rate to `1` to have all of those guarded. - `MIMALLOC_GUARDED_PRECISE=1`: If we have an object of size 13, we would usually place it an aligned 16 bytes in front of the guard page. Using `MIMALLOC_GUARDED_PRECISE` places it exactly 13 bytes before a page so that even a 1 byte overflow is detected. This violates the C/C++ minimal alignment guarantees though so use with care. # Overriding Standard Malloc Overriding the standard `malloc` (and `new`) can be done either _dynamically_ or _statically_. ## Dynamic override This is the recommended way to override the standard malloc interface. ### Dynamic Override on Linux, BSD On these ELF-based systems we preload the mimalloc shared library so all calls to the standard `malloc` interface are resolved to the _mimalloc_ library. ``` > env LD_PRELOAD=/usr/lib/libmimalloc.so myprogram ``` You can set extra environment variables to check that mimalloc is running, like: ``` > env MIMALLOC_VERBOSE=1 LD_PRELOAD=/usr/lib/libmimalloc.so myprogram ``` or run with the debug version to get detailed statistics: ``` > env MIMALLOC_SHOW_STATS=1 LD_PRELOAD=/usr/lib/libmimalloc-debug.so myprogram ``` ### Dynamic Override on MacOS On macOS we can also preload the mimalloc shared library so all calls to the standard `malloc` interface are resolved to the _mimalloc_ library. ``` > env DYLD_INSERT_LIBRARIES=/usr/lib/libmimalloc.dylib myprogram ``` Note that certain security restrictions may apply when doing this from the [shell](https://stackoverflow.com/questions/43941322/dyld-insert-libraries-ignored-when-calling-application-through-bash). ### Dynamic Override on Windows We use a separate redirection DLL to override mimalloc on Windows such that we redirect all malloc/free calls that go through the (dynamic) C runtime allocator, including those from other DLL's or libraries. As it intercepts all allocation calls on a low level, it can be used on large programs that include other 3rd party components. There are four requirements to make the overriding work well: 1. Use the C-runtime library as a DLL (using the `/MD` or `/MDd` switch). 2. Link your program explicitly with the `mimalloc.dll.lib` export library for the `mimalloc.dll`. (which must be compiled with `-DMI_OVERRIDE=ON`, which is the default though). To ensure the `mimalloc.dll` is actually loaded at run-time it is easiest to insert some call to the mimalloc API in the `main` function, like `mi_version()` (or use the `/include:mi_version` switch on the linker command, or similarly, `#pragma comment(linker, "/include:mi_version")` in some source file). See the `mimalloc-test-override` project for an example on how to use this. 3. The `mimalloc-redirect.dll` must be put in the same directory as the main `mimalloc.dll` at runtime (as it is a dependency of that DLL). The redirection DLL ensures that all calls to the C runtime malloc API get redirected to mimalloc functions (which reside in `mimalloc.dll`). 4. Ensure the `mimalloc.dll` comes as early as possible in the import list of the final executable (so it can intercept all potential allocations). You can use `minject -l ` to check this if needed. For best performance on Windows with C++, it is also recommended to also override the `new`/`delete` operations (by including [`mimalloc-new-delete.h`](include/mimalloc-new-delete.h) a single(!) source file in your project). The environment variable `MIMALLOC_DISABLE_REDIRECT=1` can be used to disable dynamic overriding at run-time. Use `MIMALLOC_VERBOSE=1` to check if mimalloc was successfully redirected. For different platforms than x64, you may need a specific [redirection dll](bin). Furthermore, we cannot always re-link an executable or ensure `mimalloc.dll` comes first in the import table. In such cases the [`minject`](bin) tool can be used to patch the executable's import tables. ## Static override On Unix-like systems, you can also statically link with _mimalloc_ to override the standard malloc interface. The recommended way is to link the final program with the _mimalloc_ single object file (`mimalloc.o`). We use an object file instead of a library file as linkers give preference to that over archives to resolve symbols. To ensure that the standard malloc interface resolves to the _mimalloc_ library, link it as the first object file. For example: ``` > gcc -o myprogram mimalloc.o myfile1.c ... ``` Another way to override statically that works on all platforms, is to link statically to mimalloc (as shown in the introduction) and include a header file in each source file that re-defines `malloc` etc. to `mi_malloc`. This is provided by [`mimalloc-override.h`](include/mimalloc-override.h). This only works reliably though if all sources are under your control or otherwise mixing of pointers from different heaps may occur! # Tools Generally, we recommend using the standard allocator with memory tracking tools, but mimalloc can also be build to support the [address sanitizer][asan] or the excellent [Valgrind] tool. Moreover, it can be build to support Windows event tracing ([ETW]). This has a small performance overhead but does allow detecting memory leaks and byte-precise buffer overflows directly on final executables. See also the `test/test-wrong.c` file to test with various tools. ## Valgrind To build with [valgrind] support, use the `MI_TRACK_VALGRIND=ON` cmake option: ``` > cmake ../.. -DMI_TRACK_VALGRIND=ON ``` This can also be combined with secure mode or debug mode. You can then run your programs directly under valgrind: ``` > valgrind ``` If you rely on overriding `malloc`/`free` by mimalloc (instead of using the `mi_malloc`/`mi_free` API directly), you also need to tell `valgrind` to not intercept those calls itself, and use: ``` > MIMALLOC_SHOW_STATS=1 valgrind --soname-synonyms=somalloc=*mimalloc* -- ``` By setting the `MIMALLOC_SHOW_STATS` environment variable you can check that mimalloc is indeed used and not the standard allocator. Even though the [Valgrind option][valgrind-soname] is called `--soname-synonyms`, this also works when overriding with a static library or object file. To dynamically override mimalloc using `LD_PRELOAD` together with `valgrind`, use: ``` > valgrind --trace-children=yes --soname-synonyms=somalloc=*mimalloc* /usr/bin/env LD_PRELOAD=/usr/lib/libmimalloc.so -- ``` See also the `test/test-wrong.c` file to test with `valgrind`. Valgrind support is in its initial development -- please report any issues. [Valgrind]: https://valgrind.org/ [valgrind-soname]: https://valgrind.org/docs/manual/manual-core.html#opt.soname-synonyms ## ASAN To build with the address sanitizer, use the `-DMI_TRACK_ASAN=ON` cmake option: ``` > cmake ../.. -DMI_TRACK_ASAN=ON ``` This can also be combined with secure mode or debug mode. You can then run your programs as:' ``` > ASAN_OPTIONS=verbosity=1 ``` When you link a program with an address sanitizer build of mimalloc, you should generally compile that program too with the address sanitizer enabled. For example, assuming you build mimalloc in `out/debug`: ``` clang -g -o test-wrong -Iinclude test/test-wrong.c out/debug/libmimalloc-asan-debug.a -lpthread -fsanitize=address -fsanitize-recover=address ``` Since the address sanitizer redirects the standard allocation functions, on some platforms (macOSX for example) it is required to compile mimalloc with `-DMI_OVERRIDE=OFF`. Address sanitizer support is in its initial development -- please report any issues. [asan]: https://github.com/google/sanitizers/wiki/AddressSanitizer ## ETW Event tracing for Windows ([ETW]) provides a high performance way to capture all allocations though mimalloc and analyze them later. To build with ETW support, use the `-DMI_TRACK_ETW=ON` cmake option. You can then capture an allocation trace using the Windows performance recorder (WPR), using the `src/prim/windows/etw-mimalloc.wprp` profile. In an admin prompt, you can use: ``` > wpr -start src\prim\windows\etw-mimalloc.wprp -filemode > > wpr -stop .etl ``` and then open `.etl` in the Windows Performance Analyzer (WPA), or use a tool like [TraceControl] that is specialized for analyzing mimalloc traces. [ETW]: https://learn.microsoft.com/en-us/windows-hardware/test/wpt/event-tracing-for-windows [TraceControl]: https://github.com/xinglonghe/TraceControl # Performance Last update: 2021-01-30 We tested _mimalloc_ against many other top allocators over a wide range of benchmarks, ranging from various real world programs to synthetic benchmarks that see how the allocator behaves under more extreme circumstances. In our benchmark suite, _mimalloc_ outperforms other leading allocators (_jemalloc_, _tcmalloc_, _Hoard_, etc), and has a similar memory footprint. A nice property is that it does consistently well over the wide range of benchmarks. General memory allocators are interesting as there exists no algorithm that is optimal -- for a given allocator one can usually construct a workload where it does not do so well. The goal is thus to find an allocation strategy that performs well over a wide range of benchmarks without suffering from (too much) underperformance in less common situations. As always, interpret these results with care since some benchmarks test synthetic or uncommon situations that may never apply to your workloads. For example, most allocators do not do well on `xmalloc-testN` but that includes even the best industrial allocators like _jemalloc_ and _tcmalloc_ that are used in some of the world's largest systems (like Chrome or FreeBSD). Also, the benchmarks here do not measure the behaviour on very large and long-running server workloads, or worst-case latencies of allocation. Much work has gone into `mimalloc` to work well on such workloads (for example, to reduce virtual memory fragmentation on long-running services) but such optimizations are not always reflected in the current benchmark suite. We show here only an overview -- for more specific details and further benchmarks we refer to the [technical report](https://www.microsoft.com/en-us/research/publication/mimalloc-free-list-sharding-in-action). The benchmark suite is automated and available separately as [mimalloc-bench](https://github.com/daanx/mimalloc-bench). ## Benchmark Results on a 16-core AMD 5950x (Zen3) Testing on the 16-core AMD 5950x processor at 3.4Ghz (4.9Ghz boost), with with 32GiB memory at 3600Mhz, running Ubuntu 20.04 with glibc 2.31 and GCC 9.3.0. We measure three versions of _mimalloc_: the main version `mi` (tag:v1.7.0), the new v2.0 beta version as `xmi` (tag:v2.0.0), and the main version in secure mode as `smi` (tag:v1.7.0). The other allocators are Google's [_tcmalloc_](https://github.com/gperftools/gperftools) (`tc`, tag:gperftools-2.8.1) used in Chrome, Facebook's [_jemalloc_](https://github.com/jemalloc/jemalloc) (`je`, tag:5.2.1) by Jason Evans used in Firefox and FreeBSD, the Intel thread building blocks [allocator](https://github.com/intel/tbb) (`tbb`, tag:v2020.3), [rpmalloc](https://github.com/mjansson/rpmalloc) (`rp`,tag:1.4.1) by Mattias Jansson, the original scalable [_Hoard_](https://github.com/emeryberger/Hoard) (git:d880f72) allocator by Emery Berger \[1], the memory compacting [_Mesh_](https://github.com/plasma-umass/Mesh) (git:67ff31a) allocator by Bobby Powers _et al_ \[8], and finally the default system allocator (`glibc`, 2.31) (based on _PtMalloc2_). Any benchmarks ending in `N` run on all 32 logical cores in parallel. Results are averaged over 10 runs and reported relative to mimalloc (where 1.2 means it took 1.2× longer to run). The legend also contains the _overall relative score_ between the allocators where 100 points is the maximum if an allocator is fastest on all benchmarks. The single threaded _cfrac_ benchmark by Dave Barrett is an implementation of continued fraction factorization which uses many small short-lived allocations. All allocators do well on such common usage, where _mimalloc_ is just a tad faster than _tcmalloc_ and _jemalloc_. The _leanN_ program is interesting as a large realistic and concurrent workload of the [Lean](https://github.com/leanprover/lean) theorem prover compiling its own standard library, and there is a 13% speedup over _tcmalloc_. This is quite significant: if Lean spends 20% of its time in the allocator that means that _mimalloc_ is 1.6× faster than _tcmalloc_ here. (This is surprising as that is not measured in a pure allocation benchmark like _alloc-test_. We conjecture that we see this outsized improvement here because _mimalloc_ has better locality in the allocation which improves performance for the *other* computations in a program as well). The single threaded _redis_ benchmark again show that most allocators do well on such workloads. The _larsonN_ server benchmark by Larson and Krishnan \[2] allocates and frees between threads. They observed this behavior (which they call _bleeding_) in actual server applications, and the benchmark simulates this. Here, _mimalloc_ is quite a bit faster than _tcmalloc_ and _jemalloc_ probably due to the object migration between different threads. The _mstressN_ workload performs many allocations and re-allocations, and migrates objects between threads (as in _larsonN_). However, it also creates and destroys the _N_ worker threads a few times keeping some objects alive beyond the life time of the allocating thread. We observed this behavior in many larger server applications. The [_rptestN_](https://github.com/mjansson/rpmalloc-benchmark) benchmark by Mattias Jansson is a allocator test originally designed for _rpmalloc_, and tries to simulate realistic allocation patterns over multiple threads. Here the differences between allocators become more apparent. The second benchmark set tests specific aspects of the allocators and shows even more extreme differences between them. The _alloc-test_, by [OLogN Technologies AG](http://ithare.com/testing-memory-allocators-ptmalloc2-tcmalloc-hoard-jemalloc-while-trying-to-simulate-real-world-loads/), is a very allocation intensive benchmark doing millions of allocations in various size classes. The test is scaled such that when an allocator performs almost identically on _alloc-test1_ as _alloc-testN_ it means that it scales linearly. The _sh6bench_ and _sh8bench_ benchmarks are developed by [MicroQuill](http://www.microquill.com/) as part of SmartHeap. In _sh6bench_ _mimalloc_ does much better than the others (more than 2.5× faster than _jemalloc_). We cannot explain this well but believe it is caused in part by the "reverse" free-ing pattern in _sh6bench_. The _sh8bench_ is a variation with object migration between threads; whereas _tcmalloc_ did well on _sh6bench_, the addition of object migration causes it to be 10× slower than before. The _xmalloc-testN_ benchmark by Lever and Boreham \[5] and Christian Eder, simulates an asymmetric workload where some threads only allocate, and others only free -- they observed this pattern in larger server applications. Here we see that the _mimalloc_ technique of having non-contended sharded thread free lists pays off as it outperforms others by a very large margin. Only _rpmalloc_, _tbb_, and _glibc_ also scale well on this benchmark. The _cache-scratch_ benchmark by Emery Berger \[1], and introduced with the Hoard allocator to test for _passive-false_ sharing of cache lines. With a single thread they all perform the same, but when running with multiple threads the potential allocator induced false sharing of the cache lines can cause large run-time differences. Crundal \[6] describes in detail why the false cache line sharing occurs in the _tcmalloc_ design, and also discusses how this can be avoided with some small implementation changes. Only the _tbb_, _rpmalloc_ and _mesh_ allocators also avoid the cache line sharing completely, while _Hoard_ and _glibc_ seem to mitigate the effects. Kukanov and Voss \[7] describe in detail how the design of _tbb_ avoids the false cache line sharing. ## On a 36-core Intel Xeon For completeness, here are the results on a big Amazon [c5.18xlarge](https://aws.amazon.com/ec2/instance-types/#Compute_Optimized) instance consisting of a 2×18-core Intel Xeon (Cascade Lake) at 3.4GHz (boost 3.5GHz) with 144GiB ECC memory, running Ubuntu 20.04 with glibc 2.31, GCC 9.3.0, and Clang 10.0.0. This time, the mimalloc allocators (mi, xmi, and smi) were compiled with the Clang compiler instead of GCC. The results are similar to the AMD results but it is interesting to see the differences in the _larsonN_, _mstressN_, and _xmalloc-testN_ benchmarks. ## Peak Working Set The following figure shows the peak working set (rss) of the allocators on the benchmarks (on the c5.18xlarge instance). Note that the _xmalloc-testN_ memory usage should be disregarded as it allocates more the faster the program runs. Similarly, memory usage of _larsonN_, _mstressN_, _rptestN_ and _sh8bench_ can vary depending on scheduling and speed. Nevertheless, we hope to improve the memory usage on _mstressN_ and _rptestN_ (just as _cfrac_, _larsonN_ and _sh8bench_ have a small working set which skews the results). # References - \[1] Emery D. Berger, Kathryn S. McKinley, Robert D. Blumofe, and Paul R. Wilson. _Hoard: A Scalable Memory Allocator for Multithreaded Applications_ the Ninth International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS-IX). Cambridge, MA, November 2000. [pdf](http://www.cs.utexas.edu/users/mckinley/papers/asplos-2000.pdf) - \[2] P. Larson and M. Krishnan. _Memory allocation for long-running server applications_. In ISMM, Vancouver, B.C., Canada, 1998. [pdf](http://citeseer.ist.psu.edu/viewdoc/download?doi=10.1.1.45.1947&rep=rep1&type=pdf) - \[3] D. Grunwald, B. Zorn, and R. Henderson. _Improving the cache locality of memory allocation_. In R. Cartwright, editor, Proceedings of the Conference on Programming Language Design and Implementation, pages 177–186, New York, NY, USA, June 1993. [pdf](http://citeseer.ist.psu.edu/viewdoc/download?doi=10.1.1.43.6621&rep=rep1&type=pdf) - \[4] J. Barnes and P. Hut. _A hierarchical O(n*log(n)) force-calculation algorithm_. Nature, 324:446-449, 1986. - \[5] C. Lever, and D. Boreham. _Malloc() Performance in a Multithreaded Linux Environment._ In USENIX Annual Technical Conference, Freenix Session. San Diego, CA. Jun. 2000. Available at - \[6] Timothy Crundal. _Reducing Active-False Sharing in TCMalloc_. 2016. CS16S1 project at the Australian National University. [pdf](http://courses.cecs.anu.edu.au/courses/CSPROJECTS/16S1/Reports/Timothy_Crundal_Report.pdf) - \[7] Alexey Kukanov, and Michael J Voss. _The Foundations for Scalable Multi-Core Software in Intel Threading Building Blocks._ Intel Technology Journal 11 (4). 2007 - \[8] Bobby Powers, David Tench, Emery D. Berger, and Andrew McGregor. _Mesh: Compacting Memory Management for C/C++_ In Proceedings of the 40th ACM SIGPLAN Conference on Programming Language Design and Implementation (PLDI'19), June 2019, pages 333-–346. # Contributing This project welcomes contributions and suggestions. Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit https://cla.microsoft.com. When you submit a pull request, a CLA-bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA. # Older Release Notes * 2021-11-14, `v1.7.3`, `v2.0.3` (beta): improved WASM support, improved macOS support and performance (including M1), improved performance for v2 for large objects, Python integration improvements, more standard installation directories, various small fixes. * 2021-06-17, `v1.7.2`, `v2.0.2` (beta): support M1, better installation layout on Linux, fix thread_id on Android, prefer 2-6TiB area for aligned allocation to work better on pre-windows 8, various small fixes. * 2021-04-06, `v1.7.1`, `v2.0.1` (beta): fix bug in arena allocation for huge pages, improved aslr on large allocations, initial M1 support (still experimental). * 2021-01-31, `v2.0.0`: beta release 2.0: new slice algorithm for managing internal mimalloc pages. * 2021-01-31, `v1.7.0`: stable release 1.7: support explicit user provided memory regions, more precise statistics, improve macOS overriding, initial support for Apple M1, improved DragonFly support, faster memcpy on Windows, various small fixes. * 2020-09-24, `v1.6.7`: stable release 1.6: using standard C atomics, passing tsan testing, improved handling of failing to commit on Windows, add [`mi_process_info`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc.h#L156) api call. * 2020-08-06, `v1.6.4`: stable release 1.6: improved error recovery in low-memory situations, support for IllumOS and Haiku, NUMA support for Vista/XP, improved NUMA detection for AMD Ryzen, ubsan support. * 2020-05-05, `v1.6.3`: stable release 1.6: improved behavior in out-of-memory situations, improved malloc zones on macOS, build PIC static libraries by default, add option to abort on out-of-memory, line buffered statistics. * 2020-04-20, `v1.6.2`: stable release 1.6: fix compilation on Android, MingW, Raspberry, and Conda, stability fix for Windows 7, fix multiple mimalloc instances in one executable, fix `strnlen` overload, fix aligned debug padding. * 2020-02-17, `v1.6.1`: stable release 1.6: minor updates (build with clang-cl, fix alignment issue for small objects). * 2020-02-09, `v1.6.0`: stable release 1.6: fixed potential memory leak, improved overriding and thread local support on FreeBSD, NetBSD, DragonFly, and macOSX. New byte-precise heap block overflow detection in debug mode (besides the double-free detection and free-list corruption detection). Add `nodiscard` attribute to most allocation functions. Enable `MIMALLOC_PAGE_RESET` by default. New reclamation strategy for abandoned heap pages for better memory footprint. * 2020-02-09, `v1.5.0`: stable release 1.5: improved free performance, small bug fixes. * 2020-01-22, `v1.4.0`: stable release 1.4: improved performance for delayed OS page reset, more eager concurrent free, addition of STL allocator, fixed potential memory leak. * 2020-01-15, `v1.3.0`: stable release 1.3: bug fixes, improved randomness and [stronger free list encoding](https://github.com/microsoft/mimalloc/blob/783e3377f79ee82af43a0793910a9f2d01ac7863/include/mimalloc-internal.h#L396) in secure mode. * 2019-12-22, `v1.2.2`: stable release 1.2: minor updates. * 2019-11-22, `v1.2.0`: stable release 1.2: bug fixes, improved secure mode (free list corruption checks, double free mitigation). Improved dynamic overriding on Windows. * 2019-10-07, `v1.1.0`: stable release 1.1. * 2019-09-01, `v1.0.8`: pre-release 8: more robust windows dynamic overriding, initial huge page support. * 2019-08-10, `v1.0.6`: pre-release 6: various performance improvements. ================================================ FILE: third-party/mimalloc/src/alloc-aligned.c ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2021, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #include "mimalloc.h" #include "mimalloc/internal.h" #include "mimalloc/prim.h" // mi_prim_get_default_heap #include // memset // ------------------------------------------------------ // Aligned Allocation // ------------------------------------------------------ static bool mi_malloc_is_naturally_aligned( size_t size, size_t alignment ) { // objects up to `MI_MAX_ALIGN_GUARANTEE` are allocated aligned to their size (see `segment.c:_mi_segment_page_start`). mi_assert_internal(_mi_is_power_of_two(alignment) && (alignment > 0)); if (alignment > size) return false; if (alignment <= MI_MAX_ALIGN_SIZE) return true; const size_t bsize = mi_good_size(size); return (bsize <= MI_MAX_ALIGN_GUARANTEE && (bsize & (alignment-1)) == 0); } #if MI_GUARDED static mi_decl_restrict void* mi_heap_malloc_guarded_aligned(mi_heap_t* heap, size_t size, size_t alignment, bool zero) mi_attr_noexcept { // use over allocation for guarded blocksl mi_assert_internal(alignment > 0 && alignment < MI_BLOCK_ALIGNMENT_MAX); const size_t oversize = size + alignment - 1; void* base = _mi_heap_malloc_guarded(heap, oversize, zero); void* p = mi_align_up_ptr(base, alignment); mi_track_align(base, p, (uint8_t*)p - (uint8_t*)base, size); mi_assert_internal(mi_usable_size(p) >= size); mi_assert_internal(_mi_is_aligned(p, alignment)); return p; } static void* mi_heap_malloc_zero_no_guarded(mi_heap_t* heap, size_t size, bool zero) { const size_t rate = heap->guarded_sample_rate; // only write if `rate!=0` so we don't write to the constant `_mi_heap_empty` if (rate != 0) { heap->guarded_sample_rate = 0; } void* p = _mi_heap_malloc_zero(heap, size, zero); if (rate != 0) { heap->guarded_sample_rate = rate; } return p; } #else static void* mi_heap_malloc_zero_no_guarded(mi_heap_t* heap, size_t size, bool zero) { return _mi_heap_malloc_zero(heap, size, zero); } #endif // Fallback aligned allocation that over-allocates -- split out for better codegen static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept { mi_assert_internal(size <= (MI_MAX_ALLOC_SIZE - MI_PADDING_SIZE)); mi_assert_internal(alignment != 0 && _mi_is_power_of_two(alignment)); void* p; size_t oversize; if mi_unlikely(alignment > MI_BLOCK_ALIGNMENT_MAX) { // use OS allocation for very large alignment and allocate inside a huge page (dedicated segment with 1 page) // This can support alignments >= MI_SEGMENT_SIZE by ensuring the object can be aligned at a point in the // first (and single) page such that the segment info is `MI_SEGMENT_SIZE` bytes before it (so it can be found by aligning the pointer down) if mi_unlikely(offset != 0) { // todo: cannot support offset alignment for very large alignments yet #if MI_DEBUG > 0 _mi_error_message(EOVERFLOW, "aligned allocation with a very large alignment cannot be used with an alignment offset (size %zu, alignment %zu, offset %zu)\n", size, alignment, offset); #endif return NULL; } oversize = (size <= MI_SMALL_SIZE_MAX ? MI_SMALL_SIZE_MAX + 1 /* ensure we use generic malloc path */ : size); // note: no guarded as alignment > 0 p = _mi_heap_malloc_zero_ex(heap, oversize, false, alignment); // the page block size should be large enough to align in the single huge page block // zero afterwards as only the area from the aligned_p may be committed! if (p == NULL) return NULL; } else { // otherwise over-allocate oversize = (size < MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : size) + alignment - 1; // adjust for size <= 16; with size 0 and aligment 64k, we would allocate a 64k block and pointing just beyond that. p = mi_heap_malloc_zero_no_guarded(heap, oversize, zero); if (p == NULL) return NULL; } mi_page_t* page = _mi_ptr_page(p); // .. and align within the allocation const uintptr_t align_mask = alignment - 1; // for any x, `(x & align_mask) == (x % alignment)` const uintptr_t poffset = ((uintptr_t)p + offset) & align_mask; const uintptr_t adjust = (poffset == 0 ? 0 : alignment - poffset); mi_assert_internal(adjust < alignment); void* aligned_p = (void*)((uintptr_t)p + adjust); if (aligned_p != p) { mi_page_set_has_aligned(page, true); #if MI_GUARDED // set tag to aligned so mi_usable_size works with guard pages if (adjust >= sizeof(mi_block_t)) { mi_block_t* const block = (mi_block_t*)p; block->next = MI_BLOCK_TAG_ALIGNED; } #endif _mi_padding_shrink(page, (mi_block_t*)p, adjust + size); } // todo: expand padding if overallocated ? mi_assert_internal(mi_page_usable_block_size(page) >= adjust + size); mi_assert_internal(((uintptr_t)aligned_p + offset) % alignment == 0); mi_assert_internal(mi_usable_size(aligned_p)>=size); mi_assert_internal(mi_usable_size(p) == mi_usable_size(aligned_p)+adjust); #if MI_DEBUG > 1 mi_page_t* const apage = _mi_ptr_page(aligned_p); void* unalign_p = _mi_page_ptr_unalign(apage, aligned_p); mi_assert_internal(p == unalign_p); #endif // now zero the block if needed if (alignment > MI_BLOCK_ALIGNMENT_MAX) { // for the tracker, on huge aligned allocations only the memory from the start of the large block is defined mi_track_mem_undefined(aligned_p, size); if (zero) { _mi_memzero_aligned(aligned_p, mi_usable_size(aligned_p)); } } if (p != aligned_p) { mi_track_align(p,aligned_p,adjust,mi_usable_size(aligned_p)); #if MI_GUARDED mi_track_mem_defined(p, sizeof(mi_block_t)); #endif } return aligned_p; } // Generic primitive aligned allocation -- split out for better codegen static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_generic(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept { mi_assert_internal(alignment != 0 && _mi_is_power_of_two(alignment)); // we don't allocate more than MI_MAX_ALLOC_SIZE (see ) if mi_unlikely(size > (MI_MAX_ALLOC_SIZE - MI_PADDING_SIZE)) { #if MI_DEBUG > 0 _mi_error_message(EOVERFLOW, "aligned allocation request is too large (size %zu, alignment %zu)\n", size, alignment); #endif return NULL; } // use regular allocation if it is guaranteed to fit the alignment constraints. // this is important to try as the fast path in `mi_heap_malloc_zero_aligned` only works when there exist // a page with the right block size, and if we always use the over-alloc fallback that would never happen. if (offset == 0 && mi_malloc_is_naturally_aligned(size,alignment)) { void* p = mi_heap_malloc_zero_no_guarded(heap, size, zero); mi_assert_internal(p == NULL || ((uintptr_t)p % alignment) == 0); const bool is_aligned_or_null = (((uintptr_t)p) & (alignment-1))==0; if mi_likely(is_aligned_or_null) { return p; } else { // this should never happen if the `mi_malloc_is_naturally_aligned` check is correct.. mi_assert(false); mi_free(p); } } // fall back to over-allocation return mi_heap_malloc_zero_aligned_at_overalloc(heap,size,alignment,offset,zero); } // Primitive aligned allocation static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept { // note: we don't require `size > offset`, we just guarantee that the address at offset is aligned regardless of the allocated size. if mi_unlikely(alignment == 0 || !_mi_is_power_of_two(alignment)) { // require power-of-two (see ) #if MI_DEBUG > 0 _mi_error_message(EOVERFLOW, "aligned allocation requires the alignment to be a power-of-two (size %zu, alignment %zu)\n", size, alignment); #endif return NULL; } #if MI_GUARDED if (offset==0 && alignment < MI_BLOCK_ALIGNMENT_MAX && mi_heap_malloc_use_guarded(heap,size)) { return mi_heap_malloc_guarded_aligned(heap, size, alignment, zero); } #endif // try first if there happens to be a small block available with just the right alignment if mi_likely(size <= MI_SMALL_SIZE_MAX && alignment <= size) { const uintptr_t align_mask = alignment-1; // for any x, `(x & align_mask) == (x % alignment)` const size_t padsize = size + MI_PADDING_SIZE; mi_page_t* page = _mi_heap_get_free_small_page(heap, padsize); if mi_likely(page->free != NULL) { const bool is_aligned = (((uintptr_t)page->free + offset) & align_mask)==0; if mi_likely(is_aligned) { #if MI_STAT>1 mi_heap_stat_increase(heap, malloc_requested, size); #endif void* p = (zero ? _mi_page_malloc_zeroed(heap,page,padsize) : _mi_page_malloc(heap,page,padsize)); // call specific page malloc for better codegen mi_assert_internal(p != NULL); mi_assert_internal(((uintptr_t)p + offset) % alignment == 0); mi_track_malloc(p,size,zero); return p; } } } // fallback to generic aligned allocation return mi_heap_malloc_zero_aligned_at_generic(heap, size, alignment, offset, zero); } // ------------------------------------------------------ // Optimized mi_heap_malloc_aligned / mi_malloc_aligned // ------------------------------------------------------ mi_decl_nodiscard mi_decl_restrict void* mi_heap_malloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept { return mi_heap_malloc_zero_aligned_at(heap, size, alignment, offset, false); } mi_decl_nodiscard mi_decl_restrict void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept { return mi_heap_malloc_aligned_at(heap, size, alignment, 0); } // ensure a definition is emitted #if defined(__cplusplus) void* _mi_extern_heap_malloc_aligned = (void*)&mi_heap_malloc_aligned; #endif // ------------------------------------------------------ // Aligned Allocation // ------------------------------------------------------ mi_decl_nodiscard mi_decl_restrict void* mi_heap_zalloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept { return mi_heap_malloc_zero_aligned_at(heap, size, alignment, offset, true); } mi_decl_nodiscard mi_decl_restrict void* mi_heap_zalloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept { return mi_heap_zalloc_aligned_at(heap, size, alignment, 0); } mi_decl_nodiscard mi_decl_restrict void* mi_heap_calloc_aligned_at(mi_heap_t* heap, size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept { size_t total; if (mi_count_size_overflow(count, size, &total)) return NULL; return mi_heap_zalloc_aligned_at(heap, total, alignment, offset); } mi_decl_nodiscard mi_decl_restrict void* mi_heap_calloc_aligned(mi_heap_t* heap, size_t count, size_t size, size_t alignment) mi_attr_noexcept { return mi_heap_calloc_aligned_at(heap,count,size,alignment,0); } mi_decl_nodiscard mi_decl_restrict void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept { return mi_heap_malloc_aligned_at(mi_prim_get_default_heap(), size, alignment, offset); } mi_decl_nodiscard mi_decl_restrict void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept { return mi_heap_malloc_aligned(mi_prim_get_default_heap(), size, alignment); } mi_decl_nodiscard mi_decl_restrict void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept { return mi_heap_zalloc_aligned_at(mi_prim_get_default_heap(), size, alignment, offset); } mi_decl_nodiscard mi_decl_restrict void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept { return mi_heap_zalloc_aligned(mi_prim_get_default_heap(), size, alignment); } mi_decl_nodiscard mi_decl_restrict void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept { return mi_heap_calloc_aligned_at(mi_prim_get_default_heap(), count, size, alignment, offset); } mi_decl_nodiscard mi_decl_restrict void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept { return mi_heap_calloc_aligned(mi_prim_get_default_heap(), count, size, alignment); } // ------------------------------------------------------ // Aligned re-allocation // ------------------------------------------------------ static void* mi_heap_realloc_zero_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset, bool zero) mi_attr_noexcept { mi_assert(alignment > 0); if (alignment <= sizeof(uintptr_t)) return _mi_heap_realloc_zero(heap,p,newsize,zero); if (p == NULL) return mi_heap_malloc_zero_aligned_at(heap,newsize,alignment,offset,zero); size_t size = mi_usable_size(p); if (newsize <= size && newsize >= (size - (size / 2)) && (((uintptr_t)p + offset) % alignment) == 0) { return p; // reallocation still fits, is aligned and not more than 50% waste } else { // note: we don't zero allocate upfront so we only zero initialize the expanded part void* newp = mi_heap_malloc_aligned_at(heap,newsize,alignment,offset); if (newp != NULL) { if (zero && newsize > size) { // also set last word in the previous allocation to zero to ensure any padding is zero-initialized size_t start = (size >= sizeof(intptr_t) ? size - sizeof(intptr_t) : 0); _mi_memzero((uint8_t*)newp + start, newsize - start); } _mi_memcpy_aligned(newp, p, (newsize > size ? size : newsize)); mi_free(p); // only free if successful } return newp; } } static void* mi_heap_realloc_zero_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, bool zero) mi_attr_noexcept { mi_assert(alignment > 0); if (alignment <= sizeof(uintptr_t)) return _mi_heap_realloc_zero(heap,p,newsize,zero); size_t offset = ((uintptr_t)p % alignment); // use offset of previous allocation (p can be NULL) return mi_heap_realloc_zero_aligned_at(heap,p,newsize,alignment,offset,zero); } mi_decl_nodiscard void* mi_heap_realloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept { return mi_heap_realloc_zero_aligned_at(heap,p,newsize,alignment,offset,false); } mi_decl_nodiscard void* mi_heap_realloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept { return mi_heap_realloc_zero_aligned(heap,p,newsize,alignment,false); } mi_decl_nodiscard void* mi_heap_rezalloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept { return mi_heap_realloc_zero_aligned_at(heap, p, newsize, alignment, offset, true); } mi_decl_nodiscard void* mi_heap_rezalloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept { return mi_heap_realloc_zero_aligned(heap, p, newsize, alignment, true); } mi_decl_nodiscard void* mi_heap_recalloc_aligned_at(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept { size_t total; if (mi_count_size_overflow(newcount, size, &total)) return NULL; return mi_heap_rezalloc_aligned_at(heap, p, total, alignment, offset); } mi_decl_nodiscard void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept { size_t total; if (mi_count_size_overflow(newcount, size, &total)) return NULL; return mi_heap_rezalloc_aligned(heap, p, total, alignment); } mi_decl_nodiscard void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept { return mi_heap_realloc_aligned_at(mi_prim_get_default_heap(), p, newsize, alignment, offset); } mi_decl_nodiscard void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept { return mi_heap_realloc_aligned(mi_prim_get_default_heap(), p, newsize, alignment); } mi_decl_nodiscard void* mi_rezalloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept { return mi_heap_rezalloc_aligned_at(mi_prim_get_default_heap(), p, newsize, alignment, offset); } mi_decl_nodiscard void* mi_rezalloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept { return mi_heap_rezalloc_aligned(mi_prim_get_default_heap(), p, newsize, alignment); } mi_decl_nodiscard void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept { return mi_heap_recalloc_aligned_at(mi_prim_get_default_heap(), p, newcount, size, alignment, offset); } mi_decl_nodiscard void* mi_recalloc_aligned(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept { return mi_heap_recalloc_aligned(mi_prim_get_default_heap(), p, newcount, size, alignment); } ================================================ FILE: third-party/mimalloc/src/alloc-override.c ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2021, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #if !defined(MI_IN_ALLOC_C) #error "this file should be included from 'alloc.c' (so aliases can work)" #endif #if defined(MI_MALLOC_OVERRIDE) && defined(_WIN32) && !(defined(MI_SHARED_LIB) && defined(_DLL)) #error "It is only possible to override "malloc" on Windows when building as a DLL (and linking the C runtime as a DLL)" #endif #if defined(MI_MALLOC_OVERRIDE) && !(defined(_WIN32)) #if defined(__APPLE__) #include mi_decl_externc void vfree(void* p); mi_decl_externc size_t malloc_size(const void* p); mi_decl_externc size_t malloc_good_size(size_t size); #endif // helper definition for C override of C++ new typedef void* mi_nothrow_t; // ------------------------------------------------------ // Override system malloc // ------------------------------------------------------ #if (defined(__GNUC__) || defined(__clang__)) && !defined(__APPLE__) && !MI_TRACK_ENABLED // gcc, clang: use aliasing to alias the exported function to one of our `mi_` functions #if (defined(__GNUC__) && __GNUC__ >= 9) #pragma GCC diagnostic ignored "-Wattributes" // or we get warnings that nodiscard is ignored on a forward #define MI_FORWARD(fun) __attribute__((alias(#fun), used, visibility("default"), copy(fun))); #else #define MI_FORWARD(fun) __attribute__((alias(#fun), used, visibility("default"))); #endif #define MI_FORWARD1(fun,x) MI_FORWARD(fun) #define MI_FORWARD2(fun,x,y) MI_FORWARD(fun) #define MI_FORWARD3(fun,x,y,z) MI_FORWARD(fun) #define MI_FORWARD0(fun,x) MI_FORWARD(fun) #define MI_FORWARD02(fun,x,y) MI_FORWARD(fun) #else // otherwise use forwarding by calling our `mi_` function #define MI_FORWARD1(fun,x) { return fun(x); } #define MI_FORWARD2(fun,x,y) { return fun(x,y); } #define MI_FORWARD3(fun,x,y,z) { return fun(x,y,z); } #define MI_FORWARD0(fun,x) { fun(x); } #define MI_FORWARD02(fun,x,y) { fun(x,y); } #endif #if defined(__APPLE__) && defined(MI_SHARED_LIB_EXPORT) && defined(MI_OSX_INTERPOSE) // define MI_OSX_IS_INTERPOSED as we should not provide forwarding definitions for // functions that are interposed (or the interposing does not work) #define MI_OSX_IS_INTERPOSED mi_decl_externc size_t mi_malloc_size_checked(void *p) { if (!mi_is_in_heap_region(p)) return 0; return mi_usable_size(p); } // use interposing so `DYLD_INSERT_LIBRARIES` works without `DYLD_FORCE_FLAT_NAMESPACE=1` // See: struct mi_interpose_s { const void* replacement; const void* target; }; #define MI_INTERPOSE_FUN(oldfun,newfun) { (const void*)&newfun, (const void*)&oldfun } #define MI_INTERPOSE_MI(fun) MI_INTERPOSE_FUN(fun,mi_##fun) __attribute__((used)) static struct mi_interpose_s _mi_interposes[] __attribute__((section("__DATA, __interpose"))) = { MI_INTERPOSE_MI(malloc), MI_INTERPOSE_MI(calloc), MI_INTERPOSE_MI(realloc), MI_INTERPOSE_MI(strdup), #if defined(MAC_OS_X_VERSION_10_7) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_7 MI_INTERPOSE_MI(strndup), #endif MI_INTERPOSE_MI(realpath), MI_INTERPOSE_MI(posix_memalign), MI_INTERPOSE_MI(reallocf), MI_INTERPOSE_MI(valloc), MI_INTERPOSE_FUN(malloc_size,mi_malloc_size_checked), MI_INTERPOSE_MI(malloc_good_size), #if defined(MAC_OS_X_VERSION_10_15) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_15 MI_INTERPOSE_MI(aligned_alloc), #endif #ifdef MI_OSX_ZONE // we interpose malloc_default_zone in alloc-override-osx.c so we can use mi_free safely MI_INTERPOSE_MI(free), MI_INTERPOSE_FUN(vfree,mi_free), #else // sometimes code allocates from default zone but deallocates using plain free :-( (like NxHashResizeToCapacity ) MI_INTERPOSE_FUN(free,mi_cfree), // use safe free that checks if pointers are from us MI_INTERPOSE_FUN(vfree,mi_cfree), #endif }; #ifdef __cplusplus extern "C" { #endif void _ZdlPv(void* p); // delete void _ZdaPv(void* p); // delete[] void _ZdlPvm(void* p, size_t n); // delete void _ZdaPvm(void* p, size_t n); // delete[] void* _Znwm(size_t n); // new void* _Znam(size_t n); // new[] void* _ZnwmRKSt9nothrow_t(size_t n, mi_nothrow_t tag); // new nothrow void* _ZnamRKSt9nothrow_t(size_t n, mi_nothrow_t tag); // new[] nothrow #ifdef __cplusplus } #endif __attribute__((used)) static struct mi_interpose_s _mi_cxx_interposes[] __attribute__((section("__DATA, __interpose"))) = { MI_INTERPOSE_FUN(_ZdlPv,mi_free), MI_INTERPOSE_FUN(_ZdaPv,mi_free), MI_INTERPOSE_FUN(_ZdlPvm,mi_free_size), MI_INTERPOSE_FUN(_ZdaPvm,mi_free_size), MI_INTERPOSE_FUN(_Znwm,mi_new), MI_INTERPOSE_FUN(_Znam,mi_new), MI_INTERPOSE_FUN(_ZnwmRKSt9nothrow_t,mi_new_nothrow), MI_INTERPOSE_FUN(_ZnamRKSt9nothrow_t,mi_new_nothrow), }; #elif defined(_MSC_VER) // cannot override malloc unless using a dll. // we just override new/delete which does work in a static library. #else // On all other systems forward allocation primitives to our API mi_decl_export void* malloc(size_t size) MI_FORWARD1(mi_malloc, size) mi_decl_export void* calloc(size_t size, size_t n) MI_FORWARD2(mi_calloc, size, n) mi_decl_export void* realloc(void* p, size_t newsize) MI_FORWARD2(mi_realloc, p, newsize) mi_decl_export void free(void* p) MI_FORWARD0(mi_free, p) // In principle we do not need to forward `strdup`/`strndup` but on some systems these do not use `malloc` internally (but a more primitive call) // We only override if `strdup` is not a macro (as on some older libc's, see issue #885) #if !defined(strdup) mi_decl_export char* strdup(const char* str) MI_FORWARD1(mi_strdup, str) #endif #if !defined(strndup) && (!defined(__APPLE__) || (defined(MAC_OS_X_VERSION_10_7) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_7)) mi_decl_export char* strndup(const char* str, size_t n) MI_FORWARD2(mi_strndup, str, n) #endif #endif #if (defined(__GNUC__) || defined(__clang__)) && !defined(__APPLE__) #pragma GCC visibility push(default) #endif // ------------------------------------------------------ // Override new/delete // This is not really necessary as they usually call // malloc/free anyway, but it improves performance. // ------------------------------------------------------ #ifdef __cplusplus // ------------------------------------------------------ // With a C++ compiler we override the new/delete operators. // see // ------------------------------------------------------ #include #ifndef MI_OSX_IS_INTERPOSED void operator delete(void* p) noexcept MI_FORWARD0(mi_free,p) void operator delete[](void* p) noexcept MI_FORWARD0(mi_free,p) void* operator new(std::size_t n) noexcept(false) MI_FORWARD1(mi_new,n) void* operator new[](std::size_t n) noexcept(false) MI_FORWARD1(mi_new,n) void* operator new (std::size_t n, const std::nothrow_t& tag) noexcept { MI_UNUSED(tag); return mi_new_nothrow(n); } void* operator new[](std::size_t n, const std::nothrow_t& tag) noexcept { MI_UNUSED(tag); return mi_new_nothrow(n); } #if (__cplusplus >= 201402L || _MSC_VER >= 1916) void operator delete (void* p, std::size_t n) noexcept MI_FORWARD02(mi_free_size,p,n) void operator delete[](void* p, std::size_t n) noexcept MI_FORWARD02(mi_free_size,p,n) #endif #endif #if (__cplusplus > 201402L && defined(__cpp_aligned_new)) && (!defined(__GNUC__) || (__GNUC__ > 5)) void operator delete (void* p, std::align_val_t al) noexcept { mi_free_aligned(p, static_cast(al)); } void operator delete[](void* p, std::align_val_t al) noexcept { mi_free_aligned(p, static_cast(al)); } void operator delete (void* p, std::size_t n, std::align_val_t al) noexcept { mi_free_size_aligned(p, n, static_cast(al)); }; void operator delete[](void* p, std::size_t n, std::align_val_t al) noexcept { mi_free_size_aligned(p, n, static_cast(al)); }; void operator delete (void* p, std::align_val_t al, const std::nothrow_t&) noexcept { mi_free_aligned(p, static_cast(al)); } void operator delete[](void* p, std::align_val_t al, const std::nothrow_t&) noexcept { mi_free_aligned(p, static_cast(al)); } void* operator new( std::size_t n, std::align_val_t al) noexcept(false) { return mi_new_aligned(n, static_cast(al)); } void* operator new[]( std::size_t n, std::align_val_t al) noexcept(false) { return mi_new_aligned(n, static_cast(al)); } void* operator new (std::size_t n, std::align_val_t al, const std::nothrow_t&) noexcept { return mi_new_aligned_nothrow(n, static_cast(al)); } void* operator new[](std::size_t n, std::align_val_t al, const std::nothrow_t&) noexcept { return mi_new_aligned_nothrow(n, static_cast(al)); } #endif #elif (defined(__GNUC__) || defined(__clang__)) // ------------------------------------------------------ // Override by defining the mangled C++ names of the operators (as // used by GCC and CLang). // See // ------------------------------------------------------ void _ZdlPv(void* p) MI_FORWARD0(mi_free,p) // delete void _ZdaPv(void* p) MI_FORWARD0(mi_free,p) // delete[] void _ZdlPvm(void* p, size_t n) MI_FORWARD02(mi_free_size,p,n) void _ZdaPvm(void* p, size_t n) MI_FORWARD02(mi_free_size,p,n) void _ZdlPvSt11align_val_t(void* p, size_t al) { mi_free_aligned(p,al); } void _ZdaPvSt11align_val_t(void* p, size_t al) { mi_free_aligned(p,al); } void _ZdlPvmSt11align_val_t(void* p, size_t n, size_t al) { mi_free_size_aligned(p,n,al); } void _ZdaPvmSt11align_val_t(void* p, size_t n, size_t al) { mi_free_size_aligned(p,n,al); } void _ZdlPvRKSt9nothrow_t(void* p, mi_nothrow_t tag) { MI_UNUSED(tag); mi_free(p); } // operator delete(void*, std::nothrow_t const&) void _ZdaPvRKSt9nothrow_t(void* p, mi_nothrow_t tag) { MI_UNUSED(tag); mi_free(p); } // operator delete[](void*, std::nothrow_t const&) void _ZdlPvSt11align_val_tRKSt9nothrow_t(void* p, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); mi_free_aligned(p,al); } // operator delete(void*, std::align_val_t, std::nothrow_t const&) void _ZdaPvSt11align_val_tRKSt9nothrow_t(void* p, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); mi_free_aligned(p,al); } // operator delete[](void*, std::align_val_t, std::nothrow_t const&) #if (MI_INTPTR_SIZE==8) void* _Znwm(size_t n) MI_FORWARD1(mi_new,n) // new 64-bit void* _Znam(size_t n) MI_FORWARD1(mi_new,n) // new[] 64-bit void* _ZnwmRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); } void* _ZnamRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); } void* _ZnwmSt11align_val_t(size_t n, size_t al) MI_FORWARD2(mi_new_aligned, n, al) void* _ZnamSt11align_val_t(size_t n, size_t al) MI_FORWARD2(mi_new_aligned, n, al) void* _ZnwmSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_aligned_nothrow(n,al); } void* _ZnamSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_aligned_nothrow(n,al); } #elif (MI_INTPTR_SIZE==4) void* _Znwj(size_t n) MI_FORWARD1(mi_new,n) // new 64-bit void* _Znaj(size_t n) MI_FORWARD1(mi_new,n) // new[] 64-bit void* _ZnwjRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); } void* _ZnajRKSt9nothrow_t(size_t n, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_nothrow(n); } void* _ZnwjSt11align_val_t(size_t n, size_t al) MI_FORWARD2(mi_new_aligned, n, al) void* _ZnajSt11align_val_t(size_t n, size_t al) MI_FORWARD2(mi_new_aligned, n, al) void* _ZnwjSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_aligned_nothrow(n,al); } void* _ZnajSt11align_val_tRKSt9nothrow_t(size_t n, size_t al, mi_nothrow_t tag) { MI_UNUSED(tag); return mi_new_aligned_nothrow(n,al); } #else #error "define overloads for new/delete for this platform (just for performance, can be skipped)" #endif #endif // __cplusplus // ------------------------------------------------------ // Further Posix & Unix functions definitions // ------------------------------------------------------ #ifdef __cplusplus extern "C" { #endif #ifndef MI_OSX_IS_INTERPOSED // Forward Posix/Unix calls as well void* reallocf(void* p, size_t newsize) MI_FORWARD2(mi_reallocf,p,newsize) size_t malloc_size(const void* p) MI_FORWARD1(mi_usable_size,p) #if !defined(__ANDROID__) && !defined(__FreeBSD__) && !defined(__DragonFly__) size_t malloc_usable_size(void *p) MI_FORWARD1(mi_usable_size,p) #else size_t malloc_usable_size(const void *p) MI_FORWARD1(mi_usable_size,p) #endif // No forwarding here due to aliasing/name mangling issues void* valloc(size_t size) { return mi_valloc(size); } void vfree(void* p) { mi_free(p); } size_t malloc_good_size(size_t size) { return mi_malloc_good_size(size); } int posix_memalign(void** p, size_t alignment, size_t size) { return mi_posix_memalign(p, alignment, size); } // `aligned_alloc` is only available when __USE_ISOC11 is defined. // Note: it seems __USE_ISOC11 is not defined in musl (and perhaps other libc's) so we only check // for it if using glibc. // Note: Conda has a custom glibc where `aligned_alloc` is declared `static inline` and we cannot // override it, but both _ISOC11_SOURCE and __USE_ISOC11 are undefined in Conda GCC7 or GCC9. // Fortunately, in the case where `aligned_alloc` is declared as `static inline` it // uses internally `memalign`, `posix_memalign`, or `_aligned_malloc` so we can avoid overriding it ourselves. #if !defined(__GLIBC__) || __USE_ISOC11 void* aligned_alloc(size_t alignment, size_t size) { return mi_aligned_alloc(alignment, size); } #endif #endif // no forwarding here due to aliasing/name mangling issues void cfree(void* p) { mi_free(p); } void* pvalloc(size_t size) { return mi_pvalloc(size); } void* memalign(size_t alignment, size_t size) { return mi_memalign(alignment, size); } void* _aligned_malloc(size_t alignment, size_t size) { return mi_aligned_alloc(alignment, size); } void* reallocarray(void* p, size_t count, size_t size) { return mi_reallocarray(p, count, size); } // some systems define reallocarr so mark it as a weak symbol (#751) mi_decl_weak int reallocarr(void* p, size_t count, size_t size) { return mi_reallocarr(p, count, size); } #if defined(__wasi__) // forward __libc interface (see PR #667) void* __libc_malloc(size_t size) MI_FORWARD1(mi_malloc, size) void* __libc_calloc(size_t count, size_t size) MI_FORWARD2(mi_calloc, count, size) void* __libc_realloc(void* p, size_t size) MI_FORWARD2(mi_realloc, p, size) void __libc_free(void* p) MI_FORWARD0(mi_free, p) void* __libc_memalign(size_t alignment, size_t size) { return mi_memalign(alignment, size); } #elif defined(__linux__) // forward __libc interface (needed for glibc-based and musl-based Linux distributions) void* __libc_malloc(size_t size) MI_FORWARD1(mi_malloc,size) void* __libc_calloc(size_t count, size_t size) MI_FORWARD2(mi_calloc,count,size) void* __libc_realloc(void* p, size_t size) MI_FORWARD2(mi_realloc,p,size) void __libc_free(void* p) MI_FORWARD0(mi_free,p) void __libc_cfree(void* p) MI_FORWARD0(mi_free,p) void* __libc_valloc(size_t size) { return mi_valloc(size); } void* __libc_pvalloc(size_t size) { return mi_pvalloc(size); } void* __libc_memalign(size_t alignment, size_t size) { return mi_memalign(alignment,size); } int __posix_memalign(void** p, size_t alignment, size_t size) { return mi_posix_memalign(p,alignment,size); } #endif #ifdef __cplusplus } #endif #if (defined(__GNUC__) || defined(__clang__)) && !defined(__APPLE__) #pragma GCC visibility pop #endif #endif // MI_MALLOC_OVERRIDE && !_WIN32 ================================================ FILE: third-party/mimalloc/src/alloc-posix.c ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2021, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ // ------------------------------------------------------------------------ // mi prefixed publi definitions of various Posix, Unix, and C++ functions // for convenience and used when overriding these functions. // ------------------------------------------------------------------------ #include "mimalloc.h" #include "mimalloc/internal.h" // ------------------------------------------------------ // Posix & Unix functions definitions // ------------------------------------------------------ #include #include // memset #include // getenv #ifdef _MSC_VER #pragma warning(disable:4996) // getenv _wgetenv #endif #ifndef EINVAL #define EINVAL 22 #endif #ifndef ENOMEM #define ENOMEM 12 #endif mi_decl_nodiscard size_t mi_malloc_size(const void* p) mi_attr_noexcept { // if (!mi_is_in_heap_region(p)) return 0; return mi_usable_size(p); } mi_decl_nodiscard size_t mi_malloc_usable_size(const void *p) mi_attr_noexcept { // if (!mi_is_in_heap_region(p)) return 0; return mi_usable_size(p); } mi_decl_nodiscard size_t mi_malloc_good_size(size_t size) mi_attr_noexcept { return mi_good_size(size); } void mi_cfree(void* p) mi_attr_noexcept { if (mi_is_in_heap_region(p)) { mi_free(p); } } int mi_posix_memalign(void** p, size_t alignment, size_t size) mi_attr_noexcept { // Note: The spec dictates we should not modify `*p` on an error. (issue#27) // if (p == NULL) return EINVAL; if ((alignment % sizeof(void*)) != 0) return EINVAL; // natural alignment // it is also required that alignment is a power of 2 and > 0; this is checked in `mi_malloc_aligned` if (alignment==0 || !_mi_is_power_of_two(alignment)) return EINVAL; // not a power of 2 void* q = mi_malloc_aligned(size, alignment); if (q==NULL && size != 0) return ENOMEM; mi_assert_internal(((uintptr_t)q % alignment) == 0); *p = q; return 0; } mi_decl_nodiscard mi_decl_restrict void* mi_memalign(size_t alignment, size_t size) mi_attr_noexcept { void* p = mi_malloc_aligned(size, alignment); mi_assert_internal(((uintptr_t)p % alignment) == 0); return p; } mi_decl_nodiscard mi_decl_restrict void* mi_valloc(size_t size) mi_attr_noexcept { return mi_memalign( _mi_os_page_size(), size ); } mi_decl_nodiscard mi_decl_restrict void* mi_pvalloc(size_t size) mi_attr_noexcept { size_t psize = _mi_os_page_size(); if (size >= SIZE_MAX - psize) return NULL; // overflow size_t asize = _mi_align_up(size, psize); return mi_malloc_aligned(asize, psize); } mi_decl_nodiscard mi_decl_restrict void* mi_aligned_alloc(size_t alignment, size_t size) mi_attr_noexcept { // C11 requires the size to be an integral multiple of the alignment, see . // unfortunately, it turns out quite some programs pass a size that is not an integral multiple so skip this check.. /* if mi_unlikely((size & (alignment - 1)) != 0) { // C11 requires alignment>0 && integral multiple, see #if MI_DEBUG > 0 _mi_error_message(EOVERFLOW, "(mi_)aligned_alloc requires the size to be an integral multiple of the alignment (size %zu, alignment %zu)\n", size, alignment); #endif return NULL; } */ // C11 also requires alignment to be a power-of-two (and > 0) which is checked in mi_malloc_aligned void* p = mi_malloc_aligned(size, alignment); mi_assert_internal(((uintptr_t)p % alignment) == 0); return p; } mi_decl_nodiscard void* mi_reallocarray( void* p, size_t count, size_t size ) mi_attr_noexcept { // BSD void* newp = mi_reallocn(p,count,size); if (newp==NULL) { errno = ENOMEM; } return newp; } mi_decl_nodiscard int mi_reallocarr( void* p, size_t count, size_t size ) mi_attr_noexcept { // NetBSD mi_assert(p != NULL); if (p == NULL) { errno = EINVAL; return EINVAL; } void** op = (void**)p; void* newp = mi_reallocarray(*op, count, size); if mi_unlikely(newp == NULL) { return errno; } *op = newp; return 0; } void* mi__expand(void* p, size_t newsize) mi_attr_noexcept { // Microsoft void* res = mi_expand(p, newsize); if (res == NULL) { errno = ENOMEM; } return res; } mi_decl_nodiscard mi_decl_restrict unsigned short* mi_wcsdup(const unsigned short* s) mi_attr_noexcept { if (s==NULL) return NULL; size_t len; for(len = 0; s[len] != 0; len++) { } size_t size = (len+1)*sizeof(unsigned short); unsigned short* p = (unsigned short*)mi_malloc(size); if (p != NULL) { _mi_memcpy(p,s,size); } return p; } mi_decl_nodiscard mi_decl_restrict unsigned char* mi_mbsdup(const unsigned char* s) mi_attr_noexcept { return (unsigned char*)mi_strdup((const char*)s); } int mi_dupenv_s(char** buf, size_t* size, const char* name) mi_attr_noexcept { if (buf==NULL || name==NULL) return EINVAL; if (size != NULL) *size = 0; char* p = getenv(name); // mscver warning 4996 if (p==NULL) { *buf = NULL; } else { *buf = mi_strdup(p); if (*buf==NULL) return ENOMEM; if (size != NULL) *size = _mi_strlen(p); } return 0; } int mi_wdupenv_s(unsigned short** buf, size_t* size, const unsigned short* name) mi_attr_noexcept { if (buf==NULL || name==NULL) return EINVAL; if (size != NULL) *size = 0; #if !defined(_WIN32) || (defined(WINAPI_FAMILY) && (WINAPI_FAMILY != WINAPI_FAMILY_DESKTOP_APP)) // not supported *buf = NULL; return EINVAL; #else unsigned short* p = (unsigned short*)_wgetenv((const wchar_t*)name); // msvc warning 4996 if (p==NULL) { *buf = NULL; } else { *buf = mi_wcsdup(p); if (*buf==NULL) return ENOMEM; if (size != NULL) *size = wcslen((const wchar_t*)p); } return 0; #endif } mi_decl_nodiscard void* mi_aligned_offset_recalloc(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept { // Microsoft return mi_recalloc_aligned_at(p, newcount, size, alignment, offset); } mi_decl_nodiscard void* mi_aligned_recalloc(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept { // Microsoft return mi_recalloc_aligned(p, newcount, size, alignment); } ================================================ FILE: third-party/mimalloc/src/alloc.c ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2024, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #ifndef _DEFAULT_SOURCE #define _DEFAULT_SOURCE // for realpath() on Linux #endif #include "mimalloc.h" #include "mimalloc/internal.h" #include "mimalloc/atomic.h" #include "mimalloc/prim.h" // _mi_prim_thread_id() #include // memset, strlen (for mi_strdup) #include // malloc, abort #define MI_IN_ALLOC_C #include "alloc-override.c" #include "free.c" #undef MI_IN_ALLOC_C // ------------------------------------------------------ // Allocation // ------------------------------------------------------ // Fast allocation in a page: just pop from the free list. // Fall back to generic allocation only if the list is empty. // Note: in release mode the (inlined) routine is about 7 instructions with a single test. extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept { mi_assert_internal(page->block_size == 0 /* empty heap */ || mi_page_block_size(page) >= size); // check the free list mi_block_t* const block = page->free; if mi_unlikely(block == NULL) { return _mi_malloc_generic(heap, size, zero, 0); } mi_assert_internal(block != NULL && _mi_ptr_page(block) == page); // pop from the free list page->free = mi_block_next(page, block); page->used++; mi_assert_internal(page->free == NULL || _mi_ptr_page(page->free) == page); mi_assert_internal(page->block_size < MI_MAX_ALIGN_SIZE || _mi_is_aligned(block, MI_MAX_ALIGN_SIZE)); #if MI_DEBUG>3 if (page->free_is_zero && size > sizeof(*block)) { mi_assert_expensive(mi_mem_is_zero(block+1,size - sizeof(*block))); } #endif // allow use of the block internally // note: when tracking we need to avoid ever touching the MI_PADDING since // that is tracked by valgrind etc. as non-accessible (through the red-zone, see `mimalloc/track.h`) mi_track_mem_undefined(block, mi_page_usable_block_size(page)); // zero the block? note: we need to zero the full block size (issue #63) if mi_unlikely(zero) { mi_assert_internal(page->block_size != 0); // do not call with zero'ing for huge blocks (see _mi_malloc_generic) mi_assert_internal(!mi_page_is_huge(page)); #if MI_PADDING mi_assert_internal(page->block_size >= MI_PADDING_SIZE); #endif if (page->free_is_zero) { block->next = 0; mi_track_mem_defined(block, page->block_size - MI_PADDING_SIZE); } else { _mi_memzero_aligned(block, page->block_size - MI_PADDING_SIZE); } } #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN if (!zero && !mi_page_is_huge(page)) { memset(block, MI_DEBUG_UNINIT, mi_page_usable_block_size(page)); } #elif (MI_SECURE!=0) if (!zero) { block->next = 0; } // don't leak internal data #endif #if (MI_STAT>0) const size_t bsize = mi_page_usable_block_size(page); if (bsize <= MI_MEDIUM_OBJ_SIZE_MAX) { mi_heap_stat_increase(heap, malloc_normal, bsize); mi_heap_stat_counter_increase(heap, malloc_normal_count, 1); #if (MI_STAT>1) const size_t bin = _mi_bin(bsize); mi_heap_stat_increase(heap, malloc_bins[bin], 1); #endif } #endif #if MI_PADDING // && !MI_TRACK_ENABLED mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + mi_page_usable_block_size(page)); ptrdiff_t delta = ((uint8_t*)padding - (uint8_t*)block - (size - MI_PADDING_SIZE)); #if (MI_DEBUG>=2) mi_assert_internal(delta >= 0 && mi_page_usable_block_size(page) >= (size - MI_PADDING_SIZE + delta)); #endif mi_track_mem_defined(padding,sizeof(mi_padding_t)); // note: re-enable since mi_page_usable_block_size may set noaccess padding->canary = mi_ptr_encode_canary(page,block,page->keys); padding->delta = (uint32_t)(delta); #if MI_PADDING_CHECK if (!mi_page_is_huge(page)) { uint8_t* fill = (uint8_t*)padding - delta; const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // set at most N initial padding bytes for (size_t i = 0; i < maxpad; i++) { fill[i] = MI_DEBUG_PADDING; } } #endif #endif return block; } // extra entries for improved efficiency in `alloc-aligned.c`. extern void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept { return _mi_page_malloc_zero(heap,page,size,false); } extern void* _mi_page_malloc_zeroed(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept { return _mi_page_malloc_zero(heap,page,size,true); } #if MI_GUARDED mi_decl_restrict void* _mi_heap_malloc_guarded(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept; #endif static inline mi_decl_restrict void* mi_heap_malloc_small_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept { mi_assert(heap != NULL); mi_assert(size <= MI_SMALL_SIZE_MAX); #if MI_DEBUG const uintptr_t tid = _mi_thread_id(); mi_assert(heap->thread_id == 0 || heap->thread_id == tid); // heaps are thread local #endif #if (MI_PADDING || MI_GUARDED) if (size == 0) { size = sizeof(void*); } #endif #if MI_GUARDED if (mi_heap_malloc_use_guarded(heap,size)) { return _mi_heap_malloc_guarded(heap, size, zero); } #endif // get page in constant time, and allocate from it mi_page_t* page = _mi_heap_get_free_small_page(heap, size + MI_PADDING_SIZE); void* const p = _mi_page_malloc_zero(heap, page, size + MI_PADDING_SIZE, zero); mi_track_malloc(p,size,zero); #if MI_STAT>1 if (p != NULL) { if (!mi_heap_is_initialized(heap)) { heap = mi_prim_get_default_heap(); } mi_heap_stat_increase(heap, malloc_requested, mi_usable_size(p)); } #endif #if MI_DEBUG>3 if (p != NULL && zero) { mi_assert_expensive(mi_mem_is_zero(p, size)); } #endif return p; } // allocate a small block mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept { return mi_heap_malloc_small_zero(heap, size, false); } mi_decl_nodiscard extern inline mi_decl_restrict void* mi_malloc_small(size_t size) mi_attr_noexcept { return mi_heap_malloc_small(mi_prim_get_default_heap(), size); } // The main allocation function extern inline void* _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept { // fast path for small objects if mi_likely(size <= MI_SMALL_SIZE_MAX) { mi_assert_internal(huge_alignment == 0); return mi_heap_malloc_small_zero(heap, size, zero); } #if MI_GUARDED else if (huge_alignment==0 && mi_heap_malloc_use_guarded(heap,size)) { return _mi_heap_malloc_guarded(heap, size, zero); } #endif else { // regular allocation mi_assert(heap!=NULL); mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id()); // heaps are thread local void* const p = _mi_malloc_generic(heap, size + MI_PADDING_SIZE, zero, huge_alignment); // note: size can overflow but it is detected in malloc_generic mi_track_malloc(p,size,zero); #if MI_STAT>1 if (p != NULL) { if (!mi_heap_is_initialized(heap)) { heap = mi_prim_get_default_heap(); } mi_heap_stat_increase(heap, malloc_requested, mi_usable_size(p)); } #endif #if MI_DEBUG>3 if (p != NULL && zero) { mi_assert_expensive(mi_mem_is_zero(p, size)); } #endif return p; } } extern inline void* _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept { return _mi_heap_malloc_zero_ex(heap, size, zero, 0); } mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept { return _mi_heap_malloc_zero(heap, size, false); } mi_decl_nodiscard extern inline mi_decl_restrict void* mi_malloc(size_t size) mi_attr_noexcept { return mi_heap_malloc(mi_prim_get_default_heap(), size); } // zero initialized small block mi_decl_nodiscard mi_decl_restrict void* mi_zalloc_small(size_t size) mi_attr_noexcept { return mi_heap_malloc_small_zero(mi_prim_get_default_heap(), size, true); } mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept { return _mi_heap_malloc_zero(heap, size, true); } mi_decl_nodiscard mi_decl_restrict void* mi_zalloc(size_t size) mi_attr_noexcept { return mi_heap_zalloc(mi_prim_get_default_heap(),size); } mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept { size_t total; if (mi_count_size_overflow(count,size,&total)) return NULL; return mi_heap_zalloc(heap,total); } mi_decl_nodiscard mi_decl_restrict void* mi_calloc(size_t count, size_t size) mi_attr_noexcept { return mi_heap_calloc(mi_prim_get_default_heap(),count,size); } // Uninitialized `calloc` mi_decl_nodiscard extern mi_decl_restrict void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept { size_t total; if (mi_count_size_overflow(count, size, &total)) return NULL; return mi_heap_malloc(heap, total); } mi_decl_nodiscard mi_decl_restrict void* mi_mallocn(size_t count, size_t size) mi_attr_noexcept { return mi_heap_mallocn(mi_prim_get_default_heap(),count,size); } // Expand (or shrink) in place (or fail) void* mi_expand(void* p, size_t newsize) mi_attr_noexcept { #if MI_PADDING // we do not shrink/expand with padding enabled MI_UNUSED(p); MI_UNUSED(newsize); return NULL; #else if (p == NULL) return NULL; const size_t size = _mi_usable_size(p,"mi_expand"); if (newsize > size) return NULL; return p; // it fits #endif } void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept { // if p == NULL then behave as malloc. // else if size == 0 then reallocate to a zero-sized block (and don't return NULL, just as mi_malloc(0)). // (this means that returning NULL always indicates an error, and `p` will not have been freed in that case.) const size_t size = _mi_usable_size(p,"mi_realloc"); // also works if p == NULL (with size 0) if mi_unlikely(newsize <= size && newsize >= (size / 2) && newsize > 0) { // note: newsize must be > 0 or otherwise we return NULL for realloc(NULL,0) mi_assert_internal(p!=NULL); // todo: do not track as the usable size is still the same in the free; adjust potential padding? // mi_track_resize(p,size,newsize) // if (newsize < size) { mi_track_mem_noaccess((uint8_t*)p + newsize, size - newsize); } return p; // reallocation still fits and not more than 50% waste } void* newp = mi_heap_malloc(heap,newsize); if mi_likely(newp != NULL) { if (zero && newsize > size) { // also set last word in the previous allocation to zero to ensure any padding is zero-initialized const size_t start = (size >= sizeof(intptr_t) ? size - sizeof(intptr_t) : 0); _mi_memzero((uint8_t*)newp + start, newsize - start); } else if (newsize == 0) { ((uint8_t*)newp)[0] = 0; // work around for applications that expect zero-reallocation to be zero initialized (issue #725) } if mi_likely(p != NULL) { const size_t copysize = (newsize > size ? size : newsize); mi_track_mem_defined(p,copysize); // _mi_useable_size may be too large for byte precise memory tracking.. _mi_memcpy(newp, p, copysize); mi_free(p); // only free the original pointer if successful } } return newp; } mi_decl_nodiscard void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept { return _mi_heap_realloc_zero(heap, p, newsize, false); } mi_decl_nodiscard void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept { size_t total; if (mi_count_size_overflow(count, size, &total)) return NULL; return mi_heap_realloc(heap, p, total); } // Reallocate but free `p` on errors mi_decl_nodiscard void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept { void* newp = mi_heap_realloc(heap, p, newsize); if (newp==NULL && p!=NULL) mi_free(p); return newp; } mi_decl_nodiscard void* mi_heap_rezalloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept { return _mi_heap_realloc_zero(heap, p, newsize, true); } mi_decl_nodiscard void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept { size_t total; if (mi_count_size_overflow(count, size, &total)) return NULL; return mi_heap_rezalloc(heap, p, total); } mi_decl_nodiscard void* mi_realloc(void* p, size_t newsize) mi_attr_noexcept { return mi_heap_realloc(mi_prim_get_default_heap(),p,newsize); } mi_decl_nodiscard void* mi_reallocn(void* p, size_t count, size_t size) mi_attr_noexcept { return mi_heap_reallocn(mi_prim_get_default_heap(),p,count,size); } // Reallocate but free `p` on errors mi_decl_nodiscard void* mi_reallocf(void* p, size_t newsize) mi_attr_noexcept { return mi_heap_reallocf(mi_prim_get_default_heap(),p,newsize); } mi_decl_nodiscard void* mi_rezalloc(void* p, size_t newsize) mi_attr_noexcept { return mi_heap_rezalloc(mi_prim_get_default_heap(), p, newsize); } mi_decl_nodiscard void* mi_recalloc(void* p, size_t count, size_t size) mi_attr_noexcept { return mi_heap_recalloc(mi_prim_get_default_heap(), p, count, size); } // ------------------------------------------------------ // strdup, strndup, and realpath // ------------------------------------------------------ // `strdup` using mi_malloc mi_decl_nodiscard mi_decl_restrict char* mi_heap_strdup(mi_heap_t* heap, const char* s) mi_attr_noexcept { if (s == NULL) return NULL; size_t len = _mi_strlen(s); char* t = (char*)mi_heap_malloc(heap,len+1); if (t == NULL) return NULL; _mi_memcpy(t, s, len); t[len] = 0; return t; } mi_decl_nodiscard mi_decl_restrict char* mi_strdup(const char* s) mi_attr_noexcept { return mi_heap_strdup(mi_prim_get_default_heap(), s); } // `strndup` using mi_malloc mi_decl_nodiscard mi_decl_restrict char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n) mi_attr_noexcept { if (s == NULL) return NULL; const size_t len = _mi_strnlen(s,n); // len <= n char* t = (char*)mi_heap_malloc(heap, len+1); if (t == NULL) return NULL; _mi_memcpy(t, s, len); t[len] = 0; return t; } mi_decl_nodiscard mi_decl_restrict char* mi_strndup(const char* s, size_t n) mi_attr_noexcept { return mi_heap_strndup(mi_prim_get_default_heap(),s,n); } #ifndef __wasi__ // `realpath` using mi_malloc #ifdef _WIN32 #ifndef PATH_MAX #define PATH_MAX MAX_PATH #endif mi_decl_nodiscard mi_decl_restrict char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept { // todo: use GetFullPathNameW to allow longer file names char buf[PATH_MAX]; DWORD res = GetFullPathNameA(fname, PATH_MAX, (resolved_name == NULL ? buf : resolved_name), NULL); if (res == 0) { errno = GetLastError(); return NULL; } else if (res > PATH_MAX) { errno = EINVAL; return NULL; } else if (resolved_name != NULL) { return resolved_name; } else { return mi_heap_strndup(heap, buf, PATH_MAX); } } #else /* #include // pathconf static size_t mi_path_max(void) { static size_t path_max = 0; if (path_max <= 0) { long m = pathconf("/",_PC_PATH_MAX); if (m <= 0) path_max = 4096; // guess else if (m < 256) path_max = 256; // at least 256 else path_max = m; } return path_max; } */ char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept { if (resolved_name != NULL) { return realpath(fname,resolved_name); } else { char* rname = realpath(fname, NULL); if (rname == NULL) return NULL; char* result = mi_heap_strdup(heap, rname); mi_cfree(rname); // use checked free (which may be redirected to our free but that's ok) // note: with ASAN realpath is intercepted and mi_cfree may leak the returned pointer :-( return result; } /* const size_t n = mi_path_max(); char* buf = (char*)mi_malloc(n+1); if (buf == NULL) { errno = ENOMEM; return NULL; } char* rname = realpath(fname,buf); char* result = mi_heap_strndup(heap,rname,n); // ok if `rname==NULL` mi_free(buf); return result; } */ } #endif mi_decl_nodiscard mi_decl_restrict char* mi_realpath(const char* fname, char* resolved_name) mi_attr_noexcept { return mi_heap_realpath(mi_prim_get_default_heap(),fname,resolved_name); } #endif /*------------------------------------------------------- C++ new and new_aligned The standard requires calling into `get_new_handler` and throwing the bad_alloc exception on failure. If we compile with a C++ compiler we can implement this precisely. If we use a C compiler we cannot throw a `bad_alloc` exception but we call `exit` instead (i.e. not returning). -------------------------------------------------------*/ #ifdef __cplusplus #include static bool mi_try_new_handler(bool nothrow) { #if defined(_MSC_VER) || (__cplusplus >= 201103L) std::new_handler h = std::get_new_handler(); #else std::new_handler h = std::set_new_handler(); std::set_new_handler(h); #endif if (h==NULL) { _mi_error_message(ENOMEM, "out of memory in 'new'"); #if defined(_CPPUNWIND) || defined(__cpp_exceptions) // exceptions are not always enabled if (!nothrow) { throw std::bad_alloc(); } #else MI_UNUSED(nothrow); #endif return false; } else { h(); return true; } } #else typedef void (*std_new_handler_t)(void); #if (defined(__GNUC__) || (defined(__clang__) && !defined(_MSC_VER))) // exclude clang-cl, see issue #631 std_new_handler_t __attribute__((weak)) _ZSt15get_new_handlerv(void) { return NULL; } static std_new_handler_t mi_get_new_handler(void) { return _ZSt15get_new_handlerv(); } #else // note: on windows we could dynamically link to `?get_new_handler@std@@YAP6AXXZXZ`. static std_new_handler_t mi_get_new_handler() { return NULL; } #endif static bool mi_try_new_handler(bool nothrow) { std_new_handler_t h = mi_get_new_handler(); if (h==NULL) { _mi_error_message(ENOMEM, "out of memory in 'new'"); if (!nothrow) { abort(); // cannot throw in plain C, use abort } return false; } else { h(); return true; } } #endif mi_decl_export mi_decl_noinline void* mi_heap_try_new(mi_heap_t* heap, size_t size, bool nothrow ) { void* p = NULL; while(p == NULL && mi_try_new_handler(nothrow)) { p = mi_heap_malloc(heap,size); } return p; } static mi_decl_noinline void* mi_try_new(size_t size, bool nothrow) { return mi_heap_try_new(mi_prim_get_default_heap(), size, nothrow); } mi_decl_nodiscard mi_decl_restrict void* mi_heap_alloc_new(mi_heap_t* heap, size_t size) { void* p = mi_heap_malloc(heap,size); if mi_unlikely(p == NULL) return mi_heap_try_new(heap, size, false); return p; } mi_decl_nodiscard mi_decl_restrict void* mi_new(size_t size) { return mi_heap_alloc_new(mi_prim_get_default_heap(), size); } mi_decl_nodiscard mi_decl_restrict void* mi_heap_alloc_new_n(mi_heap_t* heap, size_t count, size_t size) { size_t total; if mi_unlikely(mi_count_size_overflow(count, size, &total)) { mi_try_new_handler(false); // on overflow we invoke the try_new_handler once to potentially throw std::bad_alloc return NULL; } else { return mi_heap_alloc_new(heap,total); } } mi_decl_nodiscard mi_decl_restrict void* mi_new_n(size_t count, size_t size) { return mi_heap_alloc_new_n(mi_prim_get_default_heap(), count, size); } mi_decl_nodiscard mi_decl_restrict void* mi_new_nothrow(size_t size) mi_attr_noexcept { void* p = mi_malloc(size); if mi_unlikely(p == NULL) return mi_try_new(size, true); return p; } mi_decl_nodiscard mi_decl_restrict void* mi_new_aligned(size_t size, size_t alignment) { void* p; do { p = mi_malloc_aligned(size, alignment); } while(p == NULL && mi_try_new_handler(false)); return p; } mi_decl_nodiscard mi_decl_restrict void* mi_new_aligned_nothrow(size_t size, size_t alignment) mi_attr_noexcept { void* p; do { p = mi_malloc_aligned(size, alignment); } while(p == NULL && mi_try_new_handler(true)); return p; } mi_decl_nodiscard void* mi_new_realloc(void* p, size_t newsize) { void* q; do { q = mi_realloc(p, newsize); } while (q == NULL && mi_try_new_handler(false)); return q; } mi_decl_nodiscard void* mi_new_reallocn(void* p, size_t newcount, size_t size) { size_t total; if mi_unlikely(mi_count_size_overflow(newcount, size, &total)) { mi_try_new_handler(false); // on overflow we invoke the try_new_handler once to potentially throw std::bad_alloc return NULL; } else { return mi_new_realloc(p, total); } } #if MI_GUARDED // We always allocate a guarded allocation at an offset (`mi_page_has_aligned` will be true). // We then set the first word of the block to `0` for regular offset aligned allocations (in `alloc-aligned.c`) // and the first word to `~0` for guarded allocations to have a correct `mi_usable_size` static void* mi_block_ptr_set_guarded(mi_block_t* block, size_t obj_size) { // TODO: we can still make padding work by moving it out of the guard page area mi_page_t* const page = _mi_ptr_page(block); mi_page_set_has_aligned(page, true); block->next = MI_BLOCK_TAG_GUARDED; // set guard page at the end of the block mi_segment_t* const segment = _mi_page_segment(page); const size_t block_size = mi_page_block_size(page); // must use `block_size` to match `mi_free_local` const size_t os_page_size = _mi_os_page_size(); mi_assert_internal(block_size >= obj_size + os_page_size + sizeof(mi_block_t)); if (block_size < obj_size + os_page_size + sizeof(mi_block_t)) { // should never happen mi_free(block); return NULL; } uint8_t* guard_page = (uint8_t*)block + block_size - os_page_size; mi_assert_internal(_mi_is_aligned(guard_page, os_page_size)); if (segment->allow_decommit && _mi_is_aligned(guard_page, os_page_size)) { _mi_os_protect(guard_page, os_page_size); } else { _mi_warning_message("unable to set a guard page behind an object due to pinned memory (large OS pages?) (object %p of size %zu)\n", block, block_size); } // align pointer just in front of the guard page size_t offset = block_size - os_page_size - obj_size; mi_assert_internal(offset > sizeof(mi_block_t)); if (offset > MI_BLOCK_ALIGNMENT_MAX) { // give up to place it right in front of the guard page if the offset is too large for unalignment offset = MI_BLOCK_ALIGNMENT_MAX; } void* p = (uint8_t*)block + offset; mi_track_align(block, p, offset, obj_size); mi_track_mem_defined(block, sizeof(mi_block_t)); return p; } mi_decl_restrict void* _mi_heap_malloc_guarded(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept { #if defined(MI_PADDING_SIZE) mi_assert(MI_PADDING_SIZE==0); #endif // allocate multiple of page size ending in a guard page // ensure minimal alignment requirement? const size_t os_page_size = _mi_os_page_size(); const size_t obj_size = (mi_option_is_enabled(mi_option_guarded_precise) ? size : _mi_align_up(size, MI_MAX_ALIGN_SIZE)); const size_t bsize = _mi_align_up(_mi_align_up(obj_size, MI_MAX_ALIGN_SIZE) + sizeof(mi_block_t), MI_MAX_ALIGN_SIZE); const size_t req_size = _mi_align_up(bsize + os_page_size, os_page_size); mi_block_t* const block = (mi_block_t*)_mi_malloc_generic(heap, req_size, zero, 0 /* huge_alignment */); if (block==NULL) return NULL; void* const p = mi_block_ptr_set_guarded(block, obj_size); // stats mi_track_malloc(p, size, zero); if (p != NULL) { if (!mi_heap_is_initialized(heap)) { heap = mi_prim_get_default_heap(); } #if MI_STAT>1 mi_heap_stat_increase(heap, malloc_requested, mi_usable_size(p)); #endif _mi_stat_counter_increase(&heap->tld->stats.malloc_guarded_count, 1); } #if MI_DEBUG>3 if (p != NULL && zero) { mi_assert_expensive(mi_mem_is_zero(p, size)); } #endif return p; } #endif // ------------------------------------------------------ // ensure explicit external inline definitions are emitted! // ------------------------------------------------------ #ifdef __cplusplus void* _mi_externs[] = { (void*)&_mi_page_malloc, (void*)&_mi_page_malloc_zero, (void*)&_mi_heap_malloc_zero, (void*)&_mi_heap_malloc_zero_ex, (void*)&mi_malloc, (void*)&mi_malloc_small, (void*)&mi_zalloc_small, (void*)&mi_heap_malloc, (void*)&mi_heap_zalloc, (void*)&mi_heap_malloc_small, // (void*)&mi_heap_alloc_new, // (void*)&mi_heap_alloc_new_n }; #endif ================================================ FILE: third-party/mimalloc/src/arena-abandon.c ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2019-2024, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #if !defined(MI_IN_ARENA_C) #error "this file should be included from 'arena.c' (so mi_arena_t is visible)" // add includes help an IDE #include "mimalloc.h" #include "mimalloc/internal.h" #include "bitmap.h" #endif // Minimal exports for arena-abandoned. size_t mi_arena_id_index(mi_arena_id_t id); mi_arena_t* mi_arena_from_index(size_t idx); size_t mi_arena_get_count(void); void* mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex); bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index); /* ----------------------------------------------------------- Abandoned blocks/segments: _mi_arena_segment_clear_abandoned _mi_arena_segment_mark_abandoned This is used to atomically abandon/reclaim segments (and crosses the arena API but it is convenient to have here). Abandoned segments still have live blocks; they get reclaimed when a thread frees a block in it, or when a thread needs a fresh segment. Abandoned segments are atomically marked in the `block_abandoned` bitmap of arenas. Any segments allocated outside arenas are put in the sub-process `abandoned_os_list`. This list is accessed using locks but this should be uncommon and generally uncontended. Reclaim and visiting either scan through the `block_abandoned` bitmaps of the arena's, or visit the `abandoned_os_list` A potentially nicer design is to use arena's for everything and perhaps have virtual arena's to map OS allocated memory but this would lack the "density" of our current arena's. TBC. ----------------------------------------------------------- */ // reclaim a specific OS abandoned segment; `true` on success. // sets the thread_id. static bool mi_arena_segment_os_clear_abandoned(mi_segment_t* segment, bool take_lock) { mi_assert(segment->memid.memkind != MI_MEM_ARENA); // not in an arena, remove from list of abandoned os segments mi_subproc_t* const subproc = segment->subproc; if (take_lock && !mi_lock_try_acquire(&subproc->abandoned_os_lock)) { return false; // failed to acquire the lock, we just give up } // remove atomically from the abandoned os list (if possible!) bool reclaimed = false; mi_segment_t* const next = segment->abandoned_os_next; mi_segment_t* const prev = segment->abandoned_os_prev; if (next != NULL || prev != NULL || subproc->abandoned_os_list == segment) { #if MI_DEBUG>3 // find ourselves in the abandoned list (and check the count) bool found = false; size_t count = 0; for (mi_segment_t* current = subproc->abandoned_os_list; current != NULL; current = current->abandoned_os_next) { if (current == segment) { found = true; } count++; } mi_assert_internal(found); mi_assert_internal(count == mi_atomic_load_relaxed(&subproc->abandoned_os_list_count)); #endif // remove (atomically) from the list and reclaim if (prev != NULL) { prev->abandoned_os_next = next; } else { subproc->abandoned_os_list = next; } if (next != NULL) { next->abandoned_os_prev = prev; } else { subproc->abandoned_os_list_tail = prev; } segment->abandoned_os_next = NULL; segment->abandoned_os_prev = NULL; mi_atomic_decrement_relaxed(&subproc->abandoned_count); mi_atomic_decrement_relaxed(&subproc->abandoned_os_list_count); if (take_lock) { // don't reset the thread_id when iterating mi_atomic_store_release(&segment->thread_id, _mi_thread_id()); } reclaimed = true; } if (take_lock) { mi_lock_release(&segment->subproc->abandoned_os_lock); } return reclaimed; } // reclaim a specific abandoned segment; `true` on success. // sets the thread_id. bool _mi_arena_segment_clear_abandoned(mi_segment_t* segment) { if mi_unlikely(segment->memid.memkind != MI_MEM_ARENA) { return mi_arena_segment_os_clear_abandoned(segment, true /* take lock */); } // arena segment: use the blocks_abandoned bitmap. size_t arena_idx; size_t bitmap_idx; mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx); mi_arena_t* arena = mi_arena_from_index(arena_idx); mi_assert_internal(arena != NULL); // reclaim atomically bool was_marked = _mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx); if (was_marked) { mi_assert_internal(mi_atomic_load_acquire(&segment->thread_id) == 0); mi_atomic_decrement_relaxed(&segment->subproc->abandoned_count); mi_atomic_store_release(&segment->thread_id, _mi_thread_id()); } // mi_assert_internal(was_marked); mi_assert_internal(!was_marked || _mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx)); //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx)); return was_marked; } // mark a specific OS segment as abandoned static void mi_arena_segment_os_mark_abandoned(mi_segment_t* segment) { mi_assert(segment->memid.memkind != MI_MEM_ARENA); // not in an arena; we use a list of abandoned segments mi_subproc_t* const subproc = segment->subproc; mi_lock(&subproc->abandoned_os_lock) { // push on the tail of the list (important for the visitor) mi_segment_t* prev = subproc->abandoned_os_list_tail; mi_assert_internal(prev == NULL || prev->abandoned_os_next == NULL); mi_assert_internal(segment->abandoned_os_prev == NULL); mi_assert_internal(segment->abandoned_os_next == NULL); if (prev != NULL) { prev->abandoned_os_next = segment; } else { subproc->abandoned_os_list = segment; } subproc->abandoned_os_list_tail = segment; segment->abandoned_os_prev = prev; segment->abandoned_os_next = NULL; mi_atomic_increment_relaxed(&subproc->abandoned_os_list_count); mi_atomic_increment_relaxed(&subproc->abandoned_count); // and release the lock } return; } // mark a specific segment as abandoned // clears the thread_id. void _mi_arena_segment_mark_abandoned(mi_segment_t* segment) { mi_assert_internal(segment->used == segment->abandoned); mi_atomic_store_release(&segment->thread_id, (uintptr_t)0); // mark as abandoned for multi-thread free's if mi_unlikely(segment->memid.memkind != MI_MEM_ARENA) { mi_arena_segment_os_mark_abandoned(segment); return; } // segment is in an arena, mark it in the arena `blocks_abandoned` bitmap size_t arena_idx; size_t bitmap_idx; mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx); mi_arena_t* arena = mi_arena_from_index(arena_idx); mi_assert_internal(arena != NULL); // set abandonment atomically mi_subproc_t* const subproc = segment->subproc; // don't access the segment after setting it abandoned const bool was_unmarked = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL); if (was_unmarked) { mi_atomic_increment_relaxed(&subproc->abandoned_count); } mi_assert_internal(was_unmarked); mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx)); } /* ----------------------------------------------------------- Iterate through the abandoned blocks/segments using a cursor. This is used for reclaiming and abandoned block visiting. ----------------------------------------------------------- */ // start a cursor at a randomized arena void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, bool visit_all, mi_arena_field_cursor_t* current) { mi_assert_internal(heap == NULL || heap->tld->segments.subproc == subproc); current->bitmap_idx = 0; current->subproc = subproc; current->visit_all = visit_all; current->hold_visit_lock = false; const size_t abandoned_count = mi_atomic_load_relaxed(&subproc->abandoned_count); const size_t abandoned_list_count = mi_atomic_load_relaxed(&subproc->abandoned_os_list_count); const size_t max_arena = mi_arena_get_count(); if (heap != NULL && heap->arena_id != _mi_arena_id_none()) { // for a heap that is bound to one arena, only visit that arena current->start = mi_arena_id_index(heap->arena_id); current->end = current->start + 1; current->os_list_count = 0; } else { // otherwise visit all starting at a random location if (abandoned_count > abandoned_list_count && max_arena > 0) { current->start = (heap == NULL || max_arena == 0 ? 0 : (mi_arena_id_t)(_mi_heap_random_next(heap) % max_arena)); current->end = current->start + max_arena; } else { current->start = 0; current->end = 0; } current->os_list_count = abandoned_list_count; // max entries to visit in the os abandoned list } mi_assert_internal(current->start <= max_arena); } void _mi_arena_field_cursor_done(mi_arena_field_cursor_t* current) { if (current->hold_visit_lock) { mi_lock_release(¤t->subproc->abandoned_os_visit_lock); current->hold_visit_lock = false; } } static mi_segment_t* mi_arena_segment_clear_abandoned_at(mi_arena_t* arena, mi_subproc_t* subproc, mi_bitmap_index_t bitmap_idx) { // try to reclaim an abandoned segment in the arena atomically if (!_mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx)) return NULL; mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx)); mi_segment_t* segment = (mi_segment_t*)mi_arena_block_start(arena, bitmap_idx); mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0); // check that the segment belongs to our sub-process // note: this is the reason we need the `abandoned_visit` lock in the case abandoned visiting is enabled. // without the lock an abandoned visit may otherwise fail to visit all abandoned segments in the sub-process. // for regular reclaim it is fine to miss one sometimes so without abandoned visiting we don't need the `abandoned_visit` lock. if (segment->subproc != subproc) { // it is from another sub-process, re-mark it and continue searching const bool was_zero = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL); mi_assert_internal(was_zero); MI_UNUSED(was_zero); return NULL; } else { // success, we unabandoned a segment in our sub-process mi_atomic_decrement_relaxed(&subproc->abandoned_count); return segment; } } static mi_segment_t* mi_arena_segment_clear_abandoned_next_field(mi_arena_field_cursor_t* previous) { const size_t max_arena = mi_arena_get_count(); size_t field_idx = mi_bitmap_index_field(previous->bitmap_idx); size_t bit_idx = mi_bitmap_index_bit_in_field(previous->bitmap_idx); // visit arena's (from the previous cursor) for (; previous->start < previous->end; previous->start++, field_idx = 0, bit_idx = 0) { // index wraps around size_t arena_idx = (previous->start >= max_arena ? previous->start % max_arena : previous->start); mi_arena_t* arena = mi_arena_from_index(arena_idx); if (arena != NULL) { bool has_lock = false; // visit the abandoned fields (starting at previous_idx) for (; field_idx < arena->field_count; field_idx++, bit_idx = 0) { size_t field = mi_atomic_load_relaxed(&arena->blocks_abandoned[field_idx]); if mi_unlikely(field != 0) { // skip zero fields quickly // we only take the arena lock if there are actually abandoned segments present if (!has_lock && mi_option_is_enabled(mi_option_visit_abandoned)) { has_lock = (previous->visit_all ? (mi_lock_acquire(&arena->abandoned_visit_lock),true) : mi_lock_try_acquire(&arena->abandoned_visit_lock)); if (!has_lock) { if (previous->visit_all) { _mi_error_message(EFAULT, "internal error: failed to visit all abandoned segments due to failure to acquire the visitor lock"); } // skip to next arena break; } } mi_assert_internal(has_lock || !mi_option_is_enabled(mi_option_visit_abandoned)); // visit each set bit in the field (todo: maybe use `ctz` here?) for (; bit_idx < MI_BITMAP_FIELD_BITS; bit_idx++) { // pre-check if the bit is set size_t mask = ((size_t)1 << bit_idx); if mi_unlikely((field & mask) == mask) { mi_bitmap_index_t bitmap_idx = mi_bitmap_index_create(field_idx, bit_idx); mi_segment_t* const segment = mi_arena_segment_clear_abandoned_at(arena, previous->subproc, bitmap_idx); if (segment != NULL) { //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx)); if (has_lock) { mi_lock_release(&arena->abandoned_visit_lock); } previous->bitmap_idx = mi_bitmap_index_create_ex(field_idx, bit_idx + 1); // start at next one for the next iteration return segment; } } } } } if (has_lock) { mi_lock_release(&arena->abandoned_visit_lock); } } } return NULL; } static mi_segment_t* mi_arena_segment_clear_abandoned_next_list(mi_arena_field_cursor_t* previous) { // go through the abandoned_os_list // we only allow one thread per sub-process to do to visit guarded by the `abandoned_os_visit_lock`. // The lock is released when the cursor is released. if (!previous->hold_visit_lock) { previous->hold_visit_lock = (previous->visit_all ? (mi_lock_acquire(&previous->subproc->abandoned_os_visit_lock),true) : mi_lock_try_acquire(&previous->subproc->abandoned_os_visit_lock)); if (!previous->hold_visit_lock) { if (previous->visit_all) { _mi_error_message(EFAULT, "internal error: failed to visit all abandoned segments due to failure to acquire the OS visitor lock"); } return NULL; // we cannot get the lock, give up } } // One list entry at a time while (previous->os_list_count > 0) { previous->os_list_count--; mi_lock_acquire(&previous->subproc->abandoned_os_lock); // this could contend with concurrent OS block abandonment and reclaim from `free` mi_segment_t* segment = previous->subproc->abandoned_os_list; // pop from head of the list, a subsequent mark will push at the end (and thus we iterate through os_list_count entries) if (segment == NULL || mi_arena_segment_os_clear_abandoned(segment, false /* we already have the lock */)) { mi_lock_release(&previous->subproc->abandoned_os_lock); return segment; } // already abandoned, try again mi_lock_release(&previous->subproc->abandoned_os_lock); } // done mi_assert_internal(previous->os_list_count == 0); return NULL; } // reclaim abandoned segments // this does not set the thread id (so it appears as still abandoned) mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous) { if (previous->start < previous->end) { // walk the arena mi_segment_t* segment = mi_arena_segment_clear_abandoned_next_field(previous); if (segment != NULL) { return segment; } } // no entries in the arena's anymore, walk the abandoned OS list mi_assert_internal(previous->start == previous->end); return mi_arena_segment_clear_abandoned_next_list(previous); } bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) { // (unfortunately) the visit_abandoned option must be enabled from the start. // This is to avoid taking locks if abandoned list visiting is not required (as for most programs) if (!mi_option_is_enabled(mi_option_visit_abandoned)) { _mi_error_message(EFAULT, "internal error: can only visit abandoned blocks when MIMALLOC_VISIT_ABANDONED=ON"); return false; } mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(NULL, _mi_subproc_from_id(subproc_id), true /* visit all (blocking) */, ¤t); mi_segment_t* segment; bool ok = true; while (ok && (segment = _mi_arena_segment_clear_abandoned_next(¤t)) != NULL) { ok = _mi_segment_visit_blocks(segment, heap_tag, visit_blocks, visitor, arg); _mi_arena_segment_mark_abandoned(segment); } _mi_arena_field_cursor_done(¤t); return ok; } ================================================ FILE: third-party/mimalloc/src/arena.c ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2019-2024, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ /* ---------------------------------------------------------------------------- "Arenas" are fixed area's of OS memory from which we can allocate large blocks (>= MI_ARENA_MIN_BLOCK_SIZE, 4MiB). In contrast to the rest of mimalloc, the arenas are shared between threads and need to be accessed using atomic operations. Arenas are also used to for huge OS page (1GiB) reservations or for reserving OS memory upfront which can be improve performance or is sometimes needed on embedded devices. We can also employ this with WASI or `sbrk` systems to reserve large arenas upfront and be able to reuse the memory more effectively. The arena allocation needs to be thread safe and we use an atomic bitmap to allocate. -----------------------------------------------------------------------------*/ #include "mimalloc.h" #include "mimalloc/internal.h" #include "mimalloc/atomic.h" #include "bitmap.h" /* ----------------------------------------------------------- Arena allocation ----------------------------------------------------------- */ // A memory arena descriptor typedef struct mi_arena_s { mi_arena_id_t id; // arena id; 0 for non-specific mi_memid_t memid; // memid of the memory area _Atomic(uint8_t*) start; // the start of the memory area size_t block_count; // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`) size_t field_count; // number of bitmap fields (where `field_count * MI_BITMAP_FIELD_BITS >= block_count`) size_t meta_size; // size of the arena structure itself (including its bitmaps) mi_memid_t meta_memid; // memid of the arena structure itself (OS or static allocation) int numa_node; // associated NUMA node bool exclusive; // only allow allocations if specifically for this arena bool is_large; // memory area consists of large- or huge OS pages (always committed) mi_lock_t abandoned_visit_lock; // lock is only used when abandoned segments are being visited _Atomic(size_t) search_idx; // optimization to start the search for free blocks _Atomic(mi_msecs_t) purge_expire; // expiration time when blocks should be purged from `blocks_purge`. mi_bitmap_field_t* blocks_dirty; // are the blocks potentially non-zero? mi_bitmap_field_t* blocks_committed; // are the blocks committed? (can be NULL for memory that cannot be decommitted) mi_bitmap_field_t* blocks_purge; // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted) mi_bitmap_field_t* blocks_abandoned; // blocks that start with an abandoned segment. (This crosses API's but it is convenient to have here) mi_bitmap_field_t blocks_inuse[1]; // in-place bitmap of in-use blocks (of size `field_count`) // do not add further fields here as the dirty, committed, purged, and abandoned bitmaps follow the inuse bitmap fields. } mi_arena_t; #define MI_ARENA_BLOCK_SIZE (MI_SEGMENT_SIZE) // 64MiB (must be at least MI_SEGMENT_ALIGN) #define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2) // 32MiB #define MI_MAX_ARENAS (132) // Limited as the reservation exponentially increases (and takes up .bss) // The available arenas static mi_decl_cache_align _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS]; static mi_decl_cache_align _Atomic(size_t) mi_arena_count; // = 0 static mi_decl_cache_align _Atomic(int64_t) mi_arenas_purge_expire; // set if there exist purgeable arenas #define MI_IN_ARENA_C #include "arena-abandon.c" #undef MI_IN_ARENA_C /* ----------------------------------------------------------- Arena id's id = arena_index + 1 ----------------------------------------------------------- */ size_t mi_arena_id_index(mi_arena_id_t id) { return (size_t)(id <= 0 ? MI_MAX_ARENAS : id - 1); } static mi_arena_id_t mi_arena_id_create(size_t arena_index) { mi_assert_internal(arena_index < MI_MAX_ARENAS); return (int)arena_index + 1; } mi_arena_id_t _mi_arena_id_none(void) { return 0; } static bool mi_arena_id_is_suitable(mi_arena_id_t arena_id, bool arena_is_exclusive, mi_arena_id_t req_arena_id) { return ((!arena_is_exclusive && req_arena_id == _mi_arena_id_none()) || (arena_id == req_arena_id)); } bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id) { if (memid.memkind == MI_MEM_ARENA) { return mi_arena_id_is_suitable(memid.mem.arena.id, memid.mem.arena.is_exclusive, request_arena_id); } else { return mi_arena_id_is_suitable(_mi_arena_id_none(), false, request_arena_id); } } bool _mi_arena_memid_is_os_allocated(mi_memid_t memid) { return (memid.memkind == MI_MEM_OS); } size_t mi_arena_get_count(void) { return mi_atomic_load_relaxed(&mi_arena_count); } mi_arena_t* mi_arena_from_index(size_t idx) { mi_assert_internal(idx < mi_arena_get_count()); return mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[idx]); } /* ----------------------------------------------------------- Arena allocations get a (currently) 16-bit memory id where the lower 8 bits are the arena id, and the upper bits the block index. ----------------------------------------------------------- */ static size_t mi_block_count_of_size(size_t size) { return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE); } static size_t mi_arena_block_size(size_t bcount) { return (bcount * MI_ARENA_BLOCK_SIZE); } static size_t mi_arena_size(mi_arena_t* arena) { return mi_arena_block_size(arena->block_count); } static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, mi_bitmap_index_t bitmap_index) { mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA); memid.mem.arena.id = id; memid.mem.arena.block_index = bitmap_index; memid.mem.arena.is_exclusive = is_exclusive; return memid; } bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index) { mi_assert_internal(memid.memkind == MI_MEM_ARENA); *arena_index = mi_arena_id_index(memid.mem.arena.id); *bitmap_index = memid.mem.arena.block_index; return memid.mem.arena.is_exclusive; } /* ----------------------------------------------------------- Special static area for mimalloc internal structures to avoid OS calls (for example, for the arena metadata (~= 256b)) ----------------------------------------------------------- */ #define MI_ARENA_STATIC_MAX ((MI_INTPTR_SIZE/2)*MI_KiB) // 4 KiB on 64-bit static mi_decl_cache_align uint8_t mi_arena_static[MI_ARENA_STATIC_MAX]; // must be cache aligned, see issue #895 static mi_decl_cache_align _Atomic(size_t) mi_arena_static_top; static void* mi_arena_static_zalloc(size_t size, size_t alignment, mi_memid_t* memid) { *memid = _mi_memid_none(); if (size == 0 || size > MI_ARENA_STATIC_MAX) return NULL; const size_t toplow = mi_atomic_load_relaxed(&mi_arena_static_top); if ((toplow + size) > MI_ARENA_STATIC_MAX) return NULL; // try to claim space if (alignment < MI_MAX_ALIGN_SIZE) { alignment = MI_MAX_ALIGN_SIZE; } const size_t oversize = size + alignment - 1; if (toplow + oversize > MI_ARENA_STATIC_MAX) return NULL; const size_t oldtop = mi_atomic_add_acq_rel(&mi_arena_static_top, oversize); size_t top = oldtop + oversize; if (top > MI_ARENA_STATIC_MAX) { // try to roll back, ok if this fails mi_atomic_cas_strong_acq_rel(&mi_arena_static_top, &top, oldtop); return NULL; } // success *memid = _mi_memid_create(MI_MEM_STATIC); memid->initially_zero = true; const size_t start = _mi_align_up(oldtop, alignment); uint8_t* const p = &mi_arena_static[start]; _mi_memzero_aligned(p, size); return p; } void* _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid) { *memid = _mi_memid_none(); // try static void* p = mi_arena_static_zalloc(size, MI_MAX_ALIGN_SIZE, memid); if (p != NULL) return p; // or fall back to the OS p = _mi_os_alloc(size, memid); if (p == NULL) return NULL; // zero the OS memory if needed if (!memid->initially_zero) { _mi_memzero_aligned(p, size); memid->initially_zero = true; } return p; } void _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size) { if (mi_memkind_is_os(memid.memkind)) { _mi_os_free(p, size, memid); } else { mi_assert(memid.memkind == MI_MEM_STATIC); } } void* mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex) { return (arena->start + mi_arena_block_size(mi_bitmap_index_bit(bindex))); } /* ----------------------------------------------------------- Thread safe allocation in an arena ----------------------------------------------------------- */ // claim the `blocks_inuse` bits static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx) { size_t idx = 0; // mi_atomic_load_relaxed(&arena->search_idx); // start from last search; ok to be relaxed as the exact start does not matter if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx)) { mi_atomic_store_relaxed(&arena->search_idx, mi_bitmap_index_field(*bitmap_idx)); // start search from found location next time around return true; }; return false; } /* ----------------------------------------------------------- Arena Allocation ----------------------------------------------------------- */ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t arena_index, size_t needed_bcount, bool commit, mi_memid_t* memid) { MI_UNUSED(arena_index); mi_assert_internal(mi_arena_id_index(arena->id) == arena_index); mi_bitmap_index_t bitmap_index; if (!mi_arena_try_claim(arena, needed_bcount, &bitmap_index)) return NULL; // claimed it! void* p = mi_arena_block_start(arena, bitmap_index); *memid = mi_memid_create_arena(arena->id, arena->exclusive, bitmap_index); memid->is_pinned = arena->memid.is_pinned; // none of the claimed blocks should be scheduled for a decommit if (arena->blocks_purge != NULL) { // this is thread safe as a potential purge only decommits parts that are not yet claimed as used (in `blocks_inuse`). _mi_bitmap_unclaim_across(arena->blocks_purge, arena->field_count, needed_bcount, bitmap_index); } // set the dirty bits (todo: no need for an atomic op here?) if (arena->memid.initially_zero && arena->blocks_dirty != NULL) { memid->initially_zero = _mi_bitmap_claim_across(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL); } // set commit state if (arena->blocks_committed == NULL) { // always committed memid->initially_committed = true; } else if (commit) { // commit requested, but the range may not be committed as a whole: ensure it is committed now memid->initially_committed = true; bool any_uncommitted; _mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted); if (any_uncommitted) { bool commit_zero = false; if (!_mi_os_commit(p, mi_arena_block_size(needed_bcount), &commit_zero)) { memid->initially_committed = false; } else { if (commit_zero) { memid->initially_zero = true; } } } } else { // no need to commit, but check if already fully committed memid->initially_committed = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index); } return p; } // allocate in a specific arena static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node, size_t size, size_t alignment, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid ) { MI_UNUSED_RELEASE(alignment); mi_assert(alignment <= MI_SEGMENT_ALIGN); const size_t bcount = mi_block_count_of_size(size); const size_t arena_index = mi_arena_id_index(arena_id); mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count)); mi_assert_internal(size <= mi_arena_block_size(bcount)); // Check arena suitability mi_arena_t* arena = mi_arena_from_index(arena_index); if (arena == NULL) return NULL; if (!allow_large && arena->is_large) return NULL; if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return NULL; if (req_arena_id == _mi_arena_id_none()) { // in not specific, check numa affinity const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node); if (match_numa_node) { if (!numa_suitable) return NULL; } else { if (numa_suitable) return NULL; } } // try to allocate void* p = mi_arena_try_alloc_at(arena, arena_index, bcount, commit, memid); mi_assert_internal(p == NULL || _mi_is_aligned(p, alignment)); return p; } // allocate from an arena with fallback to the OS static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid ) { MI_UNUSED(alignment); mi_assert_internal(alignment <= MI_SEGMENT_ALIGN); const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); if mi_likely(max_arena == 0) return NULL; if (req_arena_id != _mi_arena_id_none()) { // try a specific arena if requested if (mi_arena_id_index(req_arena_id) < max_arena) { void* p = mi_arena_try_alloc_at_id(req_arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid); if (p != NULL) return p; } } else { // try numa affine allocation for (size_t i = 0; i < max_arena; i++) { void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid); if (p != NULL) return p; } // try from another numa node instead.. if (numa_node >= 0) { // if numa_node was < 0 (no specific affinity requested), all arena's have been tried already for (size_t i = 0; i < max_arena; i++) { void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), false /* only proceed if not numa local */, numa_node, size, alignment, commit, allow_large, req_arena_id, memid); if (p != NULL) return p; } } } return NULL; } // try to reserve a fresh arena space static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t *arena_id) { if (_mi_preloading()) return false; // use OS only while pre loading const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count); if (arena_count > (MI_MAX_ARENAS - 4)) return false; size_t arena_reserve = mi_option_get_size(mi_option_arena_reserve); if (arena_reserve == 0) return false; if (!_mi_os_has_virtual_reserve()) { arena_reserve = arena_reserve/4; // be conservative if virtual reserve is not supported (for WASM for example) } arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_BLOCK_SIZE); arena_reserve = _mi_align_up(arena_reserve, MI_SEGMENT_SIZE); if (arena_count >= 8 && arena_count <= 128) { // scale up the arena sizes exponentially every 8 entries (128 entries get to 589TiB) const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16 ); size_t reserve = 0; if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) { arena_reserve = reserve; } } if (arena_reserve < req_size) return false; // should be able to at least handle the current allocation size // commit eagerly? bool arena_commit = false; if (mi_option_get(mi_option_arena_eager_commit) == 2) { arena_commit = _mi_os_has_overcommit(); } else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; } return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id) == 0); } void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid) { mi_assert_internal(memid != NULL); mi_assert_internal(size > 0); *memid = _mi_memid_none(); const int numa_node = _mi_os_numa_node(); // current numa node // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data) if (!mi_option_is_enabled(mi_option_disallow_arena_alloc)) { // is arena allocation allowed? if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN && align_offset == 0) { void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, memid); if (p != NULL) return p; // otherwise, try to first eagerly reserve a new arena if (req_arena_id == _mi_arena_id_none()) { mi_arena_id_t arena_id = 0; if (mi_arena_reserve(size, allow_large, &arena_id)) { // and try allocate in there mi_assert_internal(req_arena_id == _mi_arena_id_none()); p = mi_arena_try_alloc_at_id(arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid); if (p != NULL) return p; } } } } // if we cannot use OS allocation, return NULL if (mi_option_is_enabled(mi_option_disallow_os_alloc) || req_arena_id != _mi_arena_id_none()) { errno = ENOMEM; return NULL; } // finally, fall back to the OS if (align_offset > 0) { return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid); } else { return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid); } } void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid) { return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid); } void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) { if (size != NULL) *size = 0; size_t arena_index = mi_arena_id_index(arena_id); if (arena_index >= MI_MAX_ARENAS) return NULL; mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]); if (arena == NULL) return NULL; if (size != NULL) { *size = mi_arena_block_size(arena->block_count); } return arena->start; } /* ----------------------------------------------------------- Arena purge ----------------------------------------------------------- */ static long mi_arena_purge_delay(void) { // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult)); } // reset or decommit in an arena and update the committed/decommit bitmaps // assumes we own the area (i.e. blocks_in_use is claimed by us) static void mi_arena_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks) { mi_assert_internal(arena->blocks_committed != NULL); mi_assert_internal(arena->blocks_purge != NULL); mi_assert_internal(!arena->memid.is_pinned); const size_t size = mi_arena_block_size(blocks); void* const p = mi_arena_block_start(arena, bitmap_idx); bool needs_recommit; if (_mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx)) { // all blocks are committed, we can purge freely needs_recommit = _mi_os_purge(p, size); } else { // some blocks are not committed -- this can happen when a partially committed block is freed // in `_mi_arena_free` and it is conservatively marked as uncommitted but still scheduled for a purge // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory), // and also undo the decommit stats (as it was already adjusted) mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits)); needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, 0); } // clear the purged blocks _mi_bitmap_unclaim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx); // update committed bitmap if (needs_recommit) { _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx); } } // Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls. // Note: assumes we (still) own the area as we may purge immediately static void mi_arena_schedule_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks) { mi_assert_internal(arena->blocks_purge != NULL); const long delay = mi_arena_purge_delay(); if (delay < 0) return; // is purging allowed at all? if (_mi_preloading() || delay == 0) { // decommit directly mi_arena_purge(arena, bitmap_idx, blocks); } else { // schedule purge const mi_msecs_t expire = _mi_clock_now() + delay; mi_msecs_t expire0 = 0; if (mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire0, expire)) { // expiration was not yet set // maybe set the global arenas expire as well (if it wasn't set already) mi_atomic_casi64_strong_acq_rel(&mi_arenas_purge_expire, &expire0, expire); } else { // already an expiration was set } _mi_bitmap_claim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx, NULL); } } // purge a range of blocks // return true if the full range was purged. // assumes we own the area (i.e. blocks_in_use is claimed by us) static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startidx, size_t bitlen, size_t purge) { const size_t endidx = startidx + bitlen; size_t bitidx = startidx; bool all_purged = false; while (bitidx < endidx) { // count consecutive ones in the purge mask size_t count = 0; while (bitidx + count < endidx && (purge & ((size_t)1 << (bitidx + count))) != 0) { count++; } if (count > 0) { // found range to be purged const mi_bitmap_index_t range_idx = mi_bitmap_index_create(idx, bitidx); mi_arena_purge(arena, range_idx, count); if (count == bitlen) { all_purged = true; } } bitidx += (count+1); // +1 to skip the zero bit (or end) } return all_purged; } // returns true if anything was purged static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force) { // check pre-conditions if (arena->memid.is_pinned) return false; // expired yet? mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire); if (!force && (expire == 0 || expire > now)) return false; // reset expire (if not already set concurrently) mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, (mi_msecs_t)0); _mi_stat_counter_increase(&_mi_stats_main.arena_purges, 1); // potential purges scheduled, walk through the bitmap bool any_purged = false; bool full_purge = true; for (size_t i = 0; i < arena->field_count; i++) { size_t purge = mi_atomic_load_relaxed(&arena->blocks_purge[i]); if (purge != 0) { size_t bitidx = 0; while (bitidx < MI_BITMAP_FIELD_BITS) { // find consecutive range of ones in the purge mask size_t bitlen = 0; while (bitidx + bitlen < MI_BITMAP_FIELD_BITS && (purge & ((size_t)1 << (bitidx + bitlen))) != 0) { bitlen++; } // temporarily claim the purge range as "in-use" to be thread-safe with allocation // try to claim the longest range of corresponding in_use bits const mi_bitmap_index_t bitmap_index = mi_bitmap_index_create(i, bitidx); while( bitlen > 0 ) { if (_mi_bitmap_try_claim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index)) { break; } bitlen--; } // actual claimed bits at `in_use` if (bitlen > 0) { // read purge again now that we have the in_use bits purge = mi_atomic_load_acquire(&arena->blocks_purge[i]); if (!mi_arena_purge_range(arena, i, bitidx, bitlen, purge)) { full_purge = false; } any_purged = true; // release the claimed `in_use` bits again _mi_bitmap_unclaim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index); } bitidx += (bitlen+1); // +1 to skip the zero (or end) } // while bitidx } // purge != 0 } // if not fully purged, make sure to purge again in the future if (!full_purge) { const long delay = mi_arena_purge_delay(); mi_msecs_t expected = 0; mi_atomic_casi64_strong_acq_rel(&arena->purge_expire,&expected,_mi_clock_now() + delay); } return any_purged; } static void mi_arenas_try_purge( bool force, bool visit_all ) { if (_mi_preloading() || mi_arena_purge_delay() <= 0) return; // nothing will be scheduled // check if any arena needs purging? const mi_msecs_t now = _mi_clock_now(); mi_msecs_t arenas_expire = mi_atomic_loadi64_acquire(&mi_arenas_purge_expire); if (!force && (arenas_expire == 0 || arenas_expire < now)) return; const size_t max_arena = mi_atomic_load_acquire(&mi_arena_count); if (max_arena == 0) return; // allow only one thread to purge at a time static mi_atomic_guard_t purge_guard; mi_atomic_guard(&purge_guard) { // increase global expire: at most one purge per delay cycle mi_atomic_storei64_release(&mi_arenas_purge_expire, now + mi_arena_purge_delay()); size_t max_purge_count = (visit_all ? max_arena : 2); bool all_visited = true; for (size_t i = 0; i < max_arena; i++) { mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); if (arena != NULL) { if (mi_arena_try_purge(arena, now, force)) { if (max_purge_count <= 1) { all_visited = false; break; } max_purge_count--; } } } if (all_visited) { // all arena's were visited and purged: reset global expire mi_atomic_storei64_release(&mi_arenas_purge_expire, 0); } } } /* ----------------------------------------------------------- Arena free ----------------------------------------------------------- */ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid) { mi_assert_internal(size > 0); mi_assert_internal(committed_size <= size); if (p==NULL) return; if (size==0) return; const bool all_committed = (committed_size == size); // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.) mi_track_mem_undefined(p,size); if (mi_memkind_is_os(memid.memkind)) { // was a direct OS allocation, pass through if (!all_committed && committed_size > 0) { // if partially committed, adjust the committed stats (as `_mi_os_free` will increase decommit by the full size) _mi_stat_decrease(&_mi_stats_main.committed, committed_size); } _mi_os_free(p, size, memid); } else if (memid.memkind == MI_MEM_ARENA) { // allocated in an arena size_t arena_idx; size_t bitmap_idx; mi_arena_memid_indices(memid, &arena_idx, &bitmap_idx); mi_assert_internal(arena_idx < MI_MAX_ARENAS); mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t,&mi_arenas[arena_idx]); mi_assert_internal(arena != NULL); const size_t blocks = mi_block_count_of_size(size); // checks if (arena == NULL) { _mi_error_message(EINVAL, "trying to free from an invalid arena: %p, size %zu, memid: 0x%zx\n", p, size, memid); return; } mi_assert_internal(arena->field_count > mi_bitmap_index_field(bitmap_idx)); if (arena->field_count <= mi_bitmap_index_field(bitmap_idx)) { _mi_error_message(EINVAL, "trying to free from an invalid arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid); return; } // potentially decommit if (arena->memid.is_pinned || arena->blocks_committed == NULL) { mi_assert_internal(all_committed); } else { mi_assert_internal(arena->blocks_committed != NULL); mi_assert_internal(arena->blocks_purge != NULL); if (!all_committed) { // mark the entire range as no longer committed (so we recommit the full range when re-using) _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx); mi_track_mem_noaccess(p,size); if (committed_size > 0) { // if partially committed, adjust the committed stats (is it will be recommitted when re-using) // in the delayed purge, we now need to not count a decommit if the range is not marked as committed. _mi_stat_decrease(&_mi_stats_main.committed, committed_size); } // note: if not all committed, it may be that the purge will reset/decommit the entire range // that contains already decommitted parts. Since purge consistently uses reset or decommit that // works (as we should never reset decommitted parts). } // (delay) purge the entire range mi_arena_schedule_purge(arena, bitmap_idx, blocks); } // and make it available to others again bool all_inuse = _mi_bitmap_unclaim_across(arena->blocks_inuse, arena->field_count, blocks, bitmap_idx); if (!all_inuse) { _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", p, size); return; }; } else { // arena was none, external, or static; nothing to do mi_assert_internal(memid.memkind < MI_MEM_OS); } // purge expired decommits mi_arenas_try_purge(false, false); } // destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit` // for dynamic libraries that are unloaded and need to release all their allocated memory. static void mi_arenas_unsafe_destroy(void) { const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); size_t new_max_arena = 0; for (size_t i = 0; i < max_arena; i++) { mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); if (arena != NULL) { mi_lock_done(&arena->abandoned_visit_lock); if (arena->start != NULL && mi_memkind_is_os(arena->memid.memkind)) { mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL); _mi_os_free(arena->start, mi_arena_size(arena), arena->memid); } else { new_max_arena = i; } _mi_arena_meta_free(arena, arena->meta_memid, arena->meta_size); } } // try to lower the max arena. size_t expected = max_arena; mi_atomic_cas_strong_acq_rel(&mi_arena_count, &expected, new_max_arena); } // Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired void _mi_arenas_collect(bool force_purge) { mi_arenas_try_purge(force_purge, force_purge /* visit all? */); } // destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit` // for dynamic libraries that are unloaded and need to release all their allocated memory. void _mi_arena_unsafe_destroy_all(void) { mi_arenas_unsafe_destroy(); _mi_arenas_collect(true /* force purge */); // purge non-owned arenas } // Is a pointer inside any of our arenas? bool _mi_arena_contains(const void* p) { const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); for (size_t i = 0; i < max_arena; i++) { mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]); if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_arena_block_size(arena->block_count) > (const uint8_t*)p) { return true; } } return false; } /* ----------------------------------------------------------- Add an arena. ----------------------------------------------------------- */ static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) { mi_assert_internal(arena != NULL); mi_assert_internal((uintptr_t)mi_atomic_load_ptr_relaxed(uint8_t,&arena->start) % MI_SEGMENT_ALIGN == 0); mi_assert_internal(arena->block_count > 0); if (arena_id != NULL) { *arena_id = -1; } size_t i = mi_atomic_increment_acq_rel(&mi_arena_count); if (i >= MI_MAX_ARENAS) { mi_atomic_decrement_acq_rel(&mi_arena_count); return false; } _mi_stat_counter_increase(&stats->arena_count,1); arena->id = mi_arena_id_create(i); mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena); if (arena_id != NULL) { *arena_id = arena->id; } return true; } static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept { if (arena_id != NULL) *arena_id = _mi_arena_id_none(); if (size < MI_ARENA_BLOCK_SIZE) { _mi_warning_message("the arena size is too small (memory at %p with size %zu)\n", start, size); return false; } if (is_large) { mi_assert_internal(memid.initially_committed && memid.is_pinned); } if (!_mi_is_aligned(start, MI_SEGMENT_ALIGN)) { void* const aligned_start = mi_align_up_ptr(start, MI_SEGMENT_ALIGN); const size_t diff = (uint8_t*)aligned_start - (uint8_t*)start; if (diff >= size || (size - diff) < MI_ARENA_BLOCK_SIZE) { _mi_warning_message("after alignment, the size of the arena becomes too small (memory at %p with size %zu)\n", start, size); return false; } start = aligned_start; size = size - diff; } const size_t bcount = size / MI_ARENA_BLOCK_SIZE; const size_t fields = _mi_divide_up(bcount, MI_BITMAP_FIELD_BITS); const size_t bitmaps = (memid.is_pinned ? 3 : 5); const size_t asize = sizeof(mi_arena_t) + (bitmaps*fields*sizeof(mi_bitmap_field_t)); mi_memid_t meta_memid; mi_arena_t* arena = (mi_arena_t*)_mi_arena_meta_zalloc(asize, &meta_memid); if (arena == NULL) return false; // already zero'd due to zalloc // _mi_memzero(arena, asize); arena->id = _mi_arena_id_none(); arena->memid = memid; arena->exclusive = exclusive; arena->meta_size = asize; arena->meta_memid = meta_memid; arena->block_count = bcount; arena->field_count = fields; arena->start = (uint8_t*)start; arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1) arena->is_large = is_large; arena->purge_expire = 0; arena->search_idx = 0; mi_lock_init(&arena->abandoned_visit_lock); // consecutive bitmaps arena->blocks_dirty = &arena->blocks_inuse[fields]; // just after inuse bitmap arena->blocks_abandoned = &arena->blocks_inuse[2 * fields]; // just after dirty bitmap arena->blocks_committed = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[3*fields]); // just after abandoned bitmap arena->blocks_purge = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[4*fields]); // just after committed bitmap // initialize committed bitmap? if (arena->blocks_committed != NULL && arena->memid.initially_committed) { memset((void*)arena->blocks_committed, 0xFF, fields*sizeof(mi_bitmap_field_t)); // cast to void* to avoid atomic warning } // and claim leftover blocks if needed (so we never allocate there) ptrdiff_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount; mi_assert_internal(post >= 0); if (post > 0) { // don't use leftover bits at the end mi_bitmap_index_t postidx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post); _mi_bitmap_claim(arena->blocks_inuse, fields, post, postidx, NULL); } return mi_arena_add(arena, arena_id, &_mi_stats_main); } bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { mi_memid_t memid = _mi_memid_create(MI_MEM_EXTERNAL); memid.initially_committed = is_committed; memid.initially_zero = is_zero; memid.is_pinned = is_large; return mi_manage_os_memory_ex2(start,size,is_large,numa_node,exclusive,memid, arena_id); } // Reserve a range of regular OS memory int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { if (arena_id != NULL) *arena_id = _mi_arena_id_none(); size = _mi_align_up(size, MI_ARENA_BLOCK_SIZE); // at least one block mi_memid_t memid; void* start = _mi_os_alloc_aligned(size, MI_SEGMENT_ALIGN, commit, allow_large, &memid); if (start == NULL) return ENOMEM; const bool is_large = memid.is_pinned; // todo: use separate is_large field? if (!mi_manage_os_memory_ex2(start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) { _mi_os_free_ex(start, size, commit, memid); _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024)); return ENOMEM; } _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size, 1024), is_large ? " (in large os pages)" : ""); return 0; } // Manage a range of regular OS memory bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept { return mi_manage_os_memory_ex(start, size, is_committed, is_large, is_zero, numa_node, false /* exclusive? */, NULL); } // Reserve a range of regular OS memory int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept { return mi_reserve_os_memory_ex(size, commit, allow_large, false, NULL); } /* ----------------------------------------------------------- Debugging ----------------------------------------------------------- */ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t block_count, mi_bitmap_field_t* fields, size_t field_count ) { _mi_message("%s%s:\n", prefix, header); size_t bcount = 0; size_t inuse_count = 0; for (size_t i = 0; i < field_count; i++) { char buf[MI_BITMAP_FIELD_BITS + 1]; uintptr_t field = mi_atomic_load_relaxed(&fields[i]); for (size_t bit = 0; bit < MI_BITMAP_FIELD_BITS; bit++, bcount++) { if (bcount < block_count) { bool inuse = ((((uintptr_t)1 << bit) & field) != 0); if (inuse) inuse_count++; buf[bit] = (inuse ? 'x' : '.'); } else { buf[bit] = ' '; } } buf[MI_BITMAP_FIELD_BITS] = 0; _mi_message("%s %s\n", prefix, buf); } _mi_message("%s total ('x'): %zu\n", prefix, inuse_count); return inuse_count; } void mi_debug_show_arenas(void) mi_attr_noexcept { const bool show_inuse = true; size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count); size_t inuse_total = 0; //size_t abandoned_total = 0; //size_t purge_total = 0; for (size_t i = 0; i < max_arenas; i++) { mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]); if (arena == NULL) break; _mi_message("arena %zu: %zu blocks of size %zuMiB (in %zu fields) %s\n", i, arena->block_count, MI_ARENA_BLOCK_SIZE / MI_MiB, arena->field_count, (arena->memid.is_pinned ? ", pinned" : "")); if (show_inuse) { inuse_total += mi_debug_show_bitmap(" ", "inuse blocks", arena->block_count, arena->blocks_inuse, arena->field_count); } if (arena->blocks_committed != NULL) { mi_debug_show_bitmap(" ", "committed blocks", arena->block_count, arena->blocks_committed, arena->field_count); } //if (show_abandoned) { // abandoned_total += mi_debug_show_bitmap(" ", "abandoned blocks", arena->block_count, arena->blocks_abandoned, arena->field_count); //} //if (show_purge && arena->blocks_purge != NULL) { // purge_total += mi_debug_show_bitmap(" ", "purgeable blocks", arena->block_count, arena->blocks_purge, arena->field_count); //} } if (show_inuse) _mi_message("total inuse blocks : %zu\n", inuse_total); //if (show_abandoned) _mi_message("total abandoned blocks: %zu\n", abandoned_total); //if (show_purge) _mi_message("total purgeable blocks: %zu\n", purge_total); } void mi_arenas_print(void) mi_attr_noexcept { mi_debug_show_arenas(); } /* ----------------------------------------------------------- Reserve a huge page arena. ----------------------------------------------------------- */ // reserve at a specific numa node int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { if (arena_id != NULL) *arena_id = -1; if (pages==0) return 0; if (numa_node < -1) numa_node = -1; if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count(); size_t hsize = 0; size_t pages_reserved = 0; mi_memid_t memid; void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, timeout_msecs, &pages_reserved, &hsize, &memid); if (p==NULL || pages_reserved==0) { _mi_warning_message("failed to reserve %zu GiB huge pages\n", pages); return ENOMEM; } _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages); if (!mi_manage_os_memory_ex2(p, hsize, true, numa_node, exclusive, memid, arena_id)) { _mi_os_free(p, hsize, memid); return ENOMEM; } return 0; } int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept { return mi_reserve_huge_os_pages_at_ex(pages, numa_node, timeout_msecs, false, NULL); } // reserve huge pages evenly among the given number of numa nodes (or use the available ones as detected) int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept { if (pages == 0) return 0; // pages per numa node size_t numa_count = (numa_nodes > 0 ? numa_nodes : _mi_os_numa_node_count()); if (numa_count <= 0) numa_count = 1; const size_t pages_per = pages / numa_count; const size_t pages_mod = pages % numa_count; const size_t timeout_per = (timeout_msecs==0 ? 0 : (timeout_msecs / numa_count) + 50); // reserve evenly among numa nodes for (size_t numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) { size_t node_pages = pages_per; // can be 0 if (numa_node < pages_mod) node_pages++; int err = mi_reserve_huge_os_pages_at(node_pages, (int)numa_node, timeout_per); if (err) return err; if (pages < node_pages) { pages = 0; } else { pages -= node_pages; } } return 0; } int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept { MI_UNUSED(max_secs); _mi_warning_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n"); if (pages_reserved != NULL) *pages_reserved = 0; int err = mi_reserve_huge_os_pages_interleave(pages, 0, (size_t)(max_secs * 1000.0)); if (err==0 && pages_reserved!=NULL) *pages_reserved = pages; return err; } ================================================ FILE: third-party/mimalloc/src/bitmap.c ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2019-2023 Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ /* ---------------------------------------------------------------------------- Concurrent bitmap that can set/reset sequences of bits atomically, represented as an array of fields where each field is a machine word (`size_t`) There are two api's; the standard one cannot have sequences that cross between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS). The `_across` postfixed functions do allow sequences that can cross over between the fields. (This is used in arena allocation) ---------------------------------------------------------------------------- */ #include "mimalloc.h" #include "mimalloc/internal.h" #include "bitmap.h" /* ----------------------------------------------------------- Bitmap definition ----------------------------------------------------------- */ // The bit mask for a given number of blocks at a specified bit index. static inline size_t mi_bitmap_mask_(size_t count, size_t bitidx) { mi_assert_internal(count + bitidx <= MI_BITMAP_FIELD_BITS); mi_assert_internal(count > 0); if (count >= MI_BITMAP_FIELD_BITS) return MI_BITMAP_FIELD_FULL; if (count == 0) return 0; return ((((size_t)1 << count) - 1) << bitidx); } /* ----------------------------------------------------------- Claim a bit sequence atomically ----------------------------------------------------------- */ // Try to atomically claim a sequence of `count` bits in a single // field at `idx` in `bitmap`. Returns `true` on success. inline bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx) { mi_assert_internal(bitmap_idx != NULL); mi_assert_internal(count <= MI_BITMAP_FIELD_BITS); mi_assert_internal(count > 0); mi_bitmap_field_t* field = &bitmap[idx]; size_t map = mi_atomic_load_relaxed(field); if (map==MI_BITMAP_FIELD_FULL) return false; // short cut // search for 0-bit sequence of length count const size_t mask = mi_bitmap_mask_(count, 0); const size_t bitidx_max = MI_BITMAP_FIELD_BITS - count; #ifdef MI_HAVE_FAST_BITSCAN size_t bitidx = mi_ctz(~map); // quickly find the first zero bit if possible #else size_t bitidx = 0; // otherwise start at 0 #endif size_t m = (mask << bitidx); // invariant: m == mask shifted by bitidx // scan linearly for a free range of zero bits while (bitidx <= bitidx_max) { const size_t mapm = (map & m); if (mapm == 0) { // are the mask bits free at bitidx? mi_assert_internal((m >> bitidx) == mask); // no overflow? const size_t newmap = (map | m); mi_assert_internal((newmap^map) >> bitidx == mask); if (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)) { // TODO: use weak cas here? // no success, another thread claimed concurrently.. keep going (with updated `map`) continue; } else { // success, we claimed the bits! *bitmap_idx = mi_bitmap_index_create(idx, bitidx); return true; } } else { // on to the next bit range #ifdef MI_HAVE_FAST_BITSCAN mi_assert_internal(mapm != 0); const size_t shift = (count == 1 ? 1 : (MI_SIZE_BITS - mi_clz(mapm) - bitidx)); mi_assert_internal(shift > 0 && shift <= count); #else const size_t shift = 1; #endif bitidx += shift; m <<= shift; } } // no bits found return false; } // Find `count` bits of 0 and set them to 1 atomically; returns `true` on success. // Starts at idx, and wraps around to search in all `bitmap_fields` fields. // `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields. bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx) { size_t idx = start_field_idx; for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) { if (idx >= bitmap_fields) { idx = 0; } // wrap if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) { return true; } } return false; } // Like _mi_bitmap_try_find_from_claim but with an extra predicate that must be fullfilled bool _mi_bitmap_try_find_from_claim_pred(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_pred_fun_t pred_fun, void* pred_arg, mi_bitmap_index_t* bitmap_idx) { size_t idx = start_field_idx; for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) { if (idx >= bitmap_fields) idx = 0; // wrap if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) { if (pred_fun == NULL || pred_fun(*bitmap_idx, pred_arg)) { return true; } // predicate returned false, unclaim and look further _mi_bitmap_unclaim(bitmap, bitmap_fields, count, *bitmap_idx); } } return false; } // Set `count` bits at `bitmap_idx` to 0 atomically // Returns `true` if all `count` bits were 1 previously. bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { const size_t idx = mi_bitmap_index_field(bitmap_idx); const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx); const size_t mask = mi_bitmap_mask_(count, bitidx); mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields); // mi_assert_internal((bitmap[idx] & mask) == mask); const size_t prev = mi_atomic_and_acq_rel(&bitmap[idx], ~mask); return ((prev & mask) == mask); } // Set `count` bits at `bitmap_idx` to 1 atomically // Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit. bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero) { const size_t idx = mi_bitmap_index_field(bitmap_idx); const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx); const size_t mask = mi_bitmap_mask_(count, bitidx); mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields); //mi_assert_internal(any_zero != NULL || (bitmap[idx] & mask) == 0); size_t prev = mi_atomic_or_acq_rel(&bitmap[idx], mask); if (any_zero != NULL) { *any_zero = ((prev & mask) != mask); } return ((prev & mask) == 0); } // Returns `true` if all `count` bits were 1. `any_ones` is `true` if there was at least one bit set to one. static bool mi_bitmap_is_claimedx(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_ones) { const size_t idx = mi_bitmap_index_field(bitmap_idx); const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx); const size_t mask = mi_bitmap_mask_(count, bitidx); mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields); const size_t field = mi_atomic_load_relaxed(&bitmap[idx]); if (any_ones != NULL) { *any_ones = ((field & mask) != 0); } return ((field & mask) == mask); } // Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically. // Returns `true` if successful when all previous `count` bits were 0. bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { const size_t idx = mi_bitmap_index_field(bitmap_idx); const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx); const size_t mask = mi_bitmap_mask_(count, bitidx); mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields); size_t expected = mi_atomic_load_relaxed(&bitmap[idx]); do { if ((expected & mask) != 0) return false; } while (!mi_atomic_cas_strong_acq_rel(&bitmap[idx], &expected, expected | mask)); mi_assert_internal((expected & mask) == 0); return true; } bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { return mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, NULL); } bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { bool any_ones; mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, &any_ones); return any_ones; } //-------------------------------------------------------------------------- // the `_across` functions work on bitmaps where sequences can cross over // between the fields. This is used in arena allocation //-------------------------------------------------------------------------- // Try to atomically claim a sequence of `count` bits starting from the field // at `idx` in `bitmap` and crossing into subsequent fields. Returns `true` on success. // Only needs to consider crossing into the next fields (see `mi_bitmap_try_find_from_claim_across`) static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t idx, const size_t count, const size_t retries, mi_bitmap_index_t* bitmap_idx) { mi_assert_internal(bitmap_idx != NULL); // check initial trailing zeros mi_bitmap_field_t* field = &bitmap[idx]; size_t map = mi_atomic_load_relaxed(field); const size_t initial = mi_clz(map); // count of initial zeros starting at idx mi_assert_internal(initial <= MI_BITMAP_FIELD_BITS); if (initial == 0) return false; if (initial >= count) return _mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx); // no need to cross fields (this case won't happen for us) if (_mi_divide_up(count - initial, MI_BITMAP_FIELD_BITS) >= (bitmap_fields - idx)) return false; // not enough entries // scan ahead size_t found = initial; size_t mask = 0; // mask bits for the final field while(found < count) { field++; map = mi_atomic_load_relaxed(field); const size_t mask_bits = (found + MI_BITMAP_FIELD_BITS <= count ? MI_BITMAP_FIELD_BITS : (count - found)); mi_assert_internal(mask_bits > 0 && mask_bits <= MI_BITMAP_FIELD_BITS); mask = mi_bitmap_mask_(mask_bits, 0); if ((map & mask) != 0) return false; // some part is already claimed found += mask_bits; } mi_assert_internal(field < &bitmap[bitmap_fields]); // we found a range of contiguous zeros up to the final field; mask contains mask in the final field // now try to claim the range atomically mi_bitmap_field_t* const final_field = field; const size_t final_mask = mask; mi_bitmap_field_t* const initial_field = &bitmap[idx]; const size_t initial_idx = MI_BITMAP_FIELD_BITS - initial; const size_t initial_mask = mi_bitmap_mask_(initial, initial_idx); // initial field size_t newmap; field = initial_field; map = mi_atomic_load_relaxed(field); do { newmap = (map | initial_mask); if ((map & initial_mask) != 0) { goto rollback; }; } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)); // intermediate fields while (++field < final_field) { newmap = MI_BITMAP_FIELD_FULL; map = 0; if (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)) { goto rollback; } } // final field mi_assert_internal(field == final_field); map = mi_atomic_load_relaxed(field); do { newmap = (map | final_mask); if ((map & final_mask) != 0) { goto rollback; } } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)); // claimed! *bitmap_idx = mi_bitmap_index_create(idx, initial_idx); return true; rollback: // roll back intermediate fields // (we just failed to claim `field` so decrement first) while (--field > initial_field) { newmap = 0; map = MI_BITMAP_FIELD_FULL; mi_assert_internal(mi_atomic_load_relaxed(field) == map); mi_atomic_store_release(field, newmap); } if (field == initial_field) { // (if we failed on the initial field, `field + 1 == initial_field`) map = mi_atomic_load_relaxed(field); do { mi_assert_internal((map & initial_mask) == initial_mask); newmap = (map & ~initial_mask); } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)); } mi_stat_counter_increase(_mi_stats_main.arena_rollback_count,1); // retry? (we make a recursive call instead of goto to be able to use const declarations) if (retries <= 2) { return mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, retries+1, bitmap_idx); } else { return false; } } // Find `count` bits of zeros and set them to 1 atomically; returns `true` on success. // Starts at idx, and wraps around to search in all `bitmap_fields` fields. bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx) { mi_assert_internal(count > 0); if (count <= 2) { // we don't bother with crossover fields for small counts return _mi_bitmap_try_find_from_claim(bitmap, bitmap_fields, start_field_idx, count, bitmap_idx); } // visit the fields size_t idx = start_field_idx; for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) { if (idx >= bitmap_fields) { idx = 0; } // wrap // first try to claim inside a field /* if (count <= MI_BITMAP_FIELD_BITS) { if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) { return true; } } */ // if that fails, then try to claim across fields if (mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, 0, bitmap_idx)) { return true; } } return false; } // Helper for masks across fields; returns the mid count, post_mask may be 0 static size_t mi_bitmap_mask_across(mi_bitmap_index_t bitmap_idx, size_t bitmap_fields, size_t count, size_t* pre_mask, size_t* mid_mask, size_t* post_mask) { MI_UNUSED(bitmap_fields); const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx); if mi_likely(bitidx + count <= MI_BITMAP_FIELD_BITS) { *pre_mask = mi_bitmap_mask_(count, bitidx); *mid_mask = 0; *post_mask = 0; mi_assert_internal(mi_bitmap_index_field(bitmap_idx) < bitmap_fields); return 0; } else { const size_t pre_bits = MI_BITMAP_FIELD_BITS - bitidx; mi_assert_internal(pre_bits < count); *pre_mask = mi_bitmap_mask_(pre_bits, bitidx); count -= pre_bits; const size_t mid_count = (count / MI_BITMAP_FIELD_BITS); *mid_mask = MI_BITMAP_FIELD_FULL; count %= MI_BITMAP_FIELD_BITS; *post_mask = (count==0 ? 0 : mi_bitmap_mask_(count, 0)); mi_assert_internal(mi_bitmap_index_field(bitmap_idx) + mid_count + (count==0 ? 0 : 1) < bitmap_fields); return mid_count; } } // Set `count` bits at `bitmap_idx` to 0 atomically // Returns `true` if all `count` bits were 1 previously. bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { size_t idx = mi_bitmap_index_field(bitmap_idx); size_t pre_mask; size_t mid_mask; size_t post_mask; size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask); bool all_one = true; mi_bitmap_field_t* field = &bitmap[idx]; size_t prev = mi_atomic_and_acq_rel(field++, ~pre_mask); // clear first part if ((prev & pre_mask) != pre_mask) all_one = false; while(mid_count-- > 0) { prev = mi_atomic_and_acq_rel(field++, ~mid_mask); // clear mid part if ((prev & mid_mask) != mid_mask) all_one = false; } if (post_mask!=0) { prev = mi_atomic_and_acq_rel(field, ~post_mask); // clear end part if ((prev & post_mask) != post_mask) all_one = false; } return all_one; } // Set `count` bits at `bitmap_idx` to 1 atomically // Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit. bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero) { size_t idx = mi_bitmap_index_field(bitmap_idx); size_t pre_mask; size_t mid_mask; size_t post_mask; size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask); bool all_zero = true; bool any_zero = false; _Atomic(size_t)*field = &bitmap[idx]; size_t prev = mi_atomic_or_acq_rel(field++, pre_mask); if ((prev & pre_mask) != 0) all_zero = false; if ((prev & pre_mask) != pre_mask) any_zero = true; while (mid_count-- > 0) { prev = mi_atomic_or_acq_rel(field++, mid_mask); if ((prev & mid_mask) != 0) all_zero = false; if ((prev & mid_mask) != mid_mask) any_zero = true; } if (post_mask!=0) { prev = mi_atomic_or_acq_rel(field, post_mask); if ((prev & post_mask) != 0) all_zero = false; if ((prev & post_mask) != post_mask) any_zero = true; } if (pany_zero != NULL) { *pany_zero = any_zero; } return all_zero; } // Returns `true` if all `count` bits were 1. // `any_ones` is `true` if there was at least one bit set to one. static bool mi_bitmap_is_claimedx_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_ones) { size_t idx = mi_bitmap_index_field(bitmap_idx); size_t pre_mask; size_t mid_mask; size_t post_mask; size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask); bool all_ones = true; bool any_ones = false; mi_bitmap_field_t* field = &bitmap[idx]; size_t prev = mi_atomic_load_relaxed(field++); if ((prev & pre_mask) != pre_mask) all_ones = false; if ((prev & pre_mask) != 0) any_ones = true; while (mid_count-- > 0) { prev = mi_atomic_load_relaxed(field++); if ((prev & mid_mask) != mid_mask) all_ones = false; if ((prev & mid_mask) != 0) any_ones = true; } if (post_mask!=0) { prev = mi_atomic_load_relaxed(field); if ((prev & post_mask) != post_mask) all_ones = false; if ((prev & post_mask) != 0) any_ones = true; } if (pany_ones != NULL) { *pany_ones = any_ones; } return all_ones; } bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { return mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, NULL); } bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { bool any_ones; mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, &any_ones); return any_ones; } ================================================ FILE: third-party/mimalloc/src/bitmap.h ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2019-2023 Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ /* ---------------------------------------------------------------------------- Concurrent bitmap that can set/reset sequences of bits atomically, represented as an array of fields where each field is a machine word (`size_t`) There are two api's; the standard one cannot have sequences that cross between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS). (this is used in region allocation) The `_across` postfixed functions do allow sequences that can cross over between the fields. (This is used in arena allocation) ---------------------------------------------------------------------------- */ #pragma once #ifndef MI_BITMAP_H #define MI_BITMAP_H /* ----------------------------------------------------------- Bitmap definition ----------------------------------------------------------- */ #define MI_BITMAP_FIELD_BITS (8*MI_SIZE_SIZE) #define MI_BITMAP_FIELD_FULL (~((size_t)0)) // all bits set // An atomic bitmap of `size_t` fields typedef _Atomic(size_t) mi_bitmap_field_t; typedef mi_bitmap_field_t* mi_bitmap_t; // A bitmap index is the index of the bit in a bitmap. typedef size_t mi_bitmap_index_t; // Create a bit index. static inline mi_bitmap_index_t mi_bitmap_index_create_ex(size_t idx, size_t bitidx) { mi_assert_internal(bitidx <= MI_BITMAP_FIELD_BITS); return (idx*MI_BITMAP_FIELD_BITS) + bitidx; } static inline mi_bitmap_index_t mi_bitmap_index_create(size_t idx, size_t bitidx) { mi_assert_internal(bitidx < MI_BITMAP_FIELD_BITS); return mi_bitmap_index_create_ex(idx,bitidx); } // Create a bit index. static inline mi_bitmap_index_t mi_bitmap_index_create_from_bit(size_t full_bitidx) { return mi_bitmap_index_create(full_bitidx / MI_BITMAP_FIELD_BITS, full_bitidx % MI_BITMAP_FIELD_BITS); } // Get the field index from a bit index. static inline size_t mi_bitmap_index_field(mi_bitmap_index_t bitmap_idx) { return (bitmap_idx / MI_BITMAP_FIELD_BITS); } // Get the bit index in a bitmap field static inline size_t mi_bitmap_index_bit_in_field(mi_bitmap_index_t bitmap_idx) { return (bitmap_idx % MI_BITMAP_FIELD_BITS); } // Get the full bit index static inline size_t mi_bitmap_index_bit(mi_bitmap_index_t bitmap_idx) { return bitmap_idx; } /* ----------------------------------------------------------- Claim a bit sequence atomically ----------------------------------------------------------- */ // Try to atomically claim a sequence of `count` bits in a single // field at `idx` in `bitmap`. Returns `true` on success. bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx); // Starts at idx, and wraps around to search in all `bitmap_fields` fields. // For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields. bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx); // Like _mi_bitmap_try_find_from_claim but with an extra predicate that must be fullfilled typedef bool (mi_cdecl *mi_bitmap_pred_fun_t)(mi_bitmap_index_t bitmap_idx, void* pred_arg); bool _mi_bitmap_try_find_from_claim_pred(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_pred_fun_t pred_fun, void* pred_arg, mi_bitmap_index_t* bitmap_idx); // Set `count` bits at `bitmap_idx` to 0 atomically // Returns `true` if all `count` bits were 1 previously. bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx); // Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically. // Returns `true` if successful when all previous `count` bits were 0. bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx); // Set `count` bits at `bitmap_idx` to 1 atomically // Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit. bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero); bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx); bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx); //-------------------------------------------------------------------------- // the `_across` functions work on bitmaps where sequences can cross over // between the fields. This is used in arena allocation //-------------------------------------------------------------------------- // Find `count` bits of zeros and set them to 1 atomically; returns `true` on success. // Starts at idx, and wraps around to search in all `bitmap_fields` fields. bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx); // Set `count` bits at `bitmap_idx` to 0 atomically // Returns `true` if all `count` bits were 1 previously. bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx); // Set `count` bits at `bitmap_idx` to 1 atomically // Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit. bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero); bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx); bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx); #endif ================================================ FILE: third-party/mimalloc/src/free.c ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2024, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #if !defined(MI_IN_ALLOC_C) #error "this file should be included from 'alloc.c' (so aliases can work from alloc-override)" // add includes help an IDE #include "mimalloc.h" #include "mimalloc/internal.h" #include "mimalloc/prim.h" // _mi_prim_thread_id() #endif // forward declarations static void mi_check_padding(const mi_page_t* page, const mi_block_t* block); static bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block); static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block); static void mi_stat_free(const mi_page_t* page, const mi_block_t* block); // ------------------------------------------------------ // Free // ------------------------------------------------------ // forward declaration of multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON) static mi_decl_noinline void mi_free_block_mt(mi_page_t* page, mi_segment_t* segment, mi_block_t* block); // regular free of a (thread local) block pointer // fast path written carefully to prevent spilling on the stack static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool track_stats, bool check_full) { // checks if mi_unlikely(mi_check_is_double_free(page, block)) return; mi_check_padding(page, block); if (track_stats) { mi_stat_free(page, block); } #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN && !MI_GUARDED if (!mi_page_is_huge(page)) { // huge page content may be already decommitted memset(block, MI_DEBUG_FREED, mi_page_block_size(page)); } #endif if (track_stats) { mi_track_free_size(block, mi_page_usable_size_of(page, block)); } // faster then mi_usable_size as we already know the page and that p is unaligned // actual free: push on the local free list mi_block_set_next(page, block, page->local_free); page->local_free = block; if mi_unlikely(--page->used == 0) { _mi_page_retire(page); } else if mi_unlikely(check_full && mi_page_is_in_full(page)) { _mi_page_unfull(page); } } // Adjust a block that was allocated aligned, to the actual start of the block in the page. // note: this can be called from `mi_free_generic_mt` where a non-owning thread accesses the // `page_start` and `block_size` fields; however these are constant and the page won't be // deallocated (as the block we are freeing keeps it alive) and thus safe to read concurrently. mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p) { mi_assert_internal(page!=NULL && p!=NULL); size_t diff = (uint8_t*)p - page->page_start; size_t adjust; if mi_likely(page->block_size_shift != 0) { adjust = diff & (((size_t)1 << page->block_size_shift) - 1); } else { adjust = diff % mi_page_block_size(page); } return (mi_block_t*)((uintptr_t)p - adjust); } // forward declaration for a MI_GUARDED build #if MI_GUARDED static void mi_block_unguard(mi_page_t* page, mi_block_t* block, void* p); // forward declaration static inline void mi_block_check_unguard(mi_page_t* page, mi_block_t* block, void* p) { if (mi_block_ptr_is_guarded(block, p)) { mi_block_unguard(page, block, p); } } #else static inline void mi_block_check_unguard(mi_page_t* page, mi_block_t* block, void* p) { MI_UNUSED(page); MI_UNUSED(block); MI_UNUSED(p); } #endif // free a local pointer (page parameter comes first for better codegen) static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept { MI_UNUSED(segment); mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(page, p) : (mi_block_t*)p); mi_block_check_unguard(page, block, p); mi_free_block_local(page, block, true /* track stats */, true /* check for a full page */); } // free a pointer owned by another thread (page parameter comes first for better codegen) static void mi_decl_noinline mi_free_generic_mt(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept { mi_block_t* const block = _mi_page_ptr_unalign(page, p); // don't check `has_aligned` flag to avoid a race (issue #865) mi_block_check_unguard(page, block, p); mi_free_block_mt(page, segment, block); } // generic free (for runtime integration) void mi_decl_noinline _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept { if (is_local) mi_free_generic_local(page,segment,p); else mi_free_generic_mt(page,segment,p); } // Get the segment data belonging to a pointer // This is just a single `and` in release mode but does further checks in debug mode // (and secure mode) to see if this was a valid pointer. static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* msg) { MI_UNUSED(msg); #if (MI_DEBUG>0) if mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0 && !mi_option_is_enabled(mi_option_guarded_precise)) { _mi_error_message(EINVAL, "%s: invalid (unaligned) pointer: %p\n", msg, p); return NULL; } #endif mi_segment_t* const segment = _mi_ptr_segment(p); if mi_unlikely(segment==NULL) return segment; #if (MI_DEBUG>0) if mi_unlikely(!mi_is_in_heap_region(p)) { #if (MI_INTPTR_SIZE == 8 && defined(__linux__)) if (((uintptr_t)p >> 40) != 0x7F) { // linux tends to align large blocks above 0x7F000000000 (issue #640) #else { #endif _mi_warning_message("%s: pointer might not point to a valid heap region: %p\n" "(this may still be a valid very large allocation (over 64MiB))\n", msg, p); if mi_likely(_mi_ptr_cookie(segment) == segment->cookie) { _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p); } } } #endif #if (MI_DEBUG>0 || MI_SECURE>=4) if mi_unlikely(_mi_ptr_cookie(segment) != segment->cookie) { _mi_error_message(EINVAL, "%s: pointer does not point to a valid heap space: %p\n", msg, p); return NULL; } #endif return segment; } // Free a block // Fast path written carefully to prevent register spilling on the stack void mi_free(void* p) mi_attr_noexcept { mi_segment_t* const segment = mi_checked_ptr_segment(p,"mi_free"); if mi_unlikely(segment==NULL) return; const bool is_local = (_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id)); mi_page_t* const page = _mi_segment_page_of(segment, p); if mi_likely(is_local) { // thread-local free? if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned) // thread-local, aligned, and not a full page mi_block_t* const block = (mi_block_t*)p; mi_free_block_local(page, block, true /* track stats */, false /* no need to check if the page is full */); } else { // page is full or contains (inner) aligned blocks; use generic path mi_free_generic_local(page, segment, p); } } else { // not thread-local; use generic path mi_free_generic_mt(page, segment, p); } } // return true if successful bool _mi_free_delayed_block(mi_block_t* block) { // get segment and page mi_assert_internal(block!=NULL); const mi_segment_t* const segment = _mi_ptr_segment(block); mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie); mi_assert_internal(_mi_thread_id() == segment->thread_id); mi_page_t* const page = _mi_segment_page_of(segment, block); // Clear the no-delayed flag so delayed freeing is used again for this page. // This must be done before collecting the free lists on this page -- otherwise // some blocks may end up in the page `thread_free` list with no blocks in the // heap `thread_delayed_free` list which may cause the page to be never freed! // (it would only be freed if we happen to scan it in `mi_page_queue_find_free_ex`) if (!_mi_page_try_use_delayed_free(page, MI_USE_DELAYED_FREE, false /* dont overwrite never delayed */)) { return false; } // collect all other non-local frees (move from `thread_free` to `free`) to ensure up-to-date `used` count _mi_page_free_collect(page, false); // and free the block (possibly freeing the page as well since `used` is updated) mi_free_block_local(page, block, false /* stats have already been adjusted */, true /* check for a full page */); return true; } // ------------------------------------------------------ // Multi-threaded Free (`_mt`) // ------------------------------------------------------ // Push a block that is owned by another thread on its page-local thread free // list or it's heap delayed free list. Such blocks are later collected by // the owning thread in `_mi_free_delayed_block`. static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block_t* block ) { // Try to put the block on either the page-local thread free list, // or the heap delayed free list (if this is the first non-local free in that page) mi_thread_free_t tfreex; bool use_delayed; mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free); do { use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE); if mi_unlikely(use_delayed) { // unlikely: this only happens on the first concurrent free in a page that is in the full list tfreex = mi_tf_set_delayed(tfree,MI_DELAYED_FREEING); } else { // usual: directly add to page thread_free list mi_block_set_next(page, block, mi_tf_block(tfree)); tfreex = mi_tf_set_block(tfree,block); } } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex)); // If this was the first non-local free, we need to push it on the heap delayed free list instead if mi_unlikely(use_delayed) { // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`) mi_heap_t* const heap = (mi_heap_t*)(mi_atomic_load_acquire(&page->xheap)); //mi_page_heap(page); mi_assert_internal(heap != NULL); if (heap != NULL) { // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity) mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free); do { mi_block_set_nextx(heap,block,dfree, heap->keys); } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block)); } // and reset the MI_DELAYED_FREEING flag tfree = mi_atomic_load_relaxed(&page->xthread_free); do { tfreex = tfree; mi_assert_internal(mi_tf_delayed(tfree) == MI_DELAYED_FREEING); tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE); } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex)); } } // Multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON) static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_segment_t* segment, mi_block_t* block) { // first see if the segment was abandoned and if we can reclaim it into our thread if (_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0 && #if MI_HUGE_PAGE_ABANDON segment->page_kind != MI_PAGE_HUGE && #endif mi_atomic_load_relaxed(&segment->thread_id) == 0 && // segment is abandoned? mi_prim_get_default_heap() != (mi_heap_t*)&_mi_heap_empty) // and we did not already exit this thread (without this check, a fresh heap will be initalized (issue #944)) { // the segment is abandoned, try to reclaim it into our heap if (_mi_segment_attempt_reclaim(mi_heap_get_default(), segment)) { mi_assert_internal(_mi_thread_id() == mi_atomic_load_relaxed(&segment->thread_id)); mi_assert_internal(mi_heap_get_default()->tld->segments.subproc == segment->subproc); mi_free(block); // recursively free as now it will be a local free in our heap return; } } // The padding check may access the non-thread-owned page for the key values. // that is safe as these are constant and the page won't be freed (as the block is not freed yet). mi_check_padding(page, block); // adjust stats (after padding check and potentially recursive `mi_free` above) mi_stat_free(page, block); // stat_free may access the padding mi_track_free_size(block, mi_page_usable_size_of(page,block)); // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection _mi_padding_shrink(page, block, sizeof(mi_block_t)); if (segment->kind == MI_SEGMENT_HUGE) { #if MI_HUGE_PAGE_ABANDON // huge page segments are always abandoned and can be freed immediately _mi_segment_huge_page_free(segment, page, block); return; #else // huge pages are special as they occupy the entire segment // as these are large we reset the memory occupied by the page so it is available to other threads // (as the owning thread needs to actually free the memory later). _mi_segment_huge_page_reset(segment, page, block); #endif } else { #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN // note: when tracking, cannot use mi_usable_size with multi-threading memset(block, MI_DEBUG_FREED, mi_usable_size(block)); #endif } // and finally free the actual block by pushing it on the owning heap // thread_delayed free list (or heap delayed free list) mi_free_block_delayed_mt(page,block); } // ------------------------------------------------------ // Usable size // ------------------------------------------------------ // Bytes available in a block static size_t mi_decl_noinline mi_page_usable_aligned_size_of(const mi_page_t* page, const void* p) mi_attr_noexcept { const mi_block_t* block = _mi_page_ptr_unalign(page, p); const size_t size = mi_page_usable_size_of(page, block); const ptrdiff_t adjust = (uint8_t*)p - (uint8_t*)block; mi_assert_internal(adjust >= 0 && (size_t)adjust <= size); const size_t aligned_size = (size - adjust); #if MI_GUARDED if (mi_block_ptr_is_guarded(block, p)) { return aligned_size - _mi_os_page_size(); } #endif return aligned_size; } static inline size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noexcept { const mi_segment_t* const segment = mi_checked_ptr_segment(p, msg); if mi_unlikely(segment==NULL) return 0; const mi_page_t* const page = _mi_segment_page_of(segment, p); if mi_likely(!mi_page_has_aligned(page)) { const mi_block_t* block = (const mi_block_t*)p; return mi_page_usable_size_of(page, block); } else { // split out to separate routine for improved code generation return mi_page_usable_aligned_size_of(page, p); } } mi_decl_nodiscard size_t mi_usable_size(const void* p) mi_attr_noexcept { return _mi_usable_size(p, "mi_usable_size"); } // ------------------------------------------------------ // Free variants // ------------------------------------------------------ void mi_free_size(void* p, size_t size) mi_attr_noexcept { MI_UNUSED_RELEASE(size); mi_assert(p == NULL || size <= _mi_usable_size(p,"mi_free_size")); mi_free(p); } void mi_free_size_aligned(void* p, size_t size, size_t alignment) mi_attr_noexcept { MI_UNUSED_RELEASE(alignment); mi_assert(((uintptr_t)p % alignment) == 0); mi_free_size(p,size); } void mi_free_aligned(void* p, size_t alignment) mi_attr_noexcept { MI_UNUSED_RELEASE(alignment); mi_assert(((uintptr_t)p % alignment) == 0); mi_free(p); } // ------------------------------------------------------ // Check for double free in secure and debug mode // This is somewhat expensive so only enabled for secure mode 4 // ------------------------------------------------------ #if (MI_ENCODE_FREELIST && (MI_SECURE>=4 || MI_DEBUG!=0)) // linear check if the free list contains a specific element static bool mi_list_contains(const mi_page_t* page, const mi_block_t* list, const mi_block_t* elem) { while (list != NULL) { if (elem==list) return true; list = mi_block_next(page, list); } return false; } static mi_decl_noinline bool mi_check_is_double_freex(const mi_page_t* page, const mi_block_t* block) { // The decoded value is in the same page (or NULL). // Walk the free lists to verify positively if it is already freed if (mi_list_contains(page, page->free, block) || mi_list_contains(page, page->local_free, block) || mi_list_contains(page, mi_page_thread_free(page), block)) { _mi_error_message(EAGAIN, "double free detected of block %p with size %zu\n", block, mi_page_block_size(page)); return true; } return false; } #define mi_track_page(page,access) { size_t psize; void* pstart = _mi_page_start(_mi_page_segment(page),page,&psize); mi_track_mem_##access( pstart, psize); } static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) { bool is_double_free = false; mi_block_t* n = mi_block_nextx(page, block, page->keys); // pretend it is freed, and get the decoded first field if (((uintptr_t)n & (MI_INTPTR_SIZE-1))==0 && // quick check: aligned pointer? (n==NULL || mi_is_in_same_page(block, n))) // quick check: in same page or NULL? { // Suspicious: decoded value a in block is in the same page (or NULL) -- maybe a double free? // (continue in separate function to improve code generation) is_double_free = mi_check_is_double_freex(page, block); } return is_double_free; } #else static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) { MI_UNUSED(page); MI_UNUSED(block); return false; } #endif // --------------------------------------------------------------------------- // Check for heap block overflow by setting up padding at the end of the block // --------------------------------------------------------------------------- #if MI_PADDING // && !MI_TRACK_ENABLED static bool mi_page_decode_padding(const mi_page_t* page, const mi_block_t* block, size_t* delta, size_t* bsize) { *bsize = mi_page_usable_block_size(page); const mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + *bsize); mi_track_mem_defined(padding,sizeof(mi_padding_t)); *delta = padding->delta; uint32_t canary = padding->canary; uintptr_t keys[2]; keys[0] = page->keys[0]; keys[1] = page->keys[1]; bool ok = (mi_ptr_encode_canary(page,block,keys) == canary && *delta <= *bsize); mi_track_mem_noaccess(padding,sizeof(mi_padding_t)); return ok; } // Return the exact usable size of a block. static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) { size_t bsize; size_t delta; bool ok = mi_page_decode_padding(page, block, &delta, &bsize); mi_assert_internal(ok); mi_assert_internal(delta <= bsize); return (ok ? bsize - delta : 0); } // When a non-thread-local block is freed, it becomes part of the thread delayed free // list that is freed later by the owning heap. If the exact usable size is too small to // contain the pointer for the delayed list, then shrink the padding (by decreasing delta) // so it will later not trigger an overflow error in `mi_free_block`. void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) { size_t bsize; size_t delta; bool ok = mi_page_decode_padding(page, block, &delta, &bsize); mi_assert_internal(ok); if (!ok || (bsize - delta) >= min_size) return; // usually already enough space mi_assert_internal(bsize >= min_size); if (bsize < min_size) return; // should never happen size_t new_delta = (bsize - min_size); mi_assert_internal(new_delta < bsize); mi_padding_t* padding = (mi_padding_t*)((uint8_t*)block + bsize); mi_track_mem_defined(padding,sizeof(mi_padding_t)); padding->delta = (uint32_t)new_delta; mi_track_mem_noaccess(padding,sizeof(mi_padding_t)); } #else static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) { MI_UNUSED(block); return mi_page_usable_block_size(page); } void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) { MI_UNUSED(page); MI_UNUSED(block); MI_UNUSED(min_size); } #endif #if MI_PADDING && MI_PADDING_CHECK static bool mi_verify_padding(const mi_page_t* page, const mi_block_t* block, size_t* size, size_t* wrong) { size_t bsize; size_t delta; bool ok = mi_page_decode_padding(page, block, &delta, &bsize); *size = *wrong = bsize; if (!ok) return false; mi_assert_internal(bsize >= delta); *size = bsize - delta; if (!mi_page_is_huge(page)) { uint8_t* fill = (uint8_t*)block + bsize - delta; const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // check at most the first N padding bytes mi_track_mem_defined(fill, maxpad); for (size_t i = 0; i < maxpad; i++) { if (fill[i] != MI_DEBUG_PADDING) { *wrong = bsize - delta + i; ok = false; break; } } mi_track_mem_noaccess(fill, maxpad); } return ok; } static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) { size_t size; size_t wrong; if (!mi_verify_padding(page,block,&size,&wrong)) { _mi_error_message(EFAULT, "buffer overflow in heap block %p of size %zu: write after %zu bytes\n", block, size, wrong ); } } #else static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) { MI_UNUSED(page); MI_UNUSED(block); } #endif // only maintain stats for smaller objects if requested #if (MI_STAT>0) static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) { #if (MI_STAT < 2) MI_UNUSED(block); #endif mi_heap_t* const heap = mi_heap_get_default(); const size_t bsize = mi_page_usable_block_size(page); #if (MI_STAT>1) const size_t usize = mi_page_usable_size_of(page, block); mi_heap_stat_decrease(heap, malloc_requested, usize); #endif if (bsize <= MI_MEDIUM_OBJ_SIZE_MAX) { mi_heap_stat_decrease(heap, malloc_normal, bsize); #if (MI_STAT > 1) mi_heap_stat_decrease(heap, malloc_bins[_mi_bin(bsize)], 1); #endif } //else if (bsize <= MI_LARGE_OBJ_SIZE_MAX) { // mi_heap_stat_decrease(heap, malloc_large, bsize); //} else { mi_heap_stat_decrease(heap, malloc_huge, bsize); } } #else static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) { MI_UNUSED(page); MI_UNUSED(block); } #endif // Remove guard page when building with MI_GUARDED #if MI_GUARDED static void mi_block_unguard(mi_page_t* page, mi_block_t* block, void* p) { MI_UNUSED(p); mi_assert_internal(mi_block_ptr_is_guarded(block, p)); mi_assert_internal(mi_page_has_aligned(page)); mi_assert_internal((uint8_t*)p - (uint8_t*)block >= (ptrdiff_t)sizeof(mi_block_t)); mi_assert_internal(block->next == MI_BLOCK_TAG_GUARDED); const size_t bsize = mi_page_block_size(page); const size_t psize = _mi_os_page_size(); mi_assert_internal(bsize > psize); mi_assert_internal(_mi_page_segment(page)->allow_decommit); void* gpage = (uint8_t*)block + bsize - psize; mi_assert_internal(_mi_is_aligned(gpage, psize)); _mi_os_unprotect(gpage, psize); } #endif ================================================ FILE: third-party/mimalloc/src/heap.c ================================================ /*---------------------------------------------------------------------------- Copyright (c) 2018-2021, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #include "mimalloc.h" #include "mimalloc/internal.h" #include "mimalloc/atomic.h" #include "mimalloc/prim.h" // mi_prim_get_default_heap #include // memset, memcpy #if defined(_MSC_VER) && (_MSC_VER < 1920) #pragma warning(disable:4204) // non-constant aggregate initializer #endif /* ----------------------------------------------------------- Helpers ----------------------------------------------------------- */ // return `true` if ok, `false` to break typedef bool (heap_page_visitor_fun)(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2); // Visit all pages in a heap; returns `false` if break was called. static bool mi_heap_visit_pages(mi_heap_t* heap, heap_page_visitor_fun* fn, void* arg1, void* arg2) { if (heap==NULL || heap->page_count==0) return 0; // visit all pages #if MI_DEBUG>1 size_t total = heap->page_count; size_t count = 0; #endif for (size_t i = 0; i <= MI_BIN_FULL; i++) { mi_page_queue_t* pq = &heap->pages[i]; mi_page_t* page = pq->first; while(page != NULL) { mi_page_t* next = page->next; // save next in case the page gets removed from the queue mi_assert_internal(mi_page_heap(page) == heap); #if MI_DEBUG>1 count++; #endif if (!fn(heap, pq, page, arg1, arg2)) return false; page = next; // and continue } } mi_assert_internal(count == total); return true; } #if MI_DEBUG>=2 static bool mi_heap_page_is_valid(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) { MI_UNUSED(arg1); MI_UNUSED(arg2); MI_UNUSED(pq); mi_assert_internal(mi_page_heap(page) == heap); mi_segment_t* segment = _mi_page_segment(page); mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == heap->thread_id); mi_assert_expensive(_mi_page_is_valid(page)); return true; } #endif #if MI_DEBUG>=3 static bool mi_heap_is_valid(mi_heap_t* heap) { mi_assert_internal(heap!=NULL); mi_heap_visit_pages(heap, &mi_heap_page_is_valid, NULL, NULL); return true; } #endif /* ----------------------------------------------------------- "Collect" pages by migrating `local_free` and `thread_free` lists and freeing empty pages. This is done when a thread stops (and in that case abandons pages if there are still blocks alive) ----------------------------------------------------------- */ typedef enum mi_collect_e { MI_NORMAL, MI_FORCE, MI_ABANDON } mi_collect_t; static bool mi_heap_page_collect(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg_collect, void* arg2 ) { MI_UNUSED(arg2); MI_UNUSED(heap); mi_assert_internal(mi_heap_page_is_valid(heap, pq, page, NULL, NULL)); mi_collect_t collect = *((mi_collect_t*)arg_collect); _mi_page_free_collect(page, collect >= MI_FORCE); if (collect == MI_FORCE) { // note: call before a potential `_mi_page_free` as the segment may be freed if this was the last used page in that segment. mi_segment_t* segment = _mi_page_segment(page); _mi_segment_collect(segment, true /* force? */); } if (mi_page_all_free(page)) { // no more used blocks, free the page. // note: this will free retired pages as well. _mi_page_free(page, pq, collect >= MI_FORCE); } else if (collect == MI_ABANDON) { // still used blocks but the thread is done; abandon the page _mi_page_abandon(page, pq); } return true; // don't break } static bool mi_heap_page_never_delayed_free(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) { MI_UNUSED(arg1); MI_UNUSED(arg2); MI_UNUSED(heap); MI_UNUSED(pq); _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false); return true; // don't break } static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect) { if (heap==NULL || !mi_heap_is_initialized(heap)) return; const bool force = (collect >= MI_FORCE); _mi_deferred_free(heap, force); // python/cpython#112532: we may be called from a thread that is not the owner of the heap const bool is_main_thread = (_mi_is_main_thread() && heap->thread_id == _mi_thread_id()); // note: never reclaim on collect but leave it to threads that need storage to reclaim const bool force_main = #ifdef NDEBUG collect == MI_FORCE #else collect >= MI_FORCE #endif && is_main_thread && mi_heap_is_backing(heap) && !heap->no_reclaim; if (force_main) { // the main thread is abandoned (end-of-program), try to reclaim all abandoned segments. // if all memory is freed by now, all segments should be freed. // note: this only collects in the current subprocess _mi_abandoned_reclaim_all(heap, &heap->tld->segments); } // if abandoning, mark all pages to no longer add to delayed_free if (collect == MI_ABANDON) { mi_heap_visit_pages(heap, &mi_heap_page_never_delayed_free, NULL, NULL); } // free all current thread delayed blocks. // (if abandoning, after this there are no more thread-delayed references into the pages.) _mi_heap_delayed_free_all(heap); // collect retired pages _mi_heap_collect_retired(heap, force); // collect all pages owned by this thread mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL); mi_assert_internal( collect != MI_ABANDON || mi_atomic_load_ptr_acquire(mi_block_t,&heap->thread_delayed_free) == NULL ); // collect abandoned segments (in particular, purge expired parts of segments in the abandoned segment list) // note: forced purge can be quite expensive if many threads are created/destroyed so we do not force on abandonment _mi_abandoned_collect(heap, collect == MI_FORCE /* force? */, &heap->tld->segments); // if forced, collect thread data cache on program-exit (or shared library unload) if (force && is_main_thread && mi_heap_is_backing(heap)) { _mi_thread_data_collect(); // collect thread data cache } // collect arenas (this is program wide so don't force purges on abandonment of threads) _mi_arenas_collect(collect == MI_FORCE /* force purge? */); // merge statistics if (collect <= MI_FORCE) { mi_stats_merge(); } } void _mi_heap_collect_abandon(mi_heap_t* heap) { mi_heap_collect_ex(heap, MI_ABANDON); } void mi_heap_collect(mi_heap_t* heap, bool force) mi_attr_noexcept { mi_heap_collect_ex(heap, (force ? MI_FORCE : MI_NORMAL)); } void mi_collect(bool force) mi_attr_noexcept { mi_heap_collect(mi_prim_get_default_heap(), force); } /* ----------------------------------------------------------- Heap new ----------------------------------------------------------- */ mi_heap_t* mi_heap_get_default(void) { mi_thread_init(); return mi_prim_get_default_heap(); } static bool mi_heap_is_default(const mi_heap_t* heap) { return (heap == mi_prim_get_default_heap()); } mi_heap_t* mi_heap_get_backing(void) { mi_heap_t* heap = mi_heap_get_default(); mi_assert_internal(heap!=NULL); mi_heap_t* bheap = heap->tld->heap_backing; mi_assert_internal(bheap!=NULL); mi_assert_internal(bheap->thread_id == _mi_thread_id()); return bheap; } void _mi_heap_init(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag) { _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t)); heap->tld = tld; heap->thread_id = _mi_thread_id(); heap->arena_id = arena_id; heap->no_reclaim = noreclaim; heap->tag = tag; if (heap == tld->heap_backing) { _mi_random_init(&heap->random); } else { _mi_random_split(&tld->heap_backing->random, &heap->random); } heap->cookie = _mi_heap_random_next(heap) | 1; heap->keys[0] = _mi_heap_random_next(heap); heap->keys[1] = _mi_heap_random_next(heap); _mi_heap_guarded_init(heap); // push on the thread local heaps list heap->next = heap->tld->heaps; heap->tld->heaps = heap; } mi_decl_nodiscard mi_heap_t* mi_heap_new_ex(int heap_tag, bool allow_destroy, mi_arena_id_t arena_id) { mi_heap_t* bheap = mi_heap_get_backing(); mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t); // todo: OS allocate in secure mode? if (heap == NULL) return NULL; mi_assert(heap_tag >= 0 && heap_tag < 256); _mi_heap_init(heap, bheap->tld, arena_id, allow_destroy /* no reclaim? */, (uint8_t)heap_tag /* heap tag */); return heap; } mi_decl_nodiscard mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id) { return mi_heap_new_ex(0 /* default heap tag */, false /* don't allow `mi_heap_destroy` */, arena_id); } mi_decl_nodiscard mi_heap_t* mi_heap_new(void) { // don't reclaim abandoned memory or otherwise destroy is unsafe return mi_heap_new_ex(0 /* default heap tag */, true /* no reclaim */, _mi_arena_id_none()); } bool _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid) { return _mi_arena_memid_is_suitable(memid, heap->arena_id); } uintptr_t _mi_heap_random_next(mi_heap_t* heap) { return _mi_random_next(&heap->random); } // zero out the page queues static void mi_heap_reset_pages(mi_heap_t* heap) { mi_assert_internal(heap != NULL); mi_assert_internal(mi_heap_is_initialized(heap)); // TODO: copy full empty heap instead? memset(&heap->pages_free_direct, 0, sizeof(heap->pages_free_direct)); _mi_memcpy_aligned(&heap->pages, &_mi_heap_empty.pages, sizeof(heap->pages)); heap->thread_delayed_free = NULL; heap->page_count = 0; } // called from `mi_heap_destroy` and `mi_heap_delete` to free the internal heap resources. static void mi_heap_free(mi_heap_t* heap) { mi_assert(heap != NULL); mi_assert_internal(mi_heap_is_initialized(heap)); if (heap==NULL || !mi_heap_is_initialized(heap)) return; if (mi_heap_is_backing(heap)) return; // dont free the backing heap // reset default if (mi_heap_is_default(heap)) { _mi_heap_set_default_direct(heap->tld->heap_backing); } // remove ourselves from the thread local heaps list // linear search but we expect the number of heaps to be relatively small mi_heap_t* prev = NULL; mi_heap_t* curr = heap->tld->heaps; while (curr != heap && curr != NULL) { prev = curr; curr = curr->next; } mi_assert_internal(curr == heap); if (curr == heap) { if (prev != NULL) { prev->next = heap->next; } else { heap->tld->heaps = heap->next; } } mi_assert_internal(heap->tld->heaps != NULL); // and free the used memory mi_free(heap); } // return a heap on the same thread as `heap` specialized for the specified tag (if it exists) mi_heap_t* _mi_heap_by_tag(mi_heap_t* heap, uint8_t tag) { if (heap->tag == tag) { return heap; } for (mi_heap_t *curr = heap->tld->heaps; curr != NULL; curr = curr->next) { if (curr->tag == tag) { return curr; } } return NULL; } /* ----------------------------------------------------------- Heap destroy ----------------------------------------------------------- */ static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) { MI_UNUSED(arg1); MI_UNUSED(arg2); MI_UNUSED(heap); MI_UNUSED(pq); // ensure no more thread_delayed_free will be added _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false); // stats const size_t bsize = mi_page_block_size(page); if (bsize > MI_MEDIUM_OBJ_SIZE_MAX) { //if (bsize <= MI_LARGE_OBJ_SIZE_MAX) { // mi_heap_stat_decrease(heap, malloc_large, bsize); //} //else { mi_heap_stat_decrease(heap, malloc_huge, bsize); } } #if (MI_STAT) _mi_page_free_collect(page, false); // update used count const size_t inuse = page->used; if (bsize <= MI_LARGE_OBJ_SIZE_MAX) { mi_heap_stat_decrease(heap, malloc_normal, bsize * inuse); #if (MI_STAT>1) mi_heap_stat_decrease(heap, malloc_bins[_mi_bin(bsize)], inuse); #endif } mi_heap_stat_decrease(heap, malloc_requested, bsize * inuse); // todo: off for aligned blocks... #endif /// pretend it is all free now mi_assert_internal(mi_page_thread_free(page) == NULL); page->used = 0; // and free the page // mi_page_free(page,false); page->next = NULL; page->prev = NULL; _mi_segment_page_free(page,false /* no force? */, &heap->tld->segments); return true; // keep going } void _mi_heap_destroy_pages(mi_heap_t* heap) { mi_heap_visit_pages(heap, &_mi_heap_page_destroy, NULL, NULL); mi_heap_reset_pages(heap); } #if MI_TRACK_HEAP_DESTROY static bool mi_cdecl mi_heap_track_block_free(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg) { MI_UNUSED(heap); MI_UNUSED(area); MI_UNUSED(arg); MI_UNUSED(block_size); mi_track_free_size(block,mi_usable_size(block)); return true; } #endif void mi_heap_destroy(mi_heap_t* heap) { mi_assert(heap != NULL); mi_assert(mi_heap_is_initialized(heap)); mi_assert(heap->no_reclaim); mi_assert_expensive(mi_heap_is_valid(heap)); if (heap==NULL || !mi_heap_is_initialized(heap)) return; #if MI_GUARDED // _mi_warning_message("'mi_heap_destroy' called but MI_GUARDED is enabled -- using `mi_heap_delete` instead (heap at %p)\n", heap); mi_heap_delete(heap); return; #else if (!heap->no_reclaim) { _mi_warning_message("'mi_heap_destroy' called but ignored as the heap was not created with 'allow_destroy' (heap at %p)\n", heap); // don't free in case it may contain reclaimed pages mi_heap_delete(heap); } else { // track all blocks as freed #if MI_TRACK_HEAP_DESTROY mi_heap_visit_blocks(heap, true, mi_heap_track_block_free, NULL); #endif // free all pages _mi_heap_destroy_pages(heap); mi_heap_free(heap); } #endif } // forcefully destroy all heaps in the current thread void _mi_heap_unsafe_destroy_all(mi_heap_t* heap) { mi_assert_internal(heap != NULL); if (heap == NULL) return; mi_heap_t* curr = heap->tld->heaps; while (curr != NULL) { mi_heap_t* next = curr->next; if (curr->no_reclaim) { mi_heap_destroy(curr); } else { _mi_heap_destroy_pages(curr); } curr = next; } } /* ----------------------------------------------------------- Safe Heap delete ----------------------------------------------------------- */ // Transfer the pages from one heap to the other static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) { mi_assert_internal(heap!=NULL); if (from==NULL || from->page_count == 0) return; // reduce the size of the delayed frees _mi_heap_delayed_free_partial(from); // transfer all pages by appending the queues; this will set a new heap field // so threads may do delayed frees in either heap for a while. // note: appending waits for each page to not be in the `MI_DELAYED_FREEING` state // so after this only the new heap will get delayed frees for (size_t i = 0; i <= MI_BIN_FULL; i++) { mi_page_queue_t* pq = &heap->pages[i]; mi_page_queue_t* append = &from->pages[i]; size_t pcount = _mi_page_queue_append(heap, pq, append); heap->page_count += pcount; from->page_count -= pcount; } mi_assert_internal(from->page_count == 0); // and do outstanding delayed frees in the `from` heap // note: be careful here as the `heap` field in all those pages no longer point to `from`, // turns out to be ok as `_mi_heap_delayed_free` only visits the list and calls a // the regular `_mi_free_delayed_block` which is safe. _mi_heap_delayed_free_all(from); #if !defined(_MSC_VER) || (_MSC_VER > 1900) // somehow the following line gives an error in VS2015, issue #353 mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_block_t,&from->thread_delayed_free) == NULL); #endif // and reset the `from` heap mi_heap_reset_pages(from); } // are two heaps compatible with respect to heap-tag, exclusive arena etc. static bool mi_heaps_are_compatible(mi_heap_t* heap1, mi_heap_t* heap2) { return (heap1->tag == heap2->tag && // store same kind of objects heap1->arena_id == heap2->arena_id); // same arena preference } // Safe delete a heap without freeing any still allocated blocks in that heap. void mi_heap_delete(mi_heap_t* heap) { mi_assert(heap != NULL); mi_assert(mi_heap_is_initialized(heap)); mi_assert_expensive(mi_heap_is_valid(heap)); if (heap==NULL || !mi_heap_is_initialized(heap)) return; mi_heap_t* bheap = heap->tld->heap_backing; if (bheap != heap && mi_heaps_are_compatible(bheap,heap)) { // transfer still used pages to the backing heap mi_heap_absorb(bheap, heap); } else { // the backing heap abandons its pages _mi_heap_collect_abandon(heap); } mi_assert_internal(heap->page_count==0); mi_heap_free(heap); } mi_heap_t* mi_heap_set_default(mi_heap_t* heap) { mi_assert(heap != NULL); mi_assert(mi_heap_is_initialized(heap)); if (heap==NULL || !mi_heap_is_initialized(heap)) return NULL; mi_assert_expensive(mi_heap_is_valid(heap)); mi_heap_t* old = mi_prim_get_default_heap(); _mi_heap_set_default_direct(heap); return old; } /* ----------------------------------------------------------- Analysis ----------------------------------------------------------- */ // static since it is not thread safe to access heaps from other threads. static mi_heap_t* mi_heap_of_block(const void* p) { if (p == NULL) return NULL; mi_segment_t* segment = _mi_ptr_segment(p); bool valid = (_mi_ptr_cookie(segment) == segment->cookie); mi_assert_internal(valid); if mi_unlikely(!valid) return NULL; return mi_page_heap(_mi_segment_page_of(segment,p)); } bool mi_heap_contains_block(mi_heap_t* heap, const void* p) { mi_assert(heap != NULL); if (heap==NULL || !mi_heap_is_initialized(heap)) return false; return (heap == mi_heap_of_block(p)); } static bool mi_heap_page_check_owned(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* p, void* vfound) { MI_UNUSED(heap); MI_UNUSED(pq); bool* found = (bool*)vfound; void* start = mi_page_start(page); void* end = (uint8_t*)start + (page->capacity * mi_page_block_size(page)); *found = (p >= start && p < end); return (!*found); // continue if not found } bool mi_heap_check_owned(mi_heap_t* heap, const void* p) { mi_assert(heap != NULL); if (heap==NULL || !mi_heap_is_initialized(heap)) return false; if (((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0) return false; // only aligned pointers bool found = false; mi_heap_visit_pages(heap, &mi_heap_page_check_owned, (void*)p, &found); return found; } bool mi_check_owned(const void* p) { return mi_heap_check_owned(mi_prim_get_default_heap(), p); } /* ----------------------------------------------------------- Visit all heap blocks and areas Todo: enable visiting abandoned pages, and enable visiting all blocks of all heaps across threads ----------------------------------------------------------- */ void _mi_heap_area_init(mi_heap_area_t* area, mi_page_t* page) { const size_t bsize = mi_page_block_size(page); const size_t ubsize = mi_page_usable_block_size(page); area->reserved = page->reserved * bsize; area->committed = page->capacity * bsize; area->blocks = mi_page_start(page); area->used = page->used; // number of blocks in use (#553) area->block_size = ubsize; area->full_block_size = bsize; area->heap_tag = page->heap_tag; } static void mi_get_fast_divisor(size_t divisor, uint64_t* magic, size_t* shift) { mi_assert_internal(divisor > 0 && divisor <= UINT32_MAX); *shift = MI_SIZE_BITS - mi_clz(divisor - 1); *magic = ((((uint64_t)1 << 32) * (((uint64_t)1 << *shift) - divisor)) / divisor + 1); } static size_t mi_fast_divide(size_t n, uint64_t magic, size_t shift) { mi_assert_internal(n <= UINT32_MAX); const uint64_t hi = ((uint64_t)n * magic) >> 32; return (size_t)((hi + n) >> shift); } bool _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_block_visit_fun* visitor, void* arg) { mi_assert(area != NULL); if (area==NULL) return true; mi_assert(page != NULL); if (page == NULL) return true; _mi_page_free_collect(page,true); // collect both thread_delayed and local_free mi_assert_internal(page->local_free == NULL); if (page->used == 0) return true; size_t psize; uint8_t* const pstart = _mi_segment_page_start(_mi_page_segment(page), page, &psize); mi_heap_t* const heap = mi_page_heap(page); const size_t bsize = mi_page_block_size(page); const size_t ubsize = mi_page_usable_block_size(page); // without padding // optimize page with one block if (page->capacity == 1) { mi_assert_internal(page->used == 1 && page->free == NULL); return visitor(mi_page_heap(page), area, pstart, ubsize, arg); } mi_assert(bsize <= UINT32_MAX); // optimize full pages if (page->used == page->capacity) { uint8_t* block = pstart; for (size_t i = 0; i < page->capacity; i++) { if (!visitor(heap, area, block, ubsize, arg)) return false; block += bsize; } return true; } // create a bitmap of free blocks. #define MI_MAX_BLOCKS (MI_SMALL_PAGE_SIZE / sizeof(void*)) uintptr_t free_map[MI_MAX_BLOCKS / MI_INTPTR_BITS]; const uintptr_t bmapsize = _mi_divide_up(page->capacity, MI_INTPTR_BITS); memset(free_map, 0, bmapsize * sizeof(intptr_t)); if (page->capacity % MI_INTPTR_BITS != 0) { // mark left-over bits at the end as free size_t shift = (page->capacity % MI_INTPTR_BITS); uintptr_t mask = (UINTPTR_MAX << shift); free_map[bmapsize - 1] = mask; } // fast repeated division by the block size uint64_t magic; size_t shift; mi_get_fast_divisor(bsize, &magic, &shift); #if MI_DEBUG>1 size_t free_count = 0; #endif for (mi_block_t* block = page->free; block != NULL; block = mi_block_next(page, block)) { #if MI_DEBUG>1 free_count++; #endif mi_assert_internal((uint8_t*)block >= pstart && (uint8_t*)block < (pstart + psize)); size_t offset = (uint8_t*)block - pstart; mi_assert_internal(offset % bsize == 0); mi_assert_internal(offset <= UINT32_MAX); size_t blockidx = mi_fast_divide(offset, magic, shift); mi_assert_internal(blockidx == offset / bsize); mi_assert_internal(blockidx < MI_MAX_BLOCKS); size_t bitidx = (blockidx / MI_INTPTR_BITS); size_t bit = blockidx - (bitidx * MI_INTPTR_BITS); free_map[bitidx] |= ((uintptr_t)1 << bit); } mi_assert_internal(page->capacity == (free_count + page->used)); // walk through all blocks skipping the free ones #if MI_DEBUG>1 size_t used_count = 0; #endif uint8_t* block = pstart; for (size_t i = 0; i < bmapsize; i++) { if (free_map[i] == 0) { // every block is in use for (size_t j = 0; j < MI_INTPTR_BITS; j++) { #if MI_DEBUG>1 used_count++; #endif if (!visitor(heap, area, block, ubsize, arg)) return false; block += bsize; } } else { // visit the used blocks in the mask uintptr_t m = ~free_map[i]; while (m != 0) { #if MI_DEBUG>1 used_count++; #endif size_t bitidx = mi_ctz(m); if (!visitor(heap, area, block + (bitidx * bsize), ubsize, arg)) return false; m &= m - 1; // clear least significant bit } block += bsize * MI_INTPTR_BITS; } } mi_assert_internal(page->used == used_count); return true; } // Separate struct to keep `mi_page_t` out of the public interface typedef struct mi_heap_area_ex_s { mi_heap_area_t area; mi_page_t* page; } mi_heap_area_ex_t; typedef bool (mi_heap_area_visit_fun)(const mi_heap_t* heap, const mi_heap_area_ex_t* area, void* arg); static bool mi_heap_visit_areas_page(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* vfun, void* arg) { MI_UNUSED(heap); MI_UNUSED(pq); mi_heap_area_visit_fun* fun = (mi_heap_area_visit_fun*)vfun; mi_heap_area_ex_t xarea; xarea.page = page; _mi_heap_area_init(&xarea.area, page); return fun(heap, &xarea, arg); } // Visit all heap pages as areas static bool mi_heap_visit_areas(const mi_heap_t* heap, mi_heap_area_visit_fun* visitor, void* arg) { if (visitor == NULL) return false; return mi_heap_visit_pages((mi_heap_t*)heap, &mi_heap_visit_areas_page, (void*)(visitor), arg); // note: function pointer to void* :-{ } // Just to pass arguments typedef struct mi_visit_blocks_args_s { bool visit_blocks; mi_block_visit_fun* visitor; void* arg; } mi_visit_blocks_args_t; static bool mi_heap_area_visitor(const mi_heap_t* heap, const mi_heap_area_ex_t* xarea, void* arg) { mi_visit_blocks_args_t* args = (mi_visit_blocks_args_t*)arg; if (!args->visitor(heap, &xarea->area, NULL, xarea->area.block_size, args->arg)) return false; if (args->visit_blocks) { return _mi_heap_area_visit_blocks(&xarea->area, xarea->page, args->visitor, args->arg); } else { return true; } } // Visit all blocks in a heap bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) { mi_visit_blocks_args_t args = { visit_blocks, visitor, arg }; return mi_heap_visit_areas(heap, &mi_heap_area_visitor, &args); } ================================================ FILE: third-party/mimalloc/src/init.c ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2022, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #include "mimalloc.h" #include "mimalloc/internal.h" #include "mimalloc/prim.h" #include // memcpy, memset #include // atexit // Empty page used to initialize the small free pages array const mi_page_t _mi_page_empty = { 0, false, false, false, false, 0, // capacity 0, // reserved capacity { 0 }, // flags false, // is_zero 0, // retire_expire NULL, // free NULL, // local_free 0, // used 0, // block size shift 0, // heap tag 0, // block_size NULL, // page_start #if (MI_PADDING || MI_ENCODE_FREELIST) { 0, 0 }, #endif MI_ATOMIC_VAR_INIT(0), // xthread_free MI_ATOMIC_VAR_INIT(0), // xheap NULL, NULL , { 0 } // padding }; #define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty) #if (MI_SMALL_WSIZE_MAX==128) #if (MI_PADDING>0) && (MI_INTPTR_SIZE >= 8) #define MI_SMALL_PAGES_EMPTY { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY(), MI_PAGE_EMPTY() } #elif (MI_PADDING>0) #define MI_SMALL_PAGES_EMPTY { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY(), MI_PAGE_EMPTY(), MI_PAGE_EMPTY() } #else #define MI_SMALL_PAGES_EMPTY { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY() } #endif #else #error "define right initialization sizes corresponding to MI_SMALL_WSIZE_MAX" #endif // Empty page queues for every bin #define QNULL(sz) { NULL, NULL, (sz)*sizeof(uintptr_t) } #define MI_PAGE_QUEUES_EMPTY \ { QNULL(1), \ QNULL( 1), QNULL( 2), QNULL( 3), QNULL( 4), QNULL( 5), QNULL( 6), QNULL( 7), QNULL( 8), /* 8 */ \ QNULL( 10), QNULL( 12), QNULL( 14), QNULL( 16), QNULL( 20), QNULL( 24), QNULL( 28), QNULL( 32), /* 16 */ \ QNULL( 40), QNULL( 48), QNULL( 56), QNULL( 64), QNULL( 80), QNULL( 96), QNULL( 112), QNULL( 128), /* 24 */ \ QNULL( 160), QNULL( 192), QNULL( 224), QNULL( 256), QNULL( 320), QNULL( 384), QNULL( 448), QNULL( 512), /* 32 */ \ QNULL( 640), QNULL( 768), QNULL( 896), QNULL( 1024), QNULL( 1280), QNULL( 1536), QNULL( 1792), QNULL( 2048), /* 40 */ \ QNULL( 2560), QNULL( 3072), QNULL( 3584), QNULL( 4096), QNULL( 5120), QNULL( 6144), QNULL( 7168), QNULL( 8192), /* 48 */ \ QNULL( 10240), QNULL( 12288), QNULL( 14336), QNULL( 16384), QNULL( 20480), QNULL( 24576), QNULL( 28672), QNULL( 32768), /* 56 */ \ QNULL( 40960), QNULL( 49152), QNULL( 57344), QNULL( 65536), QNULL( 81920), QNULL( 98304), QNULL(114688), QNULL(131072), /* 64 */ \ QNULL(163840), QNULL(196608), QNULL(229376), QNULL(262144), QNULL(327680), QNULL(393216), QNULL(458752), QNULL(524288), /* 72 */ \ QNULL(MI_MEDIUM_OBJ_WSIZE_MAX + 1 /* 655360, Huge queue */), \ QNULL(MI_MEDIUM_OBJ_WSIZE_MAX + 2) /* Full queue */ } #define MI_STAT_COUNT_NULL() {0,0,0} // Empty statistics #define MI_STATS_NULL \ MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \ MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \ MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \ { 0 }, { 0 }, { 0 }, { 0 }, \ { 0 }, { 0 }, { 0 }, { 0 }, \ \ { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, \ MI_INIT4(MI_STAT_COUNT_NULL), \ { 0 }, { 0 }, { 0 }, { 0 }, \ \ { MI_INIT4(MI_STAT_COUNT_NULL) }, \ { { 0 }, { 0 }, { 0 }, { 0 } }, \ \ { MI_INIT74(MI_STAT_COUNT_NULL) }, \ { MI_INIT74(MI_STAT_COUNT_NULL) } // Empty slice span queues for every bin #define SQNULL(sz) { NULL, NULL, sz } #define MI_SEGMENT_SPAN_QUEUES_EMPTY \ { SQNULL(1), \ SQNULL( 1), SQNULL( 2), SQNULL( 3), SQNULL( 4), SQNULL( 5), SQNULL( 6), SQNULL( 7), SQNULL( 10), /* 8 */ \ SQNULL( 12), SQNULL( 14), SQNULL( 16), SQNULL( 20), SQNULL( 24), SQNULL( 28), SQNULL( 32), SQNULL( 40), /* 16 */ \ SQNULL( 48), SQNULL( 56), SQNULL( 64), SQNULL( 80), SQNULL( 96), SQNULL( 112), SQNULL( 128), SQNULL( 160), /* 24 */ \ SQNULL( 192), SQNULL( 224), SQNULL( 256), SQNULL( 320), SQNULL( 384), SQNULL( 448), SQNULL( 512), SQNULL( 640), /* 32 */ \ SQNULL( 768), SQNULL( 896), SQNULL( 1024) /* 35 */ } // -------------------------------------------------------- // Statically allocate an empty heap as the initial // thread local value for the default heap, // and statically allocate the backing heap for the main // thread so it can function without doing any allocation // itself (as accessing a thread local for the first time // may lead to allocation itself on some platforms) // -------------------------------------------------------- mi_decl_hidden mi_decl_cache_align const mi_heap_t _mi_heap_empty = { NULL, MI_ATOMIC_VAR_INIT(NULL), 0, // tid 0, // cookie 0, // arena id { 0, 0 }, // keys { {0}, {0}, 0, true }, // random 0, // page count MI_BIN_FULL, 0, // page retired min/max 0, 0, // generic count NULL, // next false, // can reclaim 0, // tag #if MI_GUARDED 0, 0, 0, 0, 1, // count is 1 so we never write to it (see `internal.h:mi_heap_malloc_use_guarded`) #endif MI_SMALL_PAGES_EMPTY, MI_PAGE_QUEUES_EMPTY }; static mi_decl_cache_align mi_subproc_t mi_subproc_default; #define tld_empty_stats ((mi_stats_t*)((uint8_t*)&tld_empty + offsetof(mi_tld_t,stats))) mi_decl_cache_align static const mi_tld_t tld_empty = { 0, false, NULL, NULL, { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, 0, &mi_subproc_default, tld_empty_stats }, // segments { MI_STAT_VERSION, MI_STATS_NULL } // stats }; mi_threadid_t _mi_thread_id(void) mi_attr_noexcept { return _mi_prim_thread_id(); } // the thread-local default heap for allocation mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty; extern mi_decl_hidden mi_heap_t _mi_heap_main; static mi_decl_cache_align mi_tld_t tld_main = { 0, false, &_mi_heap_main, & _mi_heap_main, { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, 0, &mi_subproc_default, &tld_main.stats }, // segments { MI_STAT_VERSION, MI_STATS_NULL } // stats }; mi_decl_cache_align mi_heap_t _mi_heap_main = { &tld_main, MI_ATOMIC_VAR_INIT(NULL), 0, // thread id 0, // initial cookie 0, // arena id { 0, 0 }, // the key of the main heap can be fixed (unlike page keys that need to be secure!) { {0x846ca68b}, {0}, 0, true }, // random 0, // page count MI_BIN_FULL, 0, // page retired min/max 0, 0, // generic count NULL, // next heap false, // can reclaim 0, // tag #if MI_GUARDED 0, 0, 0, 0, 0, #endif MI_SMALL_PAGES_EMPTY, MI_PAGE_QUEUES_EMPTY }; bool _mi_process_is_initialized = false; // set to `true` in `mi_process_init`. mi_stats_t _mi_stats_main = { MI_STAT_VERSION, MI_STATS_NULL }; #if MI_GUARDED mi_decl_export void mi_heap_guarded_set_sample_rate(mi_heap_t* heap, size_t sample_rate, size_t seed) { heap->guarded_sample_seed = seed; if (heap->guarded_sample_seed == 0) { heap->guarded_sample_seed = _mi_heap_random_next(heap); } heap->guarded_sample_rate = sample_rate; if (heap->guarded_sample_rate >= 1) { heap->guarded_sample_seed = heap->guarded_sample_seed % heap->guarded_sample_rate; } heap->guarded_sample_count = heap->guarded_sample_seed; // count down samples } mi_decl_export void mi_heap_guarded_set_size_bound(mi_heap_t* heap, size_t min, size_t max) { heap->guarded_size_min = min; heap->guarded_size_max = (min > max ? min : max); } void _mi_heap_guarded_init(mi_heap_t* heap) { mi_heap_guarded_set_sample_rate(heap, (size_t)mi_option_get_clamp(mi_option_guarded_sample_rate, 0, LONG_MAX), (size_t)mi_option_get(mi_option_guarded_sample_seed)); mi_heap_guarded_set_size_bound(heap, (size_t)mi_option_get_clamp(mi_option_guarded_min, 0, LONG_MAX), (size_t)mi_option_get_clamp(mi_option_guarded_max, 0, LONG_MAX) ); } #else mi_decl_export void mi_heap_guarded_set_sample_rate(mi_heap_t* heap, size_t sample_rate, size_t seed) { MI_UNUSED(heap); MI_UNUSED(sample_rate); MI_UNUSED(seed); } mi_decl_export void mi_heap_guarded_set_size_bound(mi_heap_t* heap, size_t min, size_t max) { MI_UNUSED(heap); MI_UNUSED(min); MI_UNUSED(max); } void _mi_heap_guarded_init(mi_heap_t* heap) { MI_UNUSED(heap); } #endif static void mi_heap_main_init(void) { if (_mi_heap_main.cookie == 0) { _mi_heap_main.thread_id = _mi_thread_id(); _mi_heap_main.cookie = 1; #if defined(_WIN32) && !defined(MI_SHARED_LIB) _mi_random_init_weak(&_mi_heap_main.random); // prevent allocation failure during bcrypt dll initialization with static linking #else _mi_random_init(&_mi_heap_main.random); #endif _mi_heap_main.cookie = _mi_heap_random_next(&_mi_heap_main); _mi_heap_main.keys[0] = _mi_heap_random_next(&_mi_heap_main); _mi_heap_main.keys[1] = _mi_heap_random_next(&_mi_heap_main); mi_lock_init(&mi_subproc_default.abandoned_os_lock); mi_lock_init(&mi_subproc_default.abandoned_os_visit_lock); _mi_heap_guarded_init(&_mi_heap_main); } } mi_heap_t* _mi_heap_main_get(void) { mi_heap_main_init(); return &_mi_heap_main; } /* ----------------------------------------------------------- Sub process ----------------------------------------------------------- */ mi_subproc_id_t mi_subproc_main(void) { return NULL; } mi_subproc_id_t mi_subproc_new(void) { mi_memid_t memid = _mi_memid_none(); mi_subproc_t* subproc = (mi_subproc_t*)_mi_arena_meta_zalloc(sizeof(mi_subproc_t), &memid); if (subproc == NULL) return NULL; subproc->memid = memid; subproc->abandoned_os_list = NULL; mi_lock_init(&subproc->abandoned_os_lock); mi_lock_init(&subproc->abandoned_os_visit_lock); return subproc; } mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id) { return (subproc_id == NULL ? &mi_subproc_default : (mi_subproc_t*)subproc_id); } void mi_subproc_delete(mi_subproc_id_t subproc_id) { if (subproc_id == NULL) return; mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id); // check if there are no abandoned segments still.. bool safe_to_delete = false; mi_lock(&subproc->abandoned_os_lock) { if (subproc->abandoned_os_list == NULL) { safe_to_delete = true; } } if (!safe_to_delete) return; // safe to release // todo: should we refcount subprocesses? mi_lock_done(&subproc->abandoned_os_lock); mi_lock_done(&subproc->abandoned_os_visit_lock); _mi_arena_meta_free(subproc, subproc->memid, sizeof(mi_subproc_t)); } void mi_subproc_add_current_thread(mi_subproc_id_t subproc_id) { mi_heap_t* heap = mi_heap_get_default(); if (heap == NULL) return; mi_assert(heap->tld->segments.subproc == &mi_subproc_default); if (heap->tld->segments.subproc != &mi_subproc_default) return; heap->tld->segments.subproc = _mi_subproc_from_id(subproc_id); } /* ----------------------------------------------------------- Initialization and freeing of the thread local heaps ----------------------------------------------------------- */ // note: in x64 in release build `sizeof(mi_thread_data_t)` is under 4KiB (= OS page size). typedef struct mi_thread_data_s { mi_heap_t heap; // must come first due to cast in `_mi_heap_done` mi_tld_t tld; mi_memid_t memid; // must come last due to zero'ing } mi_thread_data_t; // Thread meta-data is allocated directly from the OS. For // some programs that do not use thread pools and allocate and // destroy many OS threads, this may causes too much overhead // per thread so we maintain a small cache of recently freed metadata. #define TD_CACHE_SIZE (32) static _Atomic(mi_thread_data_t*) td_cache[TD_CACHE_SIZE]; static mi_thread_data_t* mi_thread_data_zalloc(void) { // try to find thread metadata in the cache bool is_zero = false; mi_thread_data_t* td = NULL; for (int i = 0; i < TD_CACHE_SIZE; i++) { td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]); if (td != NULL) { // found cached allocation, try use it td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL); if (td != NULL) { break; } } } // if that fails, allocate as meta data if (td == NULL) { mi_memid_t memid; td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &memid); if (td == NULL) { // if this fails, try once more. (issue #257) td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &memid); if (td == NULL) { // really out of memory _mi_error_message(ENOMEM, "unable to allocate thread local heap metadata (%zu bytes)\n", sizeof(mi_thread_data_t)); } } if (td != NULL) { td->memid = memid; is_zero = memid.initially_zero; } } if (td != NULL && !is_zero) { _mi_memzero_aligned(td, offsetof(mi_thread_data_t,memid)); } return td; } static void mi_thread_data_free( mi_thread_data_t* tdfree ) { // try to add the thread metadata to the cache for (int i = 0; i < TD_CACHE_SIZE; i++) { mi_thread_data_t* td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]); if (td == NULL) { mi_thread_data_t* expected = NULL; if (mi_atomic_cas_ptr_weak_acq_rel(mi_thread_data_t, &td_cache[i], &expected, tdfree)) { return; } } } // if that fails, just free it directly _mi_os_free(tdfree, sizeof(mi_thread_data_t), tdfree->memid); } void _mi_thread_data_collect(void) { // free all thread metadata from the cache for (int i = 0; i < TD_CACHE_SIZE; i++) { mi_thread_data_t* td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]); if (td != NULL) { td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL); if (td != NULL) { _mi_os_free(td, sizeof(mi_thread_data_t), td->memid); } } } } // Initialize the thread local default heap, called from `mi_thread_init` static bool _mi_thread_heap_init(void) { if (mi_heap_is_initialized(mi_prim_get_default_heap())) return true; if (_mi_is_main_thread()) { // mi_assert_internal(_mi_heap_main.thread_id != 0); // can happen on freeBSD where alloc is called before any initialization // the main heap is statically allocated mi_heap_main_init(); _mi_heap_set_default_direct(&_mi_heap_main); //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_prim_get_default_heap()); } else { // use `_mi_os_alloc` to allocate directly from the OS mi_thread_data_t* td = mi_thread_data_zalloc(); if (td == NULL) return false; mi_tld_t* tld = &td->tld; mi_heap_t* heap = &td->heap; _mi_tld_init(tld, heap); // must be before `_mi_heap_init` _mi_heap_init(heap, tld, _mi_arena_id_none(), false /* can reclaim */, 0 /* default tag */); _mi_heap_set_default_direct(heap); } return false; } // initialize thread local data void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) { _mi_memcpy_aligned(tld, &tld_empty, sizeof(mi_tld_t)); tld->heap_backing = bheap; tld->heaps = NULL; tld->segments.subproc = &mi_subproc_default; tld->segments.stats = &tld->stats; } // Free the thread local default heap (called from `mi_thread_done`) static bool _mi_thread_heap_done(mi_heap_t* heap) { if (!mi_heap_is_initialized(heap)) return true; // reset default heap _mi_heap_set_default_direct(_mi_is_main_thread() ? &_mi_heap_main : (mi_heap_t*)&_mi_heap_empty); // switch to backing heap heap = heap->tld->heap_backing; if (!mi_heap_is_initialized(heap)) return false; // delete all non-backing heaps in this thread mi_heap_t* curr = heap->tld->heaps; while (curr != NULL) { mi_heap_t* next = curr->next; // save `next` as `curr` will be freed if (curr != heap) { mi_assert_internal(!mi_heap_is_backing(curr)); mi_heap_delete(curr); } curr = next; } mi_assert_internal(heap->tld->heaps == heap && heap->next == NULL); mi_assert_internal(mi_heap_is_backing(heap)); // collect if not the main thread if (heap != &_mi_heap_main) { _mi_heap_collect_abandon(heap); } // merge stats _mi_stats_done(&heap->tld->stats); // free if not the main thread if (heap != &_mi_heap_main) { // the following assertion does not always hold for huge segments as those are always treated // as abondened: one may allocate it in one thread, but deallocate in another in which case // the count can be too large or negative. todo: perhaps not count huge segments? see issue #363 // mi_assert_internal(heap->tld->segments.count == 0 || heap->thread_id != _mi_thread_id()); mi_thread_data_free((mi_thread_data_t*)heap); } else { #if 0 // never free the main thread even in debug mode; if a dll is linked statically with mimalloc, // there may still be delete/free calls after the mi_fls_done is called. Issue #207 _mi_heap_destroy_pages(heap); mi_assert_internal(heap->tld->heap_backing == &_mi_heap_main); #endif } return false; } // -------------------------------------------------------- // Try to run `mi_thread_done()` automatically so any memory // owned by the thread but not yet released can be abandoned // and re-owned by another thread. // // 1. windows dynamic library: // call from DllMain on DLL_THREAD_DETACH // 2. windows static library: // use `FlsAlloc` to call a destructor when the thread is done // 3. unix, pthreads: // use a pthread key to call a destructor when a pthread is done // // In the last two cases we also need to call `mi_process_init` // to set up the thread local keys. // -------------------------------------------------------- // Set up handlers so `mi_thread_done` is called automatically static void mi_process_setup_auto_thread_done(void) { static bool tls_initialized = false; // fine if it races if (tls_initialized) return; tls_initialized = true; _mi_prim_thread_init_auto_done(); _mi_heap_set_default_direct(&_mi_heap_main); } bool _mi_is_main_thread(void) { return (_mi_heap_main.thread_id==0 || _mi_heap_main.thread_id == _mi_thread_id()); } static _Atomic(size_t) thread_count = MI_ATOMIC_VAR_INIT(1); size_t _mi_current_thread_count(void) { return mi_atomic_load_relaxed(&thread_count); } // This is called from the `mi_malloc_generic` void mi_thread_init(void) mi_attr_noexcept { // ensure our process has started already mi_process_init(); // initialize the thread local default heap // (this will call `_mi_heap_set_default_direct` and thus set the // fiber/pthread key to a non-zero value, ensuring `_mi_thread_done` is called) if (_mi_thread_heap_init()) return; // returns true if already initialized _mi_stat_increase(&_mi_stats_main.threads, 1); mi_atomic_increment_relaxed(&thread_count); //_mi_verbose_message("thread init: 0x%zx\n", _mi_thread_id()); } void mi_thread_done(void) mi_attr_noexcept { _mi_thread_done(NULL); } void _mi_thread_done(mi_heap_t* heap) { // calling with NULL implies using the default heap if (heap == NULL) { heap = mi_prim_get_default_heap(); if (heap == NULL) return; } // prevent re-entrancy through heap_done/heap_set_default_direct (issue #699) if (!mi_heap_is_initialized(heap)) { return; } // adjust stats mi_atomic_decrement_relaxed(&thread_count); _mi_stat_decrease(&_mi_stats_main.threads, 1); // check thread-id as on Windows shutdown with FLS the main (exit) thread may call this on thread-local heaps... if (heap->thread_id != _mi_thread_id()) return; // abandon the thread local heap if (_mi_thread_heap_done(heap)) return; // returns true if already ran } void _mi_heap_set_default_direct(mi_heap_t* heap) { mi_assert_internal(heap != NULL); #if defined(MI_TLS_SLOT) mi_prim_tls_slot_set(MI_TLS_SLOT,heap); #elif defined(MI_TLS_PTHREAD_SLOT_OFS) *mi_prim_tls_pthread_heap_slot() = heap; #elif defined(MI_TLS_PTHREAD) // we use _mi_heap_default_key #else _mi_heap_default = heap; #endif // ensure the default heap is passed to `_mi_thread_done` // setting to a non-NULL value also ensures `mi_thread_done` is called. _mi_prim_thread_associate_default_heap(heap); } void mi_thread_set_in_threadpool(void) mi_attr_noexcept { // nothing } // -------------------------------------------------------- // Run functions on process init/done, and thread init/done // -------------------------------------------------------- static bool os_preloading = true; // true until this module is initialized // Returns true if this module has not been initialized; Don't use C runtime routines until it returns false. bool mi_decl_noinline _mi_preloading(void) { return os_preloading; } // Returns true if mimalloc was redirected mi_decl_nodiscard bool mi_is_redirected(void) mi_attr_noexcept { return _mi_is_redirected(); } // Called once by the process loader from `src/prim/prim.c` void _mi_process_load(void) { mi_heap_main_init(); #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD) volatile mi_heap_t* dummy = _mi_heap_default; // access TLS to allocate it before setting tls_initialized to true; if (dummy == NULL) return; // use dummy or otherwise the access may get optimized away (issue #697) #endif os_preloading = false; mi_assert_internal(_mi_is_main_thread()); _mi_options_init(); mi_process_setup_auto_thread_done(); mi_process_init(); if (_mi_is_redirected()) _mi_verbose_message("malloc is redirected.\n"); // show message from the redirector (if present) const char* msg = NULL; _mi_allocator_init(&msg); if (msg != NULL && (mi_option_is_enabled(mi_option_verbose) || mi_option_is_enabled(mi_option_show_errors))) { _mi_fputs(NULL,NULL,NULL,msg); } // reseed random _mi_random_reinit_if_weak(&_mi_heap_main.random); } #if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64)) #include mi_decl_cache_align bool _mi_cpu_has_fsrm = false; mi_decl_cache_align bool _mi_cpu_has_erms = false; static void mi_detect_cpu_features(void) { // FSRM for fast short rep movsb/stosb support (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017)) // EMRS for fast enhanced rep movsb/stosb support int32_t cpu_info[4]; __cpuid(cpu_info, 7); _mi_cpu_has_fsrm = ((cpu_info[3] & (1 << 4)) != 0); // bit 4 of EDX : see _mi_cpu_has_erms = ((cpu_info[1] & (1 << 9)) != 0); // bit 9 of EBX : see } #else static void mi_detect_cpu_features(void) { // nothing } #endif // Initialize the process; called by thread_init or the process loader void mi_process_init(void) mi_attr_noexcept { // ensure we are called once static mi_atomic_once_t process_init; #if _MSC_VER < 1920 mi_heap_main_init(); // vs2017 can dynamically re-initialize _mi_heap_main #endif if (!mi_atomic_once(&process_init)) return; _mi_process_is_initialized = true; _mi_verbose_message("process init: 0x%zx\n", _mi_thread_id()); mi_process_setup_auto_thread_done(); mi_detect_cpu_features(); _mi_os_init(); mi_heap_main_init(); mi_thread_init(); #if defined(_WIN32) // On windows, when building as a static lib the FLS cleanup happens to early for the main thread. // To avoid this, set the FLS value for the main thread to NULL so the fls cleanup // will not call _mi_thread_done on the (still executing) main thread. See issue #508. _mi_prim_thread_associate_default_heap(NULL); #endif mi_stats_reset(); // only call stat reset *after* thread init (or the heap tld == NULL) mi_track_init(); if (mi_option_is_enabled(mi_option_reserve_huge_os_pages)) { size_t pages = mi_option_get_clamp(mi_option_reserve_huge_os_pages, 0, 128*1024); long reserve_at = mi_option_get(mi_option_reserve_huge_os_pages_at); if (reserve_at != -1) { mi_reserve_huge_os_pages_at(pages, reserve_at, pages*500); } else { mi_reserve_huge_os_pages_interleave(pages, 0, pages*500); } } if (mi_option_is_enabled(mi_option_reserve_os_memory)) { long ksize = mi_option_get(mi_option_reserve_os_memory); if (ksize > 0) { mi_reserve_os_memory((size_t)ksize*MI_KiB, true /* commit? */, true /* allow large pages? */); } } } // Called when the process is done (through `at_exit`) void mi_cdecl _mi_process_done(void) { // only shutdown if we were initialized if (!_mi_process_is_initialized) return; // ensure we are called once static bool process_done = false; if (process_done) return; process_done = true; // get the default heap so we don't need to acces thread locals anymore mi_heap_t* heap = mi_prim_get_default_heap(); // use prim to not initialize any heap mi_assert_internal(heap != NULL); // release any thread specific resources and ensure _mi_thread_done is called on all but the main thread _mi_prim_thread_done_auto_done(); #ifndef MI_SKIP_COLLECT_ON_EXIT #if (MI_DEBUG || !defined(MI_SHARED_LIB)) // free all memory if possible on process exit. This is not needed for a stand-alone process // but should be done if mimalloc is statically linked into another shared library which // is repeatedly loaded/unloaded, see issue #281. mi_heap_collect(heap, true /* force */ ); #endif #endif // Forcefully release all retained memory; this can be dangerous in general if overriding regular malloc/free // since after process_done there might still be other code running that calls `free` (like at_exit routines, // or C-runtime termination code. if (mi_option_is_enabled(mi_option_destroy_on_exit)) { mi_heap_collect(heap, true /* force */); _mi_heap_unsafe_destroy_all(heap); // forcefully release all memory held by all heaps (of this thread only!) _mi_arena_unsafe_destroy_all(); _mi_segment_map_unsafe_destroy(); } if (mi_option_is_enabled(mi_option_show_stats) || mi_option_is_enabled(mi_option_verbose)) { mi_stats_print(NULL); } _mi_allocator_done(); _mi_verbose_message("process done: 0x%zx\n", _mi_heap_main.thread_id); os_preloading = true; // don't call the C runtime anymore } ================================================ FILE: third-party/mimalloc/src/libc.c ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2023, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ // -------------------------------------------------------- // This module defines various std libc functions to reduce // the dependency on libc, and also prevent errors caused // by some libc implementations when called before `main` // executes (due to malloc redirection) // -------------------------------------------------------- #include "mimalloc.h" #include "mimalloc/internal.h" #include "mimalloc/prim.h" // mi_prim_getenv char _mi_toupper(char c) { if (c >= 'a' && c <= 'z') return (c - 'a' + 'A'); else return c; } int _mi_strnicmp(const char* s, const char* t, size_t n) { if (n == 0) return 0; for (; *s != 0 && *t != 0 && n > 0; s++, t++, n--) { if (_mi_toupper(*s) != _mi_toupper(*t)) break; } return (n == 0 ? 0 : *s - *t); } void _mi_strlcpy(char* dest, const char* src, size_t dest_size) { if (dest==NULL || src==NULL || dest_size == 0) return; // copy until end of src, or when dest is (almost) full while (*src != 0 && dest_size > 1) { *dest++ = *src++; dest_size--; } // always zero terminate *dest = 0; } void _mi_strlcat(char* dest, const char* src, size_t dest_size) { if (dest==NULL || src==NULL || dest_size == 0) return; // find end of string in the dest buffer while (*dest != 0 && dest_size > 1) { dest++; dest_size--; } // and catenate _mi_strlcpy(dest, src, dest_size); } size_t _mi_strlen(const char* s) { if (s==NULL) return 0; size_t len = 0; while(s[len] != 0) { len++; } return len; } size_t _mi_strnlen(const char* s, size_t max_len) { if (s==NULL) return 0; size_t len = 0; while(s[len] != 0 && len < max_len) { len++; } return len; } #ifdef MI_NO_GETENV bool _mi_getenv(const char* name, char* result, size_t result_size) { MI_UNUSED(name); MI_UNUSED(result); MI_UNUSED(result_size); return false; } #else bool _mi_getenv(const char* name, char* result, size_t result_size) { if (name==NULL || result == NULL || result_size < 64) return false; return _mi_prim_getenv(name,result,result_size); } #endif // -------------------------------------------------------- // Define our own limited `_mi_vsnprintf` and `_mi_snprintf` // This is mostly to avoid calling these when libc is not yet // initialized (and to reduce dependencies) // // format: d i, p x u, s // prec: z l ll L // width: 10 // align-left: - // fill: 0 // plus: + // -------------------------------------------------------- static void mi_outc(char c, char** out, char* end) { char* p = *out; if (p >= end) return; *p = c; *out = p + 1; } static void mi_outs(const char* s, char** out, char* end) { if (s == NULL) return; char* p = *out; while (*s != 0 && p < end) { *p++ = *s++; } *out = p; } static void mi_out_fill(char fill, size_t len, char** out, char* end) { char* p = *out; for (size_t i = 0; i < len && p < end; i++) { *p++ = fill; } *out = p; } static void mi_out_alignright(char fill, char* start, size_t len, size_t extra, char* end) { if (len == 0 || extra == 0) return; if (start + len + extra >= end) return; // move `len` characters to the right (in reverse since it can overlap) for (size_t i = 1; i <= len; i++) { start[len + extra - i] = start[len - i]; } // and fill the start for (size_t i = 0; i < extra; i++) { start[i] = fill; } } static void mi_out_num(uintmax_t x, size_t base, char prefix, char** out, char* end) { if (x == 0 || base == 0 || base > 16) { if (prefix != 0) { mi_outc(prefix, out, end); } mi_outc('0',out,end); } else { // output digits in reverse char* start = *out; while (x > 0) { char digit = (char)(x % base); mi_outc((digit <= 9 ? '0' + digit : 'A' + digit - 10),out,end); x = x / base; } if (prefix != 0) { mi_outc(prefix, out, end); } size_t len = *out - start; // and reverse in-place for (size_t i = 0; i < (len / 2); i++) { char c = start[len - i - 1]; start[len - i - 1] = start[i]; start[i] = c; } } } #define MI_NEXTC() c = *in; if (c==0) break; in++; int _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) { if (buf == NULL || bufsize == 0 || fmt == NULL) return 0; buf[bufsize - 1] = 0; char* const end = buf + (bufsize - 1); const char* in = fmt; char* out = buf; while (true) { if (out >= end) break; char c; MI_NEXTC(); if (c != '%') { if ((c >= ' ' && c <= '~') || c=='\n' || c=='\r' || c=='\t') { // output visible ascii or standard control only mi_outc(c, &out, end); } } else { MI_NEXTC(); char fill = ' '; size_t width = 0; char numtype = 'd'; char numplus = 0; bool alignright = true; if (c == '+' || c == ' ') { numplus = c; MI_NEXTC(); } if (c == '-') { alignright = false; MI_NEXTC(); } if (c == '0') { fill = '0'; MI_NEXTC(); } if (c >= '1' && c <= '9') { width = (c - '0'); MI_NEXTC(); while (c >= '0' && c <= '9') { width = (10 * width) + (c - '0'); MI_NEXTC(); } if (c == 0) break; // extra check due to while } if (c == 'z' || c == 't' || c == 'L') { numtype = c; MI_NEXTC(); } else if (c == 'l') { numtype = c; MI_NEXTC(); if (c == 'l') { numtype = 'L'; MI_NEXTC(); } } char* start = out; if (c == 's') { // string const char* s = va_arg(args, const char*); mi_outs(s, &out, end); } else if (c == 'p' || c == 'x' || c == 'u') { // unsigned uintmax_t x = 0; if (c == 'x' || c == 'u') { if (numtype == 'z') x = va_arg(args, size_t); else if (numtype == 't') x = va_arg(args, uintptr_t); // unsigned ptrdiff_t else if (numtype == 'L') x = va_arg(args, unsigned long long); else if (numtype == 'l') x = va_arg(args, unsigned long); else x = va_arg(args, unsigned int); } else if (c == 'p') { x = va_arg(args, uintptr_t); mi_outs("0x", &out, end); start = out; width = (width >= 2 ? width - 2 : 0); } if (width == 0 && (c == 'x' || c == 'p')) { if (c == 'p') { width = 2 * (x <= UINT32_MAX ? 4 : ((x >> 16) <= UINT32_MAX ? 6 : sizeof(void*))); } if (width == 0) { width = 2; } fill = '0'; } mi_out_num(x, (c == 'x' || c == 'p' ? 16 : 10), numplus, &out, end); } else if (c == 'i' || c == 'd') { // signed intmax_t x = 0; if (numtype == 'z') x = va_arg(args, intptr_t ); else if (numtype == 't') x = va_arg(args, ptrdiff_t); else if (numtype == 'L') x = va_arg(args, long long); else if (numtype == 'l') x = va_arg(args, long); else x = va_arg(args, int); char pre = 0; if (x < 0) { pre = '-'; if (x > INTMAX_MIN) { x = -x; } } else if (numplus != 0) { pre = numplus; } mi_out_num((uintmax_t)x, 10, pre, &out, end); } else if (c >= ' ' && c <= '~') { // unknown format mi_outc('%', &out, end); mi_outc(c, &out, end); } // fill & align mi_assert_internal(out <= end); mi_assert_internal(out >= start); const size_t len = out - start; if (len < width) { mi_out_fill(fill, width - len, &out, end); if (alignright && out <= end) { mi_out_alignright(fill, start, len, width - len, end); } } } } mi_assert_internal(out <= end); *out = 0; return (int)(out - buf); } int _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...) { va_list args; va_start(args, fmt); const int written = _mi_vsnprintf(buf, buflen, fmt, args); va_end(args); return written; } ================================================ FILE: third-party/mimalloc/src/options.c ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2021, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #include "mimalloc.h" #include "mimalloc/internal.h" #include "mimalloc/atomic.h" #include "mimalloc/prim.h" // mi_prim_out_stderr #include // stdin/stdout #include // abort static long mi_max_error_count = 16; // stop outputting errors after this (use < 0 for no limit) static long mi_max_warning_count = 16; // stop outputting warnings after this (use < 0 for no limit) static void mi_add_stderr_output(void); int mi_version(void) mi_attr_noexcept { return MI_MALLOC_VERSION; } // -------------------------------------------------------- // Options // These can be accessed by multiple threads and may be // concurrently initialized, but an initializing data race // is ok since they resolve to the same value. // -------------------------------------------------------- typedef enum mi_init_e { UNINIT, // not yet initialized DEFAULTED, // not found in the environment, use default value INITIALIZED // found in environment or set explicitly } mi_init_t; typedef struct mi_option_desc_s { long value; // the value mi_init_t init; // is it initialized yet? (from the environment) mi_option_t option; // for debugging: the option index should match the option const char* name; // option name without `mimalloc_` prefix const char* legacy_name; // potential legacy option name } mi_option_desc_t; #define MI_OPTION(opt) mi_option_##opt, #opt, NULL #define MI_OPTION_LEGACY(opt,legacy) mi_option_##opt, #opt, #legacy // Some options can be set at build time for statically linked libraries // (use `-DMI_EXTRA_CPPDEFS="opt1=val1;opt2=val2"`) // // This is useful if we cannot pass them as environment variables // (and setting them programmatically would be too late) #ifndef MI_DEFAULT_VERBOSE #define MI_DEFAULT_VERBOSE 0 #endif #ifndef MI_DEFAULT_EAGER_COMMIT #define MI_DEFAULT_EAGER_COMMIT 1 #endif #ifndef MI_DEFAULT_ARENA_EAGER_COMMIT #define MI_DEFAULT_ARENA_EAGER_COMMIT 2 #endif // in KiB #ifndef MI_DEFAULT_ARENA_RESERVE #if (MI_INTPTR_SIZE>4) #define MI_DEFAULT_ARENA_RESERVE 1024L*1024L #else #define MI_DEFAULT_ARENA_RESERVE 128L*1024L #endif #endif #ifndef MI_DEFAULT_DISALLOW_ARENA_ALLOC #define MI_DEFAULT_DISALLOW_ARENA_ALLOC 0 #endif #ifndef MI_DEFAULT_ALLOW_LARGE_OS_PAGES #if defined(__linux__) && !defined(__ANDROID__) #define MI_DEFAULT_ALLOW_LARGE_OS_PAGES 2 // enabled, but only use transparent huge pages through madvise #else #define MI_DEFAULT_ALLOW_LARGE_OS_PAGES 0 #endif #endif #ifndef MI_DEFAULT_RESERVE_HUGE_OS_PAGES #define MI_DEFAULT_RESERVE_HUGE_OS_PAGES 0 #endif #ifndef MI_DEFAULT_RESERVE_OS_MEMORY #define MI_DEFAULT_RESERVE_OS_MEMORY 0 #endif #ifndef MI_DEFAULT_GUARDED_SAMPLE_RATE #if MI_GUARDED #define MI_DEFAULT_GUARDED_SAMPLE_RATE 4000 #else #define MI_DEFAULT_GUARDED_SAMPLE_RATE 0 #endif #endif static mi_option_desc_t options[_mi_option_last] = { // stable options #if MI_DEBUG || defined(MI_SHOW_ERRORS) { 1, UNINIT, MI_OPTION(show_errors) }, #else { 0, UNINIT, MI_OPTION(show_errors) }, #endif { 0, UNINIT, MI_OPTION(show_stats) }, { MI_DEFAULT_VERBOSE, UNINIT, MI_OPTION(verbose) }, // some of the following options are experimental and not all combinations are allowed. { MI_DEFAULT_EAGER_COMMIT, UNINIT, MI_OPTION(eager_commit) }, // commit per segment directly (4MiB) (but see also `eager_commit_delay`) { MI_DEFAULT_ARENA_EAGER_COMMIT, UNINIT, MI_OPTION_LEGACY(arena_eager_commit,eager_region_commit) }, // eager commit arena's? 2 is used to enable this only on an OS that has overcommit (i.e. linux) { 1, UNINIT, MI_OPTION_LEGACY(purge_decommits,reset_decommits) }, // purge decommits memory (instead of reset) (note: on linux this uses MADV_DONTNEED for decommit) { MI_DEFAULT_ALLOW_LARGE_OS_PAGES, UNINIT, MI_OPTION_LEGACY(allow_large_os_pages,large_os_pages) }, // use large OS pages, use only with eager commit to prevent fragmentation of VMA's { MI_DEFAULT_RESERVE_HUGE_OS_PAGES, UNINIT, MI_OPTION(reserve_huge_os_pages) }, // per 1GiB huge pages {-1, UNINIT, MI_OPTION(reserve_huge_os_pages_at) }, // reserve huge pages at node N { MI_DEFAULT_RESERVE_OS_MEMORY, UNINIT, MI_OPTION(reserve_os_memory) }, // reserve N KiB OS memory in advance (use `option_get_size`) { 0, UNINIT, MI_OPTION(deprecated_segment_cache) }, // cache N segments per thread { 0, UNINIT, MI_OPTION(deprecated_page_reset) }, // reset page memory on free { 0, UNINIT, MI_OPTION_LEGACY(abandoned_page_purge,abandoned_page_reset) }, // reset free page memory when a thread terminates { 0, UNINIT, MI_OPTION(deprecated_segment_reset) }, // reset segment memory on free (needs eager commit) #if defined(__NetBSD__) { 0, UNINIT, MI_OPTION(eager_commit_delay) }, // the first N segments per thread are not eagerly committed #else { 1, UNINIT, MI_OPTION(eager_commit_delay) }, // the first N segments per thread are not eagerly committed (but per page in the segment on demand) #endif { 10, UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) }, // purge delay in milli-seconds { 0, UNINIT, MI_OPTION(use_numa_nodes) }, // 0 = use available numa nodes, otherwise use at most N nodes. { 0, UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) }, // 1 = do not use OS memory for allocation (but only reserved arenas) { 100, UNINIT, MI_OPTION(os_tag) }, // only apple specific for now but might serve more or less related purpose { 32, UNINIT, MI_OPTION(max_errors) }, // maximum errors that are output { 32, UNINIT, MI_OPTION(max_warnings) }, // maximum warnings that are output { 10, UNINIT, MI_OPTION(max_segment_reclaim)}, // max. percentage of the abandoned segments to be reclaimed per try. { 0, UNINIT, MI_OPTION(destroy_on_exit)}, // release all OS memory on process exit; careful with dangling pointer or after-exit frees! { MI_DEFAULT_ARENA_RESERVE, UNINIT, MI_OPTION(arena_reserve) }, // reserve memory N KiB at a time (=1GiB) (use `option_get_size`) { 10, UNINIT, MI_OPTION(arena_purge_mult) }, // purge delay multiplier for arena's { 1, UNINIT, MI_OPTION_LEGACY(purge_extend_delay, decommit_extend_delay) }, { 0, UNINIT, MI_OPTION(abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free { MI_DEFAULT_DISALLOW_ARENA_ALLOC, UNINIT, MI_OPTION(disallow_arena_alloc) }, // 1 = do not use arena's for allocation (except if using specific arena id's) { 400, UNINIT, MI_OPTION(retry_on_oom) }, // windows only: retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries. #if defined(MI_VISIT_ABANDONED) { 1, INITIALIZED, MI_OPTION(visit_abandoned) }, // allow visiting heap blocks in abandoned segments; requires taking locks during reclaim. #else { 0, UNINIT, MI_OPTION(visit_abandoned) }, #endif { 0, UNINIT, MI_OPTION(guarded_min) }, // only used when building with MI_GUARDED: minimal rounded object size for guarded objects { MI_GiB, UNINIT, MI_OPTION(guarded_max) }, // only used when building with MI_GUARDED: maximal rounded object size for guarded objects { 0, UNINIT, MI_OPTION(guarded_precise) }, // disregard minimal alignment requirement to always place guarded blocks exactly in front of a guard page (=0) { MI_DEFAULT_GUARDED_SAMPLE_RATE, UNINIT, MI_OPTION(guarded_sample_rate)}, // 1 out of N allocations in the min/max range will be guarded (=4000) { 0, UNINIT, MI_OPTION(guarded_sample_seed)}, { 0, UNINIT, MI_OPTION(target_segments_per_thread) }, // abandon segments beyond this point, or 0 to disable. { 10000, UNINIT, MI_OPTION(generic_collect) }, // collect heaps every N (=10000) generic allocation calls }; static void mi_option_init(mi_option_desc_t* desc); static bool mi_option_has_size_in_kib(mi_option_t option) { return (option == mi_option_reserve_os_memory || option == mi_option_arena_reserve); } void _mi_options_init(void) { // called on process load mi_add_stderr_output(); // now it safe to use stderr for output for(int i = 0; i < _mi_option_last; i++ ) { mi_option_t option = (mi_option_t)i; long l = mi_option_get(option); MI_UNUSED(l); // initialize } mi_max_error_count = mi_option_get(mi_option_max_errors); mi_max_warning_count = mi_option_get(mi_option_max_warnings); #if MI_GUARDED if (mi_option_get(mi_option_guarded_sample_rate) > 0) { if (mi_option_is_enabled(mi_option_allow_large_os_pages)) { mi_option_disable(mi_option_allow_large_os_pages); _mi_warning_message("option 'allow_large_os_pages' is disabled to allow for guarded objects\n"); } } #endif if (mi_option_is_enabled(mi_option_verbose)) { mi_options_print(); } } #define mi_stringifyx(str) #str // and stringify #define mi_stringify(str) mi_stringifyx(str) // expand void mi_options_print(void) mi_attr_noexcept { // show version const int vermajor = MI_MALLOC_VERSION/100; const int verminor = (MI_MALLOC_VERSION%100)/10; const int verpatch = (MI_MALLOC_VERSION%10); _mi_message("v%i.%i.%i%s%s (built on %s, %s)\n", vermajor, verminor, verpatch, #if defined(MI_CMAKE_BUILD_TYPE) ", " mi_stringify(MI_CMAKE_BUILD_TYPE) #else "" #endif , #if defined(MI_GIT_DESCRIBE) ", git " mi_stringify(MI_GIT_DESCRIBE) #else "" #endif , __DATE__, __TIME__); // show options for (int i = 0; i < _mi_option_last; i++) { mi_option_t option = (mi_option_t)i; long l = mi_option_get(option); MI_UNUSED(l); // possibly initialize mi_option_desc_t* desc = &options[option]; _mi_message("option '%s': %ld %s\n", desc->name, desc->value, (mi_option_has_size_in_kib(option) ? "KiB" : "")); } // show build configuration _mi_message("debug level : %d\n", MI_DEBUG ); _mi_message("secure level: %d\n", MI_SECURE ); _mi_message("mem tracking: %s\n", MI_TRACK_TOOL); #if MI_GUARDED _mi_message("guarded build: %s\n", mi_option_get(mi_option_guarded_sample_rate) != 0 ? "enabled" : "disabled"); #endif #if MI_TSAN _mi_message("thread santizer enabled\n"); #endif } long _mi_option_get_fast(mi_option_t option) { mi_assert(option >= 0 && option < _mi_option_last); mi_option_desc_t* desc = &options[option]; mi_assert(desc->option == option); // index should match the option //mi_assert(desc->init != UNINIT); return desc->value; } mi_decl_nodiscard long mi_option_get(mi_option_t option) { mi_assert(option >= 0 && option < _mi_option_last); if (option < 0 || option >= _mi_option_last) return 0; mi_option_desc_t* desc = &options[option]; mi_assert(desc->option == option); // index should match the option if mi_unlikely(desc->init == UNINIT) { mi_option_init(desc); } return desc->value; } mi_decl_nodiscard long mi_option_get_clamp(mi_option_t option, long min, long max) { long x = mi_option_get(option); return (x < min ? min : (x > max ? max : x)); } mi_decl_nodiscard size_t mi_option_get_size(mi_option_t option) { const long x = mi_option_get(option); size_t size = (x < 0 ? 0 : (size_t)x); if (mi_option_has_size_in_kib(option)) { size *= MI_KiB; } return size; } void mi_option_set(mi_option_t option, long value) { mi_assert(option >= 0 && option < _mi_option_last); if (option < 0 || option >= _mi_option_last) return; mi_option_desc_t* desc = &options[option]; mi_assert(desc->option == option); // index should match the option desc->value = value; desc->init = INITIALIZED; // ensure min/max range; be careful to not recurse. if (desc->option == mi_option_guarded_min && _mi_option_get_fast(mi_option_guarded_max) < value) { mi_option_set(mi_option_guarded_max, value); } else if (desc->option == mi_option_guarded_max && _mi_option_get_fast(mi_option_guarded_min) > value) { mi_option_set(mi_option_guarded_min, value); } } void mi_option_set_default(mi_option_t option, long value) { mi_assert(option >= 0 && option < _mi_option_last); if (option < 0 || option >= _mi_option_last) return; mi_option_desc_t* desc = &options[option]; if (desc->init != INITIALIZED) { desc->value = value; } } mi_decl_nodiscard bool mi_option_is_enabled(mi_option_t option) { return (mi_option_get(option) != 0); } void mi_option_set_enabled(mi_option_t option, bool enable) { mi_option_set(option, (enable ? 1 : 0)); } void mi_option_set_enabled_default(mi_option_t option, bool enable) { mi_option_set_default(option, (enable ? 1 : 0)); } void mi_option_enable(mi_option_t option) { mi_option_set_enabled(option,true); } void mi_option_disable(mi_option_t option) { mi_option_set_enabled(option,false); } static void mi_cdecl mi_out_stderr(const char* msg, void* arg) { MI_UNUSED(arg); if (msg != NULL && msg[0] != 0) { _mi_prim_out_stderr(msg); } } // Since an output function can be registered earliest in the `main` // function we also buffer output that happens earlier. When // an output function is registered it is called immediately with // the output up to that point. #ifndef MI_MAX_DELAY_OUTPUT #define MI_MAX_DELAY_OUTPUT ((size_t)(16*1024)) #endif static char out_buf[MI_MAX_DELAY_OUTPUT+1]; static _Atomic(size_t) out_len; static void mi_cdecl mi_out_buf(const char* msg, void* arg) { MI_UNUSED(arg); if (msg==NULL) return; if (mi_atomic_load_relaxed(&out_len)>=MI_MAX_DELAY_OUTPUT) return; size_t n = _mi_strlen(msg); if (n==0) return; // claim space size_t start = mi_atomic_add_acq_rel(&out_len, n); if (start >= MI_MAX_DELAY_OUTPUT) return; // check bound if (start+n >= MI_MAX_DELAY_OUTPUT) { n = MI_MAX_DELAY_OUTPUT-start-1; } _mi_memcpy(&out_buf[start], msg, n); } static void mi_out_buf_flush(mi_output_fun* out, bool no_more_buf, void* arg) { if (out==NULL) return; // claim (if `no_more_buf == true`, no more output will be added after this point) size_t count = mi_atomic_add_acq_rel(&out_len, (no_more_buf ? MI_MAX_DELAY_OUTPUT : 1)); // and output the current contents if (count>MI_MAX_DELAY_OUTPUT) count = MI_MAX_DELAY_OUTPUT; out_buf[count] = 0; out(out_buf,arg); if (!no_more_buf) { out_buf[count] = '\n'; // if continue with the buffer, insert a newline } } // Once this module is loaded, switch to this routine // which outputs to stderr and the delayed output buffer. static void mi_cdecl mi_out_buf_stderr(const char* msg, void* arg) { mi_out_stderr(msg,arg); mi_out_buf(msg,arg); } // -------------------------------------------------------- // Default output handler // -------------------------------------------------------- // Should be atomic but gives errors on many platforms as generally we cannot cast a function pointer to a uintptr_t. // For now, don't register output from multiple threads. static mi_output_fun* volatile mi_out_default; // = NULL static _Atomic(void*) mi_out_arg; // = NULL static mi_output_fun* mi_out_get_default(void** parg) { if (parg != NULL) { *parg = mi_atomic_load_ptr_acquire(void,&mi_out_arg); } mi_output_fun* out = mi_out_default; return (out == NULL ? &mi_out_buf : out); } void mi_register_output(mi_output_fun* out, void* arg) mi_attr_noexcept { mi_out_default = (out == NULL ? &mi_out_stderr : out); // stop using the delayed output buffer mi_atomic_store_ptr_release(void,&mi_out_arg, arg); if (out!=NULL) mi_out_buf_flush(out,true,arg); // output all the delayed output now } // add stderr to the delayed output after the module is loaded static void mi_add_stderr_output(void) { mi_assert_internal(mi_out_default == NULL); mi_out_buf_flush(&mi_out_stderr, false, NULL); // flush current contents to stderr mi_out_default = &mi_out_buf_stderr; // and add stderr to the delayed output } // -------------------------------------------------------- // Messages, all end up calling `_mi_fputs`. // -------------------------------------------------------- static _Atomic(size_t) error_count; // = 0; // when >= max_error_count stop emitting errors static _Atomic(size_t) warning_count; // = 0; // when >= max_warning_count stop emitting warnings // When overriding malloc, we may recurse into mi_vfprintf if an allocation // inside the C runtime causes another message. // In some cases (like on macOS) the loader already allocates which // calls into mimalloc; if we then access thread locals (like `recurse`) // this may crash as the access may call _tlv_bootstrap that tries to // (recursively) invoke malloc again to allocate space for the thread local // variables on demand. This is why we use a _mi_preloading test on such // platforms. However, C code generator may move the initial thread local address // load before the `if` and we therefore split it out in a separate function. static mi_decl_thread bool recurse = false; static mi_decl_noinline bool mi_recurse_enter_prim(void) { if (recurse) return false; recurse = true; return true; } static mi_decl_noinline void mi_recurse_exit_prim(void) { recurse = false; } static bool mi_recurse_enter(void) { #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD) if (_mi_preloading()) return false; #endif return mi_recurse_enter_prim(); } static void mi_recurse_exit(void) { #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD) if (_mi_preloading()) return; #endif mi_recurse_exit_prim(); } void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message) { if (out==NULL || (void*)out==(void*)stdout || (void*)out==(void*)stderr) { // TODO: use mi_out_stderr for stderr? if (!mi_recurse_enter()) return; out = mi_out_get_default(&arg); if (prefix != NULL) out(prefix, arg); out(message, arg); mi_recurse_exit(); } else { if (prefix != NULL) out(prefix, arg); out(message, arg); } } // Define our own limited `fprintf` that avoids memory allocation. // We do this using `_mi_vsnprintf` with a limited buffer. static void mi_vfprintf( mi_output_fun* out, void* arg, const char* prefix, const char* fmt, va_list args ) { char buf[512]; if (fmt==NULL) return; if (!mi_recurse_enter()) return; _mi_vsnprintf(buf, sizeof(buf)-1, fmt, args); mi_recurse_exit(); _mi_fputs(out,arg,prefix,buf); } void _mi_fprintf( mi_output_fun* out, void* arg, const char* fmt, ... ) { va_list args; va_start(args,fmt); mi_vfprintf(out,arg,NULL,fmt,args); va_end(args); } static void mi_vfprintf_thread(mi_output_fun* out, void* arg, const char* prefix, const char* fmt, va_list args) { if (prefix != NULL && _mi_strnlen(prefix,33) <= 32 && !_mi_is_main_thread()) { char tprefix[64]; _mi_snprintf(tprefix, sizeof(tprefix), "%sthread 0x%tx: ", prefix, (uintptr_t)_mi_thread_id()); mi_vfprintf(out, arg, tprefix, fmt, args); } else { mi_vfprintf(out, arg, prefix, fmt, args); } } void _mi_message(const char* fmt, ...) { va_list args; va_start(args, fmt); mi_vfprintf_thread(NULL, NULL, "mimalloc: ", fmt, args); va_end(args); } void _mi_trace_message(const char* fmt, ...) { if (mi_option_get(mi_option_verbose) <= 1) return; // only with verbose level 2 or higher va_list args; va_start(args, fmt); mi_vfprintf_thread(NULL, NULL, "mimalloc: ", fmt, args); va_end(args); } void _mi_verbose_message(const char* fmt, ...) { if (!mi_option_is_enabled(mi_option_verbose)) return; va_list args; va_start(args,fmt); mi_vfprintf(NULL, NULL, "mimalloc: ", fmt, args); va_end(args); } static void mi_show_error_message(const char* fmt, va_list args) { if (!mi_option_is_enabled(mi_option_verbose)) { if (!mi_option_is_enabled(mi_option_show_errors)) return; if (mi_max_error_count >= 0 && (long)mi_atomic_increment_acq_rel(&error_count) > mi_max_error_count) return; } mi_vfprintf_thread(NULL, NULL, "mimalloc: error: ", fmt, args); } void _mi_warning_message(const char* fmt, ...) { if (!mi_option_is_enabled(mi_option_verbose)) { if (!mi_option_is_enabled(mi_option_show_errors)) return; if (mi_max_warning_count >= 0 && (long)mi_atomic_increment_acq_rel(&warning_count) > mi_max_warning_count) return; } va_list args; va_start(args,fmt); mi_vfprintf_thread(NULL, NULL, "mimalloc: warning: ", fmt, args); va_end(args); } #if MI_DEBUG void _mi_assert_fail(const char* assertion, const char* fname, unsigned line, const char* func ) { _mi_fprintf(NULL, NULL, "mimalloc: assertion failed: at \"%s\":%u, %s\n assertion: \"%s\"\n", fname, line, (func==NULL?"":func), assertion); abort(); } #endif // -------------------------------------------------------- // Errors // -------------------------------------------------------- static mi_error_fun* volatile mi_error_handler; // = NULL static _Atomic(void*) mi_error_arg; // = NULL static void mi_error_default(int err) { MI_UNUSED(err); #if (MI_DEBUG>0) if (err==EFAULT) { #ifdef _MSC_VER __debugbreak(); #endif abort(); } #endif #if (MI_SECURE>0) if (err==EFAULT) { // abort on serious errors in secure mode (corrupted meta-data) abort(); } #endif #if defined(MI_XMALLOC) if (err==ENOMEM || err==EOVERFLOW) { // abort on memory allocation fails in xmalloc mode abort(); } #endif } void mi_register_error(mi_error_fun* fun, void* arg) { mi_error_handler = fun; // can be NULL mi_atomic_store_ptr_release(void,&mi_error_arg, arg); } void _mi_error_message(int err, const char* fmt, ...) { // show detailed error message va_list args; va_start(args, fmt); mi_show_error_message(fmt, args); va_end(args); // and call the error handler which may abort (or return normally) if (mi_error_handler != NULL) { mi_error_handler(err, mi_atomic_load_ptr_acquire(void,&mi_error_arg)); } else { mi_error_default(err); } } // -------------------------------------------------------- // Initialize options by checking the environment // -------------------------------------------------------- // TODO: implement ourselves to reduce dependencies on the C runtime #include // strtol #include // strstr static void mi_option_init(mi_option_desc_t* desc) { // Read option value from the environment char s[64 + 1]; char buf[64+1]; _mi_strlcpy(buf, "mimalloc_", sizeof(buf)); _mi_strlcat(buf, desc->name, sizeof(buf)); bool found = _mi_getenv(buf, s, sizeof(s)); if (!found && desc->legacy_name != NULL) { _mi_strlcpy(buf, "mimalloc_", sizeof(buf)); _mi_strlcat(buf, desc->legacy_name, sizeof(buf)); found = _mi_getenv(buf, s, sizeof(s)); if (found) { _mi_warning_message("environment option \"mimalloc_%s\" is deprecated -- use \"mimalloc_%s\" instead.\n", desc->legacy_name, desc->name); } } if (found) { size_t len = _mi_strnlen(s, sizeof(buf) - 1); for (size_t i = 0; i < len; i++) { buf[i] = _mi_toupper(s[i]); } buf[len] = 0; if (buf[0] == 0 || strstr("1;TRUE;YES;ON", buf) != NULL) { desc->value = 1; desc->init = INITIALIZED; } else if (strstr("0;FALSE;NO;OFF", buf) != NULL) { desc->value = 0; desc->init = INITIALIZED; } else { char* end = buf; long value = strtol(buf, &end, 10); if (mi_option_has_size_in_kib(desc->option)) { // this option is interpreted in KiB to prevent overflow of `long` for large allocations // (long is 32-bit on 64-bit windows, which allows for 4TiB max.) size_t size = (value < 0 ? 0 : (size_t)value); bool overflow = false; if (*end == 'K') { end++; } else if (*end == 'M') { overflow = mi_mul_overflow(size,MI_KiB,&size); end++; } else if (*end == 'G') { overflow = mi_mul_overflow(size,MI_MiB,&size); end++; } else if (*end == 'T') { overflow = mi_mul_overflow(size,MI_GiB,&size); end++; } else { size = (size + MI_KiB - 1) / MI_KiB; } if (end[0] == 'I' && end[1] == 'B') { end += 2; } // KiB, MiB, GiB, TiB else if (*end == 'B') { end++; } // Kb, Mb, Gb, Tb if (overflow || size > MI_MAX_ALLOC_SIZE) { size = (MI_MAX_ALLOC_SIZE / MI_KiB); } value = (size > LONG_MAX ? LONG_MAX : (long)size); } if (*end == 0) { mi_option_set(desc->option, value); } else { // set `init` first to avoid recursion through _mi_warning_message on mimalloc_verbose. desc->init = DEFAULTED; if (desc->option == mi_option_verbose && desc->value == 0) { // if the 'mimalloc_verbose' env var has a bogus value we'd never know // (since the value defaults to 'off') so in that case briefly enable verbose desc->value = 1; _mi_warning_message("environment option mimalloc_%s has an invalid value.\n", desc->name); desc->value = 0; } else { _mi_warning_message("environment option mimalloc_%s has an invalid value.\n", desc->name); } } } mi_assert_internal(desc->init != UNINIT); } else if (!_mi_preloading()) { desc->init = DEFAULTED; } } ================================================ FILE: third-party/mimalloc/src/os.c ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2023, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #include "mimalloc.h" #include "mimalloc/internal.h" #include "mimalloc/atomic.h" #include "mimalloc/prim.h" #define mi_os_stat_increase(stat,amount) _mi_stat_increase(&_mi_stats_main.stat, amount) #define mi_os_stat_decrease(stat,amount) _mi_stat_decrease(&_mi_stats_main.stat, amount) #define mi_os_stat_counter_increase(stat,inc) _mi_stat_counter_increase(&_mi_stats_main.stat, inc) /* ----------------------------------------------------------- Initialization. ----------------------------------------------------------- */ #ifndef MI_DEFAULT_VIRTUAL_ADDRESS_BITS #if MI_INTPTR_SIZE < 8 #define MI_DEFAULT_VIRTUAL_ADDRESS_BITS 32 #else #define MI_DEFAULT_VIRTUAL_ADDRESS_BITS 48 #endif #endif #ifndef MI_DEFAULT_PHYSICAL_MEMORY_IN_KIB #if MI_INTPTR_SIZE < 8 #define MI_DEFAULT_PHYSICAL_MEMORY_IN_KIB 4*MI_MiB // 4 GiB #else #define MI_DEFAULT_PHYSICAL_MEMORY_IN_KIB 32*MI_MiB // 32 GiB #endif #endif static mi_os_mem_config_t mi_os_mem_config = { 4096, // page size 0, // large page size (usually 2MiB) 4096, // allocation granularity MI_DEFAULT_PHYSICAL_MEMORY_IN_KIB, MI_DEFAULT_VIRTUAL_ADDRESS_BITS, true, // has overcommit? (if true we use MAP_NORESERVE on mmap systems) false, // can we partially free allocated blocks? (on mmap systems we can free anywhere in a mapped range, but on Windows we must free the entire span) true // has virtual reserve? (if true we can reserve virtual address space without using commit or physical memory) }; bool _mi_os_has_overcommit(void) { return mi_os_mem_config.has_overcommit; } bool _mi_os_has_virtual_reserve(void) { return mi_os_mem_config.has_virtual_reserve; } // OS (small) page size size_t _mi_os_page_size(void) { return mi_os_mem_config.page_size; } // if large OS pages are supported (2 or 4MiB), then return the size, otherwise return the small page size (4KiB) size_t _mi_os_large_page_size(void) { return (mi_os_mem_config.large_page_size != 0 ? mi_os_mem_config.large_page_size : _mi_os_page_size()); } bool _mi_os_use_large_page(size_t size, size_t alignment) { // if we have access, check the size and alignment requirements if (mi_os_mem_config.large_page_size == 0 || !mi_option_is_enabled(mi_option_allow_large_os_pages)) return false; return ((size % mi_os_mem_config.large_page_size) == 0 && (alignment % mi_os_mem_config.large_page_size) == 0); } // round to a good OS allocation size (bounded by max 12.5% waste) size_t _mi_os_good_alloc_size(size_t size) { size_t align_size; if (size < 512*MI_KiB) align_size = _mi_os_page_size(); else if (size < 2*MI_MiB) align_size = 64*MI_KiB; else if (size < 8*MI_MiB) align_size = 256*MI_KiB; else if (size < 32*MI_MiB) align_size = 1*MI_MiB; else align_size = 4*MI_MiB; if mi_unlikely(size >= (SIZE_MAX - align_size)) return size; // possible overflow? return _mi_align_up(size, align_size); } void _mi_os_init(void) { _mi_prim_mem_init(&mi_os_mem_config); } /* ----------------------------------------------------------- Util -------------------------------------------------------------- */ bool _mi_os_decommit(void* addr, size_t size); bool _mi_os_commit(void* addr, size_t size, bool* is_zero); /* ----------------------------------------------------------- aligned hinting -------------------------------------------------------------- */ // On systems with enough virtual address bits, we can do efficient aligned allocation by using // the 2TiB to 30TiB area to allocate those. If we have at least 46 bits of virtual address // space (64TiB) we use this technique. (but see issue #939) #if (MI_INTPTR_SIZE >= 8) && !defined(MI_NO_ALIGNED_HINT) static mi_decl_cache_align _Atomic(uintptr_t)aligned_base; // Return a MI_SEGMENT_SIZE aligned address that is probably available. // If this returns NULL, the OS will determine the address but on some OS's that may not be // properly aligned which can be more costly as it needs to be adjusted afterwards. // For a size > 1GiB this always returns NULL in order to guarantee good ASLR randomization; // (otherwise an initial large allocation of say 2TiB has a 50% chance to include (known) addresses // in the middle of the 2TiB - 6TiB address range (see issue #372)) #define MI_HINT_BASE ((uintptr_t)2 << 40) // 2TiB start #define MI_HINT_AREA ((uintptr_t)4 << 40) // upto 6TiB (since before win8 there is "only" 8TiB available to processes) #define MI_HINT_MAX ((uintptr_t)30 << 40) // wrap after 30TiB (area after 32TiB is used for huge OS pages) void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size) { if (try_alignment <= 1 || try_alignment > MI_SEGMENT_SIZE) return NULL; if (mi_os_mem_config.virtual_address_bits < 46) return NULL; // < 64TiB virtual address space size = _mi_align_up(size, MI_SEGMENT_SIZE); if (size > 1*MI_GiB) return NULL; // guarantee the chance of fixed valid address is at most 1/(MI_HINT_AREA / 1<<30) = 1/4096. #if (MI_SECURE>0) size += MI_SEGMENT_SIZE; // put in `MI_SEGMENT_SIZE` virtual gaps between hinted blocks; this splits VLA's but increases guarded areas. #endif uintptr_t hint = mi_atomic_add_acq_rel(&aligned_base, size); if (hint == 0 || hint > MI_HINT_MAX) { // wrap or initialize uintptr_t init = MI_HINT_BASE; #if (MI_SECURE>0 || MI_DEBUG==0) // security: randomize start of aligned allocations unless in debug mode uintptr_t r = _mi_heap_random_next(mi_prim_get_default_heap()); init = init + ((MI_SEGMENT_SIZE * ((r>>17) & 0xFFFFF)) % MI_HINT_AREA); // (randomly 20 bits)*4MiB == 0 to 4TiB #endif uintptr_t expected = hint + size; mi_atomic_cas_strong_acq_rel(&aligned_base, &expected, init); hint = mi_atomic_add_acq_rel(&aligned_base, size); // this may still give 0 or > MI_HINT_MAX but that is ok, it is a hint after all } if (hint%try_alignment != 0) return NULL; return (void*)hint; } #else void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size) { MI_UNUSED(try_alignment); MI_UNUSED(size); return NULL; } #endif /* ----------------------------------------------------------- Free memory -------------------------------------------------------------- */ static void mi_os_free_huge_os_pages(void* p, size_t size); static void mi_os_prim_free(void* addr, size_t size, size_t commit_size) { mi_assert_internal((size % _mi_os_page_size()) == 0); if (addr == NULL || size == 0) return; // || _mi_os_is_huge_reserved(addr) int err = _mi_prim_free(addr, size); if (err != 0) { _mi_warning_message("unable to free OS memory (error: %d (0x%x), size: 0x%zx bytes, address: %p)\n", err, err, size, addr); } if (commit_size > 0) { mi_os_stat_decrease(committed, commit_size); } mi_os_stat_decrease(reserved, size); } void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t memid) { if (mi_memkind_is_os(memid.memkind)) { size_t csize = memid.mem.os.size; if (csize==0) { _mi_os_good_alloc_size(size); } size_t commit_size = (still_committed ? csize : 0); void* base = addr; // different base? (due to alignment) if (memid.mem.os.base != base) { mi_assert(memid.mem.os.base <= addr); base = memid.mem.os.base; const size_t diff = (uint8_t*)addr - (uint8_t*)memid.mem.os.base; if (memid.mem.os.size==0) { csize += diff; } if (still_committed) { commit_size -= diff; // the (addr-base) part was already un-committed } } // free it if (memid.memkind == MI_MEM_OS_HUGE) { mi_assert(memid.is_pinned); mi_os_free_huge_os_pages(base, csize); } else { mi_os_prim_free(base, csize, (still_committed ? commit_size : 0)); } } else { // nothing to do mi_assert(memid.memkind < MI_MEM_OS); } } void _mi_os_free(void* p, size_t size, mi_memid_t memid) { _mi_os_free_ex(p, size, true, memid); } /* ----------------------------------------------------------- Primitive allocation from the OS. -------------------------------------------------------------- */ // Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned. // Also `hint_addr` is a hint and may be ignored. static void* mi_os_prim_alloc_at(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero) { mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0); mi_assert_internal(is_zero != NULL); mi_assert_internal(is_large != NULL); if (size == 0) return NULL; if (!commit) { allow_large = false; } if (try_alignment == 0) { try_alignment = 1; } // avoid 0 to ensure there will be no divide by zero when aligning *is_zero = false; void* p = NULL; int err = _mi_prim_alloc(hint_addr, size, try_alignment, commit, allow_large, is_large, is_zero, &p); if (err != 0) { _mi_warning_message("unable to allocate OS memory (error: %d (0x%x), addr: %p, size: 0x%zx bytes, align: 0x%zx, commit: %d, allow large: %d)\n", err, err, hint_addr, size, try_alignment, commit, allow_large); } mi_os_stat_counter_increase(mmap_calls, 1); if (p != NULL) { mi_os_stat_increase(reserved, size); if (commit) { mi_os_stat_increase(committed, size); // seems needed for asan (or `mimalloc-test-api` fails) #ifdef MI_TRACK_ASAN if (*is_zero) { mi_track_mem_defined(p,size); } else { mi_track_mem_undefined(p,size); } #endif } } return p; } static void* mi_os_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero) { return mi_os_prim_alloc_at(NULL, size, try_alignment, commit, allow_large, is_large, is_zero); } // Primitive aligned allocation from the OS. // This function guarantees the allocated memory is aligned. static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** base) { mi_assert_internal(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0)); mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0); mi_assert_internal(is_large != NULL); mi_assert_internal(is_zero != NULL); mi_assert_internal(base != NULL); if (!commit) allow_large = false; if (!(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0))) return NULL; size = _mi_align_up(size, _mi_os_page_size()); // try first with a requested alignment hint (this will usually be aligned directly on Win 10+ or BSD) void* p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero); if (p == NULL) return NULL; // aligned already? if (((uintptr_t)p % alignment) == 0) { *base = p; } else { // if not aligned, free it, overallocate, and unmap around it #if !MI_TRACK_ASAN _mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (size: 0x%zx bytes, address: %p, alignment: 0x%zx, commit: %d)\n", size, p, alignment, commit); #endif if (p != NULL) { mi_os_prim_free(p, size, (commit ? size : 0)); } if (size >= (SIZE_MAX - alignment)) return NULL; // overflow const size_t over_size = size + alignment; if (!mi_os_mem_config.has_partial_free) { // win32 virtualAlloc cannot free parts of an allocated block // over-allocate uncommitted (virtual) memory p = mi_os_prim_alloc(over_size, 1 /*alignment*/, false /* commit? */, false /* allow_large */, is_large, is_zero); if (p == NULL) return NULL; // set p to the aligned part in the full region // note: this is dangerous on Windows as VirtualFree needs the actual base pointer // this is handled though by having the `base` field in the memid's *base = p; // remember the base p = mi_align_up_ptr(p, alignment); // explicitly commit only the aligned part if (commit) { _mi_os_commit(p, size, NULL); } } else { // mmap can free inside an allocation // overallocate... p = mi_os_prim_alloc(over_size, 1, commit, false, is_large, is_zero); if (p == NULL) return NULL; // and selectively unmap parts around the over-allocated area. void* aligned_p = mi_align_up_ptr(p, alignment); size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p; size_t mid_size = _mi_align_up(size, _mi_os_page_size()); size_t post_size = over_size - pre_size - mid_size; mi_assert_internal(pre_size < over_size&& post_size < over_size&& mid_size >= size); if (pre_size > 0) { mi_os_prim_free(p, pre_size, (commit ? pre_size : 0)); } if (post_size > 0) { mi_os_prim_free((uint8_t*)aligned_p + mid_size, post_size, (commit ? post_size : 0)); } // we can return the aligned pointer on `mmap` systems p = aligned_p; *base = aligned_p; // since we freed the pre part, `*base == p`. } } mi_assert_internal(p == NULL || (p != NULL && *base != NULL && ((uintptr_t)p % alignment) == 0)); return p; } /* ----------------------------------------------------------- OS API: alloc and alloc_aligned ----------------------------------------------------------- */ void* _mi_os_alloc(size_t size, mi_memid_t* memid) { *memid = _mi_memid_none(); if (size == 0) return NULL; size = _mi_os_good_alloc_size(size); bool os_is_large = false; bool os_is_zero = false; void* p = mi_os_prim_alloc(size, 0, true, false, &os_is_large, &os_is_zero); if (p != NULL) { *memid = _mi_memid_create_os(true, os_is_zero, os_is_large); } return p; } void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid) { MI_UNUSED(&_mi_os_get_aligned_hint); // suppress unused warnings *memid = _mi_memid_none(); if (size == 0) return NULL; size = _mi_os_good_alloc_size(size); alignment = _mi_align_up(alignment, _mi_os_page_size()); bool os_is_large = false; bool os_is_zero = false; void* os_base = NULL; void* p = mi_os_prim_alloc_aligned(size, alignment, commit, allow_large, &os_is_large, &os_is_zero, &os_base ); if (p != NULL) { *memid = _mi_memid_create_os(commit, os_is_zero, os_is_large); memid->mem.os.base = os_base; // memid->mem.os.alignment = alignment; memid->mem.os.size += ((uint8_t*)p - (uint8_t*)os_base); // todo: return from prim_alloc_aligned } return p; } /* ----------------------------------------------------------- OS aligned allocation with an offset. This is used for large alignments > MI_BLOCK_ALIGNMENT_MAX. We use a large mimalloc page where the object can be aligned at an offset from the start of the segment. As we may need to overallocate, we need to free such pointers using `mi_free_aligned` to use the actual start of the memory region. ----------------------------------------------------------- */ void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t offset, bool commit, bool allow_large, mi_memid_t* memid) { mi_assert(offset <= MI_SEGMENT_SIZE); mi_assert(offset <= size); mi_assert((alignment % _mi_os_page_size()) == 0); *memid = _mi_memid_none(); if (offset > MI_SEGMENT_SIZE) return NULL; if (offset == 0) { // regular aligned allocation return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid); } else { // overallocate to align at an offset const size_t extra = _mi_align_up(offset, alignment) - offset; const size_t oversize = size + extra; void* const start = _mi_os_alloc_aligned(oversize, alignment, commit, allow_large, memid); if (start == NULL) return NULL; void* const p = (uint8_t*)start + extra; mi_assert(_mi_is_aligned((uint8_t*)p + offset, alignment)); // decommit the overallocation at the start if (commit && extra > _mi_os_page_size()) { _mi_os_decommit(start, extra); } return p; } } /* ----------------------------------------------------------- OS memory API: reset, commit, decommit, protect, unprotect. ----------------------------------------------------------- */ // OS page align within a given area, either conservative (pages inside the area only), // or not (straddling pages outside the area is possible) static void* mi_os_page_align_areax(bool conservative, void* addr, size_t size, size_t* newsize) { mi_assert(addr != NULL && size > 0); if (newsize != NULL) *newsize = 0; if (size == 0 || addr == NULL) return NULL; // page align conservatively within the range void* start = (conservative ? mi_align_up_ptr(addr, _mi_os_page_size()) : mi_align_down_ptr(addr, _mi_os_page_size())); void* end = (conservative ? mi_align_down_ptr((uint8_t*)addr + size, _mi_os_page_size()) : mi_align_up_ptr((uint8_t*)addr + size, _mi_os_page_size())); ptrdiff_t diff = (uint8_t*)end - (uint8_t*)start; if (diff <= 0) return NULL; mi_assert_internal((conservative && (size_t)diff <= size) || (!conservative && (size_t)diff >= size)); if (newsize != NULL) *newsize = (size_t)diff; return start; } static void* mi_os_page_align_area_conservative(void* addr, size_t size, size_t* newsize) { return mi_os_page_align_areax(true, addr, size, newsize); } bool _mi_os_commit_ex(void* addr, size_t size, bool* is_zero, size_t stat_size) { if (is_zero != NULL) { *is_zero = false; } mi_os_stat_increase(committed, stat_size); // use size for precise commit vs. decommit mi_os_stat_counter_increase(commit_calls, 1); // page align range size_t csize; void* start = mi_os_page_align_areax(false /* conservative? */, addr, size, &csize); if (csize == 0) return true; // commit bool os_is_zero = false; int err = _mi_prim_commit(start, csize, &os_is_zero); if (err != 0) { _mi_warning_message("cannot commit OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", err, err, start, csize); return false; } if (os_is_zero && is_zero != NULL) { *is_zero = true; mi_assert_expensive(mi_mem_is_zero(start, csize)); } // note: the following seems required for asan (otherwise `mimalloc-test-stress` fails) #ifdef MI_TRACK_ASAN if (os_is_zero) { mi_track_mem_defined(start,csize); } else { mi_track_mem_undefined(start,csize); } #endif return true; } bool _mi_os_commit(void* addr, size_t size, bool* is_zero) { return _mi_os_commit_ex(addr, size, is_zero, size); } static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit, size_t stat_size) { mi_assert_internal(needs_recommit!=NULL); mi_os_stat_decrease(committed, stat_size); // page align size_t csize; void* start = mi_os_page_align_area_conservative(addr, size, &csize); if (csize == 0) return true; // decommit *needs_recommit = true; int err = _mi_prim_decommit(start,csize,needs_recommit); if (err != 0) { _mi_warning_message("cannot decommit OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", err, err, start, csize); } mi_assert_internal(err == 0); return (err == 0); } bool _mi_os_decommit(void* addr, size_t size) { bool needs_recommit; return mi_os_decommit_ex(addr, size, &needs_recommit, size); } // Signal to the OS that the address range is no longer in use // but may be used later again. This will release physical memory // pages and reduce swapping while keeping the memory committed. // We page align to a conservative area inside the range to reset. bool _mi_os_reset(void* addr, size_t size) { // page align conservatively within the range size_t csize; void* start = mi_os_page_align_area_conservative(addr, size, &csize); if (csize == 0) return true; // || _mi_os_is_huge_reserved(addr) mi_os_stat_increase(reset, csize); mi_os_stat_counter_increase(reset_calls, 1); #if (MI_DEBUG>1) && !MI_SECURE && !MI_TRACK_ENABLED // && !MI_TSAN memset(start, 0, csize); // pretend it is eagerly reset #endif int err = _mi_prim_reset(start, csize); if (err != 0) { _mi_warning_message("cannot reset OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", err, err, start, csize); } return (err == 0); } // either resets or decommits memory, returns true if the memory needs // to be recommitted if it is to be re-used later on. bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, size_t stat_size) { if (mi_option_get(mi_option_purge_delay) < 0) return false; // is purging allowed? mi_os_stat_counter_increase(purge_calls, 1); mi_os_stat_increase(purged, size); if (mi_option_is_enabled(mi_option_purge_decommits) && // should decommit? !_mi_preloading()) // don't decommit during preloading (unsafe) { bool needs_recommit = true; mi_os_decommit_ex(p, size, &needs_recommit, stat_size); return needs_recommit; } else { if (allow_reset) { // this can sometimes be not allowed if the range is not fully committed _mi_os_reset(p, size); } return false; // needs no recommit } } // either resets or decommits memory, returns true if the memory needs // to be recommitted if it is to be re-used later on. bool _mi_os_purge(void* p, size_t size) { return _mi_os_purge_ex(p, size, true, size); } // Protect a region in memory to be not accessible. static bool mi_os_protectx(void* addr, size_t size, bool protect) { // page align conservatively within the range size_t csize = 0; void* start = mi_os_page_align_area_conservative(addr, size, &csize); if (csize == 0) return false; /* if (_mi_os_is_huge_reserved(addr)) { _mi_warning_message("cannot mprotect memory allocated in huge OS pages\n"); } */ int err = _mi_prim_protect(start,csize,protect); if (err != 0) { _mi_warning_message("cannot %s OS memory (error: %d (0x%x), address: %p, size: 0x%zx bytes)\n", (protect ? "protect" : "unprotect"), err, err, start, csize); } return (err == 0); } bool _mi_os_protect(void* addr, size_t size) { return mi_os_protectx(addr, size, true); } bool _mi_os_unprotect(void* addr, size_t size) { return mi_os_protectx(addr, size, false); } /* ---------------------------------------------------------------------------- Support for allocating huge OS pages (1Gib) that are reserved up-front and possibly associated with a specific NUMA node. (use `numa_node>=0`) -----------------------------------------------------------------------------*/ #define MI_HUGE_OS_PAGE_SIZE (MI_GiB) #if (MI_INTPTR_SIZE >= 8) // To ensure proper alignment, use our own area for huge OS pages static mi_decl_cache_align _Atomic(uintptr_t) mi_huge_start; // = 0 // Claim an aligned address range for huge pages static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) { if (total_size != NULL) *total_size = 0; const size_t size = pages * MI_HUGE_OS_PAGE_SIZE; uintptr_t start = 0; uintptr_t end = 0; uintptr_t huge_start = mi_atomic_load_relaxed(&mi_huge_start); do { start = huge_start; if (start == 0) { // Initialize the start address after the 32TiB area start = ((uintptr_t)32 << 40); // 32TiB virtual start address #if (MI_SECURE>0 || MI_DEBUG==0) // security: randomize start of huge pages unless in debug mode uintptr_t r = _mi_heap_random_next(mi_prim_get_default_heap()); start = start + ((uintptr_t)MI_HUGE_OS_PAGE_SIZE * ((r>>17) & 0x0FFF)); // (randomly 12bits)*1GiB == between 0 to 4TiB #endif } end = start + size; mi_assert_internal(end % MI_SEGMENT_SIZE == 0); } while (!mi_atomic_cas_strong_acq_rel(&mi_huge_start, &huge_start, end)); if (total_size != NULL) *total_size = size; return (uint8_t*)start; } #else static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) { MI_UNUSED(pages); if (total_size != NULL) *total_size = 0; return NULL; } #endif // Allocate MI_SEGMENT_SIZE aligned huge pages void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_msecs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid) { *memid = _mi_memid_none(); if (psize != NULL) *psize = 0; if (pages_reserved != NULL) *pages_reserved = 0; size_t size = 0; uint8_t* start = mi_os_claim_huge_pages(pages, &size); if (start == NULL) return NULL; // or 32-bit systems // Allocate one page at the time but try to place them contiguously // We allocate one page at the time to be able to abort if it takes too long // or to at least allocate as many as available on the system. mi_msecs_t start_t = _mi_clock_start(); size_t page = 0; bool all_zero = true; while (page < pages) { // allocate a page bool is_zero = false; void* addr = start + (page * MI_HUGE_OS_PAGE_SIZE); void* p = NULL; int err = _mi_prim_alloc_huge_os_pages(addr, MI_HUGE_OS_PAGE_SIZE, numa_node, &is_zero, &p); if (!is_zero) { all_zero = false; } if (err != 0) { _mi_warning_message("unable to allocate huge OS page (error: %d (0x%x), address: %p, size: %zx bytes)\n", err, err, addr, MI_HUGE_OS_PAGE_SIZE); break; } // Did we succeed at a contiguous address? if (p != addr) { // no success, issue a warning and break if (p != NULL) { _mi_warning_message("could not allocate contiguous huge OS page %zu at %p\n", page, addr); mi_os_prim_free(p, MI_HUGE_OS_PAGE_SIZE, MI_HUGE_OS_PAGE_SIZE); } break; } // success, record it page++; // increase before timeout check (see issue #711) mi_os_stat_increase(committed, MI_HUGE_OS_PAGE_SIZE); mi_os_stat_increase(reserved, MI_HUGE_OS_PAGE_SIZE); // check for timeout if (max_msecs > 0) { mi_msecs_t elapsed = _mi_clock_end(start_t); if (page >= 1) { mi_msecs_t estimate = ((elapsed / (page+1)) * pages); if (estimate > 2*max_msecs) { // seems like we are going to timeout, break elapsed = max_msecs + 1; } } if (elapsed > max_msecs) { _mi_warning_message("huge OS page allocation timed out (after allocating %zu page(s))\n", page); break; } } } mi_assert_internal(page*MI_HUGE_OS_PAGE_SIZE <= size); if (pages_reserved != NULL) { *pages_reserved = page; } if (psize != NULL) { *psize = page * MI_HUGE_OS_PAGE_SIZE; } if (page != 0) { mi_assert(start != NULL); *memid = _mi_memid_create_os(true /* is committed */, all_zero, true /* is_large */); memid->memkind = MI_MEM_OS_HUGE; mi_assert(memid->is_pinned); #ifdef MI_TRACK_ASAN if (all_zero) { mi_track_mem_defined(start,size); } #endif } return (page == 0 ? NULL : start); } // free every huge page in a range individually (as we allocated per page) // note: needed with VirtualAlloc but could potentially be done in one go on mmap'd systems. static void mi_os_free_huge_os_pages(void* p, size_t size) { if (p==NULL || size==0) return; uint8_t* base = (uint8_t*)p; while (size >= MI_HUGE_OS_PAGE_SIZE) { mi_os_prim_free(base, MI_HUGE_OS_PAGE_SIZE, MI_HUGE_OS_PAGE_SIZE); size -= MI_HUGE_OS_PAGE_SIZE; base += MI_HUGE_OS_PAGE_SIZE; } } /* ---------------------------------------------------------------------------- Support NUMA aware allocation -----------------------------------------------------------------------------*/ _Atomic(size_t) _mi_numa_node_count; // = 0 // cache the node count size_t _mi_os_numa_node_count_get(void) { size_t count = mi_atomic_load_acquire(&_mi_numa_node_count); if (count <= 0) { long ncount = mi_option_get(mi_option_use_numa_nodes); // given explicitly? if (ncount > 0) { count = (size_t)ncount; } else { count = _mi_prim_numa_node_count(); // or detect dynamically if (count == 0) count = 1; } mi_atomic_store_release(&_mi_numa_node_count, count); // save it _mi_verbose_message("using %zd numa regions\n", count); } return count; } int _mi_os_numa_node_get(void) { size_t numa_count = _mi_os_numa_node_count(); if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0 // never more than the node count and >= 0 size_t numa_node = _mi_prim_numa_node(); if (numa_node >= numa_count) { numa_node = numa_node % numa_count; } return (int)numa_node; } ================================================ FILE: third-party/mimalloc/src/page-queue.c ================================================ /*---------------------------------------------------------------------------- Copyright (c) 2018-2024, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ /* ----------------------------------------------------------- Definition of page queues for each block size ----------------------------------------------------------- */ #ifndef MI_IN_PAGE_C #error "this file should be included from 'page.c'" // include to help an IDE #include "mimalloc.h" #include "mimalloc/internal.h" #include "mimalloc/atomic.h" #endif /* ----------------------------------------------------------- Minimal alignment in machine words (i.e. `sizeof(void*)`) ----------------------------------------------------------- */ #if (MI_MAX_ALIGN_SIZE > 4*MI_INTPTR_SIZE) #error "define alignment for more than 4x word size for this platform" #elif (MI_MAX_ALIGN_SIZE > 2*MI_INTPTR_SIZE) #define MI_ALIGN4W // 4 machine words minimal alignment #elif (MI_MAX_ALIGN_SIZE > MI_INTPTR_SIZE) #define MI_ALIGN2W // 2 machine words minimal alignment #else // ok, default alignment is 1 word #endif /* ----------------------------------------------------------- Queue query ----------------------------------------------------------- */ static inline bool mi_page_queue_is_huge(const mi_page_queue_t* pq) { return (pq->block_size == (MI_MEDIUM_OBJ_SIZE_MAX+sizeof(uintptr_t))); } static inline bool mi_page_queue_is_full(const mi_page_queue_t* pq) { return (pq->block_size == (MI_MEDIUM_OBJ_SIZE_MAX+(2*sizeof(uintptr_t)))); } static inline bool mi_page_queue_is_special(const mi_page_queue_t* pq) { return (pq->block_size > MI_MEDIUM_OBJ_SIZE_MAX); } /* ----------------------------------------------------------- Bins ----------------------------------------------------------- */ // Return the bin for a given field size. // Returns MI_BIN_HUGE if the size is too large. // We use `wsize` for the size in "machine word sizes", // i.e. byte size == `wsize*sizeof(void*)`. static inline size_t mi_bin(size_t size) { size_t wsize = _mi_wsize_from_size(size); #if defined(MI_ALIGN4W) if mi_likely(wsize <= 4) { return (wsize <= 1 ? 1 : (wsize+1)&~1); // round to double word sizes } #elif defined(MI_ALIGN2W) if mi_likely(wsize <= 8) { return (wsize <= 1 ? 1 : (wsize+1)&~1); // round to double word sizes } #else if mi_likely(wsize <= 8) { return (wsize == 0 ? 1 : wsize); } #endif else if mi_unlikely(wsize > MI_MEDIUM_OBJ_WSIZE_MAX) { return MI_BIN_HUGE; } else { #if defined(MI_ALIGN4W) if (wsize <= 16) { wsize = (wsize+3)&~3; } // round to 4x word sizes #endif wsize--; // find the highest bit const size_t b = (MI_SIZE_BITS - 1 - mi_clz(wsize)); // note: wsize != 0 // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation). // - adjust with 3 because we use do not round the first 8 sizes // which each get an exact bin const size_t bin = ((b << 2) + ((wsize >> (b - 2)) & 0x03)) - 3; mi_assert_internal(bin > 0 && bin < MI_BIN_HUGE); return bin; } } /* ----------------------------------------------------------- Queue of pages with free blocks ----------------------------------------------------------- */ size_t _mi_bin(size_t size) { return mi_bin(size); } size_t _mi_bin_size(size_t bin) { return _mi_heap_empty.pages[bin].block_size; } // Good size for allocation size_t mi_good_size(size_t size) mi_attr_noexcept { if (size <= MI_MEDIUM_OBJ_SIZE_MAX) { return _mi_bin_size(mi_bin(size + MI_PADDING_SIZE)); } else { return _mi_align_up(size + MI_PADDING_SIZE,_mi_os_page_size()); } } #if (MI_DEBUG>1) static bool mi_page_queue_contains(mi_page_queue_t* queue, const mi_page_t* page) { mi_assert_internal(page != NULL); mi_page_t* list = queue->first; while (list != NULL) { mi_assert_internal(list->next == NULL || list->next->prev == list); mi_assert_internal(list->prev == NULL || list->prev->next == list); if (list == page) break; list = list->next; } return (list == page); } #endif #if (MI_DEBUG>1) static bool mi_heap_contains_queue(const mi_heap_t* heap, const mi_page_queue_t* pq) { return (pq >= &heap->pages[0] && pq <= &heap->pages[MI_BIN_FULL]); } #endif static inline bool mi_page_is_large_or_huge(const mi_page_t* page) { return (mi_page_block_size(page) > MI_MEDIUM_OBJ_SIZE_MAX || mi_page_is_huge(page)); } static size_t mi_page_bin(const mi_page_t* page) { const size_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : (mi_page_is_huge(page) ? MI_BIN_HUGE : mi_bin(mi_page_block_size(page)))); mi_assert_internal(bin <= MI_BIN_FULL); return bin; } static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t* page) { mi_assert_internal(heap!=NULL); const size_t bin = mi_page_bin(page); mi_page_queue_t* pq = &heap->pages[bin]; mi_assert_internal((mi_page_block_size(page) == pq->block_size) || (mi_page_is_large_or_huge(page) && mi_page_queue_is_huge(pq)) || (mi_page_is_in_full(page) && mi_page_queue_is_full(pq))); return pq; } static mi_page_queue_t* mi_page_queue_of(const mi_page_t* page) { mi_heap_t* heap = mi_page_heap(page); mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page); mi_assert_expensive(mi_page_queue_contains(pq, page)); return pq; } // The current small page array is for efficiency and for each // small size (up to 256) it points directly to the page for that // size without having to compute the bin. This means when the // current free page queue is updated for a small bin, we need to update a // range of entries in `_mi_page_small_free`. static inline void mi_heap_queue_first_update(mi_heap_t* heap, const mi_page_queue_t* pq) { mi_assert_internal(mi_heap_contains_queue(heap,pq)); size_t size = pq->block_size; if (size > MI_SMALL_SIZE_MAX) return; mi_page_t* page = pq->first; if (pq->first == NULL) page = (mi_page_t*)&_mi_page_empty; // find index in the right direct page array size_t start; size_t idx = _mi_wsize_from_size(size); mi_page_t** pages_free = heap->pages_free_direct; if (pages_free[idx] == page) return; // already set // find start slot if (idx<=1) { start = 0; } else { // find previous size; due to minimal alignment upto 3 previous bins may need to be skipped size_t bin = mi_bin(size); const mi_page_queue_t* prev = pq - 1; while( bin == mi_bin(prev->block_size) && prev > &heap->pages[0]) { prev--; } start = 1 + _mi_wsize_from_size(prev->block_size); if (start > idx) start = idx; } // set size range to the right page mi_assert(start <= idx); for (size_t sz = start; sz <= idx; sz++) { pages_free[sz] = page; } } /* static bool mi_page_queue_is_empty(mi_page_queue_t* queue) { return (queue->first == NULL); } */ static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) { mi_assert_internal(page != NULL); mi_assert_expensive(mi_page_queue_contains(queue, page)); mi_assert_internal(mi_page_block_size(page) == queue->block_size || (mi_page_is_large_or_huge(page) && mi_page_queue_is_huge(queue)) || (mi_page_is_in_full(page) && mi_page_queue_is_full(queue))); mi_heap_t* heap = mi_page_heap(page); if (page->prev != NULL) page->prev->next = page->next; if (page->next != NULL) page->next->prev = page->prev; if (page == queue->last) queue->last = page->prev; if (page == queue->first) { queue->first = page->next; // update first mi_assert_internal(mi_heap_contains_queue(heap, queue)); mi_heap_queue_first_update(heap,queue); } heap->page_count--; page->next = NULL; page->prev = NULL; // mi_atomic_store_ptr_release(mi_atomic_cast(void*, &page->heap), NULL); mi_page_set_in_full(page,false); } static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_t* page) { mi_assert_internal(mi_page_heap(page) == heap); mi_assert_internal(!mi_page_queue_contains(queue, page)); #if MI_HUGE_PAGE_ABANDON mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE); #endif mi_assert_internal(mi_page_block_size(page) == queue->block_size || (mi_page_is_large_or_huge(page) && mi_page_queue_is_huge(queue)) || (mi_page_is_in_full(page) && mi_page_queue_is_full(queue))); mi_page_set_in_full(page, mi_page_queue_is_full(queue)); // mi_atomic_store_ptr_release(mi_atomic_cast(void*, &page->heap), heap); page->next = queue->first; page->prev = NULL; if (queue->first != NULL) { mi_assert_internal(queue->first->prev == NULL); queue->first->prev = page; queue->first = page; } else { queue->first = queue->last = page; } // update direct mi_heap_queue_first_update(heap, queue); heap->page_count++; } static void mi_page_queue_move_to_front(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_t* page) { mi_assert_internal(mi_page_heap(page) == heap); mi_assert_internal(mi_page_queue_contains(queue, page)); if (queue->first == page) return; mi_page_queue_remove(queue, page); mi_page_queue_push(heap, queue, page); mi_assert_internal(queue->first == page); } static void mi_page_queue_enqueue_from_ex(mi_page_queue_t* to, mi_page_queue_t* from, bool enqueue_at_end, mi_page_t* page) { mi_assert_internal(page != NULL); mi_assert_expensive(mi_page_queue_contains(from, page)); mi_assert_expensive(!mi_page_queue_contains(to, page)); const size_t bsize = mi_page_block_size(page); MI_UNUSED(bsize); mi_assert_internal((bsize == to->block_size && bsize == from->block_size) || (bsize == to->block_size && mi_page_queue_is_full(from)) || (bsize == from->block_size && mi_page_queue_is_full(to)) || (mi_page_is_large_or_huge(page) && mi_page_queue_is_huge(to)) || (mi_page_is_large_or_huge(page) && mi_page_queue_is_full(to))); mi_heap_t* heap = mi_page_heap(page); // delete from `from` if (page->prev != NULL) page->prev->next = page->next; if (page->next != NULL) page->next->prev = page->prev; if (page == from->last) from->last = page->prev; if (page == from->first) { from->first = page->next; // update first mi_assert_internal(mi_heap_contains_queue(heap, from)); mi_heap_queue_first_update(heap, from); } // insert into `to` if (enqueue_at_end) { // enqueue at the end page->prev = to->last; page->next = NULL; if (to->last != NULL) { mi_assert_internal(heap == mi_page_heap(to->last)); to->last->next = page; to->last = page; } else { to->first = page; to->last = page; mi_heap_queue_first_update(heap, to); } } else { if (to->first != NULL) { // enqueue at 2nd place mi_assert_internal(heap == mi_page_heap(to->first)); mi_page_t* next = to->first->next; page->prev = to->first; page->next = next; to->first->next = page; if (next != NULL) { next->prev = page; } else { to->last = page; } } else { // enqueue at the head (singleton list) page->prev = NULL; page->next = NULL; to->first = page; to->last = page; mi_heap_queue_first_update(heap, to); } } mi_page_set_in_full(page, mi_page_queue_is_full(to)); } static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* from, mi_page_t* page) { mi_page_queue_enqueue_from_ex(to, from, true /* enqueue at the end */, page); } static void mi_page_queue_enqueue_from_full(mi_page_queue_t* to, mi_page_queue_t* from, mi_page_t* page) { // note: we could insert at the front to increase reuse, but it slows down certain benchmarks (like `alloc-test`) mi_page_queue_enqueue_from_ex(to, from, true /* enqueue at the end of the `to` queue? */, page); } // Only called from `mi_heap_absorb`. size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append) { mi_assert_internal(mi_heap_contains_queue(heap,pq)); mi_assert_internal(pq->block_size == append->block_size); if (append->first==NULL) return 0; // set append pages to new heap and count size_t count = 0; for (mi_page_t* page = append->first; page != NULL; page = page->next) { // inline `mi_page_set_heap` to avoid wrong assertion during absorption; // in this case it is ok to be delayed freeing since both "to" and "from" heap are still alive. mi_atomic_store_release(&page->xheap, (uintptr_t)heap); // set the flag to delayed free (not overriding NEVER_DELAYED_FREE) which has as a // side effect that it spins until any DELAYED_FREEING is finished. This ensures // that after appending only the new heap will be used for delayed free operations. _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, false); count++; } if (pq->last==NULL) { // take over afresh mi_assert_internal(pq->first==NULL); pq->first = append->first; pq->last = append->last; mi_heap_queue_first_update(heap, pq); } else { // append to end mi_assert_internal(pq->last!=NULL); mi_assert_internal(append->first!=NULL); pq->last->next = append->first; append->first->prev = pq->last; pq->last = append->last; } return count; } ================================================ FILE: third-party/mimalloc/src/page.c ================================================ /*---------------------------------------------------------------------------- Copyright (c) 2018-2024, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ /* ----------------------------------------------------------- The core of the allocator. Every segment contains pages of a certain block size. The main function exported is `mi_malloc_generic`. ----------------------------------------------------------- */ #include "mimalloc.h" #include "mimalloc/internal.h" #include "mimalloc/atomic.h" /* ----------------------------------------------------------- Definition of page queues for each block size ----------------------------------------------------------- */ #define MI_IN_PAGE_C #include "page-queue.c" #undef MI_IN_PAGE_C /* ----------------------------------------------------------- Page helpers ----------------------------------------------------------- */ // Index a block in a page static inline mi_block_t* mi_page_block_at(const mi_page_t* page, void* page_start, size_t block_size, size_t i) { MI_UNUSED(page); mi_assert_internal(page != NULL); mi_assert_internal(i <= page->reserved); return (mi_block_t*)((uint8_t*)page_start + (i * block_size)); } static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t size, mi_tld_t* tld); static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld); #if (MI_DEBUG>=3) static size_t mi_page_list_count(mi_page_t* page, mi_block_t* head) { size_t count = 0; while (head != NULL) { mi_assert_internal(page == _mi_ptr_page(head)); count++; head = mi_block_next(page, head); } return count; } /* // Start of the page available memory static inline uint8_t* mi_page_area(const mi_page_t* page) { return _mi_page_start(_mi_page_segment(page), page, NULL); } */ static bool mi_page_list_is_valid(mi_page_t* page, mi_block_t* p) { size_t psize; uint8_t* page_area = _mi_segment_page_start(_mi_page_segment(page), page, &psize); mi_block_t* start = (mi_block_t*)page_area; mi_block_t* end = (mi_block_t*)(page_area + psize); while(p != NULL) { if (p < start || p >= end) return false; p = mi_block_next(page, p); } #if MI_DEBUG>3 // generally too expensive to check this if (page->free_is_zero) { const size_t ubsize = mi_page_usable_block_size(page); for (mi_block_t* block = page->free; block != NULL; block = mi_block_next(page, block)) { mi_assert_expensive(mi_mem_is_zero(block + 1, ubsize - sizeof(mi_block_t))); } } #endif return true; } static bool mi_page_is_valid_init(mi_page_t* page) { mi_assert_internal(mi_page_block_size(page) > 0); mi_assert_internal(page->used <= page->capacity); mi_assert_internal(page->capacity <= page->reserved); uint8_t* start = mi_page_start(page); mi_assert_internal(start == _mi_segment_page_start(_mi_page_segment(page), page, NULL)); mi_assert_internal(page->is_huge == (_mi_page_segment(page)->kind == MI_SEGMENT_HUGE)); //mi_assert_internal(start + page->capacity*page->block_size == page->top); mi_assert_internal(mi_page_list_is_valid(page,page->free)); mi_assert_internal(mi_page_list_is_valid(page,page->local_free)); #if MI_DEBUG>3 // generally too expensive to check this if (page->free_is_zero) { const size_t ubsize = mi_page_usable_block_size(page); for(mi_block_t* block = page->free; block != NULL; block = mi_block_next(page,block)) { mi_assert_expensive(mi_mem_is_zero(block + 1, ubsize - sizeof(mi_block_t))); } } #endif #if !MI_TRACK_ENABLED && !MI_TSAN mi_block_t* tfree = mi_page_thread_free(page); mi_assert_internal(mi_page_list_is_valid(page, tfree)); //size_t tfree_count = mi_page_list_count(page, tfree); //mi_assert_internal(tfree_count <= page->thread_freed + 1); #endif size_t free_count = mi_page_list_count(page, page->free) + mi_page_list_count(page, page->local_free); mi_assert_internal(page->used + free_count == page->capacity); return true; } extern bool _mi_process_is_initialized; // has mi_process_init been called? bool _mi_page_is_valid(mi_page_t* page) { mi_assert_internal(mi_page_is_valid_init(page)); #if MI_SECURE mi_assert_internal(page->keys[0] != 0); #endif if (mi_page_heap(page)!=NULL) { mi_segment_t* segment = _mi_page_segment(page); mi_assert_internal(!_mi_process_is_initialized || segment->thread_id==0 || segment->thread_id == mi_page_heap(page)->thread_id); #if MI_HUGE_PAGE_ABANDON if (segment->kind != MI_SEGMENT_HUGE) #endif { mi_page_queue_t* pq = mi_page_queue_of(page); mi_assert_internal(mi_page_queue_contains(pq, page)); mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_block_size(page) > MI_MEDIUM_OBJ_SIZE_MAX || mi_page_is_in_full(page)); mi_assert_internal(mi_heap_contains_queue(mi_page_heap(page),pq)); } } return true; } #endif void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never) { while (!_mi_page_try_use_delayed_free(page, delay, override_never)) { mi_atomic_yield(); } } bool _mi_page_try_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never) { mi_thread_free_t tfreex; mi_delayed_t old_delay; mi_thread_free_t tfree; size_t yield_count = 0; do { tfree = mi_atomic_load_acquire(&page->xthread_free); // note: must acquire as we can break/repeat this loop and not do a CAS; tfreex = mi_tf_set_delayed(tfree, delay); old_delay = mi_tf_delayed(tfree); if mi_unlikely(old_delay == MI_DELAYED_FREEING) { if (yield_count >= 4) return false; // give up after 4 tries yield_count++; mi_atomic_yield(); // delay until outstanding MI_DELAYED_FREEING are done. // tfree = mi_tf_set_delayed(tfree, MI_NO_DELAYED_FREE); // will cause CAS to busy fail } else if (delay == old_delay) { break; // avoid atomic operation if already equal } else if (!override_never && old_delay == MI_NEVER_DELAYED_FREE) { break; // leave never-delayed flag set } } while ((old_delay == MI_DELAYED_FREEING) || !mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex)); return true; // success } /* ----------------------------------------------------------- Page collect the `local_free` and `thread_free` lists ----------------------------------------------------------- */ // Collect the local `thread_free` list using an atomic exchange. // Note: The exchange must be done atomically as this is used right after // moving to the full list in `mi_page_collect_ex` and we need to // ensure that there was no race where the page became unfull just before the move. static void _mi_page_thread_free_collect(mi_page_t* page) { mi_block_t* head; mi_thread_free_t tfreex; mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free); do { head = mi_tf_block(tfree); tfreex = mi_tf_set_block(tfree,NULL); } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tfree, tfreex)); // return if the list is empty if (head == NULL) return; // find the tail -- also to get a proper count (without data races) size_t max_count = page->capacity; // cannot collect more than capacity size_t count = 1; mi_block_t* tail = head; mi_block_t* next; while ((next = mi_block_next(page,tail)) != NULL && count <= max_count) { count++; tail = next; } // if `count > max_count` there was a memory corruption (possibly infinite list due to double multi-threaded free) if (count > max_count) { _mi_error_message(EFAULT, "corrupted thread-free list\n"); return; // the thread-free items cannot be freed } // and append the current local free list mi_block_set_next(page,tail, page->local_free); page->local_free = head; // update counts now page->used -= (uint16_t)count; } void _mi_page_free_collect(mi_page_t* page, bool force) { mi_assert_internal(page!=NULL); // collect the thread free list if (force || mi_page_thread_free(page) != NULL) { // quick test to avoid an atomic operation _mi_page_thread_free_collect(page); } // and the local free list if (page->local_free != NULL) { if mi_likely(page->free == NULL) { // usual case page->free = page->local_free; page->local_free = NULL; page->free_is_zero = false; } else if (force) { // append -- only on shutdown (force) as this is a linear operation mi_block_t* tail = page->local_free; mi_block_t* next; while ((next = mi_block_next(page, tail)) != NULL) { tail = next; } mi_block_set_next(page, tail, page->free); page->free = page->local_free; page->local_free = NULL; page->free_is_zero = false; } } mi_assert_internal(!force || page->local_free == NULL); } /* ----------------------------------------------------------- Page fresh and retire ----------------------------------------------------------- */ // called from segments when reclaiming abandoned pages void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) { mi_assert_expensive(mi_page_is_valid_init(page)); mi_assert_internal(mi_page_heap(page) == heap); mi_assert_internal(mi_page_thread_free_flag(page) != MI_NEVER_DELAYED_FREE); #if MI_HUGE_PAGE_ABANDON mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE); #endif // TODO: push on full queue immediately if it is full? mi_page_queue_t* pq = mi_page_queue(heap, mi_page_block_size(page)); mi_page_queue_push(heap, pq, page); mi_assert_expensive(_mi_page_is_valid(page)); } // allocate a fresh page from a segment static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size_t block_size, size_t page_alignment) { #if !MI_HUGE_PAGE_ABANDON mi_assert_internal(pq != NULL); mi_assert_internal(mi_heap_contains_queue(heap, pq)); mi_assert_internal(page_alignment > 0 || block_size > MI_MEDIUM_OBJ_SIZE_MAX || block_size == pq->block_size); #endif mi_page_t* page = _mi_segment_page_alloc(heap, block_size, page_alignment, &heap->tld->segments); if (page == NULL) { // this may be out-of-memory, or an abandoned page was reclaimed (and in our queue) return NULL; } #if MI_HUGE_PAGE_ABANDON mi_assert_internal(pq==NULL || _mi_page_segment(page)->page_kind != MI_PAGE_HUGE); #endif mi_assert_internal(page_alignment >0 || block_size > MI_MEDIUM_OBJ_SIZE_MAX || _mi_page_segment(page)->kind != MI_SEGMENT_HUGE); mi_assert_internal(pq!=NULL || mi_page_block_size(page) >= block_size); // a fresh page was found, initialize it const size_t full_block_size = (pq == NULL || mi_page_is_huge(page) ? mi_page_block_size(page) : block_size); // see also: mi_segment_huge_page_alloc mi_assert_internal(full_block_size >= block_size); mi_page_init(heap, page, full_block_size, heap->tld); mi_heap_stat_increase(heap, pages, 1); mi_heap_stat_increase(heap, page_bins[mi_page_bin(page)], 1); if (pq != NULL) { mi_page_queue_push(heap, pq, page); } mi_assert_expensive(_mi_page_is_valid(page)); return page; } // Get a fresh page to use static mi_page_t* mi_page_fresh(mi_heap_t* heap, mi_page_queue_t* pq) { mi_assert_internal(mi_heap_contains_queue(heap, pq)); mi_page_t* page = mi_page_fresh_alloc(heap, pq, pq->block_size, 0); if (page==NULL) return NULL; mi_assert_internal(pq->block_size==mi_page_block_size(page)); mi_assert_internal(pq==mi_page_queue(heap, mi_page_block_size(page))); return page; } /* ----------------------------------------------------------- Do any delayed frees (put there by other threads if they deallocated in a full page) ----------------------------------------------------------- */ void _mi_heap_delayed_free_all(mi_heap_t* heap) { while (!_mi_heap_delayed_free_partial(heap)) { mi_atomic_yield(); } } // returns true if all delayed frees were processed bool _mi_heap_delayed_free_partial(mi_heap_t* heap) { // take over the list (note: no atomic exchange since it is often NULL) mi_block_t* block = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free); while (block != NULL && !mi_atomic_cas_ptr_weak_acq_rel(mi_block_t, &heap->thread_delayed_free, &block, NULL)) { /* nothing */ }; bool all_freed = true; // and free them all while(block != NULL) { mi_block_t* next = mi_block_nextx(heap,block, heap->keys); // use internal free instead of regular one to keep stats etc correct if (!_mi_free_delayed_block(block)) { // we might already start delayed freeing while another thread has not yet // reset the delayed_freeing flag; in that case delay it further by reinserting the current block // into the delayed free list all_freed = false; mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free); do { mi_block_set_nextx(heap, block, dfree, heap->keys); } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block)); } block = next; } return all_freed; } /* ----------------------------------------------------------- Unfull, abandon, free and retire ----------------------------------------------------------- */ // Move a page from the full list back to a regular list void _mi_page_unfull(mi_page_t* page) { mi_assert_internal(page != NULL); mi_assert_expensive(_mi_page_is_valid(page)); mi_assert_internal(mi_page_is_in_full(page)); if (!mi_page_is_in_full(page)) return; mi_heap_t* heap = mi_page_heap(page); mi_page_queue_t* pqfull = &heap->pages[MI_BIN_FULL]; mi_page_set_in_full(page, false); // to get the right queue mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page); mi_page_set_in_full(page, true); mi_page_queue_enqueue_from_full(pq, pqfull, page); } static void mi_page_to_full(mi_page_t* page, mi_page_queue_t* pq) { mi_assert_internal(pq == mi_page_queue_of(page)); mi_assert_internal(!mi_page_immediate_available(page)); mi_assert_internal(!mi_page_is_in_full(page)); if (mi_page_is_in_full(page)) return; mi_page_queue_enqueue_from(&mi_page_heap(page)->pages[MI_BIN_FULL], pq, page); _mi_page_free_collect(page,false); // try to collect right away in case another thread freed just before MI_USE_DELAYED_FREE was set } // Abandon a page with used blocks at the end of a thread. // Note: only call if it is ensured that no references exist from // the `page->heap->thread_delayed_free` into this page. // Currently only called through `mi_heap_collect_ex` which ensures this. void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) { mi_assert_internal(page != NULL); mi_assert_expensive(_mi_page_is_valid(page)); mi_assert_internal(pq == mi_page_queue_of(page)); mi_assert_internal(mi_page_heap(page) != NULL); mi_heap_t* pheap = mi_page_heap(page); // remove from our page list mi_segments_tld_t* segments_tld = &pheap->tld->segments; mi_page_queue_remove(pq, page); // page is no longer associated with our heap mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE); mi_page_set_heap(page, NULL); #if (MI_DEBUG>1) && !MI_TRACK_ENABLED // check there are no references left.. for (mi_block_t* block = (mi_block_t*)pheap->thread_delayed_free; block != NULL; block = mi_block_nextx(pheap, block, pheap->keys)) { mi_assert_internal(_mi_ptr_page(block) != page); } #endif // and abandon it mi_assert_internal(mi_page_heap(page) == NULL); _mi_segment_page_abandon(page,segments_tld); } // force abandon a page void _mi_page_force_abandon(mi_page_t* page) { mi_heap_t* heap = mi_page_heap(page); // mark page as not using delayed free _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false); // ensure this page is no longer in the heap delayed free list _mi_heap_delayed_free_all(heap); // We can still access the page meta-info even if it is freed as we ensure // in `mi_segment_force_abandon` that the segment is not freed (yet) if (page->capacity == 0) return; // it may have been freed now // and now unlink it from the page queue and abandon (or free) mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page); if (mi_page_all_free(page)) { _mi_page_free(page, pq, false); } else { _mi_page_abandon(page, pq); } } // Free a page with no more free blocks void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) { mi_assert_internal(page != NULL); mi_assert_expensive(_mi_page_is_valid(page)); mi_assert_internal(pq == mi_page_queue_of(page)); mi_assert_internal(mi_page_all_free(page)); mi_assert_internal(mi_page_thread_free_flag(page)!=MI_DELAYED_FREEING); // no more aligned blocks in here mi_page_set_has_aligned(page, false); // remove from the page list // (no need to do _mi_heap_delayed_free first as all blocks are already free) mi_heap_t* heap = mi_page_heap(page); mi_segments_tld_t* segments_tld = &heap->tld->segments; mi_page_queue_remove(pq, page); // and free it mi_heap_stat_decrease(heap, page_bins[mi_page_bin(page)], 1); mi_page_set_heap(page,NULL); _mi_segment_page_free(page, force, segments_tld); } #define MI_MAX_RETIRE_SIZE MI_MEDIUM_OBJ_SIZE_MAX // should be less than size for MI_BIN_HUGE #define MI_RETIRE_CYCLES (16) // Retire a page with no more used blocks // Important to not retire too quickly though as new // allocations might coming. // Note: called from `mi_free` and benchmarks often // trigger this due to freeing everything and then // allocating again so careful when changing this. void _mi_page_retire(mi_page_t* page) mi_attr_noexcept { mi_assert_internal(page != NULL); mi_assert_expensive(_mi_page_is_valid(page)); mi_assert_internal(mi_page_all_free(page)); mi_page_set_has_aligned(page, false); // don't retire too often.. // (or we end up retiring and re-allocating most of the time) // NOTE: refine this more: we should not retire if this // is the only page left with free blocks. It is not clear // how to check this efficiently though... // for now, we don't retire if it is the only page left of this size class. mi_page_queue_t* pq = mi_page_queue_of(page); #if MI_RETIRE_CYCLES > 0 const size_t bsize = mi_page_block_size(page); if mi_likely( /* bsize < MI_MAX_RETIRE_SIZE && */ !mi_page_queue_is_special(pq)) { // not full or huge queue? if (pq->last==page && pq->first==page) { // the only page in the queue? mi_stat_counter_increase(_mi_stats_main.pages_retire,1); page->retire_expire = (bsize <= MI_SMALL_OBJ_SIZE_MAX ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4); mi_heap_t* heap = mi_page_heap(page); mi_assert_internal(pq >= heap->pages); const size_t index = pq - heap->pages; mi_assert_internal(index < MI_BIN_FULL && index < MI_BIN_HUGE); if (index < heap->page_retired_min) heap->page_retired_min = index; if (index > heap->page_retired_max) heap->page_retired_max = index; mi_assert_internal(mi_page_all_free(page)); return; // don't free after all } } #endif _mi_page_free(page, pq, false); } // free retired pages: we don't need to look at the entire queues // since we only retire pages that are at the head position in a queue. void _mi_heap_collect_retired(mi_heap_t* heap, bool force) { size_t min = MI_BIN_FULL; size_t max = 0; for(size_t bin = heap->page_retired_min; bin <= heap->page_retired_max; bin++) { mi_page_queue_t* pq = &heap->pages[bin]; mi_page_t* page = pq->first; if (page != NULL && page->retire_expire != 0) { if (mi_page_all_free(page)) { page->retire_expire--; if (force || page->retire_expire == 0) { _mi_page_free(pq->first, pq, force); } else { // keep retired, update min/max if (bin < min) min = bin; if (bin > max) max = bin; } } else { page->retire_expire = 0; } } } heap->page_retired_min = min; heap->page_retired_max = max; } /* ----------------------------------------------------------- Initialize the initial free list in a page. In secure mode we initialize a randomized list by alternating between slices. ----------------------------------------------------------- */ #define MI_MAX_SLICE_SHIFT (6) // at most 64 slices #define MI_MAX_SLICES (1UL << MI_MAX_SLICE_SHIFT) #define MI_MIN_SLICES (2) static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* const page, const size_t bsize, const size_t extend, mi_stats_t* const stats) { MI_UNUSED(stats); #if (MI_SECURE<=2) mi_assert_internal(page->free == NULL); mi_assert_internal(page->local_free == NULL); #endif mi_assert_internal(page->capacity + extend <= page->reserved); mi_assert_internal(bsize == mi_page_block_size(page)); void* const page_area = mi_page_start(page); // initialize a randomized free list // set up `slice_count` slices to alternate between size_t shift = MI_MAX_SLICE_SHIFT; while ((extend >> shift) == 0) { shift--; } const size_t slice_count = (size_t)1U << shift; const size_t slice_extend = extend / slice_count; mi_assert_internal(slice_extend >= 1); mi_block_t* blocks[MI_MAX_SLICES]; // current start of the slice size_t counts[MI_MAX_SLICES]; // available objects in the slice for (size_t i = 0; i < slice_count; i++) { blocks[i] = mi_page_block_at(page, page_area, bsize, page->capacity + i*slice_extend); counts[i] = slice_extend; } counts[slice_count-1] += (extend % slice_count); // final slice holds the modulus too (todo: distribute evenly?) // and initialize the free list by randomly threading through them // set up first element const uintptr_t r = _mi_heap_random_next(heap); size_t current = r % slice_count; counts[current]--; mi_block_t* const free_start = blocks[current]; // and iterate through the rest; use `random_shuffle` for performance uintptr_t rnd = _mi_random_shuffle(r|1); // ensure not 0 for (size_t i = 1; i < extend; i++) { // call random_shuffle only every INTPTR_SIZE rounds const size_t round = i%MI_INTPTR_SIZE; if (round == 0) rnd = _mi_random_shuffle(rnd); // select a random next slice index size_t next = ((rnd >> 8*round) & (slice_count-1)); while (counts[next]==0) { // ensure it still has space next++; if (next==slice_count) next = 0; } // and link the current block to it counts[next]--; mi_block_t* const block = blocks[current]; blocks[current] = (mi_block_t*)((uint8_t*)block + bsize); // bump to the following block mi_block_set_next(page, block, blocks[next]); // and set next; note: we may have `current == next` current = next; } // prepend to the free list (usually NULL) mi_block_set_next(page, blocks[current], page->free); // end of the list page->free = free_start; } static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, const size_t bsize, const size_t extend, mi_stats_t* const stats) { MI_UNUSED(stats); #if (MI_SECURE <= 2) mi_assert_internal(page->free == NULL); mi_assert_internal(page->local_free == NULL); #endif mi_assert_internal(page->capacity + extend <= page->reserved); mi_assert_internal(bsize == mi_page_block_size(page)); void* const page_area = mi_page_start(page); mi_block_t* const start = mi_page_block_at(page, page_area, bsize, page->capacity); // initialize a sequential free list mi_block_t* const last = mi_page_block_at(page, page_area, bsize, page->capacity + extend - 1); mi_block_t* block = start; while(block <= last) { mi_block_t* next = (mi_block_t*)((uint8_t*)block + bsize); mi_block_set_next(page,block,next); block = next; } // prepend to free list (usually `NULL`) mi_block_set_next(page, last, page->free); page->free = start; } /* ----------------------------------------------------------- Page initialize and extend the capacity ----------------------------------------------------------- */ #define MI_MAX_EXTEND_SIZE (4*1024) // heuristic, one OS page seems to work well. #if (MI_SECURE>0) #define MI_MIN_EXTEND (8*MI_SECURE) // extend at least by this many #else #define MI_MIN_EXTEND (4) #endif // Extend the capacity (up to reserved) by initializing a free list // We do at most `MI_MAX_EXTEND` to avoid touching too much memory // Note: we also experimented with "bump" allocation on the first // allocations but this did not speed up any benchmark (due to an // extra test in malloc? or cache effects?) static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld) { MI_UNUSED(tld); mi_assert_expensive(mi_page_is_valid_init(page)); #if (MI_SECURE<=2) mi_assert(page->free == NULL); mi_assert(page->local_free == NULL); if (page->free != NULL) return; #endif if (page->capacity >= page->reserved) return; mi_stat_counter_increase(tld->stats.pages_extended, 1); // calculate the extend count const size_t bsize = mi_page_block_size(page); size_t extend = page->reserved - page->capacity; mi_assert_internal(extend > 0); size_t max_extend = (bsize >= MI_MAX_EXTEND_SIZE ? MI_MIN_EXTEND : MI_MAX_EXTEND_SIZE/bsize); if (max_extend < MI_MIN_EXTEND) { max_extend = MI_MIN_EXTEND; } mi_assert_internal(max_extend > 0); if (extend > max_extend) { // ensure we don't touch memory beyond the page to reduce page commit. // the `lean` benchmark tests this. Going from 1 to 8 increases rss by 50%. extend = max_extend; } mi_assert_internal(extend > 0 && extend + page->capacity <= page->reserved); mi_assert_internal(extend < (1UL<<16)); // and append the extend the free list if (extend < MI_MIN_SLICES || MI_SECURE==0) { //!mi_option_is_enabled(mi_option_secure)) { mi_page_free_list_extend(page, bsize, extend, &tld->stats ); } else { mi_page_free_list_extend_secure(heap, page, bsize, extend, &tld->stats); } // enable the new free list page->capacity += (uint16_t)extend; mi_stat_increase(tld->stats.page_committed, extend * bsize); mi_assert_expensive(mi_page_is_valid_init(page)); } // Initialize a fresh page static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi_tld_t* tld) { mi_assert(page != NULL); mi_segment_t* segment = _mi_page_segment(page); mi_assert(segment != NULL); mi_assert_internal(block_size > 0); // set fields mi_page_set_heap(page, heap); page->block_size = block_size; size_t page_size; page->page_start = _mi_segment_page_start(segment, page, &page_size); mi_track_mem_noaccess(page->page_start,page_size); mi_assert_internal(mi_page_block_size(page) <= page_size); mi_assert_internal(page_size <= page->slice_count*MI_SEGMENT_SLICE_SIZE); mi_assert_internal(page_size / block_size < (1L<<16)); page->reserved = (uint16_t)(page_size / block_size); mi_assert_internal(page->reserved > 0); #if (MI_PADDING || MI_ENCODE_FREELIST) page->keys[0] = _mi_heap_random_next(heap); page->keys[1] = _mi_heap_random_next(heap); #endif page->free_is_zero = page->is_zero_init; #if MI_DEBUG>2 if (page->is_zero_init) { mi_track_mem_defined(page->page_start, page_size); mi_assert_expensive(mi_mem_is_zero(page->page_start, page_size)); } #endif mi_assert_internal(page->is_committed); if (block_size > 0 && _mi_is_power_of_two(block_size)) { page->block_size_shift = (uint8_t)(mi_ctz((uintptr_t)block_size)); } else { page->block_size_shift = 0; } mi_assert_internal(page->capacity == 0); mi_assert_internal(page->free == NULL); mi_assert_internal(page->used == 0); mi_assert_internal(page->xthread_free == 0); mi_assert_internal(page->next == NULL); mi_assert_internal(page->prev == NULL); mi_assert_internal(page->retire_expire == 0); mi_assert_internal(!mi_page_has_aligned(page)); #if (MI_PADDING || MI_ENCODE_FREELIST) mi_assert_internal(page->keys[0] != 0); mi_assert_internal(page->keys[1] != 0); #endif mi_assert_internal(page->block_size_shift == 0 || (block_size == ((size_t)1 << page->block_size_shift))); mi_assert_expensive(mi_page_is_valid_init(page)); // initialize an initial free list mi_page_extend_free(heap,page,tld); mi_assert(mi_page_immediate_available(page)); } /* ----------------------------------------------------------- Find pages with free blocks -------------------------------------------------------------*/ // search for a best next page to use for at most N pages (often cut short if immediate blocks are available) #define MI_MAX_CANDIDATE_SEARCH (4) // is the page not yet used up to its reserved space? static bool mi_page_is_expandable(const mi_page_t* page) { mi_assert_internal(page != NULL); mi_assert_internal(page->capacity <= page->reserved); return (page->capacity < page->reserved); } // Find a page with free blocks of `page->block_size`. static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* pq, bool first_try) { // search through the pages in "next fit" order #if MI_STAT size_t count = 0; #endif size_t candidate_count = 0; // we reset this on the first candidate to limit the search mi_page_t* page_candidate = NULL; // a page with free space mi_page_t* page = pq->first; while (page != NULL) { mi_page_t* next = page->next; // remember next #if MI_STAT count++; #endif candidate_count++; // collect freed blocks by us and other threads _mi_page_free_collect(page, false); #if MI_MAX_CANDIDATE_SEARCH > 1 // search up to N pages for a best candidate // is the local free list non-empty? const bool immediate_available = mi_page_immediate_available(page); // if the page is completely full, move it to the `mi_pages_full` // queue so we don't visit long-lived pages too often. if (!immediate_available && !mi_page_is_expandable(page)) { mi_assert_internal(!mi_page_is_in_full(page) && !mi_page_immediate_available(page)); mi_page_to_full(page, pq); } else { // the page has free space, make it a candidate // we prefer non-expandable pages with high usage as candidates (to reduce commit, and increase chances of free-ing up pages) if (page_candidate == NULL) { page_candidate = page; candidate_count = 0; } // prefer to reuse fuller pages (in the hope the less used page gets freed) else if (page->used >= page_candidate->used && !mi_page_is_mostly_used(page) && !mi_page_is_expandable(page)) { page_candidate = page; } // if we find a non-expandable candidate, or searched for N pages, return with the best candidate if (immediate_available || candidate_count > MI_MAX_CANDIDATE_SEARCH) { mi_assert_internal(page_candidate!=NULL); break; } } #else // first-fit algorithm // If the page contains free blocks, we are done if (mi_page_immediate_available(page) || mi_page_is_expandable(page)) { break; // pick this one } // If the page is completely full, move it to the `mi_pages_full` // queue so we don't visit long-lived pages too often. mi_assert_internal(!mi_page_is_in_full(page) && !mi_page_immediate_available(page)); mi_page_to_full(page, pq); #endif page = next; } // for each page mi_heap_stat_counter_increase(heap, page_searches, count); // set the page to the best candidate if (page_candidate != NULL) { page = page_candidate; } if (page != NULL && !mi_page_immediate_available(page)) { mi_assert_internal(mi_page_is_expandable(page)); mi_page_extend_free(heap, page, heap->tld); } if (page == NULL) { _mi_heap_collect_retired(heap, false); // perhaps make a page available? page = mi_page_fresh(heap, pq); if (page == NULL && first_try) { // out-of-memory _or_ an abandoned page with free blocks was reclaimed, try once again page = mi_page_queue_find_free_ex(heap, pq, false); } } else { // move the page to the front of the queue mi_page_queue_move_to_front(heap, pq, page); page->retire_expire = 0; // _mi_heap_collect_retired(heap, false); // update retire counts; note: increases rss on MemoryLoad bench so don't do this } mi_assert_internal(page == NULL || mi_page_immediate_available(page)); return page; } // Find a page with free blocks of `size`. static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, size_t size) { mi_page_queue_t* pq = mi_page_queue(heap, size); // check the first page: we even do this with candidate search or otherwise we re-search every time mi_page_t* page = pq->first; if (page != NULL) { #if (MI_SECURE>=3) // in secure mode, we extend half the time to increase randomness if (page->capacity < page->reserved && ((_mi_heap_random_next(heap) & 1) == 1)) { mi_page_extend_free(heap, page, heap->tld); mi_assert_internal(mi_page_immediate_available(page)); } else #endif { _mi_page_free_collect(page,false); } if (mi_page_immediate_available(page)) { page->retire_expire = 0; return page; // fast path } } return mi_page_queue_find_free_ex(heap, pq, true); } /* ----------------------------------------------------------- Users can register a deferred free function called when the `free` list is empty. Since the `local_free` is separate this is deterministically called after a certain number of allocations. ----------------------------------------------------------- */ static mi_deferred_free_fun* volatile deferred_free = NULL; static _Atomic(void*) deferred_arg; // = NULL void _mi_deferred_free(mi_heap_t* heap, bool force) { heap->tld->heartbeat++; if (deferred_free != NULL && !heap->tld->recurse) { heap->tld->recurse = true; deferred_free(force, heap->tld->heartbeat, mi_atomic_load_ptr_relaxed(void,&deferred_arg)); heap->tld->recurse = false; } } void mi_register_deferred_free(mi_deferred_free_fun* fn, void* arg) mi_attr_noexcept { deferred_free = fn; mi_atomic_store_ptr_release(void,&deferred_arg, arg); } /* ----------------------------------------------------------- General allocation ----------------------------------------------------------- */ // Large and huge page allocation. // Huge pages contain just one block, and the segment contains just that page (as `MI_SEGMENT_HUGE`). // Huge pages are also use if the requested alignment is very large (> MI_BLOCK_ALIGNMENT_MAX) // so their size is not always `> MI_LARGE_OBJ_SIZE_MAX`. static mi_page_t* mi_large_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_alignment) { size_t block_size = _mi_os_good_alloc_size(size); mi_assert_internal(mi_bin(block_size) == MI_BIN_HUGE || page_alignment > 0); bool is_huge = (block_size > MI_LARGE_OBJ_SIZE_MAX || page_alignment > 0); #if MI_HUGE_PAGE_ABANDON mi_page_queue_t* pq = (is_huge ? NULL : mi_page_queue(heap, block_size)); #else mi_page_queue_t* pq = mi_page_queue(heap, is_huge ? MI_LARGE_OBJ_SIZE_MAX+1 : block_size); mi_assert_internal(!is_huge || mi_page_queue_is_huge(pq)); #endif mi_page_t* page = mi_page_fresh_alloc(heap, pq, block_size, page_alignment); if (page != NULL) { mi_assert_internal(mi_page_immediate_available(page)); if (is_huge) { mi_assert_internal(mi_page_is_huge(page)); mi_assert_internal(_mi_page_segment(page)->kind == MI_SEGMENT_HUGE); mi_assert_internal(_mi_page_segment(page)->used==1); #if MI_HUGE_PAGE_ABANDON mi_assert_internal(_mi_page_segment(page)->thread_id==0); // abandoned, not in the huge queue mi_page_set_heap(page, NULL); #endif } else { mi_assert_internal(!mi_page_is_huge(page)); } const size_t bsize = mi_page_usable_block_size(page); // note: not `mi_page_block_size` to account for padding /*if (bsize <= MI_LARGE_OBJ_SIZE_MAX) { mi_heap_stat_increase(heap, malloc_large, bsize); mi_heap_stat_counter_increase(heap, malloc_large_count, 1); } else */ { _mi_stat_increase(&heap->tld->stats.malloc_huge, bsize); _mi_stat_counter_increase(&heap->tld->stats.malloc_huge_count, 1); } } return page; } // Allocate a page // Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed. static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignment) mi_attr_noexcept { // huge allocation? const size_t req_size = size - MI_PADDING_SIZE; // correct for padding_size in case of an overflow on `size` if mi_unlikely(req_size > (MI_MEDIUM_OBJ_SIZE_MAX - MI_PADDING_SIZE) || huge_alignment > 0) { if mi_unlikely(req_size > MI_MAX_ALLOC_SIZE) { _mi_error_message(EOVERFLOW, "allocation request is too large (%zu bytes)\n", req_size); return NULL; } else { return mi_large_huge_page_alloc(heap,size,huge_alignment); } } else { // otherwise find a page with free blocks in our size segregated queues #if MI_PADDING mi_assert_internal(size >= MI_PADDING_SIZE); #endif return mi_find_free_page(heap, size); } } // Generic allocation routine if the fast path (`alloc.c:mi_page_malloc`) does not succeed. // Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed. // The `huge_alignment` is normally 0 but is set to a multiple of MI_SEGMENT_SIZE for // very large requested alignments in which case we use a huge segment. void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept { mi_assert_internal(heap != NULL); // initialize if necessary if mi_unlikely(!mi_heap_is_initialized(heap)) { heap = mi_heap_get_default(); // calls mi_thread_init if mi_unlikely(!mi_heap_is_initialized(heap)) { return NULL; } } mi_assert_internal(mi_heap_is_initialized(heap)); // do administrative tasks every N generic mallocs if mi_unlikely(++heap->generic_count >= 100) { heap->generic_collect_count += heap->generic_count; heap->generic_count = 0; // call potential deferred free routines _mi_deferred_free(heap, false); // free delayed frees from other threads (but skip contended ones) _mi_heap_delayed_free_partial(heap); // collect every once in a while (10000 by default) const long generic_collect = mi_option_get_clamp(mi_option_generic_collect, 1, 1000000L); if (heap->generic_collect_count >= generic_collect) { heap->generic_collect_count = 0; mi_heap_collect(heap, false /* force? */); } } // find (or allocate) a page of the right size mi_page_t* page = mi_find_page(heap, size, huge_alignment); if mi_unlikely(page == NULL) { // first time out of memory, try to collect and retry the allocation once more mi_heap_collect(heap, true /* force */); page = mi_find_page(heap, size, huge_alignment); } if mi_unlikely(page == NULL) { // out of memory const size_t req_size = size - MI_PADDING_SIZE; // correct for padding_size in case of an overflow on `size` _mi_error_message(ENOMEM, "unable to allocate memory (%zu bytes)\n", req_size); return NULL; } mi_assert_internal(mi_page_immediate_available(page)); mi_assert_internal(mi_page_block_size(page) >= size); // and try again, this time succeeding! (i.e. this should never recurse through _mi_page_malloc) void* p; if mi_unlikely(zero && mi_page_is_huge(page)) { // note: we cannot call _mi_page_malloc with zeroing for huge blocks; we zero it afterwards in that case. p = _mi_page_malloc(heap, page, size); mi_assert_internal(p != NULL); _mi_memzero_aligned(p, mi_page_usable_block_size(page)); } else { p = _mi_page_malloc_zero(heap, page, size, zero); mi_assert_internal(p != NULL); } // move singleton pages to the full queue if (page->reserved == page->used) { mi_page_to_full(page, mi_page_queue_of(page)); } return p; } ================================================ FILE: third-party/mimalloc/src/prim/emscripten/prim.c ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2023, Microsoft Research, Daan Leijen, Alon Zakai This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ // This file is included in `src/prim/prim.c` #include "mimalloc.h" #include "mimalloc/internal.h" #include "mimalloc/atomic.h" #include "mimalloc/prim.h" // Design // ====== // // mimalloc is built on top of emmalloc. emmalloc is a minimal allocator on top // of sbrk. The reason for having three layers here is that we want mimalloc to // be able to allocate and release system memory properly, the same way it would // when using VirtualAlloc on Windows or mmap on POSIX, and sbrk is too limited. // Specifically, sbrk can only go up and down, and not "skip" over regions, and // so we end up either never freeing memory to the system, or we can get stuck // with holes. // // Atm wasm generally does *not* free memory back the system: once grown, we do // not shrink back down (https://github.com/WebAssembly/design/issues/1397). // However, that is expected to improve // (https://github.com/WebAssembly/memory-control/blob/main/proposals/memory-control/Overview.md) // and so we do not want to bake those limitations in here. // // Even without that issue, we want our system allocator to handle holes, that // is, it should merge freed regions and allow allocating new content there of // the full size, etc., so that we do not waste space. That means that the // system allocator really does need to handle the general problem of allocating // and freeing variable-sized chunks of memory in a random order, like malloc/ // free do. And so it makes sense to layer mimalloc on top of such an // implementation. // // emmalloc makes sense for the lower level because it is small and simple while // still fully handling merging of holes etc. It is not the most efficient // allocator, but our assumption is that mimalloc needs to be fast while the // system allocator underneath it is called much less frequently. // //--------------------------------------------- // init //--------------------------------------------- void _mi_prim_mem_init( mi_os_mem_config_t* config) { config->page_size = 64*MI_KiB; // WebAssembly has a fixed page size: 64KiB config->alloc_granularity = 16; config->has_overcommit = false; config->has_partial_free = false; config->has_virtual_reserve = false; } extern void emmalloc_free(void*); int _mi_prim_free(void* addr, size_t size) { MI_UNUSED(size); emmalloc_free(addr); return 0; } //--------------------------------------------- // Allocation //--------------------------------------------- extern void* emmalloc_memalign(size_t alignment, size_t size); // Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned. int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) { MI_UNUSED(try_alignment); MI_UNUSED(allow_large); MI_UNUSED(commit); MI_UNUSED(hint_addr); *is_large = false; // TODO: Track the highest address ever seen; first uses of it are zeroes. // That assumes no one else uses sbrk but us (they could go up, // scribble, and then down), but we could assert on that perhaps. *is_zero = false; // emmalloc has a minimum alignment size. #define MIN_EMMALLOC_ALIGN 8 if (try_alignment < MIN_EMMALLOC_ALIGN) { try_alignment = MIN_EMMALLOC_ALIGN; } void* p = emmalloc_memalign(try_alignment, size); *addr = p; if (p == 0) { return ENOMEM; } return 0; } //--------------------------------------------- // Commit/Reset //--------------------------------------------- int _mi_prim_commit(void* addr, size_t size, bool* is_zero) { MI_UNUSED(addr); MI_UNUSED(size); // See TODO above. *is_zero = false; return 0; } int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit) { MI_UNUSED(addr); MI_UNUSED(size); *needs_recommit = false; return 0; } int _mi_prim_reset(void* addr, size_t size) { MI_UNUSED(addr); MI_UNUSED(size); return 0; } int _mi_prim_protect(void* addr, size_t size, bool protect) { MI_UNUSED(addr); MI_UNUSED(size); MI_UNUSED(protect); return 0; } //--------------------------------------------- // Huge pages and NUMA nodes //--------------------------------------------- int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr) { MI_UNUSED(hint_addr); MI_UNUSED(size); MI_UNUSED(numa_node); *is_zero = true; *addr = NULL; return ENOSYS; } size_t _mi_prim_numa_node(void) { return 0; } size_t _mi_prim_numa_node_count(void) { return 1; } //---------------------------------------------------------------- // Clock //---------------------------------------------------------------- #include mi_msecs_t _mi_prim_clock_now(void) { return emscripten_date_now(); } //---------------------------------------------------------------- // Process info //---------------------------------------------------------------- void _mi_prim_process_info(mi_process_info_t* pinfo) { // use defaults MI_UNUSED(pinfo); } //---------------------------------------------------------------- // Output //---------------------------------------------------------------- #include void _mi_prim_out_stderr( const char* msg) { emscripten_console_error(msg); } //---------------------------------------------------------------- // Environment //---------------------------------------------------------------- bool _mi_prim_getenv(const char* name, char* result, size_t result_size) { // For code size reasons, do not support environ customization for now. MI_UNUSED(name); MI_UNUSED(result); MI_UNUSED(result_size); return false; } //---------------------------------------------------------------- // Random //---------------------------------------------------------------- bool _mi_prim_random_buf(void* buf, size_t buf_len) { int err = getentropy(buf, buf_len); return !err; } //---------------------------------------------------------------- // Thread init/done //---------------------------------------------------------------- #if defined(MI_USE_PTHREADS) // use pthread local storage keys to detect thread ending // (and used with MI_TLS_PTHREADS for the default heap) pthread_key_t _mi_heap_default_key = (pthread_key_t)(-1); static void mi_pthread_done(void* value) { if (value!=NULL) { _mi_thread_done((mi_heap_t*)value); } } void _mi_prim_thread_init_auto_done(void) { mi_assert_internal(_mi_heap_default_key == (pthread_key_t)(-1)); pthread_key_create(&_mi_heap_default_key, &mi_pthread_done); } void _mi_prim_thread_done_auto_done(void) { // nothing to do } void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) { if (_mi_heap_default_key != (pthread_key_t)(-1)) { // can happen during recursive invocation on freeBSD pthread_setspecific(_mi_heap_default_key, heap); } } #else void _mi_prim_thread_init_auto_done(void) { // nothing } void _mi_prim_thread_done_auto_done(void) { // nothing } void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) { MI_UNUSED(heap); } #endif ================================================ FILE: third-party/mimalloc/src/prim/osx/alloc-override-zone.c ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2022, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #include "mimalloc.h" #include "mimalloc/internal.h" #if defined(MI_MALLOC_OVERRIDE) #if !defined(__APPLE__) #error "this file should only be included on macOS" #endif /* ------------------------------------------------------ Override system malloc on macOS This is done through the malloc zone interface. It seems to be most robust in combination with interposing though or otherwise we may get zone errors as there are could be allocations done by the time we take over the zone. ------------------------------------------------------ */ #include #include #include // memset #include #ifdef __cplusplus extern "C" { #endif #if defined(MAC_OS_X_VERSION_10_6) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6) // only available from OSX 10.6 extern malloc_zone_t* malloc_default_purgeable_zone(void) __attribute__((weak_import)); #endif /* ------------------------------------------------------ malloc zone members ------------------------------------------------------ */ static size_t zone_size(malloc_zone_t* zone, const void* p) { MI_UNUSED(zone); if (!mi_is_in_heap_region(p)){ return 0; } // not our pointer, bail out return mi_usable_size(p); } static void* zone_malloc(malloc_zone_t* zone, size_t size) { MI_UNUSED(zone); return mi_malloc(size); } static void* zone_calloc(malloc_zone_t* zone, size_t count, size_t size) { MI_UNUSED(zone); return mi_calloc(count, size); } static void* zone_valloc(malloc_zone_t* zone, size_t size) { MI_UNUSED(zone); return mi_malloc_aligned(size, _mi_os_page_size()); } static void zone_free(malloc_zone_t* zone, void* p) { MI_UNUSED(zone); mi_cfree(p); } static void* zone_realloc(malloc_zone_t* zone, void* p, size_t newsize) { MI_UNUSED(zone); return mi_realloc(p, newsize); } static void* zone_memalign(malloc_zone_t* zone, size_t alignment, size_t size) { MI_UNUSED(zone); return mi_malloc_aligned(size,alignment); } static void zone_destroy(malloc_zone_t* zone) { MI_UNUSED(zone); // todo: ignore for now? } static unsigned zone_batch_malloc(malloc_zone_t* zone, size_t size, void** ps, unsigned count) { size_t i; for (i = 0; i < count; i++) { ps[i] = zone_malloc(zone, size); if (ps[i] == NULL) break; } return i; } static void zone_batch_free(malloc_zone_t* zone, void** ps, unsigned count) { for(size_t i = 0; i < count; i++) { zone_free(zone, ps[i]); ps[i] = NULL; } } static size_t zone_pressure_relief(malloc_zone_t* zone, size_t size) { MI_UNUSED(zone); MI_UNUSED(size); mi_collect(false); return 0; } static void zone_free_definite_size(malloc_zone_t* zone, void* p, size_t size) { MI_UNUSED(size); zone_free(zone,p); } static boolean_t zone_claimed_address(malloc_zone_t* zone, void* p) { MI_UNUSED(zone); return mi_is_in_heap_region(p); } /* ------------------------------------------------------ Introspection members ------------------------------------------------------ */ static kern_return_t intro_enumerator(task_t task, void* p, unsigned type_mask, vm_address_t zone_address, memory_reader_t reader, vm_range_recorder_t recorder) { // todo: enumerate all memory MI_UNUSED(task); MI_UNUSED(p); MI_UNUSED(type_mask); MI_UNUSED(zone_address); MI_UNUSED(reader); MI_UNUSED(recorder); return KERN_SUCCESS; } static size_t intro_good_size(malloc_zone_t* zone, size_t size) { MI_UNUSED(zone); return mi_good_size(size); } static boolean_t intro_check(malloc_zone_t* zone) { MI_UNUSED(zone); return true; } static void intro_print(malloc_zone_t* zone, boolean_t verbose) { MI_UNUSED(zone); MI_UNUSED(verbose); mi_stats_print(NULL); } static void intro_log(malloc_zone_t* zone, void* p) { MI_UNUSED(zone); MI_UNUSED(p); // todo? } static void intro_force_lock(malloc_zone_t* zone) { MI_UNUSED(zone); // todo? } static void intro_force_unlock(malloc_zone_t* zone) { MI_UNUSED(zone); // todo? } static void intro_statistics(malloc_zone_t* zone, malloc_statistics_t* stats) { MI_UNUSED(zone); // todo... stats->blocks_in_use = 0; stats->size_in_use = 0; stats->max_size_in_use = 0; stats->size_allocated = 0; } static boolean_t intro_zone_locked(malloc_zone_t* zone) { MI_UNUSED(zone); return false; } /* ------------------------------------------------------ At process start, override the default allocator ------------------------------------------------------ */ #if defined(__GNUC__) && !defined(__clang__) #pragma GCC diagnostic ignored "-Wmissing-field-initializers" #endif #if defined(__clang__) #pragma clang diagnostic ignored "-Wc99-extensions" #endif static malloc_introspection_t mi_introspect = { .enumerator = &intro_enumerator, .good_size = &intro_good_size, .check = &intro_check, .print = &intro_print, .log = &intro_log, .force_lock = &intro_force_lock, .force_unlock = &intro_force_unlock, #if defined(MAC_OS_X_VERSION_10_6) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6) && !defined(__ppc__) .statistics = &intro_statistics, .zone_locked = &intro_zone_locked, #endif }; static malloc_zone_t mi_malloc_zone = { // note: even with designators, the order is important for C++ compilation //.reserved1 = NULL, //.reserved2 = NULL, .size = &zone_size, .malloc = &zone_malloc, .calloc = &zone_calloc, .valloc = &zone_valloc, .free = &zone_free, .realloc = &zone_realloc, .destroy = &zone_destroy, .zone_name = "mimalloc", .batch_malloc = &zone_batch_malloc, .batch_free = &zone_batch_free, .introspect = &mi_introspect, #if defined(MAC_OS_X_VERSION_10_6) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6) && !defined(__ppc__) #if defined(MAC_OS_X_VERSION_10_14) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_14) .version = 10, #else .version = 9, #endif // switch to version 9+ on OSX 10.6 to support memalign. .memalign = &zone_memalign, .free_definite_size = &zone_free_definite_size, #if defined(MAC_OS_X_VERSION_10_7) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_7) .pressure_relief = &zone_pressure_relief, #endif #if defined(MAC_OS_X_VERSION_10_14) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_14) .claimed_address = &zone_claimed_address, #endif #else .version = 4, #endif }; #ifdef __cplusplus } #endif #if defined(MI_OSX_INTERPOSE) && defined(MI_SHARED_LIB_EXPORT) // ------------------------------------------------------ // Override malloc_xxx and malloc_zone_xxx api's to use only // our mimalloc zone. Since even the loader uses malloc // on macOS, this ensures that all allocations go through // mimalloc (as all calls are interposed). // The main `malloc`, `free`, etc calls are interposed in `alloc-override.c`, // Here, we also override macOS specific API's like // `malloc_zone_calloc` etc. see // ------------------------------------------------------ static inline malloc_zone_t* mi_get_default_zone(void) { static bool init; if mi_unlikely(!init) { init = true; malloc_zone_register(&mi_malloc_zone); // by calling register we avoid a zone error on free (see ) } return &mi_malloc_zone; } mi_decl_externc int malloc_jumpstart(uintptr_t cookie); mi_decl_externc void _malloc_fork_prepare(void); mi_decl_externc void _malloc_fork_parent(void); mi_decl_externc void _malloc_fork_child(void); static malloc_zone_t* mi_malloc_create_zone(vm_size_t size, unsigned flags) { MI_UNUSED(size); MI_UNUSED(flags); return mi_get_default_zone(); } static malloc_zone_t* mi_malloc_default_zone (void) { return mi_get_default_zone(); } static malloc_zone_t* mi_malloc_default_purgeable_zone(void) { return mi_get_default_zone(); } static void mi_malloc_destroy_zone(malloc_zone_t* zone) { MI_UNUSED(zone); // nothing. } static kern_return_t mi_malloc_get_all_zones (task_t task, memory_reader_t mr, vm_address_t** addresses, unsigned* count) { MI_UNUSED(task); MI_UNUSED(mr); if (addresses != NULL) *addresses = NULL; if (count != NULL) *count = 0; return KERN_SUCCESS; } static const char* mi_malloc_get_zone_name(malloc_zone_t* zone) { return (zone == NULL ? mi_malloc_zone.zone_name : zone->zone_name); } static void mi_malloc_set_zone_name(malloc_zone_t* zone, const char* name) { MI_UNUSED(zone); MI_UNUSED(name); } static int mi_malloc_jumpstart(uintptr_t cookie) { MI_UNUSED(cookie); return 1; // or 0 for no error? } static void mi__malloc_fork_prepare(void) { // nothing } static void mi__malloc_fork_parent(void) { // nothing } static void mi__malloc_fork_child(void) { // nothing } static void mi_malloc_printf(const char* fmt, ...) { MI_UNUSED(fmt); } static bool zone_check(malloc_zone_t* zone) { MI_UNUSED(zone); return true; } static malloc_zone_t* zone_from_ptr(const void* p) { MI_UNUSED(p); return mi_get_default_zone(); } static void zone_log(malloc_zone_t* zone, void* p) { MI_UNUSED(zone); MI_UNUSED(p); } static void zone_print(malloc_zone_t* zone, bool b) { MI_UNUSED(zone); MI_UNUSED(b); } static void zone_print_ptr_info(void* p) { MI_UNUSED(p); } static void zone_register(malloc_zone_t* zone) { MI_UNUSED(zone); } static void zone_unregister(malloc_zone_t* zone) { MI_UNUSED(zone); } // use interposing so `DYLD_INSERT_LIBRARIES` works without `DYLD_FORCE_FLAT_NAMESPACE=1` // See: struct mi_interpose_s { const void* replacement; const void* target; }; #define MI_INTERPOSE_FUN(oldfun,newfun) { (const void*)&newfun, (const void*)&oldfun } #define MI_INTERPOSE_MI(fun) MI_INTERPOSE_FUN(fun,mi_##fun) #define MI_INTERPOSE_ZONE(fun) MI_INTERPOSE_FUN(malloc_##fun,fun) __attribute__((used)) static const struct mi_interpose_s _mi_zone_interposes[] __attribute__((section("__DATA, __interpose"))) = { MI_INTERPOSE_MI(malloc_create_zone), MI_INTERPOSE_MI(malloc_default_purgeable_zone), MI_INTERPOSE_MI(malloc_default_zone), MI_INTERPOSE_MI(malloc_destroy_zone), MI_INTERPOSE_MI(malloc_get_all_zones), MI_INTERPOSE_MI(malloc_get_zone_name), MI_INTERPOSE_MI(malloc_jumpstart), MI_INTERPOSE_MI(malloc_printf), MI_INTERPOSE_MI(malloc_set_zone_name), MI_INTERPOSE_MI(_malloc_fork_child), MI_INTERPOSE_MI(_malloc_fork_parent), MI_INTERPOSE_MI(_malloc_fork_prepare), MI_INTERPOSE_ZONE(zone_batch_free), MI_INTERPOSE_ZONE(zone_batch_malloc), MI_INTERPOSE_ZONE(zone_calloc), MI_INTERPOSE_ZONE(zone_check), MI_INTERPOSE_ZONE(zone_free), MI_INTERPOSE_ZONE(zone_from_ptr), MI_INTERPOSE_ZONE(zone_log), MI_INTERPOSE_ZONE(zone_malloc), MI_INTERPOSE_ZONE(zone_memalign), MI_INTERPOSE_ZONE(zone_print), MI_INTERPOSE_ZONE(zone_print_ptr_info), MI_INTERPOSE_ZONE(zone_realloc), MI_INTERPOSE_ZONE(zone_register), MI_INTERPOSE_ZONE(zone_unregister), MI_INTERPOSE_ZONE(zone_valloc) }; #else // ------------------------------------------------------ // hook into the zone api's without interposing // This is the official way of adding an allocator but // it seems less robust than using interpose. // ------------------------------------------------------ static inline malloc_zone_t* mi_get_default_zone(void) { // The first returned zone is the real default malloc_zone_t** zones = NULL; unsigned count = 0; kern_return_t ret = malloc_get_all_zones(0, NULL, (vm_address_t**)&zones, &count); if (ret == KERN_SUCCESS && count > 0) { return zones[0]; } else { // fallback return malloc_default_zone(); } } #if defined(__clang__) __attribute__((constructor(101))) // highest priority #else __attribute__((constructor)) // priority level is not supported by gcc #endif __attribute__((used)) static void _mi_macos_override_malloc(void) { malloc_zone_t* purgeable_zone = NULL; #if defined(MAC_OS_X_VERSION_10_6) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6) // force the purgeable zone to exist to avoid strange bugs if (malloc_default_purgeable_zone) { purgeable_zone = malloc_default_purgeable_zone(); } #endif // Register our zone. // thomcc: I think this is still needed to put us in the zone list. malloc_zone_register(&mi_malloc_zone); // Unregister the default zone, this makes our zone the new default // as that was the last registered. malloc_zone_t *default_zone = mi_get_default_zone(); // thomcc: Unsure if the next test is *always* false or just false in the // cases I've tried. I'm also unsure if the code inside is needed. at all if (default_zone != &mi_malloc_zone) { malloc_zone_unregister(default_zone); // Reregister the default zone so free and realloc in that zone keep working. malloc_zone_register(default_zone); } // Unregister, and re-register the purgeable_zone to avoid bugs if it occurs // earlier than the default zone. if (purgeable_zone != NULL) { malloc_zone_unregister(purgeable_zone); malloc_zone_register(purgeable_zone); } } #endif // MI_OSX_INTERPOSE #endif // MI_MALLOC_OVERRIDE ================================================ FILE: third-party/mimalloc/src/prim/osx/prim.c ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2023, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ // We use the unix/prim.c with the mmap API on macOSX #include "../unix/prim.c" ================================================ FILE: third-party/mimalloc/src/prim/prim.c ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2023, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ // Select the implementation of the primitives // depending on the OS. #if defined(_WIN32) #include "windows/prim.c" // VirtualAlloc (Windows) #elif defined(__APPLE__) #include "osx/prim.c" // macOSX (actually defers to mmap in unix/prim.c) #elif defined(__wasi__) #define MI_USE_SBRK #include "wasi/prim.c" // memory-grow or sbrk (Wasm) #elif defined(__EMSCRIPTEN__) #include "emscripten/prim.c" // emmalloc_*, + pthread support #else #include "unix/prim.c" // mmap() (Linux, macOSX, BSD, Illumnos, Haiku, DragonFly, etc.) #endif // Generic process initialization #ifndef MI_PRIM_HAS_PROCESS_ATTACH #if defined(__GNUC__) || defined(__clang__) // gcc,clang: use the constructor/destructor attribute // which for both seem to run before regular constructors/destructors #if defined(__clang__) #define mi_attr_constructor __attribute__((constructor(101))) #define mi_attr_destructor __attribute__((destructor(101))) #else #define mi_attr_constructor __attribute__((constructor)) #define mi_attr_destructor __attribute__((destructor)) #endif static void mi_attr_constructor mi_process_attach(void) { _mi_process_load(); } static void mi_attr_destructor mi_process_detach(void) { _mi_process_done(); } #elif defined(__cplusplus) // C++: use static initialization to detect process start/end // This is not guaranteed to be first/last but the best we can generally do? struct mi_init_done_t { mi_init_done_t() { _mi_process_load(); } ~mi_init_done_t() { _mi_process_done(); } }; static mi_init_done_t mi_init_done; #else #pragma message("define a way to call _mi_process_load/done on your platform") #endif #endif // Generic allocator init/done callback #ifndef MI_PRIM_HAS_ALLOCATOR_INIT bool _mi_is_redirected(void) { return false; } bool _mi_allocator_init(const char** message) { if (message != NULL) { *message = NULL; } return true; } void _mi_allocator_done(void) { // nothing to do } #endif ================================================ FILE: third-party/mimalloc/src/prim/readme.md ================================================ ## Portability Primitives This is the portability layer where all primitives needed from the OS are defined. - `include/mimalloc/prim.h`: primitive portability API definition. - `prim.c`: Selects one of `unix/prim.c`, `wasi/prim.c`, or `windows/prim.c` depending on the host platform (and on macOS, `osx/prim.c` defers to `unix/prim.c`). Note: still work in progress, there may still be places in the sources that still depend on OS ifdef's. ================================================ FILE: third-party/mimalloc/src/prim/unix/prim.c ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2023, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ // This file is included in `src/prim/prim.c` #ifndef _DEFAULT_SOURCE #define _DEFAULT_SOURCE // ensure mmap flags and syscall are defined #endif #if defined(__sun) // illumos provides new mman.h api when any of these are defined // otherwise the old api based on caddr_t which predates the void pointers one. // stock solaris provides only the former, chose to atomically to discard those // flags only here rather than project wide tough. #undef _XOPEN_SOURCE #undef _POSIX_C_SOURCE #endif #include "mimalloc.h" #include "mimalloc/internal.h" #include "mimalloc/prim.h" #include // mmap #include // sysconf #include // open, close, read, access #include // getenv, arc4random_buf #if defined(__linux__) #include //#if defined(MI_NO_THP) #include // THP disable //#endif #if defined(__GLIBC__) #include // linux mmap flags #else #include #endif #elif defined(__APPLE__) #include #include #if !defined(TARGET_OS_OSX) || TARGET_OS_OSX // see issue #879, used to be (!TARGET_IOS_IPHONE && !TARGET_IOS_SIMULATOR) #include // VM_MAKE_TAG, VM_FLAGS_SUPERPAGE_SIZE_2MB, etc. #endif #if !defined(MAC_OS_X_VERSION_10_7) #define MAC_OS_X_VERSION_10_7 1070 #endif #elif defined(__FreeBSD__) || defined(__DragonFly__) #include #if __FreeBSD_version >= 1200000 #include #include #endif #include #endif #if (defined(__linux__) && !defined(__ANDROID__)) || defined(__FreeBSD__) #define MI_HAS_SYSCALL_H #include #endif #if !defined(MADV_DONTNEED) && defined(POSIX_MADV_DONTNEED) // QNX #define MADV_DONTNEED POSIX_MADV_DONTNEED #endif #if !defined(MADV_FREE) && defined(POSIX_MADV_FREE) // QNX #define MADV_FREE POSIX_MADV_FREE #endif //------------------------------------------------------------------------------------ // Use syscalls for some primitives to allow for libraries that override open/read/close etc. // and do allocation themselves; using syscalls prevents recursion when mimalloc is // still initializing (issue #713) // Declare inline to avoid unused function warnings. //------------------------------------------------------------------------------------ #if defined(MI_HAS_SYSCALL_H) && defined(SYS_open) && defined(SYS_close) && defined(SYS_read) && defined(SYS_access) static inline int mi_prim_open(const char* fpath, int open_flags) { return syscall(SYS_open,fpath,open_flags,0); } static inline ssize_t mi_prim_read(int fd, void* buf, size_t bufsize) { return syscall(SYS_read,fd,buf,bufsize); } static inline int mi_prim_close(int fd) { return syscall(SYS_close,fd); } static inline int mi_prim_access(const char *fpath, int mode) { return syscall(SYS_access,fpath,mode); } #else static inline int mi_prim_open(const char* fpath, int open_flags) { return open(fpath,open_flags); } static inline ssize_t mi_prim_read(int fd, void* buf, size_t bufsize) { return read(fd,buf,bufsize); } static inline int mi_prim_close(int fd) { return close(fd); } static inline int mi_prim_access(const char *fpath, int mode) { return access(fpath,mode); } #endif //--------------------------------------------- // init //--------------------------------------------- static bool unix_detect_overcommit(void) { bool os_overcommit = true; #if defined(__linux__) int fd = mi_prim_open("/proc/sys/vm/overcommit_memory", O_RDONLY); if (fd >= 0) { char buf[32]; ssize_t nread = mi_prim_read(fd, &buf, sizeof(buf)); mi_prim_close(fd); // // 0: heuristic overcommit, 1: always overcommit, 2: never overcommit (ignore NORESERVE) if (nread >= 1) { os_overcommit = (buf[0] == '0' || buf[0] == '1'); } } #elif defined(__FreeBSD__) int val = 0; size_t olen = sizeof(val); if (sysctlbyname("vm.overcommit", &val, &olen, NULL, 0) == 0) { os_overcommit = (val != 0); } #else // default: overcommit is true #endif return os_overcommit; } void _mi_prim_mem_init( mi_os_mem_config_t* config ) { long psize = sysconf(_SC_PAGESIZE); if (psize > 0) { config->page_size = (size_t)psize; config->alloc_granularity = (size_t)psize; #if defined(_SC_PHYS_PAGES) long pphys = sysconf(_SC_PHYS_PAGES); const size_t psize_in_kib = (size_t)psize / MI_KiB; if (psize_in_kib > 0 && pphys > 0 && (size_t)pphys <= (SIZE_MAX/psize_in_kib)) { config->physical_memory_in_kib = (size_t)pphys * psize_in_kib; } #endif } config->large_page_size = 2*MI_MiB; // TODO: can we query the OS for this? config->has_overcommit = unix_detect_overcommit(); config->has_partial_free = true; // mmap can free in parts config->has_virtual_reserve = true; // todo: check if this true for NetBSD? (for anonymous mmap with PROT_NONE) // disable transparent huge pages for this process? #if (defined(__linux__) || defined(__ANDROID__)) && defined(PR_GET_THP_DISABLE) #if defined(MI_NO_THP) if (true) #else if (!mi_option_is_enabled(mi_option_allow_large_os_pages)) // disable THP also if large OS pages are not allowed in the options #endif { int val = 0; if (prctl(PR_GET_THP_DISABLE, &val, 0, 0, 0) != 0) { // Most likely since distros often come with always/madvise settings. val = 1; // Disabling only for mimalloc process rather than touching system wide settings (void)prctl(PR_SET_THP_DISABLE, &val, 0, 0, 0); } } #endif } //--------------------------------------------- // free //--------------------------------------------- int _mi_prim_free(void* addr, size_t size ) { bool err = (munmap(addr, size) == -1); return (err ? errno : 0); } //--------------------------------------------- // mmap //--------------------------------------------- static int unix_madvise(void* addr, size_t size, int advice) { #if defined(__sun) int res = madvise((caddr_t)addr, size, advice); // Solaris needs cast (issue #520) #elif defined(__QNX__) int res = posix_madvise(addr, size, advice); #else int res = madvise(addr, size, advice); #endif return (res==0 ? 0 : errno); } static void* unix_mmap_prim(void* addr, size_t size, size_t try_alignment, int protect_flags, int flags, int fd) { MI_UNUSED(try_alignment); void* p = NULL; #if defined(MAP_ALIGNED) // BSD if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) { size_t n = mi_bsr(try_alignment); if (((size_t)1 << n) == try_alignment && n >= 12 && n <= 30) { // alignment is a power of 2 and 4096 <= alignment <= 1GiB p = mmap(addr, size, protect_flags, flags | MAP_ALIGNED(n), fd, 0); if (p==MAP_FAILED || !_mi_is_aligned(p,try_alignment)) { int err = errno; _mi_trace_message("unable to directly request aligned OS memory (error: %d (0x%x), size: 0x%zx bytes, alignment: 0x%zx, hint address: %p)\n", err, err, size, try_alignment, addr); } if (p!=MAP_FAILED) return p; // fall back to regular mmap } } #elif defined(MAP_ALIGN) // Solaris if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) { p = mmap((void*)try_alignment, size, protect_flags, flags | MAP_ALIGN, fd, 0); // addr parameter is the required alignment if (p!=MAP_FAILED) return p; // fall back to regular mmap } #endif #if (MI_INTPTR_SIZE >= 8) && !defined(MAP_ALIGNED) // on 64-bit systems, use the virtual address area after 2TiB for 4MiB aligned allocations if (addr == NULL) { void* hint = _mi_os_get_aligned_hint(try_alignment, size); if (hint != NULL) { p = mmap(hint, size, protect_flags, flags, fd, 0); if (p==MAP_FAILED || !_mi_is_aligned(p,try_alignment)) { #if MI_TRACK_ENABLED // asan sometimes does not instrument errno correctly? int err = 0; #else int err = errno; #endif _mi_trace_message("unable to directly request hinted aligned OS memory (error: %d (0x%x), size: 0x%zx bytes, alignment: 0x%zx, hint address: %p)\n", err, err, size, try_alignment, hint); } if (p!=MAP_FAILED) return p; // fall back to regular mmap } } #endif // regular mmap p = mmap(addr, size, protect_flags, flags, fd, 0); if (p!=MAP_FAILED) return p; // failed to allocate return NULL; } static int unix_mmap_fd(void) { #if defined(VM_MAKE_TAG) // macOS: tracking anonymous page with a specific ID. (All up to 98 are taken officially but LLVM sanitizers had taken 99) int os_tag = (int)mi_option_get(mi_option_os_tag); if (os_tag < 100 || os_tag > 255) { os_tag = 254; } return VM_MAKE_TAG(os_tag); #else return -1; #endif } static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protect_flags, bool large_only, bool allow_large, bool* is_large) { #if !defined(MAP_ANONYMOUS) #define MAP_ANONYMOUS MAP_ANON #endif #if !defined(MAP_NORESERVE) #define MAP_NORESERVE 0 #endif void* p = NULL; const int fd = unix_mmap_fd(); int flags = MAP_PRIVATE | MAP_ANONYMOUS; if (_mi_os_has_overcommit()) { flags |= MAP_NORESERVE; } #if defined(PROT_MAX) protect_flags |= PROT_MAX(PROT_READ | PROT_WRITE); // BSD #endif // huge page allocation if (allow_large && (large_only || (_mi_os_use_large_page(size, try_alignment) && mi_option_get(mi_option_allow_large_os_pages) == 1))) { static _Atomic(size_t) large_page_try_ok; // = 0; size_t try_ok = mi_atomic_load_acquire(&large_page_try_ok); if (!large_only && try_ok > 0) { // If the OS is not configured for large OS pages, or the user does not have // enough permission, the `mmap` will always fail (but it might also fail for other reasons). // Therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times // to avoid too many failing calls to mmap. mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - 1); } else { int lflags = flags & ~MAP_NORESERVE; // using NORESERVE on huge pages seems to fail on Linux int lfd = fd; #ifdef MAP_ALIGNED_SUPER lflags |= MAP_ALIGNED_SUPER; #endif #ifdef MAP_HUGETLB lflags |= MAP_HUGETLB; #endif #ifdef MAP_HUGE_1GB static bool mi_huge_pages_available = true; if (large_only && (size % MI_GiB) == 0 && mi_huge_pages_available) { lflags |= MAP_HUGE_1GB; } else #endif { #ifdef MAP_HUGE_2MB lflags |= MAP_HUGE_2MB; #endif } #ifdef VM_FLAGS_SUPERPAGE_SIZE_2MB lfd |= VM_FLAGS_SUPERPAGE_SIZE_2MB; #endif if (large_only || lflags != flags) { // try large OS page allocation *is_large = true; p = unix_mmap_prim(addr, size, try_alignment, protect_flags, lflags, lfd); #ifdef MAP_HUGE_1GB if (p == NULL && (lflags & MAP_HUGE_1GB) == MAP_HUGE_1GB) { mi_huge_pages_available = false; // don't try huge 1GiB pages again if (large_only) { _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (errno: %i)\n", errno); } lflags = ((lflags & ~MAP_HUGE_1GB) | MAP_HUGE_2MB); p = unix_mmap_prim(addr, size, try_alignment, protect_flags, lflags, lfd); } #endif if (large_only) return p; if (p == NULL) { mi_atomic_store_release(&large_page_try_ok, (size_t)8); // on error, don't try again for the next N allocations } } } } // regular allocation if (p == NULL) { *is_large = false; p = unix_mmap_prim(addr, size, try_alignment, protect_flags, flags, fd); if (p != NULL) { #if defined(MADV_HUGEPAGE) // Many Linux systems don't allow MAP_HUGETLB but they support instead // transparent huge pages (THP). Generally, it is not required to call `madvise` with MADV_HUGE // though since properly aligned allocations will already use large pages if available // in that case -- in particular for our large regions (in `memory.c`). // However, some systems only allow THP if called with explicit `madvise`, so // when large OS pages are enabled for mimalloc, we call `madvise` anyways. if (allow_large && _mi_os_use_large_page(size, try_alignment)) { if (unix_madvise(p, size, MADV_HUGEPAGE) == 0) { // *is_large = true; // possibly }; } #elif defined(__sun) if (allow_large && _mi_os_use_large_page(size, try_alignment)) { struct memcntl_mha cmd = {0}; cmd.mha_pagesize = _mi_os_large_page_size(); cmd.mha_cmd = MHA_MAPSIZE_VA; if (memcntl((caddr_t)p, size, MC_HAT_ADVISE, (caddr_t)&cmd, 0, 0) == 0) { // *is_large = true; // possibly } } #endif } } return p; } // Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned. int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) { mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0); mi_assert_internal(commit || !allow_large); mi_assert_internal(try_alignment > 0); *is_zero = true; int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE); *addr = unix_mmap(hint_addr, size, try_alignment, protect_flags, false, allow_large, is_large); return (*addr != NULL ? 0 : errno); } //--------------------------------------------- // Commit/Reset //--------------------------------------------- static void unix_mprotect_hint(int err) { #if defined(__linux__) && (MI_SECURE>=2) // guard page around every mimalloc page if (err == ENOMEM) { _mi_warning_message("The next warning may be caused by a low memory map limit.\n" " On Linux this is controlled by the vm.max_map_count -- maybe increase it?\n" " For example: sudo sysctl -w vm.max_map_count=262144\n"); } #else MI_UNUSED(err); #endif } int _mi_prim_commit(void* start, size_t size, bool* is_zero) { // commit: ensure we can access the area // note: we may think that *is_zero can be true since the memory // was either from mmap PROT_NONE, or from decommit MADV_DONTNEED, but // we sometimes call commit on a range with still partially committed // memory and `mprotect` does not zero the range. *is_zero = false; int err = mprotect(start, size, (PROT_READ | PROT_WRITE)); if (err != 0) { err = errno; unix_mprotect_hint(err); } return err; } int _mi_prim_decommit(void* start, size_t size, bool* needs_recommit) { int err = 0; // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE) err = unix_madvise(start, size, MADV_DONTNEED); #if !MI_DEBUG && !MI_SECURE *needs_recommit = false; #else *needs_recommit = true; mprotect(start, size, PROT_NONE); #endif /* // decommit: use mmap with MAP_FIXED and PROT_NONE to discard the existing memory (and reduce rss) *needs_recommit = true; const int fd = unix_mmap_fd(); void* p = mmap(start, size, PROT_NONE, (MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE), fd, 0); if (p != start) { err = errno; } */ return err; } int _mi_prim_reset(void* start, size_t size) { // We try to use `MADV_FREE` as that is the fastest. A drawback though is that it // will not reduce the `rss` stats in tools like `top` even though the memory is available // to other processes. With the default `MIMALLOC_PURGE_DECOMMITS=1` we ensure that by // default `MADV_DONTNEED` is used though. #if defined(MADV_FREE) static _Atomic(size_t) advice = MI_ATOMIC_VAR_INIT(MADV_FREE); int oadvice = (int)mi_atomic_load_relaxed(&advice); int err; while ((err = unix_madvise(start, size, oadvice)) != 0 && errno == EAGAIN) { errno = 0; }; if (err != 0 && errno == EINVAL && oadvice == MADV_FREE) { // if MADV_FREE is not supported, fall back to MADV_DONTNEED from now on mi_atomic_store_release(&advice, (size_t)MADV_DONTNEED); err = unix_madvise(start, size, MADV_DONTNEED); } #else int err = unix_madvise(start, size, MADV_DONTNEED); #endif return err; } int _mi_prim_protect(void* start, size_t size, bool protect) { int err = mprotect(start, size, protect ? PROT_NONE : (PROT_READ | PROT_WRITE)); if (err != 0) { err = errno; } unix_mprotect_hint(err); return err; } //--------------------------------------------- // Huge page allocation //--------------------------------------------- #if (MI_INTPTR_SIZE >= 8) && !defined(__HAIKU__) && !defined(__CYGWIN__) #ifndef MPOL_PREFERRED #define MPOL_PREFERRED 1 #endif #if defined(MI_HAS_SYSCALL_H) && defined(SYS_mbind) static long mi_prim_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) { return syscall(SYS_mbind, start, len, mode, nmask, maxnode, flags); } #else static long mi_prim_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) { MI_UNUSED(start); MI_UNUSED(len); MI_UNUSED(mode); MI_UNUSED(nmask); MI_UNUSED(maxnode); MI_UNUSED(flags); return 0; } #endif int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr) { bool is_large = true; *is_zero = true; *addr = unix_mmap(hint_addr, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large); if (*addr != NULL && numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes unsigned long numa_mask = (1UL << numa_node); // TODO: does `mbind` work correctly for huge OS pages? should we // use `set_mempolicy` before calling mmap instead? // see: long err = mi_prim_mbind(*addr, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0); if (err != 0) { err = errno; _mi_warning_message("failed to bind huge (1GiB) pages to numa node %d (error: %d (0x%x))\n", numa_node, err, err); } } return (*addr != NULL ? 0 : errno); } #else int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr) { MI_UNUSED(hint_addr); MI_UNUSED(size); MI_UNUSED(numa_node); *is_zero = false; *addr = NULL; return ENOMEM; } #endif //--------------------------------------------- // NUMA nodes //--------------------------------------------- #if defined(__linux__) size_t _mi_prim_numa_node(void) { #if defined(MI_HAS_SYSCALL_H) && defined(SYS_getcpu) unsigned long node = 0; unsigned long ncpu = 0; long err = syscall(SYS_getcpu, &ncpu, &node, NULL); if (err != 0) return 0; return node; #else return 0; #endif } size_t _mi_prim_numa_node_count(void) { char buf[128]; unsigned node = 0; for(node = 0; node < 256; node++) { // enumerate node entries -- todo: it there a more efficient way to do this? (but ensure there is no allocation) _mi_snprintf(buf, 127, "/sys/devices/system/node/node%u", node + 1); if (mi_prim_access(buf,R_OK) != 0) break; } return (node+1); } #elif defined(__FreeBSD__) && __FreeBSD_version >= 1200000 size_t _mi_prim_numa_node(void) { domainset_t dom; size_t node; int policy; if (cpuset_getdomain(CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, sizeof(dom), &dom, &policy) == -1) return 0ul; for (node = 0; node < MAXMEMDOM; node++) { if (DOMAINSET_ISSET(node, &dom)) return node; } return 0ul; } size_t _mi_prim_numa_node_count(void) { size_t ndomains = 0; size_t len = sizeof(ndomains); if (sysctlbyname("vm.ndomains", &ndomains, &len, NULL, 0) == -1) return 0ul; return ndomains; } #elif defined(__DragonFly__) size_t _mi_prim_numa_node(void) { // TODO: DragonFly does not seem to provide any userland means to get this information. return 0ul; } size_t _mi_prim_numa_node_count(void) { size_t ncpus = 0, nvirtcoresperphys = 0; size_t len = sizeof(size_t); if (sysctlbyname("hw.ncpu", &ncpus, &len, NULL, 0) == -1) return 0ul; if (sysctlbyname("hw.cpu_topology_ht_ids", &nvirtcoresperphys, &len, NULL, 0) == -1) return 0ul; return nvirtcoresperphys * ncpus; } #else size_t _mi_prim_numa_node(void) { return 0; } size_t _mi_prim_numa_node_count(void) { return 1; } #endif // ---------------------------------------------------------------- // Clock // ---------------------------------------------------------------- #include #if defined(CLOCK_REALTIME) || defined(CLOCK_MONOTONIC) mi_msecs_t _mi_prim_clock_now(void) { struct timespec t; #ifdef CLOCK_MONOTONIC clock_gettime(CLOCK_MONOTONIC, &t); #else clock_gettime(CLOCK_REALTIME, &t); #endif return ((mi_msecs_t)t.tv_sec * 1000) + ((mi_msecs_t)t.tv_nsec / 1000000); } #else // low resolution timer mi_msecs_t _mi_prim_clock_now(void) { #if !defined(CLOCKS_PER_SEC) || (CLOCKS_PER_SEC == 1000) || (CLOCKS_PER_SEC == 0) return (mi_msecs_t)clock(); #elif (CLOCKS_PER_SEC < 1000) return (mi_msecs_t)clock() * (1000 / (mi_msecs_t)CLOCKS_PER_SEC); #else return (mi_msecs_t)clock() / ((mi_msecs_t)CLOCKS_PER_SEC / 1000); #endif } #endif //---------------------------------------------------------------- // Process info //---------------------------------------------------------------- #if defined(__unix__) || defined(__unix) || defined(unix) || defined(__APPLE__) || defined(__HAIKU__) #include #include #include #if defined(__APPLE__) #include #endif #if defined(__HAIKU__) #include #endif static mi_msecs_t timeval_secs(const struct timeval* tv) { return ((mi_msecs_t)tv->tv_sec * 1000L) + ((mi_msecs_t)tv->tv_usec / 1000L); } void _mi_prim_process_info(mi_process_info_t* pinfo) { struct rusage rusage; getrusage(RUSAGE_SELF, &rusage); pinfo->utime = timeval_secs(&rusage.ru_utime); pinfo->stime = timeval_secs(&rusage.ru_stime); #if !defined(__HAIKU__) pinfo->page_faults = rusage.ru_majflt; #endif #if defined(__HAIKU__) // Haiku does not have (yet?) a way to // get these stats per process thread_info tid; area_info mem; ssize_t c; get_thread_info(find_thread(0), &tid); while (get_next_area_info(tid.team, &c, &mem) == B_OK) { pinfo->peak_rss += mem.ram_size; } pinfo->page_faults = 0; #elif defined(__APPLE__) pinfo->peak_rss = rusage.ru_maxrss; // macos reports in bytes #ifdef MACH_TASK_BASIC_INFO struct mach_task_basic_info info; mach_msg_type_number_t infoCount = MACH_TASK_BASIC_INFO_COUNT; if (task_info(mach_task_self(), MACH_TASK_BASIC_INFO, (task_info_t)&info, &infoCount) == KERN_SUCCESS) { pinfo->current_rss = (size_t)info.resident_size; } #else struct task_basic_info info; mach_msg_type_number_t infoCount = TASK_BASIC_INFO_COUNT; if (task_info(mach_task_self(), TASK_BASIC_INFO, (task_info_t)&info, &infoCount) == KERN_SUCCESS) { pinfo->current_rss = (size_t)info.resident_size; } #endif #else pinfo->peak_rss = rusage.ru_maxrss * 1024; // Linux/BSD report in KiB #endif // use defaults for commit } #else #ifndef __wasi__ // WebAssembly instances are not processes #pragma message("define a way to get process info") #endif void _mi_prim_process_info(mi_process_info_t* pinfo) { // use defaults MI_UNUSED(pinfo); } #endif //---------------------------------------------------------------- // Output //---------------------------------------------------------------- void _mi_prim_out_stderr( const char* msg ) { fputs(msg,stderr); } //---------------------------------------------------------------- // Environment //---------------------------------------------------------------- #if !defined(MI_USE_ENVIRON) || (MI_USE_ENVIRON!=0) // On Posix systemsr use `environ` to access environment variables // even before the C runtime is initialized. #if defined(__APPLE__) && defined(__has_include) && __has_include() #include static char** mi_get_environ(void) { return (*_NSGetEnviron()); } #else extern char** environ; static char** mi_get_environ(void) { return environ; } #endif bool _mi_prim_getenv(const char* name, char* result, size_t result_size) { if (name==NULL) return false; const size_t len = _mi_strlen(name); if (len == 0) return false; char** env = mi_get_environ(); if (env == NULL) return false; // compare up to 10000 entries for (int i = 0; i < 10000 && env[i] != NULL; i++) { const char* s = env[i]; if (_mi_strnicmp(name, s, len) == 0 && s[len] == '=') { // case insensitive // found it _mi_strlcpy(result, s + len + 1, result_size); return true; } } return false; } #else // fallback: use standard C `getenv` but this cannot be used while initializing the C runtime bool _mi_prim_getenv(const char* name, char* result, size_t result_size) { // cannot call getenv() when still initializing the C runtime. if (_mi_preloading()) return false; const char* s = getenv(name); if (s == NULL) { // we check the upper case name too. char buf[64+1]; size_t len = _mi_strnlen(name,sizeof(buf)-1); for (size_t i = 0; i < len; i++) { buf[i] = _mi_toupper(name[i]); } buf[len] = 0; s = getenv(buf); } if (s == NULL || _mi_strnlen(s,result_size) >= result_size) return false; _mi_strlcpy(result, s, result_size); return true; } #endif // !MI_USE_ENVIRON //---------------------------------------------------------------- // Random //---------------------------------------------------------------- #if defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_15) && (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_15) #include #include bool _mi_prim_random_buf(void* buf, size_t buf_len) { // We prefer CCRandomGenerateBytes as it returns an error code while arc4random_buf // may fail silently on macOS. See PR #390, and return (CCRandomGenerateBytes(buf, buf_len) == kCCSuccess); } #elif defined(__ANDROID__) || defined(__DragonFly__) || \ defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__sun) || \ (defined(__APPLE__) && (MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_7)) bool _mi_prim_random_buf(void* buf, size_t buf_len) { arc4random_buf(buf, buf_len); return true; } #elif defined(__APPLE__) || defined(__linux__) || defined(__HAIKU__) // also for old apple versions < 10.7 (issue #829) #include #include #include bool _mi_prim_random_buf(void* buf, size_t buf_len) { // Modern Linux provides `getrandom` but different distributions either use `sys/random.h` or `linux/random.h` // and for the latter the actual `getrandom` call is not always defined. // (see ) // We therefore use a syscall directly and fall back dynamically to /dev/urandom when needed. #if defined(MI_HAS_SYSCALL_H) && defined(SYS_getrandom) #ifndef GRND_NONBLOCK #define GRND_NONBLOCK (1) #endif static _Atomic(uintptr_t) no_getrandom; // = 0 if (mi_atomic_load_acquire(&no_getrandom)==0) { ssize_t ret = syscall(SYS_getrandom, buf, buf_len, GRND_NONBLOCK); if (ret >= 0) return (buf_len == (size_t)ret); if (errno != ENOSYS) return false; mi_atomic_store_release(&no_getrandom, (uintptr_t)1); // don't call again, and fall back to /dev/urandom } #endif int flags = O_RDONLY; #if defined(O_CLOEXEC) flags |= O_CLOEXEC; #endif int fd = mi_prim_open("/dev/urandom", flags); if (fd < 0) return false; size_t count = 0; while(count < buf_len) { ssize_t ret = mi_prim_read(fd, (char*)buf + count, buf_len - count); if (ret<=0) { if (errno!=EAGAIN && errno!=EINTR) break; } else { count += ret; } } mi_prim_close(fd); return (count==buf_len); } #else bool _mi_prim_random_buf(void* buf, size_t buf_len) { return false; } #endif //---------------------------------------------------------------- // Thread init/done //---------------------------------------------------------------- #if defined(MI_USE_PTHREADS) // use pthread local storage keys to detect thread ending // (and used with MI_TLS_PTHREADS for the default heap) pthread_key_t _mi_heap_default_key = (pthread_key_t)(-1); static void mi_pthread_done(void* value) { if (value!=NULL) { _mi_thread_done((mi_heap_t*)value); } } void _mi_prim_thread_init_auto_done(void) { mi_assert_internal(_mi_heap_default_key == (pthread_key_t)(-1)); pthread_key_create(&_mi_heap_default_key, &mi_pthread_done); } void _mi_prim_thread_done_auto_done(void) { if (_mi_heap_default_key != (pthread_key_t)(-1)) { // do not leak the key, see issue #809 pthread_key_delete(_mi_heap_default_key); } } void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) { if (_mi_heap_default_key != (pthread_key_t)(-1)) { // can happen during recursive invocation on freeBSD pthread_setspecific(_mi_heap_default_key, heap); } } #else void _mi_prim_thread_init_auto_done(void) { // nothing } void _mi_prim_thread_done_auto_done(void) { // nothing } void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) { MI_UNUSED(heap); } #endif ================================================ FILE: third-party/mimalloc/src/prim/wasi/prim.c ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2023, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ // This file is included in `src/prim/prim.c` #include "mimalloc.h" #include "mimalloc/internal.h" #include "mimalloc/prim.h" #include // fputs #include // getenv //--------------------------------------------- // Initialize //--------------------------------------------- void _mi_prim_mem_init( mi_os_mem_config_t* config ) { config->page_size = 64*MI_KiB; // WebAssembly has a fixed page size: 64KiB config->alloc_granularity = 16; config->has_overcommit = false; config->has_partial_free = false; config->has_virtual_reserve = false; } //--------------------------------------------- // Free //--------------------------------------------- int _mi_prim_free(void* addr, size_t size ) { MI_UNUSED(addr); MI_UNUSED(size); // wasi heap cannot be shrunk return 0; } //--------------------------------------------- // Allocation: sbrk or memory_grow //--------------------------------------------- #if defined(MI_USE_SBRK) #include // for sbrk static void* mi_memory_grow( size_t size ) { void* p = sbrk(size); if (p == (void*)(-1)) return NULL; #if !defined(__wasi__) // on wasi this is always zero initialized already (?) memset(p,0,size); #endif return p; } #elif defined(__wasi__) static void* mi_memory_grow( size_t size ) { size_t base = (size > 0 ? __builtin_wasm_memory_grow(0,_mi_divide_up(size, _mi_os_page_size())) : __builtin_wasm_memory_size(0)); if (base == SIZE_MAX) return NULL; return (void*)(base * _mi_os_page_size()); } #endif #if defined(MI_USE_PTHREADS) static pthread_mutex_t mi_heap_grow_mutex = PTHREAD_MUTEX_INITIALIZER; #endif static void* mi_prim_mem_grow(size_t size, size_t try_alignment) { void* p = NULL; if (try_alignment <= 1) { // `sbrk` is not thread safe in general so try to protect it (we could skip this on WASM but leave it in for now) #if defined(MI_USE_PTHREADS) pthread_mutex_lock(&mi_heap_grow_mutex); #endif p = mi_memory_grow(size); #if defined(MI_USE_PTHREADS) pthread_mutex_unlock(&mi_heap_grow_mutex); #endif } else { void* base = NULL; size_t alloc_size = 0; // to allocate aligned use a lock to try to avoid thread interaction // between getting the current size and actual allocation // (also, `sbrk` is not thread safe in general) #if defined(MI_USE_PTHREADS) pthread_mutex_lock(&mi_heap_grow_mutex); #endif { void* current = mi_memory_grow(0); // get current size if (current != NULL) { void* aligned_current = mi_align_up_ptr(current, try_alignment); // and align from there to minimize wasted space alloc_size = _mi_align_up( ((uint8_t*)aligned_current - (uint8_t*)current) + size, _mi_os_page_size()); base = mi_memory_grow(alloc_size); } } #if defined(MI_USE_PTHREADS) pthread_mutex_unlock(&mi_heap_grow_mutex); #endif if (base != NULL) { p = mi_align_up_ptr(base, try_alignment); if ((uint8_t*)p + size > (uint8_t*)base + alloc_size) { // another thread used wasm_memory_grow/sbrk in-between and we do not have enough // space after alignment. Give up (and waste the space as we cannot shrink :-( ) // (in `mi_os_mem_alloc_aligned` this will fall back to overallocation to align) p = NULL; } } } /* if (p == NULL) { _mi_warning_message("unable to allocate sbrk/wasm_memory_grow OS memory (%zu bytes, %zu alignment)\n", size, try_alignment); errno = ENOMEM; return NULL; } */ mi_assert_internal( p == NULL || try_alignment == 0 || (uintptr_t)p % try_alignment == 0 ); return p; } // Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned. int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) { MI_UNUSED(allow_large); MI_UNUSED(commit); MI_UNUSED(hint_addr); *is_large = false; *is_zero = false; *addr = mi_prim_mem_grow(size, try_alignment); return (*addr != NULL ? 0 : ENOMEM); } //--------------------------------------------- // Commit/Reset/Protect //--------------------------------------------- int _mi_prim_commit(void* addr, size_t size, bool* is_zero) { MI_UNUSED(addr); MI_UNUSED(size); *is_zero = false; return 0; } int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit) { MI_UNUSED(addr); MI_UNUSED(size); *needs_recommit = false; return 0; } int _mi_prim_reset(void* addr, size_t size) { MI_UNUSED(addr); MI_UNUSED(size); return 0; } int _mi_prim_protect(void* addr, size_t size, bool protect) { MI_UNUSED(addr); MI_UNUSED(size); MI_UNUSED(protect); return 0; } //--------------------------------------------- // Huge pages and NUMA nodes //--------------------------------------------- int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr) { MI_UNUSED(hint_addr); MI_UNUSED(size); MI_UNUSED(numa_node); *is_zero = true; *addr = NULL; return ENOSYS; } size_t _mi_prim_numa_node(void) { return 0; } size_t _mi_prim_numa_node_count(void) { return 1; } //---------------------------------------------------------------- // Clock //---------------------------------------------------------------- #include #if defined(CLOCK_REALTIME) || defined(CLOCK_MONOTONIC) mi_msecs_t _mi_prim_clock_now(void) { struct timespec t; #ifdef CLOCK_MONOTONIC clock_gettime(CLOCK_MONOTONIC, &t); #else clock_gettime(CLOCK_REALTIME, &t); #endif return ((mi_msecs_t)t.tv_sec * 1000) + ((mi_msecs_t)t.tv_nsec / 1000000); } #else // low resolution timer mi_msecs_t _mi_prim_clock_now(void) { #if !defined(CLOCKS_PER_SEC) || (CLOCKS_PER_SEC == 1000) || (CLOCKS_PER_SEC == 0) return (mi_msecs_t)clock(); #elif (CLOCKS_PER_SEC < 1000) return (mi_msecs_t)clock() * (1000 / (mi_msecs_t)CLOCKS_PER_SEC); #else return (mi_msecs_t)clock() / ((mi_msecs_t)CLOCKS_PER_SEC / 1000); #endif } #endif //---------------------------------------------------------------- // Process info //---------------------------------------------------------------- void _mi_prim_process_info(mi_process_info_t* pinfo) { // use defaults MI_UNUSED(pinfo); } //---------------------------------------------------------------- // Output //---------------------------------------------------------------- void _mi_prim_out_stderr( const char* msg ) { fputs(msg,stderr); } //---------------------------------------------------------------- // Environment //---------------------------------------------------------------- bool _mi_prim_getenv(const char* name, char* result, size_t result_size) { // cannot call getenv() when still initializing the C runtime. if (_mi_preloading()) return false; const char* s = getenv(name); if (s == NULL) { // we check the upper case name too. char buf[64+1]; size_t len = _mi_strnlen(name,sizeof(buf)-1); for (size_t i = 0; i < len; i++) { buf[i] = _mi_toupper(name[i]); } buf[len] = 0; s = getenv(buf); } if (s == NULL || _mi_strnlen(s,result_size) >= result_size) return false; _mi_strlcpy(result, s, result_size); return true; } //---------------------------------------------------------------- // Random //---------------------------------------------------------------- bool _mi_prim_random_buf(void* buf, size_t buf_len) { return false; } //---------------------------------------------------------------- // Thread init/done //---------------------------------------------------------------- void _mi_prim_thread_init_auto_done(void) { // nothing } void _mi_prim_thread_done_auto_done(void) { // nothing } void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) { MI_UNUSED(heap); } ================================================ FILE: third-party/mimalloc/src/prim/windows/etw-mimalloc.wprp ================================================ ================================================ FILE: third-party/mimalloc/src/prim/windows/etw.h ================================================ //**********************************************************************` //* This is an include file generated by Message Compiler. *` //* *` //* Copyright (c) Microsoft Corporation. All Rights Reserved. *` //**********************************************************************` #pragma once //***************************************************************************** // // Notes on the ETW event code generated by MC: // // - Structures and arrays of structures are treated as an opaque binary blob. // The caller is responsible for packing the data for the structure into a // single region of memory, with no padding between values. The macro will // have an extra parameter for the length of the blob. // - Arrays of nul-terminated strings must be packed by the caller into a // single binary blob containing the correct number of strings, with a nul // after each string. The size of the blob is specified in characters, and // includes the final nul. // - Arrays of SID are treated as a single binary blob. The caller is // responsible for packing the SID values into a single region of memory with // no padding. // - The length attribute on the data element in the manifest is significant // for values with intype win:UnicodeString, win:AnsiString, or win:Binary. // The length attribute must be specified for win:Binary, and is optional for // win:UnicodeString and win:AnsiString (if no length is given, the strings // are assumed to be nul-terminated). For win:UnicodeString, the length is // measured in characters, not bytes. // - For an array of win:UnicodeString, win:AnsiString, or win:Binary, the // length attribute applies to every value in the array, so every value in // the array must have the same length. The values in the array are provided // to the macro via a single pointer -- the caller is responsible for packing // all of the values into a single region of memory with no padding between // values. // - Values of type win:CountedUnicodeString, win:CountedAnsiString, and // win:CountedBinary can be generated and collected on Vista or later. // However, they may not decode properly without the Windows 10 2018 Fall // Update. // - Arrays of type win:CountedUnicodeString, win:CountedAnsiString, and // win:CountedBinary must be packed by the caller into a single region of // memory. The format for each item is a UINT16 byte-count followed by that // many bytes of data. When providing the array to the generated macro, you // must provide the total size of the packed array data, including the UINT16 // sizes for each item. In the case of win:CountedUnicodeString, the data // size is specified in WCHAR (16-bit) units. In the case of // win:CountedAnsiString and win:CountedBinary, the data size is specified in // bytes. // //***************************************************************************** #include #include #include #ifndef ETW_INLINE #ifdef _ETW_KM_ // In kernel mode, save stack space by never inlining templates. #define ETW_INLINE DECLSPEC_NOINLINE __inline #else // In user mode, save code size by inlining templates as appropriate. #define ETW_INLINE __inline #endif #endif // ETW_INLINE #if defined(__cplusplus) extern "C" { #endif // // MCGEN_DISABLE_PROVIDER_CODE_GENERATION macro: // Define this macro to have the compiler skip the generated functions in this // header. // #ifndef MCGEN_DISABLE_PROVIDER_CODE_GENERATION // // MCGEN_USE_KERNEL_MODE_APIS macro: // Controls whether the generated code uses kernel-mode or user-mode APIs. // - Set to 0 to use Windows user-mode APIs such as EventRegister. // - Set to 1 to use Windows kernel-mode APIs such as EtwRegister. // Default is based on whether the _ETW_KM_ macro is defined (i.e. by wdm.h). // Note that the APIs can also be overridden directly, e.g. by setting the // MCGEN_EVENTWRITETRANSFER or MCGEN_EVENTREGISTER macros. // #ifndef MCGEN_USE_KERNEL_MODE_APIS #ifdef _ETW_KM_ #define MCGEN_USE_KERNEL_MODE_APIS 1 #else #define MCGEN_USE_KERNEL_MODE_APIS 0 #endif #endif // MCGEN_USE_KERNEL_MODE_APIS // // MCGEN_HAVE_EVENTSETINFORMATION macro: // Controls how McGenEventSetInformation uses the EventSetInformation API. // - Set to 0 to disable the use of EventSetInformation // (McGenEventSetInformation will always return an error). // - Set to 1 to directly invoke MCGEN_EVENTSETINFORMATION. // - Set to 2 to to locate EventSetInformation at runtime via GetProcAddress // (user-mode) or MmGetSystemRoutineAddress (kernel-mode). // Default is determined as follows: // - If MCGEN_EVENTSETINFORMATION has been customized, set to 1 // (i.e. use MCGEN_EVENTSETINFORMATION). // - Else if the target OS version has EventSetInformation, set to 1 // (i.e. use MCGEN_EVENTSETINFORMATION). // - Else set to 2 (i.e. try to dynamically locate EventSetInformation). // Note that an McGenEventSetInformation function will only be generated if one // or more provider in a manifest has provider traits. // #ifndef MCGEN_HAVE_EVENTSETINFORMATION #ifdef MCGEN_EVENTSETINFORMATION // if MCGEN_EVENTSETINFORMATION has been customized, #define MCGEN_HAVE_EVENTSETINFORMATION 1 // directly invoke MCGEN_EVENTSETINFORMATION(...). #elif MCGEN_USE_KERNEL_MODE_APIS // else if using kernel-mode APIs, #if NTDDI_VERSION >= 0x06040000 // if target OS is Windows 10 or later, #define MCGEN_HAVE_EVENTSETINFORMATION 1 // directly invoke MCGEN_EVENTSETINFORMATION(...). #else // else #define MCGEN_HAVE_EVENTSETINFORMATION 2 // find "EtwSetInformation" via MmGetSystemRoutineAddress. #endif // else (using user-mode APIs) #else // if target OS and SDK is Windows 8 or later, #if WINVER >= 0x0602 && defined(EVENT_FILTER_TYPE_SCHEMATIZED) #define MCGEN_HAVE_EVENTSETINFORMATION 1 // directly invoke MCGEN_EVENTSETINFORMATION(...). #else // else #define MCGEN_HAVE_EVENTSETINFORMATION 2 // find "EventSetInformation" via GetModuleHandleExW/GetProcAddress. #endif #endif #endif // MCGEN_HAVE_EVENTSETINFORMATION // // MCGEN Override Macros // // The following override macros may be defined before including this header // to control the APIs used by this header: // // - MCGEN_EVENTREGISTER // - MCGEN_EVENTUNREGISTER // - MCGEN_EVENTSETINFORMATION // - MCGEN_EVENTWRITETRANSFER // // If the the macro is undefined, the MC implementation will default to the // corresponding ETW APIs. For example, if the MCGEN_EVENTREGISTER macro is // undefined, the EventRegister[MyProviderName] macro will use EventRegister // in user mode and will use EtwRegister in kernel mode. // // To prevent issues from conflicting definitions of these macros, the value // of the override macro will be used as a suffix in certain internal function // names. Because of this, the override macros must follow certain rules: // // - The macro must be defined before any MC-generated header is included and // must not be undefined or redefined after any MC-generated header is // included. Different translation units (i.e. different .c or .cpp files) // may set the macros to different values, but within a translation unit // (within a single .c or .cpp file), the macro must be set once and not // changed. // - The override must be an object-like macro, not a function-like macro // (i.e. the override macro must not have a parameter list). // - The override macro's value must be a simple identifier, i.e. must be // something that starts with a letter or '_' and contains only letters, // numbers, and '_' characters. // - If the override macro's value is the name of a second object-like macro, // the second object-like macro must follow the same rules. (The override // macro's value can also be the name of a function-like macro, in which // case the function-like macro does not need to follow the same rules.) // // For example, the following will cause compile errors: // // #define MCGEN_EVENTWRITETRANSFER MyNamespace::MyClass::MyFunction // Value has non-identifier characters (colon). // #define MCGEN_EVENTWRITETRANSFER GetEventWriteFunctionPointer(7) // Value has non-identifier characters (parentheses). // #define MCGEN_EVENTWRITETRANSFER(h,e,a,r,c,d) EventWrite(h,e,c,d) // Override is defined as a function-like macro. // #define MY_OBJECT_LIKE_MACRO MyNamespace::MyClass::MyEventWriteFunction // #define MCGEN_EVENTWRITETRANSFER MY_OBJECT_LIKE_MACRO // Evaluates to something with non-identifier characters (colon). // // The following would be ok: // // #define MCGEN_EVENTWRITETRANSFER MyEventWriteFunction1 // OK, suffix will be "MyEventWriteFunction1". // #define MY_OBJECT_LIKE_MACRO MyEventWriteFunction2 // #define MCGEN_EVENTWRITETRANSFER MY_OBJECT_LIKE_MACRO // OK, suffix will be "MyEventWriteFunction2". // #define MY_FUNCTION_LIKE_MACRO(h,e,a,r,c,d) MyNamespace::MyClass::MyEventWriteFunction3(h,e,c,d) // #define MCGEN_EVENTWRITETRANSFER MY_FUNCTION_LIKE_MACRO // OK, suffix will be "MY_FUNCTION_LIKE_MACRO". // #ifndef MCGEN_EVENTREGISTER #if MCGEN_USE_KERNEL_MODE_APIS #define MCGEN_EVENTREGISTER EtwRegister #else #define MCGEN_EVENTREGISTER EventRegister #endif #endif // MCGEN_EVENTREGISTER #ifndef MCGEN_EVENTUNREGISTER #if MCGEN_USE_KERNEL_MODE_APIS #define MCGEN_EVENTUNREGISTER EtwUnregister #else #define MCGEN_EVENTUNREGISTER EventUnregister #endif #endif // MCGEN_EVENTUNREGISTER #ifndef MCGEN_EVENTSETINFORMATION #if MCGEN_USE_KERNEL_MODE_APIS #define MCGEN_EVENTSETINFORMATION EtwSetInformation #else #define MCGEN_EVENTSETINFORMATION EventSetInformation #endif #endif // MCGEN_EVENTSETINFORMATION #ifndef MCGEN_EVENTWRITETRANSFER #if MCGEN_USE_KERNEL_MODE_APIS #define MCGEN_EVENTWRITETRANSFER EtwWriteTransfer #else #define MCGEN_EVENTWRITETRANSFER EventWriteTransfer #endif #endif // MCGEN_EVENTWRITETRANSFER // // MCGEN_EVENT_ENABLED macro: // Override to control how the EventWrite[EventName] macros determine whether // an event is enabled. The default behavior is for EventWrite[EventName] to // use the EventEnabled[EventName] macros. // #ifndef MCGEN_EVENT_ENABLED #define MCGEN_EVENT_ENABLED(EventName) EventEnabled##EventName() #endif // // MCGEN_EVENT_ENABLED_FORCONTEXT macro: // Override to control how the EventWrite[EventName]_ForContext macros // determine whether an event is enabled. The default behavior is for // EventWrite[EventName]_ForContext to use the // EventEnabled[EventName]_ForContext macros. // #ifndef MCGEN_EVENT_ENABLED_FORCONTEXT #define MCGEN_EVENT_ENABLED_FORCONTEXT(pContext, EventName) EventEnabled##EventName##_ForContext(pContext) #endif // // MCGEN_ENABLE_CHECK macro: // Determines whether the specified event would be considered as enabled // based on the state of the specified context. Slightly faster than calling // McGenEventEnabled directly. // #ifndef MCGEN_ENABLE_CHECK #define MCGEN_ENABLE_CHECK(Context, Descriptor) (Context.IsEnabled && McGenEventEnabled(&Context, &Descriptor)) #endif #if !defined(MCGEN_TRACE_CONTEXT_DEF) #define MCGEN_TRACE_CONTEXT_DEF // This structure is for use by MC-generated code and should not be used directly. typedef struct _MCGEN_TRACE_CONTEXT { TRACEHANDLE RegistrationHandle; TRACEHANDLE Logger; // Used as pointer to provider traits. ULONGLONG MatchAnyKeyword; ULONGLONG MatchAllKeyword; ULONG Flags; ULONG IsEnabled; UCHAR Level; UCHAR Reserve; USHORT EnableBitsCount; PULONG EnableBitMask; const ULONGLONG* EnableKeyWords; const UCHAR* EnableLevel; } MCGEN_TRACE_CONTEXT, *PMCGEN_TRACE_CONTEXT; #endif // MCGEN_TRACE_CONTEXT_DEF #if !defined(MCGEN_LEVEL_KEYWORD_ENABLED_DEF) #define MCGEN_LEVEL_KEYWORD_ENABLED_DEF // // Determines whether an event with a given Level and Keyword would be // considered as enabled based on the state of the specified context. // Note that you may want to use MCGEN_ENABLE_CHECK instead of calling this // function directly. // FORCEINLINE BOOLEAN McGenLevelKeywordEnabled( _In_ PMCGEN_TRACE_CONTEXT EnableInfo, _In_ UCHAR Level, _In_ ULONGLONG Keyword ) { // // Check if the event Level is lower than the level at which // the channel is enabled. // If the event Level is 0 or the channel is enabled at level 0, // all levels are enabled. // if ((Level <= EnableInfo->Level) || // This also covers the case of Level == 0. (EnableInfo->Level == 0)) { // // Check if Keyword is enabled // if ((Keyword == (ULONGLONG)0) || ((Keyword & EnableInfo->MatchAnyKeyword) && ((Keyword & EnableInfo->MatchAllKeyword) == EnableInfo->MatchAllKeyword))) { return TRUE; } } return FALSE; } #endif // MCGEN_LEVEL_KEYWORD_ENABLED_DEF #if !defined(MCGEN_EVENT_ENABLED_DEF) #define MCGEN_EVENT_ENABLED_DEF // // Determines whether the specified event would be considered as enabled based // on the state of the specified context. Note that you may want to use // MCGEN_ENABLE_CHECK instead of calling this function directly. // FORCEINLINE BOOLEAN McGenEventEnabled( _In_ PMCGEN_TRACE_CONTEXT EnableInfo, _In_ PCEVENT_DESCRIPTOR EventDescriptor ) { return McGenLevelKeywordEnabled(EnableInfo, EventDescriptor->Level, EventDescriptor->Keyword); } #endif // MCGEN_EVENT_ENABLED_DEF #if !defined(MCGEN_CONTROL_CALLBACK) #define MCGEN_CONTROL_CALLBACK // This function is for use by MC-generated code and should not be used directly. DECLSPEC_NOINLINE __inline VOID __stdcall McGenControlCallbackV2( _In_ LPCGUID SourceId, _In_ ULONG ControlCode, _In_ UCHAR Level, _In_ ULONGLONG MatchAnyKeyword, _In_ ULONGLONG MatchAllKeyword, _In_opt_ PEVENT_FILTER_DESCRIPTOR FilterData, _Inout_opt_ PVOID CallbackContext ) /*++ Routine Description: This is the notification callback for Windows Vista and later. Arguments: SourceId - The GUID that identifies the session that enabled the provider. ControlCode - The parameter indicates whether the provider is being enabled or disabled. Level - The level at which the event is enabled. MatchAnyKeyword - The bitmask of keywords that the provider uses to determine the category of events that it writes. MatchAllKeyword - This bitmask additionally restricts the category of events that the provider writes. FilterData - The provider-defined data. CallbackContext - The context of the callback that is defined when the provider called EtwRegister to register itself. Remarks: ETW calls this function to notify provider of enable/disable --*/ { PMCGEN_TRACE_CONTEXT Ctx = (PMCGEN_TRACE_CONTEXT)CallbackContext; ULONG Ix; #ifndef MCGEN_PRIVATE_ENABLE_CALLBACK_V2 UNREFERENCED_PARAMETER(SourceId); UNREFERENCED_PARAMETER(FilterData); #endif if (Ctx == NULL) { return; } switch (ControlCode) { case EVENT_CONTROL_CODE_ENABLE_PROVIDER: Ctx->Level = Level; Ctx->MatchAnyKeyword = MatchAnyKeyword; Ctx->MatchAllKeyword = MatchAllKeyword; Ctx->IsEnabled = EVENT_CONTROL_CODE_ENABLE_PROVIDER; for (Ix = 0; Ix < Ctx->EnableBitsCount; Ix += 1) { if (McGenLevelKeywordEnabled(Ctx, Ctx->EnableLevel[Ix], Ctx->EnableKeyWords[Ix]) != FALSE) { Ctx->EnableBitMask[Ix >> 5] |= (1 << (Ix % 32)); } else { Ctx->EnableBitMask[Ix >> 5] &= ~(1 << (Ix % 32)); } } break; case EVENT_CONTROL_CODE_DISABLE_PROVIDER: Ctx->IsEnabled = EVENT_CONTROL_CODE_DISABLE_PROVIDER; Ctx->Level = 0; Ctx->MatchAnyKeyword = 0; Ctx->MatchAllKeyword = 0; if (Ctx->EnableBitsCount > 0) { #pragma warning(suppress: 26451) // Arithmetic overflow cannot occur, no matter the value of EnableBitCount RtlZeroMemory(Ctx->EnableBitMask, (((Ctx->EnableBitsCount - 1) / 32) + 1) * sizeof(ULONG)); } break; default: break; } #ifdef MCGEN_PRIVATE_ENABLE_CALLBACK_V2 // // Call user defined callback // MCGEN_PRIVATE_ENABLE_CALLBACK_V2( SourceId, ControlCode, Level, MatchAnyKeyword, MatchAllKeyword, FilterData, CallbackContext ); #endif // MCGEN_PRIVATE_ENABLE_CALLBACK_V2 return; } #endif // MCGEN_CONTROL_CALLBACK #ifndef _mcgen_PENABLECALLBACK #if MCGEN_USE_KERNEL_MODE_APIS #define _mcgen_PENABLECALLBACK PETWENABLECALLBACK #else #define _mcgen_PENABLECALLBACK PENABLECALLBACK #endif #endif // _mcgen_PENABLECALLBACK #if !defined(_mcgen_PASTE2) // This macro is for use by MC-generated code and should not be used directly. #define _mcgen_PASTE2(a, b) _mcgen_PASTE2_imp(a, b) #define _mcgen_PASTE2_imp(a, b) a##b #endif // _mcgen_PASTE2 #if !defined(_mcgen_PASTE3) // This macro is for use by MC-generated code and should not be used directly. #define _mcgen_PASTE3(a, b, c) _mcgen_PASTE3_imp(a, b, c) #define _mcgen_PASTE3_imp(a, b, c) a##b##_##c #endif // _mcgen_PASTE3 // // Macro validation // // Validate MCGEN_EVENTREGISTER: // Trigger an error if MCGEN_EVENTREGISTER is not an unqualified (simple) identifier: struct _mcgen_PASTE2(MCGEN_EVENTREGISTER_definition_must_be_an_unqualified_identifier_, MCGEN_EVENTREGISTER); // Trigger an error if MCGEN_EVENTREGISTER is redefined: typedef struct _mcgen_PASTE2(MCGEN_EVENTREGISTER_definition_must_be_an_unqualified_identifier_, MCGEN_EVENTREGISTER) MCGEN_EVENTREGISTER_must_not_be_redefined_between_headers; // Trigger an error if MCGEN_EVENTREGISTER is defined as a function-like macro: typedef void MCGEN_EVENTREGISTER_must_not_be_a_functionLike_macro_MCGEN_EVENTREGISTER; typedef int _mcgen_PASTE2(MCGEN_EVENTREGISTER_must_not_be_a_functionLike_macro_, MCGEN_EVENTREGISTER); // Validate MCGEN_EVENTUNREGISTER: // Trigger an error if MCGEN_EVENTUNREGISTER is not an unqualified (simple) identifier: struct _mcgen_PASTE2(MCGEN_EVENTUNREGISTER_definition_must_be_an_unqualified_identifier_, MCGEN_EVENTUNREGISTER); // Trigger an error if MCGEN_EVENTUNREGISTER is redefined: typedef struct _mcgen_PASTE2(MCGEN_EVENTUNREGISTER_definition_must_be_an_unqualified_identifier_, MCGEN_EVENTUNREGISTER) MCGEN_EVENTUNREGISTER_must_not_be_redefined_between_headers; // Trigger an error if MCGEN_EVENTUNREGISTER is defined as a function-like macro: typedef void MCGEN_EVENTUNREGISTER_must_not_be_a_functionLike_macro_MCGEN_EVENTUNREGISTER; typedef int _mcgen_PASTE2(MCGEN_EVENTUNREGISTER_must_not_be_a_functionLike_macro_, MCGEN_EVENTUNREGISTER); // Validate MCGEN_EVENTSETINFORMATION: // Trigger an error if MCGEN_EVENTSETINFORMATION is not an unqualified (simple) identifier: struct _mcgen_PASTE2(MCGEN_EVENTSETINFORMATION_definition_must_be_an_unqualified_identifier_, MCGEN_EVENTSETINFORMATION); // Trigger an error if MCGEN_EVENTSETINFORMATION is redefined: typedef struct _mcgen_PASTE2(MCGEN_EVENTSETINFORMATION_definition_must_be_an_unqualified_identifier_, MCGEN_EVENTSETINFORMATION) MCGEN_EVENTSETINFORMATION_must_not_be_redefined_between_headers; // Trigger an error if MCGEN_EVENTSETINFORMATION is defined as a function-like macro: typedef void MCGEN_EVENTSETINFORMATION_must_not_be_a_functionLike_macro_MCGEN_EVENTSETINFORMATION; typedef int _mcgen_PASTE2(MCGEN_EVENTSETINFORMATION_must_not_be_a_functionLike_macro_, MCGEN_EVENTSETINFORMATION); // Validate MCGEN_EVENTWRITETRANSFER: // Trigger an error if MCGEN_EVENTWRITETRANSFER is not an unqualified (simple) identifier: struct _mcgen_PASTE2(MCGEN_EVENTWRITETRANSFER_definition_must_be_an_unqualified_identifier_, MCGEN_EVENTWRITETRANSFER); // Trigger an error if MCGEN_EVENTWRITETRANSFER is redefined: typedef struct _mcgen_PASTE2(MCGEN_EVENTWRITETRANSFER_definition_must_be_an_unqualified_identifier_, MCGEN_EVENTWRITETRANSFER) MCGEN_EVENTWRITETRANSFER_must_not_be_redefined_between_headers;; // Trigger an error if MCGEN_EVENTWRITETRANSFER is defined as a function-like macro: typedef void MCGEN_EVENTWRITETRANSFER_must_not_be_a_functionLike_macro_MCGEN_EVENTWRITETRANSFER; typedef int _mcgen_PASTE2(MCGEN_EVENTWRITETRANSFER_must_not_be_a_functionLike_macro_, MCGEN_EVENTWRITETRANSFER); #ifndef McGenEventWrite_def #define McGenEventWrite_def // This macro is for use by MC-generated code and should not be used directly. #define McGenEventWrite _mcgen_PASTE2(McGenEventWrite_, MCGEN_EVENTWRITETRANSFER) // This function is for use by MC-generated code and should not be used directly. DECLSPEC_NOINLINE __inline ULONG __stdcall McGenEventWrite( _In_ PMCGEN_TRACE_CONTEXT Context, _In_ PCEVENT_DESCRIPTOR Descriptor, _In_opt_ LPCGUID ActivityId, _In_range_(1, 128) ULONG EventDataCount, _Pre_cap_(EventDataCount) EVENT_DATA_DESCRIPTOR* EventData ) { const USHORT UNALIGNED* Traits; // Some customized MCGEN_EVENTWRITETRANSFER macros might ignore ActivityId. UNREFERENCED_PARAMETER(ActivityId); Traits = (const USHORT UNALIGNED*)(UINT_PTR)Context->Logger; if (Traits == NULL) { EventData[0].Ptr = 0; EventData[0].Size = 0; EventData[0].Reserved = 0; } else { EventData[0].Ptr = (ULONG_PTR)Traits; EventData[0].Size = *Traits; EventData[0].Reserved = 2; // EVENT_DATA_DESCRIPTOR_TYPE_PROVIDER_METADATA } return MCGEN_EVENTWRITETRANSFER( Context->RegistrationHandle, Descriptor, ActivityId, NULL, EventDataCount, EventData); } #endif // McGenEventWrite_def #if !defined(McGenEventRegisterUnregister) #define McGenEventRegisterUnregister // This macro is for use by MC-generated code and should not be used directly. #define McGenEventRegister _mcgen_PASTE2(McGenEventRegister_, MCGEN_EVENTREGISTER) #pragma warning(push) #pragma warning(disable:6103) // This function is for use by MC-generated code and should not be used directly. DECLSPEC_NOINLINE __inline ULONG __stdcall McGenEventRegister( _In_ LPCGUID ProviderId, _In_opt_ _mcgen_PENABLECALLBACK EnableCallback, _In_opt_ PVOID CallbackContext, _Inout_ PREGHANDLE RegHandle ) /*++ Routine Description: This function registers the provider with ETW. Arguments: ProviderId - Provider ID to register with ETW. EnableCallback - Callback to be used. CallbackContext - Context for the callback. RegHandle - Pointer to registration handle. Remarks: Should not be called if the provider is already registered (i.e. should not be called if *RegHandle != 0). Repeatedly registering a provider is a bug and may indicate a race condition. However, for compatibility with previous behavior, this function will return SUCCESS in this case. --*/ { ULONG Error; if (*RegHandle != 0) { Error = 0; // ERROR_SUCCESS } else { Error = MCGEN_EVENTREGISTER(ProviderId, EnableCallback, CallbackContext, RegHandle); } return Error; } #pragma warning(pop) // This macro is for use by MC-generated code and should not be used directly. #define McGenEventUnregister _mcgen_PASTE2(McGenEventUnregister_, MCGEN_EVENTUNREGISTER) // This function is for use by MC-generated code and should not be used directly. DECLSPEC_NOINLINE __inline ULONG __stdcall McGenEventUnregister(_Inout_ PREGHANDLE RegHandle) /*++ Routine Description: Unregister from ETW and set *RegHandle = 0. Arguments: RegHandle - the pointer to the provider registration handle Remarks: If provider has not been registered (i.e. if *RegHandle == 0), return SUCCESS. It is safe to call McGenEventUnregister even if the call to McGenEventRegister returned an error. --*/ { ULONG Error; if(*RegHandle == 0) { Error = 0; // ERROR_SUCCESS } else { Error = MCGEN_EVENTUNREGISTER(*RegHandle); *RegHandle = (REGHANDLE)0; } return Error; } #endif // McGenEventRegisterUnregister #ifndef _mcgen_EVENT_BIT_SET #if defined(_M_IX86) || defined(_M_X64) // This macro is for use by MC-generated code and should not be used directly. #define _mcgen_EVENT_BIT_SET(EnableBits, BitPosition) ((((const unsigned char*)EnableBits)[BitPosition >> 3] & (1u << (BitPosition & 7))) != 0) #else // CPU type // This macro is for use by MC-generated code and should not be used directly. #define _mcgen_EVENT_BIT_SET(EnableBits, BitPosition) ((EnableBits[BitPosition >> 5] & (1u << (BitPosition & 31))) != 0) #endif // CPU type #endif // _mcgen_EVENT_BIT_SET #endif // MCGEN_DISABLE_PROVIDER_CODE_GENERATION //+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ // Provider "microsoft-windows-mimalloc" event count 2 //+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ // Provider GUID = 138f4dbb-ee04-4899-aa0a-572ad4475779 EXTERN_C __declspec(selectany) const GUID ETW_MI_Provider = {0x138f4dbb, 0xee04, 0x4899, {0xaa, 0x0a, 0x57, 0x2a, 0xd4, 0x47, 0x57, 0x79}}; #ifndef ETW_MI_Provider_Traits #define ETW_MI_Provider_Traits NULL #endif // ETW_MI_Provider_Traits // // Event Descriptors // EXTERN_C __declspec(selectany) const EVENT_DESCRIPTOR ETW_MI_ALLOC = {0x64, 0x1, 0x0, 0x4, 0x0, 0x0, 0x0}; #define ETW_MI_ALLOC_value 0x64 EXTERN_C __declspec(selectany) const EVENT_DESCRIPTOR ETW_MI_FREE = {0x65, 0x1, 0x0, 0x4, 0x0, 0x0, 0x0}; #define ETW_MI_FREE_value 0x65 // // MCGEN_DISABLE_PROVIDER_CODE_GENERATION macro: // Define this macro to have the compiler skip the generated functions in this // header. // #ifndef MCGEN_DISABLE_PROVIDER_CODE_GENERATION // // Event Enablement Bits // These variables are for use by MC-generated code and should not be used directly. // EXTERN_C __declspec(selectany) DECLSPEC_CACHEALIGN ULONG microsoft_windows_mimallocEnableBits[1]; EXTERN_C __declspec(selectany) const ULONGLONG microsoft_windows_mimallocKeywords[1] = {0x0}; EXTERN_C __declspec(selectany) const unsigned char microsoft_windows_mimallocLevels[1] = {4}; // // Provider context // EXTERN_C __declspec(selectany) MCGEN_TRACE_CONTEXT ETW_MI_Provider_Context = {0, (ULONG_PTR)ETW_MI_Provider_Traits, 0, 0, 0, 0, 0, 0, 1, microsoft_windows_mimallocEnableBits, microsoft_windows_mimallocKeywords, microsoft_windows_mimallocLevels}; // // Provider REGHANDLE // #define microsoft_windows_mimallocHandle (ETW_MI_Provider_Context.RegistrationHandle) // // This macro is set to 0, indicating that the EventWrite[Name] macros do not // have an Activity parameter. This is controlled by the -km and -um options. // #define ETW_MI_Provider_EventWriteActivity 0 // // Register with ETW using the control GUID specified in the manifest. // Invoke this macro during module initialization (i.e. program startup, // DLL process attach, or driver load) to initialize the provider. // Note that if this function returns an error, the error means that // will not work, but no action needs to be taken -- even if EventRegister // returns an error, it is generally safe to use EventWrite and // EventUnregister macros (they will be no-ops if EventRegister failed). // #ifndef EventRegistermicrosoft_windows_mimalloc #define EventRegistermicrosoft_windows_mimalloc() McGenEventRegister(&ETW_MI_Provider, McGenControlCallbackV2, &ETW_MI_Provider_Context, µsoft_windows_mimallocHandle) #endif // // Register with ETW using a specific control GUID (i.e. a GUID other than what // is specified in the manifest). Advanced scenarios only. // #ifndef EventRegisterByGuidmicrosoft_windows_mimalloc #define EventRegisterByGuidmicrosoft_windows_mimalloc(Guid) McGenEventRegister(&(Guid), McGenControlCallbackV2, &ETW_MI_Provider_Context, µsoft_windows_mimallocHandle) #endif // // Unregister with ETW and close the provider. // Invoke this macro during module shutdown (i.e. program exit, DLL process // detach, or driver unload) to unregister the provider. // Note that you MUST call EventUnregister before DLL or driver unload // (not optional): failure to unregister a provider before DLL or driver unload // will result in crashes. // #ifndef EventUnregistermicrosoft_windows_mimalloc #define EventUnregistermicrosoft_windows_mimalloc() McGenEventUnregister(µsoft_windows_mimallocHandle) #endif // // MCGEN_ENABLE_FORCONTEXT_CODE_GENERATION macro: // Define this macro to enable support for caller-allocated provider context. // #ifdef MCGEN_ENABLE_FORCONTEXT_CODE_GENERATION // // Advanced scenarios: Caller-allocated provider context. // Use when multiple differently-configured provider handles are needed, // e.g. for container-aware drivers, one context per container. // // Usage: // // - Caller enables the feature before including this header, e.g. // #define MCGEN_ENABLE_FORCONTEXT_CODE_GENERATION 1 // - Caller allocates memory, e.g. pContext = malloc(sizeof(McGenContext_microsoft_windows_mimalloc)); // - Caller registers the provider, e.g. EventRegistermicrosoft_windows_mimalloc_ForContext(pContext); // - Caller writes events, e.g. EventWriteMyEvent_ForContext(pContext, ...); // - Caller unregisters, e.g. EventUnregistermicrosoft_windows_mimalloc_ForContext(pContext); // - Caller frees memory, e.g. free(pContext); // typedef struct tagMcGenContext_microsoft_windows_mimalloc { // The fields of this structure are subject to change and should // not be accessed directly. To access the provider's REGHANDLE, // use microsoft_windows_mimallocHandle_ForContext(pContext). MCGEN_TRACE_CONTEXT Context; ULONG EnableBits[1]; } McGenContext_microsoft_windows_mimalloc; #define EventRegistermicrosoft_windows_mimalloc_ForContext(pContext) _mcgen_PASTE2(_mcgen_RegisterForContext_microsoft_windows_mimalloc_, MCGEN_EVENTREGISTER)(&ETW_MI_Provider, pContext) #define EventRegisterByGuidmicrosoft_windows_mimalloc_ForContext(Guid, pContext) _mcgen_PASTE2(_mcgen_RegisterForContext_microsoft_windows_mimalloc_, MCGEN_EVENTREGISTER)(&(Guid), pContext) #define EventUnregistermicrosoft_windows_mimalloc_ForContext(pContext) McGenEventUnregister(&(pContext)->Context.RegistrationHandle) // // Provider REGHANDLE for caller-allocated context. // #define microsoft_windows_mimallocHandle_ForContext(pContext) ((pContext)->Context.RegistrationHandle) // This function is for use by MC-generated code and should not be used directly. // Initialize and register the caller-allocated context. __inline ULONG __stdcall _mcgen_PASTE2(_mcgen_RegisterForContext_microsoft_windows_mimalloc_, MCGEN_EVENTREGISTER)( _In_ LPCGUID pProviderId, _Out_ McGenContext_microsoft_windows_mimalloc* pContext) { RtlZeroMemory(pContext, sizeof(*pContext)); pContext->Context.Logger = (ULONG_PTR)ETW_MI_Provider_Traits; pContext->Context.EnableBitsCount = 1; pContext->Context.EnableBitMask = pContext->EnableBits; pContext->Context.EnableKeyWords = microsoft_windows_mimallocKeywords; pContext->Context.EnableLevel = microsoft_windows_mimallocLevels; return McGenEventRegister( pProviderId, McGenControlCallbackV2, &pContext->Context, &pContext->Context.RegistrationHandle); } // This function is for use by MC-generated code and should not be used directly. // Trigger a compile error if called with the wrong parameter type. FORCEINLINE _Ret_ McGenContext_microsoft_windows_mimalloc* _mcgen_CheckContextType_microsoft_windows_mimalloc(_In_ McGenContext_microsoft_windows_mimalloc* pContext) { return pContext; } #endif // MCGEN_ENABLE_FORCONTEXT_CODE_GENERATION // // Enablement check macro for event "ETW_MI_ALLOC" // #define EventEnabledETW_MI_ALLOC() _mcgen_EVENT_BIT_SET(microsoft_windows_mimallocEnableBits, 0) #define EventEnabledETW_MI_ALLOC_ForContext(pContext) _mcgen_EVENT_BIT_SET(_mcgen_CheckContextType_microsoft_windows_mimalloc(pContext)->EnableBits, 0) // // Event write macros for event "ETW_MI_ALLOC" // #define EventWriteETW_MI_ALLOC(Address, Size) \ MCGEN_EVENT_ENABLED(ETW_MI_ALLOC) \ ? _mcgen_TEMPLATE_FOR_ETW_MI_ALLOC(&ETW_MI_Provider_Context, &ETW_MI_ALLOC, Address, Size) : 0 #define EventWriteETW_MI_ALLOC_AssumeEnabled(Address, Size) \ _mcgen_TEMPLATE_FOR_ETW_MI_ALLOC(&ETW_MI_Provider_Context, &ETW_MI_ALLOC, Address, Size) #define EventWriteETW_MI_ALLOC_ForContext(pContext, Address, Size) \ MCGEN_EVENT_ENABLED_FORCONTEXT(pContext, ETW_MI_ALLOC) \ ? _mcgen_TEMPLATE_FOR_ETW_MI_ALLOC(&(pContext)->Context, &ETW_MI_ALLOC, Address, Size) : 0 #define EventWriteETW_MI_ALLOC_ForContextAssumeEnabled(pContext, Address, Size) \ _mcgen_TEMPLATE_FOR_ETW_MI_ALLOC(&_mcgen_CheckContextType_microsoft_windows_mimalloc(pContext)->Context, &ETW_MI_ALLOC, Address, Size) // This macro is for use by MC-generated code and should not be used directly. #define _mcgen_TEMPLATE_FOR_ETW_MI_ALLOC _mcgen_PASTE2(McTemplateU0xx_, MCGEN_EVENTWRITETRANSFER) // // Enablement check macro for event "ETW_MI_FREE" // #define EventEnabledETW_MI_FREE() _mcgen_EVENT_BIT_SET(microsoft_windows_mimallocEnableBits, 0) #define EventEnabledETW_MI_FREE_ForContext(pContext) _mcgen_EVENT_BIT_SET(_mcgen_CheckContextType_microsoft_windows_mimalloc(pContext)->EnableBits, 0) // // Event write macros for event "ETW_MI_FREE" // #define EventWriteETW_MI_FREE(Address, Size) \ MCGEN_EVENT_ENABLED(ETW_MI_FREE) \ ? _mcgen_TEMPLATE_FOR_ETW_MI_FREE(&ETW_MI_Provider_Context, &ETW_MI_FREE, Address, Size) : 0 #define EventWriteETW_MI_FREE_AssumeEnabled(Address, Size) \ _mcgen_TEMPLATE_FOR_ETW_MI_FREE(&ETW_MI_Provider_Context, &ETW_MI_FREE, Address, Size) #define EventWriteETW_MI_FREE_ForContext(pContext, Address, Size) \ MCGEN_EVENT_ENABLED_FORCONTEXT(pContext, ETW_MI_FREE) \ ? _mcgen_TEMPLATE_FOR_ETW_MI_FREE(&(pContext)->Context, &ETW_MI_FREE, Address, Size) : 0 #define EventWriteETW_MI_FREE_ForContextAssumeEnabled(pContext, Address, Size) \ _mcgen_TEMPLATE_FOR_ETW_MI_FREE(&_mcgen_CheckContextType_microsoft_windows_mimalloc(pContext)->Context, &ETW_MI_FREE, Address, Size) // This macro is for use by MC-generated code and should not be used directly. #define _mcgen_TEMPLATE_FOR_ETW_MI_FREE _mcgen_PASTE2(McTemplateU0xx_, MCGEN_EVENTWRITETRANSFER) #endif // MCGEN_DISABLE_PROVIDER_CODE_GENERATION // // MCGEN_DISABLE_PROVIDER_CODE_GENERATION macro: // Define this macro to have the compiler skip the generated functions in this // header. // #ifndef MCGEN_DISABLE_PROVIDER_CODE_GENERATION // // Template Functions // // // Function for template "ETW_CUSTOM_HEAP_ALLOC_DATA" (and possibly others). // This function is for use by MC-generated code and should not be used directly. // #ifndef McTemplateU0xx_def #define McTemplateU0xx_def ETW_INLINE ULONG _mcgen_PASTE2(McTemplateU0xx_, MCGEN_EVENTWRITETRANSFER)( _In_ PMCGEN_TRACE_CONTEXT Context, _In_ PCEVENT_DESCRIPTOR Descriptor, _In_ const unsigned __int64 _Arg0, _In_ const unsigned __int64 _Arg1 ) { #define McTemplateU0xx_ARGCOUNT 2 EVENT_DATA_DESCRIPTOR EventData[McTemplateU0xx_ARGCOUNT + 1]; EventDataDescCreate(&EventData[1],&_Arg0, sizeof(const unsigned __int64) ); EventDataDescCreate(&EventData[2],&_Arg1, sizeof(const unsigned __int64) ); return McGenEventWrite(Context, Descriptor, NULL, McTemplateU0xx_ARGCOUNT + 1, EventData); } #endif // McTemplateU0xx_def #endif // MCGEN_DISABLE_PROVIDER_CODE_GENERATION #if defined(__cplusplus) } #endif ================================================ FILE: third-party/mimalloc/src/prim/windows/prim.c ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2023, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ // This file is included in `src/prim/prim.c` #include "mimalloc.h" #include "mimalloc/internal.h" #include "mimalloc/prim.h" #include // fputs, stderr //--------------------------------------------- // Dynamically bind Windows API points for portability //--------------------------------------------- // We use VirtualAlloc2 for aligned allocation, but it is only supported on Windows 10 and Windows Server 2016. // So, we need to look it up dynamically to run on older systems. (use __stdcall for 32-bit compatibility) // NtAllocateVirtualAllocEx is used for huge OS page allocation (1GiB) // We define a minimal MEM_EXTENDED_PARAMETER ourselves in order to be able to compile with older SDK's. typedef enum MI_MEM_EXTENDED_PARAMETER_TYPE_E { MiMemExtendedParameterInvalidType = 0, MiMemExtendedParameterAddressRequirements, MiMemExtendedParameterNumaNode, MiMemExtendedParameterPartitionHandle, MiMemExtendedParameterUserPhysicalHandle, MiMemExtendedParameterAttributeFlags, MiMemExtendedParameterMax } MI_MEM_EXTENDED_PARAMETER_TYPE; typedef struct DECLSPEC_ALIGN(8) MI_MEM_EXTENDED_PARAMETER_S { struct { DWORD64 Type : 8; DWORD64 Reserved : 56; } Type; union { DWORD64 ULong64; PVOID Pointer; SIZE_T Size; HANDLE Handle; DWORD ULong; } Arg; } MI_MEM_EXTENDED_PARAMETER; typedef struct MI_MEM_ADDRESS_REQUIREMENTS_S { PVOID LowestStartingAddress; PVOID HighestEndingAddress; SIZE_T Alignment; } MI_MEM_ADDRESS_REQUIREMENTS; #define MI_MEM_EXTENDED_PARAMETER_NONPAGED_HUGE 0x00000010 #include typedef PVOID (__stdcall *PVirtualAlloc2)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG); typedef NTSTATUS (__stdcall *PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG); static PVirtualAlloc2 pVirtualAlloc2 = NULL; static PNtAllocateVirtualMemoryEx pNtAllocateVirtualMemoryEx = NULL; // Similarly, GetNumaProcessorNodeEx is only supported since Windows 7 typedef struct MI_PROCESSOR_NUMBER_S { WORD Group; BYTE Number; BYTE Reserved; } MI_PROCESSOR_NUMBER; typedef VOID (__stdcall *PGetCurrentProcessorNumberEx)(MI_PROCESSOR_NUMBER* ProcNumber); typedef BOOL (__stdcall *PGetNumaProcessorNodeEx)(MI_PROCESSOR_NUMBER* Processor, PUSHORT NodeNumber); typedef BOOL (__stdcall* PGetNumaNodeProcessorMaskEx)(USHORT Node, PGROUP_AFFINITY ProcessorMask); typedef BOOL (__stdcall *PGetNumaProcessorNode)(UCHAR Processor, PUCHAR NodeNumber); static PGetCurrentProcessorNumberEx pGetCurrentProcessorNumberEx = NULL; static PGetNumaProcessorNodeEx pGetNumaProcessorNodeEx = NULL; static PGetNumaNodeProcessorMaskEx pGetNumaNodeProcessorMaskEx = NULL; static PGetNumaProcessorNode pGetNumaProcessorNode = NULL; // Available after Windows XP typedef BOOL (__stdcall *PGetPhysicallyInstalledSystemMemory)( PULONGLONG TotalMemoryInKilobytes ); //--------------------------------------------- // Enable large page support dynamically (if possible) //--------------------------------------------- static bool win_enable_large_os_pages(size_t* large_page_size) { static bool large_initialized = false; if (large_initialized) return (_mi_os_large_page_size() > 0); large_initialized = true; // Try to see if large OS pages are supported // To use large pages on Windows, we first need access permission // Set "Lock pages in memory" permission in the group policy editor // unsigned long err = 0; HANDLE token = NULL; BOOL ok = OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token); if (ok) { TOKEN_PRIVILEGES tp; ok = LookupPrivilegeValue(NULL, TEXT("SeLockMemoryPrivilege"), &tp.Privileges[0].Luid); if (ok) { tp.PrivilegeCount = 1; tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; ok = AdjustTokenPrivileges(token, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0); if (ok) { err = GetLastError(); ok = (err == ERROR_SUCCESS); if (ok && large_page_size != NULL) { *large_page_size = GetLargePageMinimum(); } } } CloseHandle(token); } if (!ok) { if (err == 0) err = GetLastError(); _mi_warning_message("cannot enable large OS page support, error %lu\n", err); } return (ok!=0); } //--------------------------------------------- // Initialize //--------------------------------------------- void _mi_prim_mem_init( mi_os_mem_config_t* config ) { config->has_overcommit = false; config->has_partial_free = false; config->has_virtual_reserve = true; // get the page size SYSTEM_INFO si; GetSystemInfo(&si); if (si.dwPageSize > 0) { config->page_size = si.dwPageSize; } if (si.dwAllocationGranularity > 0) { config->alloc_granularity = si.dwAllocationGranularity; } // get virtual address bits if ((uintptr_t)si.lpMaximumApplicationAddress > 0) { const size_t vbits = MI_SIZE_BITS - mi_clz((uintptr_t)si.lpMaximumApplicationAddress); config->virtual_address_bits = vbits; } // get the VirtualAlloc2 function HINSTANCE hDll; hDll = LoadLibrary(TEXT("kernelbase.dll")); if (hDll != NULL) { // use VirtualAlloc2FromApp if possible as it is available to Windows store apps pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2FromApp"); if (pVirtualAlloc2==NULL) pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2"); FreeLibrary(hDll); } // NtAllocateVirtualMemoryEx is used for huge page allocation hDll = LoadLibrary(TEXT("ntdll.dll")); if (hDll != NULL) { pNtAllocateVirtualMemoryEx = (PNtAllocateVirtualMemoryEx)(void (*)(void))GetProcAddress(hDll, "NtAllocateVirtualMemoryEx"); FreeLibrary(hDll); } // Try to use Win7+ numa API hDll = LoadLibrary(TEXT("kernel32.dll")); if (hDll != NULL) { pGetCurrentProcessorNumberEx = (PGetCurrentProcessorNumberEx)(void (*)(void))GetProcAddress(hDll, "GetCurrentProcessorNumberEx"); pGetNumaProcessorNodeEx = (PGetNumaProcessorNodeEx)(void (*)(void))GetProcAddress(hDll, "GetNumaProcessorNodeEx"); pGetNumaNodeProcessorMaskEx = (PGetNumaNodeProcessorMaskEx)(void (*)(void))GetProcAddress(hDll, "GetNumaNodeProcessorMaskEx"); pGetNumaProcessorNode = (PGetNumaProcessorNode)(void (*)(void))GetProcAddress(hDll, "GetNumaProcessorNode"); // Get physical memory (not available on XP, so check dynamically) PGetPhysicallyInstalledSystemMemory pGetPhysicallyInstalledSystemMemory = (PGetPhysicallyInstalledSystemMemory)(void (*)(void))GetProcAddress(hDll,"GetPhysicallyInstalledSystemMemory"); if (pGetPhysicallyInstalledSystemMemory != NULL) { ULONGLONG memInKiB = 0; if ((*pGetPhysicallyInstalledSystemMemory)(&memInKiB)) { if (memInKiB > 0 && memInKiB <= SIZE_MAX) { config->physical_memory_in_kib = (size_t)memInKiB; } } } FreeLibrary(hDll); } // Enable large/huge OS page support? if (mi_option_is_enabled(mi_option_allow_large_os_pages) || mi_option_is_enabled(mi_option_reserve_huge_os_pages)) { win_enable_large_os_pages(&config->large_page_size); } } //--------------------------------------------- // Free //--------------------------------------------- int _mi_prim_free(void* addr, size_t size ) { MI_UNUSED(size); DWORD errcode = 0; bool err = (VirtualFree(addr, 0, MEM_RELEASE) == 0); if (err) { errcode = GetLastError(); } if (errcode == ERROR_INVALID_ADDRESS) { // In mi_os_mem_alloc_aligned the fallback path may have returned a pointer inside // the memory region returned by VirtualAlloc; in that case we need to free using // the start of the region. MEMORY_BASIC_INFORMATION info; _mi_memzero_var(info); VirtualQuery(addr, &info, sizeof(info)); if (info.AllocationBase < addr && ((uint8_t*)addr - (uint8_t*)info.AllocationBase) < (ptrdiff_t)MI_SEGMENT_SIZE) { errcode = 0; err = (VirtualFree(info.AllocationBase, 0, MEM_RELEASE) == 0); if (err) { errcode = GetLastError(); } } } return (int)errcode; } //--------------------------------------------- // VirtualAlloc //--------------------------------------------- static void* win_virtual_alloc_prim_once(void* addr, size_t size, size_t try_alignment, DWORD flags) { #if (MI_INTPTR_SIZE >= 8) // on 64-bit systems, try to use the virtual address area after 2TiB for 4MiB aligned allocations if (addr == NULL) { void* hint = _mi_os_get_aligned_hint(try_alignment,size); if (hint != NULL) { void* p = VirtualAlloc(hint, size, flags, PAGE_READWRITE); if (p != NULL) return p; _mi_verbose_message("warning: unable to allocate hinted aligned OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x)\n", size, GetLastError(), hint, try_alignment, flags); // fall through on error } } #endif // on modern Windows try use VirtualAlloc2 for aligned allocation if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) { MI_MEM_ADDRESS_REQUIREMENTS reqs = { 0, 0, 0 }; reqs.Alignment = try_alignment; MI_MEM_EXTENDED_PARAMETER param = { {0, 0}, {0} }; param.Type.Type = MiMemExtendedParameterAddressRequirements; param.Arg.Pointer = &reqs; void* p = (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, ¶m, 1); if (p != NULL) return p; _mi_warning_message("unable to allocate aligned OS memory (0x%zx bytes, error code: 0x%x, address: %p, alignment: 0x%zx, flags: 0x%x)\n", size, GetLastError(), addr, try_alignment, flags); // fall through on error } // last resort return VirtualAlloc(addr, size, flags, PAGE_READWRITE); } static bool win_is_out_of_memory_error(DWORD err) { switch (err) { case ERROR_COMMITMENT_MINIMUM: case ERROR_COMMITMENT_LIMIT: case ERROR_PAGEFILE_QUOTA: case ERROR_NOT_ENOUGH_MEMORY: return true; default: return false; } } static void* win_virtual_alloc_prim(void* addr, size_t size, size_t try_alignment, DWORD flags) { long max_retry_msecs = mi_option_get_clamp(mi_option_retry_on_oom, 0, 2000); // at most 2 seconds if (max_retry_msecs == 1) { max_retry_msecs = 100; } // if one sets the option to "true" for (long tries = 1; tries <= 10; tries++) { // try at most 10 times (=2200ms) void* p = win_virtual_alloc_prim_once(addr, size, try_alignment, flags); if (p != NULL) { // success, return the address return p; } else if (max_retry_msecs > 0 && (try_alignment <= 2*MI_SEGMENT_ALIGN) && (flags&MEM_COMMIT) != 0 && (flags&MEM_LARGE_PAGES) == 0 && win_is_out_of_memory_error(GetLastError())) { // if committing regular memory and being out-of-memory, // keep trying for a bit in case memory frees up after all. See issue #894 _mi_warning_message("out-of-memory on OS allocation, try again... (attempt %lu, 0x%zx bytes, error code: 0x%x, address: %p, alignment: 0x%zx, flags: 0x%x)\n", tries, size, GetLastError(), addr, try_alignment, flags); long sleep_msecs = tries*40; // increasing waits if (sleep_msecs > max_retry_msecs) { sleep_msecs = max_retry_msecs; } max_retry_msecs -= sleep_msecs; Sleep(sleep_msecs); } else { // otherwise return with an error break; } } return NULL; } static void* win_virtual_alloc(void* addr, size_t size, size_t try_alignment, DWORD flags, bool large_only, bool allow_large, bool* is_large) { mi_assert_internal(!(large_only && !allow_large)); static _Atomic(size_t) large_page_try_ok; // = 0; void* p = NULL; // Try to allocate large OS pages (2MiB) if allowed or required. if ((large_only || _mi_os_use_large_page(size, try_alignment)) && allow_large && (flags&MEM_COMMIT)!=0 && (flags&MEM_RESERVE)!=0) { size_t try_ok = mi_atomic_load_acquire(&large_page_try_ok); if (!large_only && try_ok > 0) { // if a large page allocation fails, it seems the calls to VirtualAlloc get very expensive. // therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times. mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - 1); } else { // large OS pages must always reserve and commit. *is_large = true; p = win_virtual_alloc_prim(addr, size, try_alignment, flags | MEM_LARGE_PAGES); if (large_only) return p; // fall back to non-large page allocation on error (`p == NULL`). if (p == NULL) { mi_atomic_store_release(&large_page_try_ok,10UL); // on error, don't try again for the next N allocations } } } // Fall back to regular page allocation if (p == NULL) { *is_large = ((flags&MEM_LARGE_PAGES) != 0); p = win_virtual_alloc_prim(addr, size, try_alignment, flags); } //if (p == NULL) { _mi_warning_message("unable to allocate OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x, large only: %d, allow large: %d)\n", size, GetLastError(), addr, try_alignment, flags, large_only, allow_large); } return p; } int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** addr) { mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0); mi_assert_internal(commit || !allow_large); mi_assert_internal(try_alignment > 0); *is_zero = true; int flags = MEM_RESERVE; if (commit) { flags |= MEM_COMMIT; } *addr = win_virtual_alloc(hint_addr, size, try_alignment, flags, false, allow_large, is_large); return (*addr != NULL ? 0 : (int)GetLastError()); } //--------------------------------------------- // Commit/Reset/Protect //--------------------------------------------- #ifdef _MSC_VER #pragma warning(disable:6250) // suppress warning calling VirtualFree without MEM_RELEASE (for decommit) #endif int _mi_prim_commit(void* addr, size_t size, bool* is_zero) { *is_zero = false; /* // zero'ing only happens on an initial commit... but checking upfront seems expensive.. _MEMORY_BASIC_INFORMATION meminfo; _mi_memzero_var(meminfo); if (VirtualQuery(addr, &meminfo, size) > 0) { if ((meminfo.State & MEM_COMMIT) == 0) { *is_zero = true; } } */ // commit void* p = VirtualAlloc(addr, size, MEM_COMMIT, PAGE_READWRITE); if (p == NULL) return (int)GetLastError(); return 0; } int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit) { BOOL ok = VirtualFree(addr, size, MEM_DECOMMIT); *needs_recommit = true; // for safety, assume always decommitted even in the case of an error. return (ok ? 0 : (int)GetLastError()); } int _mi_prim_reset(void* addr, size_t size) { void* p = VirtualAlloc(addr, size, MEM_RESET, PAGE_READWRITE); mi_assert_internal(p == addr); #if 0 if (p != NULL) { VirtualUnlock(addr,size); // VirtualUnlock after MEM_RESET removes the memory directly from the working set } #endif return (p != NULL ? 0 : (int)GetLastError()); } int _mi_prim_protect(void* addr, size_t size, bool protect) { DWORD oldprotect = 0; BOOL ok = VirtualProtect(addr, size, protect ? PAGE_NOACCESS : PAGE_READWRITE, &oldprotect); return (ok ? 0 : (int)GetLastError()); } //--------------------------------------------- // Huge page allocation //--------------------------------------------- static void* _mi_prim_alloc_huge_os_pagesx(void* hint_addr, size_t size, int numa_node) { const DWORD flags = MEM_LARGE_PAGES | MEM_COMMIT | MEM_RESERVE; win_enable_large_os_pages(NULL); MI_MEM_EXTENDED_PARAMETER params[3] = { {{0,0},{0}},{{0,0},{0}},{{0,0},{0}} }; // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages static bool mi_huge_pages_available = true; if (pNtAllocateVirtualMemoryEx != NULL && mi_huge_pages_available) { params[0].Type.Type = MiMemExtendedParameterAttributeFlags; params[0].Arg.ULong64 = MI_MEM_EXTENDED_PARAMETER_NONPAGED_HUGE; ULONG param_count = 1; if (numa_node >= 0) { param_count++; params[1].Type.Type = MiMemExtendedParameterNumaNode; params[1].Arg.ULong = (unsigned)numa_node; } SIZE_T psize = size; void* base = hint_addr; NTSTATUS err = (*pNtAllocateVirtualMemoryEx)(GetCurrentProcess(), &base, &psize, flags, PAGE_READWRITE, params, param_count); if (err == 0 && base != NULL) { return base; } else { // fall back to regular large pages mi_huge_pages_available = false; // don't try further huge pages _mi_warning_message("unable to allocate using huge (1GiB) pages, trying large (2MiB) pages instead (status 0x%lx)\n", err); } } // on modern Windows try use VirtualAlloc2 for numa aware large OS page allocation if (pVirtualAlloc2 != NULL && numa_node >= 0) { params[0].Type.Type = MiMemExtendedParameterNumaNode; params[0].Arg.ULong = (unsigned)numa_node; return (*pVirtualAlloc2)(GetCurrentProcess(), hint_addr, size, flags, PAGE_READWRITE, params, 1); } // otherwise use regular virtual alloc on older windows return VirtualAlloc(hint_addr, size, flags, PAGE_READWRITE); } int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr) { *is_zero = true; *addr = _mi_prim_alloc_huge_os_pagesx(hint_addr,size,numa_node); return (*addr != NULL ? 0 : (int)GetLastError()); } //--------------------------------------------- // Numa nodes //--------------------------------------------- size_t _mi_prim_numa_node(void) { USHORT numa_node = 0; if (pGetCurrentProcessorNumberEx != NULL && pGetNumaProcessorNodeEx != NULL) { // Extended API is supported MI_PROCESSOR_NUMBER pnum; (*pGetCurrentProcessorNumberEx)(&pnum); USHORT nnode = 0; BOOL ok = (*pGetNumaProcessorNodeEx)(&pnum, &nnode); if (ok) { numa_node = nnode; } } else if (pGetNumaProcessorNode != NULL) { // Vista or earlier, use older API that is limited to 64 processors. Issue #277 DWORD pnum = GetCurrentProcessorNumber(); UCHAR nnode = 0; BOOL ok = pGetNumaProcessorNode((UCHAR)pnum, &nnode); if (ok) { numa_node = nnode; } } return numa_node; } size_t _mi_prim_numa_node_count(void) { ULONG numa_max = 0; GetNumaHighestNodeNumber(&numa_max); // find the highest node number that has actual processors assigned to it. Issue #282 while(numa_max > 0) { if (pGetNumaNodeProcessorMaskEx != NULL) { // Extended API is supported GROUP_AFFINITY affinity; if ((*pGetNumaNodeProcessorMaskEx)((USHORT)numa_max, &affinity)) { if (affinity.Mask != 0) break; // found the maximum non-empty node } } else { // Vista or earlier, use older API that is limited to 64 processors. ULONGLONG mask; if (GetNumaNodeProcessorMask((UCHAR)numa_max, &mask)) { if (mask != 0) break; // found the maximum non-empty node }; } // max node was invalid or had no processor assigned, try again numa_max--; } return ((size_t)numa_max + 1); } //---------------------------------------------------------------- // Clock //---------------------------------------------------------------- static mi_msecs_t mi_to_msecs(LARGE_INTEGER t) { static LARGE_INTEGER mfreq; // = 0 if (mfreq.QuadPart == 0LL) { LARGE_INTEGER f; QueryPerformanceFrequency(&f); mfreq.QuadPart = f.QuadPart/1000LL; if (mfreq.QuadPart == 0) mfreq.QuadPart = 1; } return (mi_msecs_t)(t.QuadPart / mfreq.QuadPart); } mi_msecs_t _mi_prim_clock_now(void) { LARGE_INTEGER t; QueryPerformanceCounter(&t); return mi_to_msecs(t); } //---------------------------------------------------------------- // Process Info //---------------------------------------------------------------- #include static mi_msecs_t filetime_msecs(const FILETIME* ftime) { ULARGE_INTEGER i; i.LowPart = ftime->dwLowDateTime; i.HighPart = ftime->dwHighDateTime; mi_msecs_t msecs = (i.QuadPart / 10000); // FILETIME is in 100 nano seconds return msecs; } typedef BOOL (WINAPI *PGetProcessMemoryInfo)(HANDLE, PPROCESS_MEMORY_COUNTERS, DWORD); static PGetProcessMemoryInfo pGetProcessMemoryInfo = NULL; void _mi_prim_process_info(mi_process_info_t* pinfo) { FILETIME ct; FILETIME ut; FILETIME st; FILETIME et; GetProcessTimes(GetCurrentProcess(), &ct, &et, &st, &ut); pinfo->utime = filetime_msecs(&ut); pinfo->stime = filetime_msecs(&st); // load psapi on demand if (pGetProcessMemoryInfo == NULL) { HINSTANCE hDll = LoadLibrary(TEXT("psapi.dll")); if (hDll != NULL) { pGetProcessMemoryInfo = (PGetProcessMemoryInfo)(void (*)(void))GetProcAddress(hDll, "GetProcessMemoryInfo"); } } // get process info PROCESS_MEMORY_COUNTERS info; _mi_memzero_var(info); if (pGetProcessMemoryInfo != NULL) { pGetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info)); } pinfo->current_rss = (size_t)info.WorkingSetSize; pinfo->peak_rss = (size_t)info.PeakWorkingSetSize; pinfo->current_commit = (size_t)info.PagefileUsage; pinfo->peak_commit = (size_t)info.PeakPagefileUsage; pinfo->page_faults = (size_t)info.PageFaultCount; } //---------------------------------------------------------------- // Output //---------------------------------------------------------------- void _mi_prim_out_stderr( const char* msg ) { // on windows with redirection, the C runtime cannot handle locale dependent output // after the main thread closes so we use direct console output. if (!_mi_preloading()) { // _cputs(msg); // _cputs cannot be used as it aborts when failing to lock the console static HANDLE hcon = INVALID_HANDLE_VALUE; static bool hconIsConsole; if (hcon == INVALID_HANDLE_VALUE) { CONSOLE_SCREEN_BUFFER_INFO sbi; hcon = GetStdHandle(STD_ERROR_HANDLE); hconIsConsole = ((hcon != INVALID_HANDLE_VALUE) && GetConsoleScreenBufferInfo(hcon, &sbi)); } const size_t len = _mi_strlen(msg); if (len > 0 && len < UINT32_MAX) { DWORD written = 0; if (hconIsConsole) { WriteConsoleA(hcon, msg, (DWORD)len, &written, NULL); } else if (hcon != INVALID_HANDLE_VALUE) { // use direct write if stderr was redirected WriteFile(hcon, msg, (DWORD)len, &written, NULL); } else { // finally fall back to fputs after all fputs(msg, stderr); } } } } //---------------------------------------------------------------- // Environment //---------------------------------------------------------------- // On Windows use GetEnvironmentVariable instead of getenv to work // reliably even when this is invoked before the C runtime is initialized. // i.e. when `_mi_preloading() == true`. // Note: on windows, environment names are not case sensitive. bool _mi_prim_getenv(const char* name, char* result, size_t result_size) { result[0] = 0; size_t len = GetEnvironmentVariableA(name, result, (DWORD)result_size); return (len > 0 && len < result_size); } //---------------------------------------------------------------- // Random //---------------------------------------------------------------- #if defined(MI_USE_RTLGENRANDOM) // || defined(__cplusplus) // We prefer to use BCryptGenRandom instead of (the unofficial) RtlGenRandom but when using // dynamic overriding, we observed it can raise an exception when compiled with C++, and // sometimes deadlocks when also running under the VS debugger. // In contrast, issue #623 implies that on Windows Server 2019 we need to use BCryptGenRandom. // To be continued.. #pragma comment (lib,"advapi32.lib") #define RtlGenRandom SystemFunction036 mi_decl_externc BOOLEAN NTAPI RtlGenRandom(PVOID RandomBuffer, ULONG RandomBufferLength); bool _mi_prim_random_buf(void* buf, size_t buf_len) { return (RtlGenRandom(buf, (ULONG)buf_len) != 0); } #else #ifndef BCRYPT_USE_SYSTEM_PREFERRED_RNG #define BCRYPT_USE_SYSTEM_PREFERRED_RNG 0x00000002 #endif typedef LONG (NTAPI *PBCryptGenRandom)(HANDLE, PUCHAR, ULONG, ULONG); static PBCryptGenRandom pBCryptGenRandom = NULL; bool _mi_prim_random_buf(void* buf, size_t buf_len) { if (pBCryptGenRandom == NULL) { HINSTANCE hDll = LoadLibrary(TEXT("bcrypt.dll")); if (hDll != NULL) { pBCryptGenRandom = (PBCryptGenRandom)(void (*)(void))GetProcAddress(hDll, "BCryptGenRandom"); } if (pBCryptGenRandom == NULL) return false; } return (pBCryptGenRandom(NULL, (PUCHAR)buf, (ULONG)buf_len, BCRYPT_USE_SYSTEM_PREFERRED_RNG) >= 0); } #endif // MI_USE_RTLGENRANDOM //---------------------------------------------------------------- // Process & Thread Init/Done //---------------------------------------------------------------- static void NTAPI mi_win_main(PVOID module, DWORD reason, LPVOID reserved) { MI_UNUSED(reserved); MI_UNUSED(module); #if MI_TLS_SLOT >= 2 if ((reason==DLL_PROCESS_ATTACH || reason==DLL_THREAD_ATTACH) && mi_prim_get_default_heap() == NULL) { _mi_heap_set_default_direct((mi_heap_t*)&_mi_heap_empty); } #endif if (reason==DLL_PROCESS_ATTACH) { _mi_process_load(); } else if (reason==DLL_PROCESS_DETACH) { _mi_process_done(); } else if (reason==DLL_THREAD_DETACH && !_mi_is_redirected()) { _mi_thread_done(NULL); } } #if defined(MI_SHARED_LIB) #define MI_PRIM_HAS_PROCESS_ATTACH 1 // Windows DLL: easy to hook into process_init and thread_done BOOL WINAPI DllMain(HINSTANCE inst, DWORD reason, LPVOID reserved) { mi_win_main((PVOID)inst,reason,reserved); return TRUE; } // nothing to do since `_mi_thread_done` is handled through the DLL_THREAD_DETACH event. void _mi_prim_thread_init_auto_done(void) { } void _mi_prim_thread_done_auto_done(void) { } void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) { MI_UNUSED(heap); } #elif !defined(MI_WIN_USE_FLS) #define MI_PRIM_HAS_PROCESS_ATTACH 1 static void NTAPI mi_win_main_attach(PVOID module, DWORD reason, LPVOID reserved) { if (reason == DLL_PROCESS_ATTACH || reason == DLL_THREAD_ATTACH) { mi_win_main(module, reason, reserved); } } static void NTAPI mi_win_main_detach(PVOID module, DWORD reason, LPVOID reserved) { if (reason == DLL_PROCESS_DETACH || reason == DLL_THREAD_DETACH) { mi_win_main(module, reason, reserved); } } // Set up TLS callbacks in a statically linked library by using special data sections. // See // We use 2 entries to ensure we call attach events before constructors // are called, and detach events after destructors are called. #if defined(__cplusplus) extern "C" { #endif #if defined(_WIN64) #pragma comment(linker, "/INCLUDE:_tls_used") #pragma comment(linker, "/INCLUDE:_mi_tls_callback_pre") #pragma comment(linker, "/INCLUDE:_mi_tls_callback_post") #pragma const_seg(".CRT$XLB") extern const PIMAGE_TLS_CALLBACK _mi_tls_callback_pre[]; const PIMAGE_TLS_CALLBACK _mi_tls_callback_pre[] = { &mi_win_main_attach }; #pragma const_seg() #pragma const_seg(".CRT$XLY") extern const PIMAGE_TLS_CALLBACK _mi_tls_callback_post[]; const PIMAGE_TLS_CALLBACK _mi_tls_callback_post[] = { &mi_win_main_detach }; #pragma const_seg() #else #pragma comment(linker, "/INCLUDE:__tls_used") #pragma comment(linker, "/INCLUDE:__mi_tls_callback_pre") #pragma comment(linker, "/INCLUDE:__mi_tls_callback_post") #pragma data_seg(".CRT$XLB") PIMAGE_TLS_CALLBACK _mi_tls_callback_pre[] = { &mi_win_main_attach }; #pragma data_seg() #pragma data_seg(".CRT$XLY") PIMAGE_TLS_CALLBACK _mi_tls_callback_post[] = { &mi_win_main_detach }; #pragma data_seg() #endif #if defined(__cplusplus) } #endif // nothing to do since `_mi_thread_done` is handled through the DLL_THREAD_DETACH event. void _mi_prim_thread_init_auto_done(void) { } void _mi_prim_thread_done_auto_done(void) { } void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) { MI_UNUSED(heap); } #else // deprecated: statically linked, use fiber api #if defined(_MSC_VER) // on clang/gcc use the constructor attribute (in `src/prim/prim.c`) // MSVC: use data section magic for static libraries // See #define MI_PRIM_HAS_PROCESS_ATTACH 1 static int mi_process_attach(void) { mi_win_main(NULL,DLL_PROCESS_ATTACH,NULL); atexit(&_mi_process_done); return 0; } typedef int(*mi_crt_callback_t)(void); #if defined(_WIN64) #pragma comment(linker, "/INCLUDE:_mi_tls_callback") #pragma section(".CRT$XIU", long, read) #else #pragma comment(linker, "/INCLUDE:__mi_tls_callback") #endif #pragma data_seg(".CRT$XIU") mi_decl_externc mi_crt_callback_t _mi_tls_callback[] = { &mi_process_attach }; #pragma data_seg() #endif // use the fiber api for calling `_mi_thread_done`. #include #if (_WIN32_WINNT < 0x600) // before Windows Vista WINBASEAPI DWORD WINAPI FlsAlloc( _In_opt_ PFLS_CALLBACK_FUNCTION lpCallback ); WINBASEAPI PVOID WINAPI FlsGetValue( _In_ DWORD dwFlsIndex ); WINBASEAPI BOOL WINAPI FlsSetValue( _In_ DWORD dwFlsIndex, _In_opt_ PVOID lpFlsData ); WINBASEAPI BOOL WINAPI FlsFree(_In_ DWORD dwFlsIndex); #endif static DWORD mi_fls_key = (DWORD)(-1); static void NTAPI mi_fls_done(PVOID value) { mi_heap_t* heap = (mi_heap_t*)value; if (heap != NULL) { _mi_thread_done(heap); FlsSetValue(mi_fls_key, NULL); // prevent recursion as _mi_thread_done may set it back to the main heap, issue #672 } } void _mi_prim_thread_init_auto_done(void) { mi_fls_key = FlsAlloc(&mi_fls_done); } void _mi_prim_thread_done_auto_done(void) { // call thread-done on all threads (except the main thread) to prevent // dangling callback pointer if statically linked with a DLL; Issue #208 FlsFree(mi_fls_key); } void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) { mi_assert_internal(mi_fls_key != (DWORD)(-1)); FlsSetValue(mi_fls_key, heap); } #endif // ---------------------------------------------------- // Communicate with the redirection module on Windows // ---------------------------------------------------- #if defined(MI_SHARED_LIB) && !defined(MI_WIN_NOREDIRECT) #define MI_PRIM_HAS_ALLOCATOR_INIT 1 static bool mi_redirected = false; // true if malloc redirects to mi_malloc bool _mi_is_redirected(void) { return mi_redirected; } #ifdef __cplusplus extern "C" { #endif mi_decl_export void _mi_redirect_entry(DWORD reason) { // called on redirection; careful as this may be called before DllMain #if MI_TLS_SLOT >= 2 if ((reason==DLL_PROCESS_ATTACH || reason==DLL_THREAD_ATTACH) && mi_prim_get_default_heap() == NULL) { _mi_heap_set_default_direct((mi_heap_t*)&_mi_heap_empty); } #endif if (reason == DLL_PROCESS_ATTACH) { mi_redirected = true; } else if (reason == DLL_PROCESS_DETACH) { mi_redirected = false; } else if (reason == DLL_THREAD_DETACH) { _mi_thread_done(NULL); } } __declspec(dllimport) bool mi_cdecl mi_allocator_init(const char** message); __declspec(dllimport) void mi_cdecl mi_allocator_done(void); #ifdef __cplusplus } #endif bool _mi_allocator_init(const char** message) { return mi_allocator_init(message); } void _mi_allocator_done(void) { mi_allocator_done(); } #endif ================================================ FILE: third-party/mimalloc/src/prim/windows/readme.md ================================================ ## Primitives: - `prim.c` contains Windows primitives for OS allocation. ## Event Tracing for Windows (ETW) - `etw.h` is generated from `etw.man` which contains the manifest for mimalloc events. (100 is an allocation, 101 is for a free) - `etw-mimalloc.wprp` is a profile for the Windows Performance Recorder (WPR). In an admin prompt, you can use: ``` > wpr -start src\prim\windows\etw-mimalloc.wprp -filemode > > wpr -stop test.etl ``` and then open `test.etl` in the Windows Performance Analyzer (WPA). ================================================ FILE: third-party/mimalloc/src/random.c ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2019-2021, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #include "mimalloc.h" #include "mimalloc/internal.h" #include "mimalloc/prim.h" // _mi_prim_random_buf #include // memset /* ---------------------------------------------------------------------------- We use our own PRNG to keep predictable performance of random number generation and to avoid implementations that use a lock. We only use the OS provided random source to initialize the initial seeds. Since we do not need ultimate performance but we do rely on the security (for secret cookies in secure mode) we use a cryptographically secure generator (chacha20). -----------------------------------------------------------------------------*/ #define MI_CHACHA_ROUNDS (20) // perhaps use 12 for better performance? /* ---------------------------------------------------------------------------- Chacha20 implementation as the original algorithm with a 64-bit nonce and counter: https://en.wikipedia.org/wiki/Salsa20 The input matrix has sixteen 32-bit values: Position 0 to 3: constant key Position 4 to 11: the key Position 12 to 13: the counter. Position 14 to 15: the nonce. The implementation uses regular C code which compiles very well on modern compilers. (gcc x64 has no register spills, and clang 6+ uses SSE instructions) -----------------------------------------------------------------------------*/ static inline uint32_t rotl(uint32_t x, uint32_t shift) { return (x << shift) | (x >> (32 - shift)); } static inline void qround(uint32_t x[16], size_t a, size_t b, size_t c, size_t d) { x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 16); x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 12); x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 8); x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 7); } static void chacha_block(mi_random_ctx_t* ctx) { // scramble into `x` uint32_t x[16]; for (size_t i = 0; i < 16; i++) { x[i] = ctx->input[i]; } for (size_t i = 0; i < MI_CHACHA_ROUNDS; i += 2) { qround(x, 0, 4, 8, 12); qround(x, 1, 5, 9, 13); qround(x, 2, 6, 10, 14); qround(x, 3, 7, 11, 15); qround(x, 0, 5, 10, 15); qround(x, 1, 6, 11, 12); qround(x, 2, 7, 8, 13); qround(x, 3, 4, 9, 14); } // add scrambled data to the initial state for (size_t i = 0; i < 16; i++) { ctx->output[i] = x[i] + ctx->input[i]; } ctx->output_available = 16; // increment the counter for the next round ctx->input[12] += 1; if (ctx->input[12] == 0) { ctx->input[13] += 1; if (ctx->input[13] == 0) { // and keep increasing into the nonce ctx->input[14] += 1; } } } static uint32_t chacha_next32(mi_random_ctx_t* ctx) { if (ctx->output_available <= 0) { chacha_block(ctx); ctx->output_available = 16; // (assign again to suppress static analysis warning) } const uint32_t x = ctx->output[16 - ctx->output_available]; ctx->output[16 - ctx->output_available] = 0; // reset once the data is handed out ctx->output_available--; return x; } static inline uint32_t read32(const uint8_t* p, size_t idx32) { const size_t i = 4*idx32; return ((uint32_t)p[i+0] | (uint32_t)p[i+1] << 8 | (uint32_t)p[i+2] << 16 | (uint32_t)p[i+3] << 24); } static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t nonce) { // since we only use chacha for randomness (and not encryption) we // do not _need_ to read 32-bit values as little endian but we do anyways // just for being compatible :-) memset(ctx, 0, sizeof(*ctx)); for (size_t i = 0; i < 4; i++) { const uint8_t* sigma = (uint8_t*)"expand 32-byte k"; ctx->input[i] = read32(sigma,i); } for (size_t i = 0; i < 8; i++) { ctx->input[i + 4] = read32(key,i); } ctx->input[12] = 0; ctx->input[13] = 0; ctx->input[14] = (uint32_t)nonce; ctx->input[15] = (uint32_t)(nonce >> 32); } static void chacha_split(mi_random_ctx_t* ctx, uint64_t nonce, mi_random_ctx_t* ctx_new) { memset(ctx_new, 0, sizeof(*ctx_new)); _mi_memcpy(ctx_new->input, ctx->input, sizeof(ctx_new->input)); ctx_new->input[12] = 0; ctx_new->input[13] = 0; ctx_new->input[14] = (uint32_t)nonce; ctx_new->input[15] = (uint32_t)(nonce >> 32); mi_assert_internal(ctx->input[14] != ctx_new->input[14] || ctx->input[15] != ctx_new->input[15]); // do not reuse nonces! chacha_block(ctx_new); } /* ---------------------------------------------------------------------------- Random interface -----------------------------------------------------------------------------*/ #if MI_DEBUG>1 static bool mi_random_is_initialized(mi_random_ctx_t* ctx) { return (ctx != NULL && ctx->input[0] != 0); } #endif void _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* ctx_new) { mi_assert_internal(mi_random_is_initialized(ctx)); mi_assert_internal(ctx != ctx_new); chacha_split(ctx, (uintptr_t)ctx_new /*nonce*/, ctx_new); } uintptr_t _mi_random_next(mi_random_ctx_t* ctx) { mi_assert_internal(mi_random_is_initialized(ctx)); #if MI_INTPTR_SIZE <= 4 return chacha_next32(ctx); #elif MI_INTPTR_SIZE == 8 return (((uintptr_t)chacha_next32(ctx) << 32) | chacha_next32(ctx)); #else # error "define mi_random_next for this platform" #endif } /* ---------------------------------------------------------------------------- To initialize a fresh random context. If we cannot get good randomness, we fall back to weak randomness based on a timer and ASLR. -----------------------------------------------------------------------------*/ uintptr_t _mi_os_random_weak(uintptr_t extra_seed) { uintptr_t x = (uintptr_t)&_mi_os_random_weak ^ extra_seed; // ASLR makes the address random x ^= _mi_prim_clock_now(); // and do a few randomization steps uintptr_t max = ((x ^ (x >> 17)) & 0x0F) + 1; for (uintptr_t i = 0; i < max; i++) { x = _mi_random_shuffle(x); } mi_assert_internal(x != 0); return x; } static void mi_random_init_ex(mi_random_ctx_t* ctx, bool use_weak) { uint8_t key[32]; if (use_weak || !_mi_prim_random_buf(key, sizeof(key))) { // if we fail to get random data from the OS, we fall back to a // weak random source based on the current time #if !defined(__wasi__) if (!use_weak) { _mi_warning_message("unable to use secure randomness\n"); } #endif uintptr_t x = _mi_os_random_weak(0); for (size_t i = 0; i < 8; i++) { // key is eight 32-bit words. x = _mi_random_shuffle(x); ((uint32_t*)key)[i] = (uint32_t)x; } ctx->weak = true; } else { ctx->weak = false; } chacha_init(ctx, key, (uintptr_t)ctx /*nonce*/ ); } void _mi_random_init(mi_random_ctx_t* ctx) { mi_random_init_ex(ctx, false); } void _mi_random_init_weak(mi_random_ctx_t * ctx) { mi_random_init_ex(ctx, true); } void _mi_random_reinit_if_weak(mi_random_ctx_t * ctx) { if (ctx->weak) { _mi_random_init(ctx); } } /* -------------------------------------------------------- test vectors from ----------------------------------------------------------- */ /* static bool array_equals(uint32_t* x, uint32_t* y, size_t n) { for (size_t i = 0; i < n; i++) { if (x[i] != y[i]) return false; } return true; } static void chacha_test(void) { uint32_t x[4] = { 0x11111111, 0x01020304, 0x9b8d6f43, 0x01234567 }; uint32_t x_out[4] = { 0xea2a92f4, 0xcb1cf8ce, 0x4581472e, 0x5881c4bb }; qround(x, 0, 1, 2, 3); mi_assert_internal(array_equals(x, x_out, 4)); uint32_t y[16] = { 0x879531e0, 0xc5ecf37d, 0x516461b1, 0xc9a62f8a, 0x44c20ef3, 0x3390af7f, 0xd9fc690b, 0x2a5f714c, 0x53372767, 0xb00a5631, 0x974c541a, 0x359e9963, 0x5c971061, 0x3d631689, 0x2098d9d6, 0x91dbd320 }; uint32_t y_out[16] = { 0x879531e0, 0xc5ecf37d, 0xbdb886dc, 0xc9a62f8a, 0x44c20ef3, 0x3390af7f, 0xd9fc690b, 0xcfacafd2, 0xe46bea80, 0xb00a5631, 0x974c541a, 0x359e9963, 0x5c971061, 0xccc07c79, 0x2098d9d6, 0x91dbd320 }; qround(y, 2, 7, 8, 13); mi_assert_internal(array_equals(y, y_out, 16)); mi_random_ctx_t r = { { 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574, 0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c, 0x13121110, 0x17161514, 0x1b1a1918, 0x1f1e1d1c, 0x00000001, 0x09000000, 0x4a000000, 0x00000000 }, {0}, 0 }; uint32_t r_out[16] = { 0xe4e7f110, 0x15593bd1, 0x1fdd0f50, 0xc47120a3, 0xc7f4d1c7, 0x0368c033, 0x9aaa2204, 0x4e6cd4c3, 0x466482d2, 0x09aa9f07, 0x05d7c214, 0xa2028bd9, 0xd19c12b5, 0xb94e16de, 0xe883d0cb, 0x4e3c50a2 }; chacha_block(&r); mi_assert_internal(array_equals(r.output, r_out, 16)); } */ ================================================ FILE: third-party/mimalloc/src/segment-map.c ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2019-2023, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ /* ----------------------------------------------------------- The following functions are to reliably find the segment or block that encompasses any pointer p (or NULL if it is not in any of our segments). We maintain a bitmap of all memory with 1 bit per MI_SEGMENT_SIZE (64MiB) set to 1 if it contains the segment meta data. ----------------------------------------------------------- */ #include "mimalloc.h" #include "mimalloc/internal.h" #include "mimalloc/atomic.h" // Reduce total address space to reduce .bss (due to the `mi_segment_map`) #if (MI_INTPTR_SIZE > 4) && MI_TRACK_ASAN #define MI_SEGMENT_MAP_MAX_ADDRESS (128*1024ULL*MI_GiB) // 128 TiB (see issue #881) #elif (MI_INTPTR_SIZE > 4) #define MI_SEGMENT_MAP_MAX_ADDRESS (48*1024ULL*MI_GiB) // 48 TiB #else #define MI_SEGMENT_MAP_MAX_ADDRESS (UINT32_MAX) #endif #define MI_SEGMENT_MAP_PART_SIZE (MI_INTPTR_SIZE*MI_KiB - 128) // 128 > sizeof(mi_memid_t) ! #define MI_SEGMENT_MAP_PART_BITS (8*MI_SEGMENT_MAP_PART_SIZE) #define MI_SEGMENT_MAP_PART_ENTRIES (MI_SEGMENT_MAP_PART_SIZE / MI_INTPTR_SIZE) #define MI_SEGMENT_MAP_PART_BIT_SPAN (MI_SEGMENT_ALIGN) // memory area covered by 1 bit #if (MI_SEGMENT_MAP_PART_BITS < (MI_SEGMENT_MAP_MAX_ADDRESS / MI_SEGMENT_MAP_PART_BIT_SPAN)) // prevent overflow on 32-bit (issue #1017) #define MI_SEGMENT_MAP_PART_SPAN (MI_SEGMENT_MAP_PART_BITS * MI_SEGMENT_MAP_PART_BIT_SPAN) #else #define MI_SEGMENT_MAP_PART_SPAN MI_SEGMENT_MAP_MAX_ADDRESS #endif #define MI_SEGMENT_MAP_MAX_PARTS ((MI_SEGMENT_MAP_MAX_ADDRESS / MI_SEGMENT_MAP_PART_SPAN) + 1) // A part of the segment map. typedef struct mi_segmap_part_s { mi_memid_t memid; _Atomic(uintptr_t) map[MI_SEGMENT_MAP_PART_ENTRIES]; } mi_segmap_part_t; // Allocate parts on-demand to reduce .bss footprint static _Atomic(mi_segmap_part_t*) mi_segment_map[MI_SEGMENT_MAP_MAX_PARTS]; // = { NULL, .. } static mi_segmap_part_t* mi_segment_map_index_of(const mi_segment_t* segment, bool create_on_demand, size_t* idx, size_t* bitidx) { // note: segment can be invalid or NULL. mi_assert_internal(_mi_ptr_segment(segment + 1) == segment); // is it aligned on MI_SEGMENT_SIZE? *idx = 0; *bitidx = 0; if ((uintptr_t)segment >= MI_SEGMENT_MAP_MAX_ADDRESS) return NULL; const uintptr_t segindex = ((uintptr_t)segment) / MI_SEGMENT_MAP_PART_SPAN; if (segindex >= MI_SEGMENT_MAP_MAX_PARTS) return NULL; mi_segmap_part_t* part = mi_atomic_load_ptr_relaxed(mi_segmap_part_t, &mi_segment_map[segindex]); // allocate on demand to reduce .bss footprint if mi_unlikely(part == NULL) { if (!create_on_demand) return NULL; mi_memid_t memid; part = (mi_segmap_part_t*)_mi_os_alloc(sizeof(mi_segmap_part_t), &memid); if (part == NULL) return NULL; part->memid = memid; mi_segmap_part_t* expected = NULL; if (!mi_atomic_cas_ptr_strong_release(mi_segmap_part_t, &mi_segment_map[segindex], &expected, part)) { _mi_os_free(part, sizeof(mi_segmap_part_t), memid); part = expected; if (part == NULL) return NULL; } } mi_assert(part != NULL); const uintptr_t offset = ((uintptr_t)segment) % MI_SEGMENT_MAP_PART_SPAN; const uintptr_t bitofs = offset / MI_SEGMENT_MAP_PART_BIT_SPAN; *idx = bitofs / MI_INTPTR_BITS; *bitidx = bitofs % MI_INTPTR_BITS; return part; } void _mi_segment_map_allocated_at(const mi_segment_t* segment) { if (segment->memid.memkind == MI_MEM_ARENA) return; // we lookup segments first in the arena's and don't need the segment map size_t index; size_t bitidx; mi_segmap_part_t* part = mi_segment_map_index_of(segment, true /* alloc map if needed */, &index, &bitidx); if (part == NULL) return; // outside our address range.. uintptr_t mask = mi_atomic_load_relaxed(&part->map[index]); uintptr_t newmask; do { newmask = (mask | ((uintptr_t)1 << bitidx)); } while (!mi_atomic_cas_weak_release(&part->map[index], &mask, newmask)); } void _mi_segment_map_freed_at(const mi_segment_t* segment) { if (segment->memid.memkind == MI_MEM_ARENA) return; size_t index; size_t bitidx; mi_segmap_part_t* part = mi_segment_map_index_of(segment, false /* don't alloc if not present */, &index, &bitidx); if (part == NULL) return; // outside our address range.. uintptr_t mask = mi_atomic_load_relaxed(&part->map[index]); uintptr_t newmask; do { newmask = (mask & ~((uintptr_t)1 << bitidx)); } while (!mi_atomic_cas_weak_release(&part->map[index], &mask, newmask)); } // Determine the segment belonging to a pointer or NULL if it is not in a valid segment. static mi_segment_t* _mi_segment_of(const void* p) { if (p == NULL) return NULL; mi_segment_t* segment = _mi_ptr_segment(p); // segment can be NULL size_t index; size_t bitidx; mi_segmap_part_t* part = mi_segment_map_index_of(segment, false /* dont alloc if not present */, &index, &bitidx); if (part == NULL) return NULL; const uintptr_t mask = mi_atomic_load_relaxed(&part->map[index]); if mi_likely((mask & ((uintptr_t)1 << bitidx)) != 0) { bool cookie_ok = (_mi_ptr_cookie(segment) == segment->cookie); mi_assert_internal(cookie_ok); MI_UNUSED(cookie_ok); return segment; // yes, allocated by us } return NULL; } // Is this a valid pointer in our heap? static bool mi_is_valid_pointer(const void* p) { // first check if it is in an arena, then check if it is OS allocated return (_mi_arena_contains(p) || _mi_segment_of(p) != NULL); } mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept { return mi_is_valid_pointer(p); } void _mi_segment_map_unsafe_destroy(void) { for (size_t i = 0; i < MI_SEGMENT_MAP_MAX_PARTS; i++) { mi_segmap_part_t* part = mi_atomic_exchange_ptr_relaxed(mi_segmap_part_t, &mi_segment_map[i], NULL); if (part != NULL) { _mi_os_free(part, sizeof(mi_segmap_part_t), part->memid); } } } ================================================ FILE: third-party/mimalloc/src/segment.c ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2024, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #include "mimalloc.h" #include "mimalloc/internal.h" #include "mimalloc/atomic.h" #include // memset #include // ------------------------------------------------------------------- // Segments // mimalloc pages reside in segments. See `mi_segment_valid` for invariants. // ------------------------------------------------------------------- static void mi_segment_try_purge(mi_segment_t* segment, bool force); // ------------------------------------------------------------------- // commit mask // ------------------------------------------------------------------- static bool mi_commit_mask_all_set(const mi_commit_mask_t* commit, const mi_commit_mask_t* cm) { for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) { if ((commit->mask[i] & cm->mask[i]) != cm->mask[i]) return false; } return true; } static bool mi_commit_mask_any_set(const mi_commit_mask_t* commit, const mi_commit_mask_t* cm) { for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) { if ((commit->mask[i] & cm->mask[i]) != 0) return true; } return false; } static void mi_commit_mask_create_intersect(const mi_commit_mask_t* commit, const mi_commit_mask_t* cm, mi_commit_mask_t* res) { for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) { res->mask[i] = (commit->mask[i] & cm->mask[i]); } } static void mi_commit_mask_clear(mi_commit_mask_t* res, const mi_commit_mask_t* cm) { for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) { res->mask[i] &= ~(cm->mask[i]); } } static void mi_commit_mask_set(mi_commit_mask_t* res, const mi_commit_mask_t* cm) { for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) { res->mask[i] |= cm->mask[i]; } } static void mi_commit_mask_create(size_t bitidx, size_t bitcount, mi_commit_mask_t* cm) { mi_assert_internal(bitidx < MI_COMMIT_MASK_BITS); mi_assert_internal((bitidx + bitcount) <= MI_COMMIT_MASK_BITS); if (bitcount == MI_COMMIT_MASK_BITS) { mi_assert_internal(bitidx==0); mi_commit_mask_create_full(cm); } else if (bitcount == 0) { mi_commit_mask_create_empty(cm); } else { mi_commit_mask_create_empty(cm); size_t i = bitidx / MI_COMMIT_MASK_FIELD_BITS; size_t ofs = bitidx % MI_COMMIT_MASK_FIELD_BITS; while (bitcount > 0) { mi_assert_internal(i < MI_COMMIT_MASK_FIELD_COUNT); size_t avail = MI_COMMIT_MASK_FIELD_BITS - ofs; size_t count = (bitcount > avail ? avail : bitcount); size_t mask = (count >= MI_COMMIT_MASK_FIELD_BITS ? ~((size_t)0) : (((size_t)1 << count) - 1) << ofs); cm->mask[i] = mask; bitcount -= count; ofs = 0; i++; } } } size_t _mi_commit_mask_committed_size(const mi_commit_mask_t* cm, size_t total) { mi_assert_internal((total%MI_COMMIT_MASK_BITS)==0); size_t count = 0; for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) { size_t mask = cm->mask[i]; if (~mask == 0) { count += MI_COMMIT_MASK_FIELD_BITS; } else { for (; mask != 0; mask >>= 1) { // todo: use popcount if ((mask&1)!=0) count++; } } } // we use total since for huge segments each commit bit may represent a larger size return ((total / MI_COMMIT_MASK_BITS) * count); } size_t _mi_commit_mask_next_run(const mi_commit_mask_t* cm, size_t* idx) { size_t i = (*idx) / MI_COMMIT_MASK_FIELD_BITS; size_t ofs = (*idx) % MI_COMMIT_MASK_FIELD_BITS; size_t mask = 0; // find first ones while (i < MI_COMMIT_MASK_FIELD_COUNT) { mask = cm->mask[i]; mask >>= ofs; if (mask != 0) { while ((mask&1) == 0) { mask >>= 1; ofs++; } break; } i++; ofs = 0; } if (i >= MI_COMMIT_MASK_FIELD_COUNT) { // not found *idx = MI_COMMIT_MASK_BITS; return 0; } else { // found, count ones size_t count = 0; *idx = (i*MI_COMMIT_MASK_FIELD_BITS) + ofs; do { mi_assert_internal(ofs < MI_COMMIT_MASK_FIELD_BITS && (mask&1) == 1); do { count++; mask >>= 1; } while ((mask&1) == 1); if ((((*idx + count) % MI_COMMIT_MASK_FIELD_BITS) == 0)) { i++; if (i >= MI_COMMIT_MASK_FIELD_COUNT) break; mask = cm->mask[i]; ofs = 0; } } while ((mask&1) == 1); mi_assert_internal(count > 0); return count; } } /* -------------------------------------------------------------------------------- Segment allocation We allocate pages inside bigger "segments" (32 MiB on 64-bit). This is to avoid splitting VMA's on Linux and reduce fragmentation on other OS's. Each thread owns its own segments. Currently we have: - small pages (64KiB) - medium pages (512KiB) - large pages (4MiB), - huge segments have 1 page in one segment that can be larger than `MI_SEGMENT_SIZE`. it is used for blocks `> MI_LARGE_OBJ_SIZE_MAX` or with alignment `> MI_BLOCK_ALIGNMENT_MAX`. The memory for a segment is usually committed on demand. (i.e. we are careful to not touch the memory until we actually allocate a block there) If a thread ends, it "abandons" pages that still contain live blocks. Such segments are abandoned and these can be reclaimed by still running threads, (much like work-stealing). -------------------------------------------------------------------------------- */ /* ----------------------------------------------------------- Slices ----------------------------------------------------------- */ static const mi_slice_t* mi_segment_slices_end(const mi_segment_t* segment) { return &segment->slices[segment->slice_entries]; } static uint8_t* mi_slice_start(const mi_slice_t* slice) { mi_segment_t* segment = _mi_ptr_segment(slice); mi_assert_internal(slice >= segment->slices && slice < mi_segment_slices_end(segment)); return ((uint8_t*)segment + ((slice - segment->slices)*MI_SEGMENT_SLICE_SIZE)); } /* ----------------------------------------------------------- Bins ----------------------------------------------------------- */ // Use bit scan forward to quickly find the first zero bit if it is available static inline size_t mi_slice_bin8(size_t slice_count) { if (slice_count<=1) return slice_count; mi_assert_internal(slice_count <= MI_SLICES_PER_SEGMENT); slice_count--; size_t s = mi_bsr(slice_count); // slice_count > 1 if (s <= 2) return slice_count + 1; size_t bin = ((s << 2) | ((slice_count >> (s - 2))&0x03)) - 4; return bin; } static inline size_t mi_slice_bin(size_t slice_count) { mi_assert_internal(slice_count*MI_SEGMENT_SLICE_SIZE <= MI_SEGMENT_SIZE); mi_assert_internal(mi_slice_bin8(MI_SLICES_PER_SEGMENT) <= MI_SEGMENT_BIN_MAX); size_t bin = mi_slice_bin8(slice_count); mi_assert_internal(bin <= MI_SEGMENT_BIN_MAX); return bin; } static inline size_t mi_slice_index(const mi_slice_t* slice) { mi_segment_t* segment = _mi_ptr_segment(slice); ptrdiff_t index = slice - segment->slices; mi_assert_internal(index >= 0 && index < (ptrdiff_t)segment->slice_entries); return index; } /* ----------------------------------------------------------- Slice span queues ----------------------------------------------------------- */ static void mi_span_queue_push(mi_span_queue_t* sq, mi_slice_t* slice) { // todo: or push to the end? mi_assert_internal(slice->prev == NULL && slice->next==NULL); slice->prev = NULL; // paranoia slice->next = sq->first; sq->first = slice; if (slice->next != NULL) slice->next->prev = slice; else sq->last = slice; slice->block_size = 0; // free } static mi_span_queue_t* mi_span_queue_for(size_t slice_count, mi_segments_tld_t* tld) { size_t bin = mi_slice_bin(slice_count); mi_span_queue_t* sq = &tld->spans[bin]; mi_assert_internal(sq->slice_count >= slice_count); return sq; } static void mi_span_queue_delete(mi_span_queue_t* sq, mi_slice_t* slice) { mi_assert_internal(slice->block_size==0 && slice->slice_count>0 && slice->slice_offset==0); // should work too if the queue does not contain slice (which can happen during reclaim) if (slice->prev != NULL) slice->prev->next = slice->next; if (slice == sq->first) sq->first = slice->next; if (slice->next != NULL) slice->next->prev = slice->prev; if (slice == sq->last) sq->last = slice->prev; slice->prev = NULL; slice->next = NULL; slice->block_size = 1; // no more free } /* ----------------------------------------------------------- Invariant checking ----------------------------------------------------------- */ static bool mi_slice_is_used(const mi_slice_t* slice) { return (slice->block_size > 0); } #if (MI_DEBUG>=3) static bool mi_span_queue_contains(mi_span_queue_t* sq, mi_slice_t* slice) { for (mi_slice_t* s = sq->first; s != NULL; s = s->next) { if (s==slice) return true; } return false; } static bool mi_segment_is_valid(mi_segment_t* segment, mi_segments_tld_t* tld) { mi_assert_internal(segment != NULL); mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie); mi_assert_internal(segment->abandoned <= segment->used); mi_assert_internal(segment->thread_id == 0 || segment->thread_id == _mi_thread_id()); mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->purge_mask)); // can only decommit committed blocks //mi_assert_internal(segment->segment_info_size % MI_SEGMENT_SLICE_SIZE == 0); mi_slice_t* slice = &segment->slices[0]; const mi_slice_t* end = mi_segment_slices_end(segment); size_t used_count = 0; mi_span_queue_t* sq; while(slice < end) { mi_assert_internal(slice->slice_count > 0); mi_assert_internal(slice->slice_offset == 0); size_t index = mi_slice_index(slice); size_t maxindex = (index + slice->slice_count >= segment->slice_entries ? segment->slice_entries : index + slice->slice_count) - 1; if (mi_slice_is_used(slice)) { // a page in use, we need at least MAX_SLICE_OFFSET_COUNT valid back offsets used_count++; mi_assert_internal(slice->is_huge == (segment->kind == MI_SEGMENT_HUGE)); for (size_t i = 0; i <= MI_MAX_SLICE_OFFSET_COUNT && index + i <= maxindex; i++) { mi_assert_internal(segment->slices[index + i].slice_offset == i*sizeof(mi_slice_t)); mi_assert_internal(i==0 || segment->slices[index + i].slice_count == 0); mi_assert_internal(i==0 || segment->slices[index + i].block_size == 1); } // and the last entry as well (for coalescing) const mi_slice_t* last = slice + slice->slice_count - 1; if (last > slice && last < mi_segment_slices_end(segment)) { mi_assert_internal(last->slice_offset == (slice->slice_count-1)*sizeof(mi_slice_t)); mi_assert_internal(last->slice_count == 0); mi_assert_internal(last->block_size == 1); } } else { // free range of slices; only last slice needs a valid back offset mi_slice_t* last = &segment->slices[maxindex]; if (segment->kind != MI_SEGMENT_HUGE || slice->slice_count <= (segment->slice_entries - segment->segment_info_slices)) { mi_assert_internal((uint8_t*)slice == (uint8_t*)last - last->slice_offset); } mi_assert_internal(slice == last || last->slice_count == 0 ); mi_assert_internal(last->block_size == 0 || (segment->kind==MI_SEGMENT_HUGE && last->block_size==1)); if (segment->kind != MI_SEGMENT_HUGE && segment->thread_id != 0) { // segment is not huge or abandoned sq = mi_span_queue_for(slice->slice_count,tld); mi_assert_internal(mi_span_queue_contains(sq,slice)); } } slice = &segment->slices[maxindex+1]; } mi_assert_internal(slice == end); mi_assert_internal(used_count == segment->used + 1); return true; } #endif /* ----------------------------------------------------------- Segment size calculations ----------------------------------------------------------- */ static size_t mi_segment_info_size(mi_segment_t* segment) { return segment->segment_info_slices * MI_SEGMENT_SLICE_SIZE; } static uint8_t* _mi_segment_page_start_from_slice(const mi_segment_t* segment, const mi_slice_t* slice, size_t block_size, size_t* page_size) { const ptrdiff_t idx = slice - segment->slices; const size_t psize = (size_t)slice->slice_count * MI_SEGMENT_SLICE_SIZE; uint8_t* const pstart = (uint8_t*)segment + (idx*MI_SEGMENT_SLICE_SIZE); // make the start not OS page aligned for smaller blocks to avoid page/cache effects // note: the offset must always be a block_size multiple since we assume small allocations // are aligned (see `mi_heap_malloc_aligned`). size_t start_offset = 0; if (block_size > 0 && block_size <= MI_MAX_ALIGN_GUARANTEE) { // for small objects, ensure the page start is aligned with the block size (PR#66 by kickunderscore) const size_t adjust = block_size - ((uintptr_t)pstart % block_size); if (adjust < block_size && psize >= block_size + adjust) { start_offset += adjust; } } if (block_size >= MI_INTPTR_SIZE) { if (block_size <= 64) { start_offset += 3*block_size; } else if (block_size <= 512) { start_offset += block_size; } } start_offset = _mi_align_up(start_offset, MI_MAX_ALIGN_SIZE); mi_assert_internal(_mi_is_aligned(pstart + start_offset, MI_MAX_ALIGN_SIZE)); mi_assert_internal(block_size == 0 || block_size > MI_MAX_ALIGN_GUARANTEE || _mi_is_aligned(pstart + start_offset,block_size)); if (page_size != NULL) { *page_size = psize - start_offset; } return (pstart + start_offset); } // Start of the page available memory; can be used on uninitialized pages uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size) { const mi_slice_t* slice = mi_page_to_slice((mi_page_t*)page); uint8_t* p = _mi_segment_page_start_from_slice(segment, slice, mi_page_block_size(page), page_size); mi_assert_internal(mi_page_block_size(page) > 0 || _mi_ptr_page(p) == page); mi_assert_internal(_mi_ptr_segment(p) == segment); return p; } static size_t mi_segment_calculate_slices(size_t required, size_t* info_slices) { size_t page_size = _mi_os_page_size(); size_t isize = _mi_align_up(sizeof(mi_segment_t), page_size); size_t guardsize = 0; if (MI_SECURE>0) { // in secure mode, we set up a protected page in between the segment info // and the page data (and one at the end of the segment) guardsize = page_size; if (required > 0) { required = _mi_align_up(required, MI_SEGMENT_SLICE_SIZE) + page_size; } } isize = _mi_align_up(isize + guardsize, MI_SEGMENT_SLICE_SIZE); if (info_slices != NULL) *info_slices = isize / MI_SEGMENT_SLICE_SIZE; size_t segment_size = (required==0 ? MI_SEGMENT_SIZE : _mi_align_up( required + isize + guardsize, MI_SEGMENT_SLICE_SIZE) ); mi_assert_internal(segment_size % MI_SEGMENT_SLICE_SIZE == 0); return (segment_size / MI_SEGMENT_SLICE_SIZE); } /* ---------------------------------------------------------------------------- Segment caches We keep a small segment cache per thread to increase local reuse and avoid setting/clearing guard pages in secure mode. ------------------------------------------------------------------------------- */ static void mi_segments_track_size(long segment_size, mi_segments_tld_t* tld) { if (segment_size>=0) _mi_stat_increase(&tld->stats->segments,1); else _mi_stat_decrease(&tld->stats->segments,1); tld->count += (segment_size >= 0 ? 1 : -1); if (tld->count > tld->peak_count) tld->peak_count = tld->count; tld->current_size += segment_size; if (tld->current_size > tld->peak_size) tld->peak_size = tld->current_size; } static void mi_segment_os_free(mi_segment_t* segment, mi_segments_tld_t* tld) { segment->thread_id = 0; _mi_segment_map_freed_at(segment); mi_segments_track_size(-((long)mi_segment_size(segment)),tld); if (segment->was_reclaimed) { tld->reclaim_count--; segment->was_reclaimed = false; } if (MI_SECURE>0) { // _mi_os_unprotect(segment, mi_segment_size(segment)); // ensure no more guard pages are set // unprotect the guard pages; we cannot just unprotect the whole segment size as part may be decommitted size_t os_pagesize = _mi_os_page_size(); _mi_os_unprotect((uint8_t*)segment + mi_segment_info_size(segment) - os_pagesize, os_pagesize); uint8_t* end = (uint8_t*)segment + mi_segment_size(segment) - os_pagesize; _mi_os_unprotect(end, os_pagesize); } // purge delayed decommits now? (no, leave it to the arena) // mi_segment_try_purge(segment,true,tld->stats); const size_t size = mi_segment_size(segment); const size_t csize = _mi_commit_mask_committed_size(&segment->commit_mask, size); _mi_arena_free(segment, mi_segment_size(segment), csize, segment->memid); } /* ----------------------------------------------------------- Commit/Decommit ranges ----------------------------------------------------------- */ static void mi_segment_commit_mask(mi_segment_t* segment, bool conservative, uint8_t* p, size_t size, uint8_t** start_p, size_t* full_size, mi_commit_mask_t* cm) { mi_assert_internal(_mi_ptr_segment(p + 1) == segment); mi_assert_internal(segment->kind != MI_SEGMENT_HUGE); mi_commit_mask_create_empty(cm); if (size == 0 || size > MI_SEGMENT_SIZE || segment->kind == MI_SEGMENT_HUGE) return; const size_t segstart = mi_segment_info_size(segment); const size_t segsize = mi_segment_size(segment); if (p >= (uint8_t*)segment + segsize) return; size_t pstart = (p - (uint8_t*)segment); mi_assert_internal(pstart + size <= segsize); size_t start; size_t end; if (conservative) { // decommit conservative start = _mi_align_up(pstart, MI_COMMIT_SIZE); end = _mi_align_down(pstart + size, MI_COMMIT_SIZE); mi_assert_internal(start >= segstart); mi_assert_internal(end <= segsize); } else { // commit liberal start = _mi_align_down(pstart, MI_MINIMAL_COMMIT_SIZE); end = _mi_align_up(pstart + size, MI_MINIMAL_COMMIT_SIZE); } if (pstart >= segstart && start < segstart) { // note: the mask is also calculated for an initial commit of the info area start = segstart; } if (end > segsize) { end = segsize; } mi_assert_internal(start <= pstart && (pstart + size) <= end); mi_assert_internal(start % MI_COMMIT_SIZE==0 && end % MI_COMMIT_SIZE == 0); *start_p = (uint8_t*)segment + start; *full_size = (end > start ? end - start : 0); if (*full_size == 0) return; size_t bitidx = start / MI_COMMIT_SIZE; mi_assert_internal(bitidx < MI_COMMIT_MASK_BITS); size_t bitcount = *full_size / MI_COMMIT_SIZE; // can be 0 if (bitidx + bitcount > MI_COMMIT_MASK_BITS) { _mi_warning_message("commit mask overflow: idx=%zu count=%zu start=%zx end=%zx p=0x%p size=%zu fullsize=%zu\n", bitidx, bitcount, start, end, p, size, *full_size); } mi_assert_internal((bitidx + bitcount) <= MI_COMMIT_MASK_BITS); mi_commit_mask_create(bitidx, bitcount, cm); } static bool mi_segment_commit(mi_segment_t* segment, uint8_t* p, size_t size) { mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->purge_mask)); // commit liberal uint8_t* start = NULL; size_t full_size = 0; mi_commit_mask_t mask; mi_segment_commit_mask(segment, false /* conservative? */, p, size, &start, &full_size, &mask); if (mi_commit_mask_is_empty(&mask) || full_size == 0) return true; if (!mi_commit_mask_all_set(&segment->commit_mask, &mask)) { // committing bool is_zero = false; mi_commit_mask_t cmask; mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask); _mi_stat_decrease(&_mi_stats_main.committed, _mi_commit_mask_committed_size(&cmask, MI_SEGMENT_SIZE)); // adjust for overlap if (!_mi_os_commit(start, full_size, &is_zero)) return false; mi_commit_mask_set(&segment->commit_mask, &mask); } // increase purge expiration when using part of delayed purges -- we assume more allocations are coming soon. if (mi_commit_mask_any_set(&segment->purge_mask, &mask)) { segment->purge_expire = _mi_clock_now() + mi_option_get(mi_option_purge_delay); } // always clear any delayed purges in our range (as they are either committed now) mi_commit_mask_clear(&segment->purge_mask, &mask); return true; } static bool mi_segment_ensure_committed(mi_segment_t* segment, uint8_t* p, size_t size) { mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->purge_mask)); // note: assumes commit_mask is always full for huge segments as otherwise the commit mask bits can overflow if (mi_commit_mask_is_full(&segment->commit_mask) && mi_commit_mask_is_empty(&segment->purge_mask)) return true; // fully committed mi_assert_internal(segment->kind != MI_SEGMENT_HUGE); return mi_segment_commit(segment, p, size); } static bool mi_segment_purge(mi_segment_t* segment, uint8_t* p, size_t size) { mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->purge_mask)); if (!segment->allow_purge) return true; // purge conservative uint8_t* start = NULL; size_t full_size = 0; mi_commit_mask_t mask; mi_segment_commit_mask(segment, true /* conservative? */, p, size, &start, &full_size, &mask); if (mi_commit_mask_is_empty(&mask) || full_size==0) return true; if (mi_commit_mask_any_set(&segment->commit_mask, &mask)) { // purging mi_assert_internal((void*)start != (void*)segment); mi_assert_internal(segment->allow_decommit); const bool decommitted = _mi_os_purge(start, full_size); // reset or decommit if (decommitted) { mi_commit_mask_t cmask; mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask); _mi_stat_increase(&_mi_stats_main.committed, full_size - _mi_commit_mask_committed_size(&cmask, MI_SEGMENT_SIZE)); // adjust for double counting mi_commit_mask_clear(&segment->commit_mask, &mask); } } // always clear any scheduled purges in our range mi_commit_mask_clear(&segment->purge_mask, &mask); return true; } static void mi_segment_schedule_purge(mi_segment_t* segment, uint8_t* p, size_t size) { if (!segment->allow_purge) return; if (mi_option_get(mi_option_purge_delay) == 0) { mi_segment_purge(segment, p, size); } else { // register for future purge in the purge mask uint8_t* start = NULL; size_t full_size = 0; mi_commit_mask_t mask; mi_segment_commit_mask(segment, true /*conservative*/, p, size, &start, &full_size, &mask); if (mi_commit_mask_is_empty(&mask) || full_size==0) return; // update delayed commit mi_assert_internal(segment->purge_expire > 0 || mi_commit_mask_is_empty(&segment->purge_mask)); mi_commit_mask_t cmask; mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask); // only purge what is committed; span_free may try to decommit more mi_commit_mask_set(&segment->purge_mask, &cmask); mi_msecs_t now = _mi_clock_now(); if (segment->purge_expire == 0) { // no previous purgess, initialize now segment->purge_expire = now + mi_option_get(mi_option_purge_delay); } else if (segment->purge_expire <= now) { // previous purge mask already expired if (segment->purge_expire + mi_option_get(mi_option_purge_extend_delay) <= now) { mi_segment_try_purge(segment, true); } else { segment->purge_expire = now + mi_option_get(mi_option_purge_extend_delay); // (mi_option_get(mi_option_purge_delay) / 8); // wait a tiny bit longer in case there is a series of free's } } else { // previous purge mask is not yet expired, increase the expiration by a bit. segment->purge_expire += mi_option_get(mi_option_purge_extend_delay); } } } static void mi_segment_try_purge(mi_segment_t* segment, bool force) { if (!segment->allow_purge || segment->purge_expire == 0 || mi_commit_mask_is_empty(&segment->purge_mask)) return; mi_msecs_t now = _mi_clock_now(); if (!force && now < segment->purge_expire) return; mi_commit_mask_t mask = segment->purge_mask; segment->purge_expire = 0; mi_commit_mask_create_empty(&segment->purge_mask); size_t idx; size_t count; mi_commit_mask_foreach(&mask, idx, count) { // if found, decommit that sequence if (count > 0) { uint8_t* p = (uint8_t*)segment + (idx*MI_COMMIT_SIZE); size_t size = count * MI_COMMIT_SIZE; mi_segment_purge(segment, p, size); } } mi_commit_mask_foreach_end() mi_assert_internal(mi_commit_mask_is_empty(&segment->purge_mask)); } // called from `mi_heap_collect_ex` // this can be called per-page so it is important that try_purge has fast exit path void _mi_segment_collect(mi_segment_t* segment, bool force) { mi_segment_try_purge(segment, force); } /* ----------------------------------------------------------- Span free ----------------------------------------------------------- */ static bool mi_segment_is_abandoned(mi_segment_t* segment) { return (mi_atomic_load_relaxed(&segment->thread_id) == 0); } // note: can be called on abandoned segments static void mi_segment_span_free(mi_segment_t* segment, size_t slice_index, size_t slice_count, bool allow_purge, mi_segments_tld_t* tld) { mi_assert_internal(slice_index < segment->slice_entries); mi_span_queue_t* sq = (segment->kind == MI_SEGMENT_HUGE || mi_segment_is_abandoned(segment) ? NULL : mi_span_queue_for(slice_count,tld)); if (slice_count==0) slice_count = 1; mi_assert_internal(slice_index + slice_count - 1 < segment->slice_entries); // set first and last slice (the intermediates can be undetermined) mi_slice_t* slice = &segment->slices[slice_index]; slice->slice_count = (uint32_t)slice_count; mi_assert_internal(slice->slice_count == slice_count); // no overflow? slice->slice_offset = 0; if (slice_count > 1) { mi_slice_t* last = slice + slice_count - 1; mi_slice_t* end = (mi_slice_t*)mi_segment_slices_end(segment); if (last > end) { last = end; } last->slice_count = 0; last->slice_offset = (uint32_t)(sizeof(mi_page_t)*(slice_count - 1)); last->block_size = 0; } // perhaps decommit if (allow_purge) { mi_segment_schedule_purge(segment, mi_slice_start(slice), slice_count * MI_SEGMENT_SLICE_SIZE); } // and push it on the free page queue (if it was not a huge page) if (sq != NULL) mi_span_queue_push( sq, slice ); else slice->block_size = 0; // mark huge page as free anyways } /* // called from reclaim to add existing free spans static void mi_segment_span_add_free(mi_slice_t* slice, mi_segments_tld_t* tld) { mi_segment_t* segment = _mi_ptr_segment(slice); mi_assert_internal(slice->xblock_size==0 && slice->slice_count>0 && slice->slice_offset==0); size_t slice_index = mi_slice_index(slice); mi_segment_span_free(segment,slice_index,slice->slice_count,tld); } */ static void mi_segment_span_remove_from_queue(mi_slice_t* slice, mi_segments_tld_t* tld) { mi_assert_internal(slice->slice_count > 0 && slice->slice_offset==0 && slice->block_size==0); mi_assert_internal(_mi_ptr_segment(slice)->kind != MI_SEGMENT_HUGE); mi_span_queue_t* sq = mi_span_queue_for(slice->slice_count, tld); mi_span_queue_delete(sq, slice); } // note: can be called on abandoned segments static mi_slice_t* mi_segment_span_free_coalesce(mi_slice_t* slice, mi_segments_tld_t* tld) { mi_assert_internal(slice != NULL && slice->slice_count > 0 && slice->slice_offset == 0); mi_segment_t* const segment = _mi_ptr_segment(slice); // for huge pages, just mark as free but don't add to the queues if (segment->kind == MI_SEGMENT_HUGE) { // issue #691: segment->used can be 0 if the huge page block was freed while abandoned (reclaim will get here in that case) mi_assert_internal((segment->used==0 && slice->block_size==0) || segment->used == 1); // decreased right after this call in `mi_segment_page_clear` slice->block_size = 0; // mark as free anyways // we should mark the last slice `xblock_size=0` now to maintain invariants but we skip it to // avoid a possible cache miss (and the segment is about to be freed) return slice; } // otherwise coalesce the span and add to the free span queues const bool is_abandoned = (segment->thread_id == 0); // mi_segment_is_abandoned(segment); size_t slice_count = slice->slice_count; mi_slice_t* next = slice + slice->slice_count; mi_assert_internal(next <= mi_segment_slices_end(segment)); if (next < mi_segment_slices_end(segment) && next->block_size==0) { // free next block -- remove it from free and merge mi_assert_internal(next->slice_count > 0 && next->slice_offset==0); slice_count += next->slice_count; // extend if (!is_abandoned) { mi_segment_span_remove_from_queue(next, tld); } } if (slice > segment->slices) { mi_slice_t* prev = mi_slice_first(slice - 1); mi_assert_internal(prev >= segment->slices); if (prev->block_size==0) { // free previous slice -- remove it from free and merge mi_assert_internal(prev->slice_count > 0 && prev->slice_offset==0); slice_count += prev->slice_count; slice->slice_count = 0; slice->slice_offset = (uint32_t)((uint8_t*)slice - (uint8_t*)prev); // set the slice offset for `segment_force_abandon` (in case the previous free block is very large). if (!is_abandoned) { mi_segment_span_remove_from_queue(prev, tld); } slice = prev; } } // and add the new free page mi_segment_span_free(segment, mi_slice_index(slice), slice_count, true, tld); return slice; } /* ----------------------------------------------------------- Page allocation ----------------------------------------------------------- */ // Note: may still return NULL if committing the memory failed static mi_page_t* mi_segment_span_allocate(mi_segment_t* segment, size_t slice_index, size_t slice_count) { mi_assert_internal(slice_index < segment->slice_entries); mi_slice_t* const slice = &segment->slices[slice_index]; mi_assert_internal(slice->block_size==0 || slice->block_size==1); // commit before changing the slice data if (!mi_segment_ensure_committed(segment, _mi_segment_page_start_from_slice(segment, slice, 0, NULL), slice_count * MI_SEGMENT_SLICE_SIZE)) { return NULL; // commit failed! } // convert the slices to a page slice->slice_offset = 0; slice->slice_count = (uint32_t)slice_count; mi_assert_internal(slice->slice_count == slice_count); const size_t bsize = slice_count * MI_SEGMENT_SLICE_SIZE; slice->block_size = bsize; mi_page_t* page = mi_slice_to_page(slice); mi_assert_internal(mi_page_block_size(page) == bsize); // set slice back pointers for the first MI_MAX_SLICE_OFFSET_COUNT entries size_t extra = slice_count-1; if (extra > MI_MAX_SLICE_OFFSET_COUNT) extra = MI_MAX_SLICE_OFFSET_COUNT; if (slice_index + extra >= segment->slice_entries) extra = segment->slice_entries - slice_index - 1; // huge objects may have more slices than avaiable entries in the segment->slices mi_slice_t* slice_next = slice + 1; for (size_t i = 1; i <= extra; i++, slice_next++) { slice_next->slice_offset = (uint32_t)(sizeof(mi_slice_t)*i); slice_next->slice_count = 0; slice_next->block_size = 1; } // and also for the last one (if not set already) (the last one is needed for coalescing and for large alignments) // note: the cast is needed for ubsan since the index can be larger than MI_SLICES_PER_SEGMENT for huge allocations (see #543) mi_slice_t* last = slice + slice_count - 1; mi_slice_t* end = (mi_slice_t*)mi_segment_slices_end(segment); if (last > end) last = end; if (last > slice) { last->slice_offset = (uint32_t)(sizeof(mi_slice_t) * (last - slice)); last->slice_count = 0; last->block_size = 1; } // and initialize the page page->is_committed = true; page->is_huge = (segment->kind == MI_SEGMENT_HUGE); segment->used++; return page; } static void mi_segment_slice_split(mi_segment_t* segment, mi_slice_t* slice, size_t slice_count, mi_segments_tld_t* tld) { mi_assert_internal(_mi_ptr_segment(slice) == segment); mi_assert_internal(slice->slice_count >= slice_count); mi_assert_internal(slice->block_size > 0); // no more in free queue if (slice->slice_count <= slice_count) return; mi_assert_internal(segment->kind != MI_SEGMENT_HUGE); size_t next_index = mi_slice_index(slice) + slice_count; size_t next_count = slice->slice_count - slice_count; mi_segment_span_free(segment, next_index, next_count, false /* don't purge left-over part */, tld); slice->slice_count = (uint32_t)slice_count; } static mi_page_t* mi_segments_page_find_and_allocate(size_t slice_count, mi_arena_id_t req_arena_id, mi_segments_tld_t* tld) { mi_assert_internal(slice_count*MI_SEGMENT_SLICE_SIZE <= MI_LARGE_OBJ_SIZE_MAX); // search from best fit up mi_span_queue_t* sq = mi_span_queue_for(slice_count, tld); if (slice_count == 0) slice_count = 1; while (sq <= &tld->spans[MI_SEGMENT_BIN_MAX]) { for (mi_slice_t* slice = sq->first; slice != NULL; slice = slice->next) { if (slice->slice_count >= slice_count) { // found one mi_segment_t* segment = _mi_ptr_segment(slice); if (_mi_arena_memid_is_suitable(segment->memid, req_arena_id)) { // found a suitable page span mi_span_queue_delete(sq, slice); if (slice->slice_count > slice_count) { mi_segment_slice_split(segment, slice, slice_count, tld); } mi_assert_internal(slice != NULL && slice->slice_count == slice_count && slice->block_size > 0); mi_page_t* page = mi_segment_span_allocate(segment, mi_slice_index(slice), slice->slice_count); if (page == NULL) { // commit failed; return NULL but first restore the slice mi_segment_span_free_coalesce(slice, tld); return NULL; } return page; } } } sq++; } // could not find a page.. return NULL; } /* ----------------------------------------------------------- Segment allocation ----------------------------------------------------------- */ static mi_segment_t* mi_segment_os_alloc( size_t required, size_t page_alignment, bool eager_delayed, mi_arena_id_t req_arena_id, size_t* psegment_slices, size_t* pinfo_slices, bool commit, mi_segments_tld_t* tld) { mi_memid_t memid; bool allow_large = (!eager_delayed && (MI_SECURE == 0)); // only allow large OS pages once we are no longer lazy size_t align_offset = 0; size_t alignment = MI_SEGMENT_ALIGN; if (page_alignment > 0) { // mi_assert_internal(huge_page != NULL); mi_assert_internal(page_alignment >= MI_SEGMENT_ALIGN); alignment = page_alignment; const size_t info_size = (*pinfo_slices) * MI_SEGMENT_SLICE_SIZE; align_offset = _mi_align_up( info_size, MI_SEGMENT_ALIGN ); const size_t extra = align_offset - info_size; // recalculate due to potential guard pages *psegment_slices = mi_segment_calculate_slices(required + extra, pinfo_slices); mi_assert_internal(*psegment_slices > 0 && *psegment_slices <= UINT32_MAX); } const size_t segment_size = (*psegment_slices) * MI_SEGMENT_SLICE_SIZE; mi_segment_t* segment = (mi_segment_t*)_mi_arena_alloc_aligned(segment_size, alignment, align_offset, commit, allow_large, req_arena_id, &memid); if (segment == NULL) { return NULL; // failed to allocate } // ensure metadata part of the segment is committed mi_commit_mask_t commit_mask; if (memid.initially_committed) { mi_commit_mask_create_full(&commit_mask); } else { // at least commit the info slices const size_t commit_needed = _mi_divide_up((*pinfo_slices)*MI_SEGMENT_SLICE_SIZE, MI_COMMIT_SIZE); mi_assert_internal(commit_needed>0); mi_commit_mask_create(0, commit_needed, &commit_mask); mi_assert_internal(commit_needed*MI_COMMIT_SIZE >= (*pinfo_slices)*MI_SEGMENT_SLICE_SIZE); if (!_mi_os_commit(segment, commit_needed*MI_COMMIT_SIZE, NULL)) { _mi_arena_free(segment,segment_size,0,memid); return NULL; } } mi_assert_internal(segment != NULL && (uintptr_t)segment % MI_SEGMENT_SIZE == 0); segment->memid = memid; segment->allow_decommit = !memid.is_pinned; segment->allow_purge = segment->allow_decommit && (mi_option_get(mi_option_purge_delay) >= 0); segment->segment_size = segment_size; segment->subproc = tld->subproc; segment->commit_mask = commit_mask; segment->purge_expire = 0; mi_commit_mask_create_empty(&segment->purge_mask); mi_segments_track_size((long)(segment_size), tld); _mi_segment_map_allocated_at(segment); return segment; } // Allocate a segment from the OS aligned to `MI_SEGMENT_SIZE` . static mi_segment_t* mi_segment_alloc(size_t required, size_t page_alignment, mi_arena_id_t req_arena_id, mi_segments_tld_t* tld, mi_page_t** huge_page) { mi_assert_internal((required==0 && huge_page==NULL) || (required>0 && huge_page != NULL)); // calculate needed sizes first size_t info_slices; size_t segment_slices = mi_segment_calculate_slices(required, &info_slices); mi_assert_internal(segment_slices > 0 && segment_slices <= UINT32_MAX); // Commit eagerly only if not the first N lazy segments (to reduce impact of many threads that allocate just a little) const bool eager_delay = (// !_mi_os_has_overcommit() && // never delay on overcommit systems _mi_current_thread_count() > 1 && // do not delay for the first N threads tld->peak_count < (size_t)mi_option_get(mi_option_eager_commit_delay)); const bool eager = !eager_delay && mi_option_is_enabled(mi_option_eager_commit); bool commit = eager || (required > 0); // Allocate the segment from the OS mi_segment_t* segment = mi_segment_os_alloc(required, page_alignment, eager_delay, req_arena_id, &segment_slices, &info_slices, commit, tld); if (segment == NULL) return NULL; // zero the segment info? -- not always needed as it may be zero initialized from the OS if (!segment->memid.initially_zero) { ptrdiff_t ofs = offsetof(mi_segment_t, next); size_t prefix = offsetof(mi_segment_t, slices) - ofs; size_t zsize = prefix + (sizeof(mi_slice_t) * (segment_slices + 1)); // one more _mi_memzero((uint8_t*)segment + ofs, zsize); } // initialize the rest of the segment info const size_t slice_entries = (segment_slices > MI_SLICES_PER_SEGMENT ? MI_SLICES_PER_SEGMENT : segment_slices); segment->segment_slices = segment_slices; segment->segment_info_slices = info_slices; segment->thread_id = _mi_thread_id(); segment->cookie = _mi_ptr_cookie(segment); segment->slice_entries = slice_entries; segment->kind = (required == 0 ? MI_SEGMENT_NORMAL : MI_SEGMENT_HUGE); // _mi_memzero(segment->slices, sizeof(mi_slice_t)*(info_slices+1)); _mi_stat_increase(&tld->stats->page_committed, mi_segment_info_size(segment)); // set up guard pages size_t guard_slices = 0; if (MI_SECURE>0) { // in secure mode, we set up a protected page in between the segment info // and the page data, and at the end of the segment. size_t os_pagesize = _mi_os_page_size(); _mi_os_protect((uint8_t*)segment + mi_segment_info_size(segment) - os_pagesize, os_pagesize); uint8_t* end = (uint8_t*)segment + mi_segment_size(segment) - os_pagesize; mi_segment_ensure_committed(segment, end, os_pagesize); _mi_os_protect(end, os_pagesize); if (slice_entries == segment_slices) segment->slice_entries--; // don't use the last slice :-( guard_slices = 1; } // reserve first slices for segment info mi_page_t* page0 = mi_segment_span_allocate(segment, 0, info_slices); mi_assert_internal(page0!=NULL); if (page0==NULL) return NULL; // cannot fail as we always commit in advance mi_assert_internal(segment->used == 1); segment->used = 0; // don't count our internal slices towards usage // initialize initial free pages if (segment->kind == MI_SEGMENT_NORMAL) { // not a huge page mi_assert_internal(huge_page==NULL); mi_segment_span_free(segment, info_slices, segment->slice_entries - info_slices, false /* don't purge */, tld); } else { mi_assert_internal(huge_page!=NULL); mi_assert_internal(mi_commit_mask_is_empty(&segment->purge_mask)); mi_assert_internal(mi_commit_mask_is_full(&segment->commit_mask)); *huge_page = mi_segment_span_allocate(segment, info_slices, segment_slices - info_slices - guard_slices); mi_assert_internal(*huge_page != NULL); // cannot fail as we commit in advance } mi_assert_expensive(mi_segment_is_valid(segment,tld)); return segment; } static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t* tld) { MI_UNUSED(force); mi_assert_internal(segment != NULL); mi_assert_internal(segment->next == NULL); mi_assert_internal(segment->used == 0); // in `mi_segment_force_abandon` we set this to true to ensure the segment's memory stays valid if (segment->dont_free) return; // Remove the free pages mi_slice_t* slice = &segment->slices[0]; const mi_slice_t* end = mi_segment_slices_end(segment); #if MI_DEBUG>1 size_t page_count = 0; #endif while (slice < end) { mi_assert_internal(slice->slice_count > 0); mi_assert_internal(slice->slice_offset == 0); mi_assert_internal(mi_slice_index(slice)==0 || slice->block_size == 0); // no more used pages .. if (slice->block_size == 0 && segment->kind != MI_SEGMENT_HUGE) { mi_segment_span_remove_from_queue(slice, tld); } #if MI_DEBUG>1 page_count++; #endif slice = slice + slice->slice_count; } mi_assert_internal(page_count == 2); // first page is allocated by the segment itself // stats // _mi_stat_decrease(&tld->stats->page_committed, mi_segment_info_size(segment)); // return it to the OS mi_segment_os_free(segment, tld); } /* ----------------------------------------------------------- Page Free ----------------------------------------------------------- */ static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld); // note: can be called on abandoned pages static mi_slice_t* mi_segment_page_clear(mi_page_t* page, mi_segments_tld_t* tld) { mi_assert_internal(page->block_size > 0); mi_assert_internal(mi_page_all_free(page)); mi_segment_t* segment = _mi_ptr_segment(page); mi_assert_internal(segment->used > 0); size_t inuse = page->capacity * mi_page_block_size(page); _mi_stat_decrease(&tld->stats->page_committed, inuse); _mi_stat_decrease(&tld->stats->pages, 1); // reset the page memory to reduce memory pressure? if (segment->allow_decommit && mi_option_is_enabled(mi_option_deprecated_page_reset)) { size_t psize; uint8_t* start = _mi_segment_page_start(segment, page, &psize); _mi_os_reset(start, psize); } // zero the page data, but not the segment fields and heap tag page->is_zero_init = false; uint8_t heap_tag = page->heap_tag; ptrdiff_t ofs = offsetof(mi_page_t, capacity); _mi_memzero((uint8_t*)page + ofs, sizeof(*page) - ofs); page->block_size = 1; page->heap_tag = heap_tag; // and free it mi_slice_t* slice = mi_segment_span_free_coalesce(mi_page_to_slice(page), tld); segment->used--; // cannot assert segment valid as it is called during reclaim // mi_assert_expensive(mi_segment_is_valid(segment, tld)); return slice; } void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld) { mi_assert(page != NULL); mi_segment_t* segment = _mi_page_segment(page); mi_assert_expensive(mi_segment_is_valid(segment,tld)); // mark it as free now mi_segment_page_clear(page, tld); mi_assert_expensive(mi_segment_is_valid(segment, tld)); if (segment->used == 0) { // no more used pages; remove from the free list and free the segment mi_segment_free(segment, force, tld); } else if (segment->used == segment->abandoned) { // only abandoned pages; remove from free list and abandon mi_segment_abandon(segment,tld); } else { // perform delayed purges mi_segment_try_purge(segment, false /* force? */); } } /* ----------------------------------------------------------- Abandonment When threads terminate, they can leave segments with live blocks (reachable through other threads). Such segments are "abandoned" and will be reclaimed by other threads to reuse their pages and/or free them eventually. The `thread_id` of such segments is 0. When a block is freed in an abandoned segment, the segment is reclaimed into that thread. Moreover, if threads are looking for a fresh segment, they will first consider abandoned segments -- these can be found by scanning the arena memory (segments outside arena memoryare only reclaimed by a free). ----------------------------------------------------------- */ /* ----------------------------------------------------------- Abandon segment/page ----------------------------------------------------------- */ static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) { mi_assert_internal(segment->used == segment->abandoned); mi_assert_internal(segment->used > 0); mi_assert_internal(segment->abandoned_visits == 0); mi_assert_expensive(mi_segment_is_valid(segment,tld)); // remove the free pages from the free page queues mi_slice_t* slice = &segment->slices[0]; const mi_slice_t* end = mi_segment_slices_end(segment); while (slice < end) { mi_assert_internal(slice->slice_count > 0); mi_assert_internal(slice->slice_offset == 0); if (slice->block_size == 0) { // a free page mi_segment_span_remove_from_queue(slice,tld); slice->block_size = 0; // but keep it free } slice = slice + slice->slice_count; } // perform delayed decommits (forcing is much slower on mstress) // Only abandoned segments in arena memory can be reclaimed without a free // so if a segment is not from an arena we force purge here to be conservative. const bool force_purge = (segment->memid.memkind != MI_MEM_ARENA) || mi_option_is_enabled(mi_option_abandoned_page_purge); mi_segment_try_purge(segment, force_purge); // all pages in the segment are abandoned; add it to the abandoned list _mi_stat_increase(&tld->stats->segments_abandoned, 1); mi_segments_track_size(-((long)mi_segment_size(segment)), tld); segment->thread_id = 0; segment->abandoned_visits = 1; // from 0 to 1 to signify it is abandoned if (segment->was_reclaimed) { tld->reclaim_count--; segment->was_reclaimed = false; } _mi_arena_segment_mark_abandoned(segment); } void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) { mi_assert(page != NULL); mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE); mi_assert_internal(mi_page_heap(page) == NULL); mi_segment_t* segment = _mi_page_segment(page); mi_assert_expensive(mi_segment_is_valid(segment,tld)); segment->abandoned++; _mi_stat_increase(&tld->stats->pages_abandoned, 1); mi_assert_internal(segment->abandoned <= segment->used); if (segment->used == segment->abandoned) { // all pages are abandoned, abandon the entire segment mi_segment_abandon(segment, tld); } } /* ----------------------------------------------------------- Reclaim abandoned pages ----------------------------------------------------------- */ static mi_slice_t* mi_slices_start_iterate(mi_segment_t* segment, const mi_slice_t** end) { mi_slice_t* slice = &segment->slices[0]; *end = mi_segment_slices_end(segment); mi_assert_internal(slice->slice_count>0 && slice->block_size>0); // segment allocated page slice = slice + slice->slice_count; // skip the first segment allocated page return slice; } // Possibly free pages and check if free space is available static bool mi_segment_check_free(mi_segment_t* segment, size_t slices_needed, size_t block_size, mi_segments_tld_t* tld) { mi_assert_internal(mi_segment_is_abandoned(segment)); bool has_page = false; // for all slices const mi_slice_t* end; mi_slice_t* slice = mi_slices_start_iterate(segment, &end); while (slice < end) { mi_assert_internal(slice->slice_count > 0); mi_assert_internal(slice->slice_offset == 0); if (mi_slice_is_used(slice)) { // used page // ensure used count is up to date and collect potential concurrent frees mi_page_t* const page = mi_slice_to_page(slice); _mi_page_free_collect(page, false); if (mi_page_all_free(page)) { // if this page is all free now, free it without adding to any queues (yet) mi_assert_internal(page->next == NULL && page->prev==NULL); _mi_stat_decrease(&tld->stats->pages_abandoned, 1); segment->abandoned--; slice = mi_segment_page_clear(page, tld); // re-assign slice due to coalesce! mi_assert_internal(!mi_slice_is_used(slice)); if (slice->slice_count >= slices_needed) { has_page = true; } } else if (mi_page_block_size(page) == block_size && mi_page_has_any_available(page)) { // a page has available free blocks of the right size has_page = true; } } else { // empty span if (slice->slice_count >= slices_needed) { has_page = true; } } slice = slice + slice->slice_count; } return has_page; } // Reclaim an abandoned segment; returns NULL if the segment was freed // set `right_page_reclaimed` to `true` if it reclaimed a page of the right `block_size` that was not full. static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap, size_t requested_block_size, bool* right_page_reclaimed, mi_segments_tld_t* tld) { if (right_page_reclaimed != NULL) { *right_page_reclaimed = false; } // can be 0 still with abandoned_next, or already a thread id for segments outside an arena that are reclaimed on a free. mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0 || mi_atomic_load_relaxed(&segment->thread_id) == _mi_thread_id()); mi_assert_internal(segment->subproc == heap->tld->segments.subproc); // only reclaim within the same subprocess mi_atomic_store_release(&segment->thread_id, _mi_thread_id()); segment->abandoned_visits = 0; segment->was_reclaimed = true; tld->reclaim_count++; mi_segments_track_size((long)mi_segment_size(segment), tld); mi_assert_internal(segment->next == NULL); _mi_stat_decrease(&tld->stats->segments_abandoned, 1); // for all slices const mi_slice_t* end; mi_slice_t* slice = mi_slices_start_iterate(segment, &end); while (slice < end) { mi_assert_internal(slice->slice_count > 0); mi_assert_internal(slice->slice_offset == 0); if (mi_slice_is_used(slice)) { // in use: reclaim the page in our heap mi_page_t* page = mi_slice_to_page(slice); mi_assert_internal(page->is_committed); mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE); mi_assert_internal(mi_page_heap(page) == NULL); mi_assert_internal(page->next == NULL && page->prev==NULL); _mi_stat_decrease(&tld->stats->pages_abandoned, 1); segment->abandoned--; // get the target heap for this thread which has a matching heap tag (so we reclaim into a matching heap) mi_heap_t* target_heap = _mi_heap_by_tag(heap, page->heap_tag); // allow custom heaps to separate objects if (target_heap == NULL) { target_heap = heap; _mi_error_message(EFAULT, "page with tag %u cannot be reclaimed by a heap with the same tag (using heap tag %u instead)\n", page->heap_tag, heap->tag ); } // associate the heap with this page, and allow heap thread delayed free again. mi_page_set_heap(page, target_heap); _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, true); // override never (after heap is set) _mi_page_free_collect(page, false); // ensure used count is up to date if (mi_page_all_free(page)) { // if everything free by now, free the page slice = mi_segment_page_clear(page, tld); // set slice again due to coalesceing } else { // otherwise reclaim it into the heap _mi_page_reclaim(target_heap, page); if (requested_block_size == mi_page_block_size(page) && mi_page_has_any_available(page) && heap == target_heap) { if (right_page_reclaimed != NULL) { *right_page_reclaimed = true; } } } } else { // the span is free, add it to our page queues slice = mi_segment_span_free_coalesce(slice, tld); // set slice again due to coalesceing } mi_assert_internal(slice->slice_count>0 && slice->slice_offset==0); slice = slice + slice->slice_count; } mi_assert(segment->abandoned == 0); mi_assert_expensive(mi_segment_is_valid(segment, tld)); if (segment->used == 0) { // due to page_clear mi_assert_internal(right_page_reclaimed == NULL || !(*right_page_reclaimed)); mi_segment_free(segment, false, tld); return NULL; } else { return segment; } } // attempt to reclaim a particular segment (called from multi threaded free `alloc.c:mi_free_block_mt`) bool _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment) { if (mi_atomic_load_relaxed(&segment->thread_id) != 0) return false; // it is not abandoned if (segment->subproc != heap->tld->segments.subproc) return false; // only reclaim within the same subprocess if (!_mi_heap_memid_is_suitable(heap,segment->memid)) return false; // don't reclaim between exclusive and non-exclusive arena's const long target = _mi_option_get_fast(mi_option_target_segments_per_thread); if (target > 0 && (size_t)target <= heap->tld->segments.count) return false; // don't reclaim if going above the target count // don't reclaim more from a `free` call than half the current segments // this is to prevent a pure free-ing thread to start owning too many segments // (but not for out-of-arena segments as that is the main way to be reclaimed for those) if (segment->memid.memkind == MI_MEM_ARENA && heap->tld->segments.reclaim_count * 2 > heap->tld->segments.count) { return false; } if (_mi_arena_segment_clear_abandoned(segment)) { // atomically unabandon mi_segment_t* res = mi_segment_reclaim(segment, heap, 0, NULL, &heap->tld->segments); mi_assert_internal(res == segment); return (res != NULL); } return false; } void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld) { mi_segment_t* segment; mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap, tld->subproc, true /* visit all, blocking */, ¤t); while ((segment = _mi_arena_segment_clear_abandoned_next(¤t)) != NULL) { mi_segment_reclaim(segment, heap, 0, NULL, tld); } _mi_arena_field_cursor_done(¤t); } static bool segment_count_is_within_target(mi_segments_tld_t* tld, size_t* ptarget) { const size_t target = (size_t)mi_option_get_clamp(mi_option_target_segments_per_thread, 0, 1024); if (ptarget != NULL) { *ptarget = target; } return (target == 0 || tld->count < target); } static long mi_segment_get_reclaim_tries(mi_segments_tld_t* tld) { // limit the tries to 10% (default) of the abandoned segments with at least 8 and at most 1024 tries. const size_t perc = (size_t)mi_option_get_clamp(mi_option_max_segment_reclaim, 0, 100); if (perc <= 0) return 0; const size_t total_count = mi_atomic_load_relaxed(&tld->subproc->abandoned_count); if (total_count == 0) return 0; const size_t relative_count = (total_count > 10000 ? (total_count / 100) * perc : (total_count * perc) / 100); // avoid overflow long max_tries = (long)(relative_count <= 1 ? 1 : (relative_count > 1024 ? 1024 : relative_count)); if (max_tries < 8 && total_count > 8) { max_tries = 8; } return max_tries; } static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t needed_slices, size_t block_size, bool* reclaimed, mi_segments_tld_t* tld) { *reclaimed = false; long max_tries = mi_segment_get_reclaim_tries(tld); if (max_tries <= 0) return NULL; mi_segment_t* result = NULL; mi_segment_t* segment = NULL; mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap, tld->subproc, false /* non-blocking */, ¤t); while (segment_count_is_within_target(tld,NULL) && (max_tries-- > 0) && ((segment = _mi_arena_segment_clear_abandoned_next(¤t)) != NULL)) { mi_assert(segment->subproc == heap->tld->segments.subproc); // cursor only visits segments in our sub-process segment->abandoned_visits++; // todo: should we respect numa affinity for abandoned reclaim? perhaps only for the first visit? // todo: an arena exclusive heap will potentially visit many abandoned unsuitable segments and use many tries // Perhaps we can skip non-suitable ones in a better way? bool is_suitable = _mi_heap_memid_is_suitable(heap, segment->memid); bool has_page = mi_segment_check_free(segment,needed_slices,block_size,tld); // try to free up pages (due to concurrent frees) if (segment->used == 0) { // free the segment (by forced reclaim) to make it available to other threads. // note1: we prefer to free a segment as that might lead to reclaiming another // segment that is still partially used. // note2: we could in principle optimize this by skipping reclaim and directly // freeing but that would violate some invariants temporarily) mi_segment_reclaim(segment, heap, 0, NULL, tld); } else if (has_page && is_suitable) { // found a large enough free span, or a page of the right block_size with free space // we return the result of reclaim (which is usually `segment`) as it might free // the segment due to concurrent frees (in which case `NULL` is returned). result = mi_segment_reclaim(segment, heap, block_size, reclaimed, tld); break; } else if (segment->abandoned_visits > 3 && is_suitable) { // always reclaim on 3rd visit to limit the abandoned segment count. mi_segment_reclaim(segment, heap, 0, NULL, tld); } else { // otherwise, push on the visited list so it gets not looked at too quickly again max_tries++; // don't count this as a try since it was not suitable mi_segment_try_purge(segment, false /* true force? */); // force purge if needed as we may not visit soon again _mi_arena_segment_mark_abandoned(segment); } } _mi_arena_field_cursor_done(¤t); return result; } // collect abandoned segments void _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld) { mi_segment_t* segment; mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap, tld->subproc, force /* blocking? */, ¤t); long max_tries = (force ? (long)mi_atomic_load_relaxed(&tld->subproc->abandoned_count) : 1024); // limit latency while ((max_tries-- > 0) && ((segment = _mi_arena_segment_clear_abandoned_next(¤t)) != NULL)) { mi_segment_check_free(segment,0,0,tld); // try to free up pages (due to concurrent frees) if (segment->used == 0) { // free the segment (by forced reclaim) to make it available to other threads. // note: we could in principle optimize this by skipping reclaim and directly // freeing but that would violate some invariants temporarily) mi_segment_reclaim(segment, heap, 0, NULL, tld); } else { // otherwise, purge if needed and push on the visited list // note: forced purge can be expensive if many threads are destroyed/created as in mstress. mi_segment_try_purge(segment, force); _mi_arena_segment_mark_abandoned(segment); } } _mi_arena_field_cursor_done(¤t); } /* ----------------------------------------------------------- Force abandon a segment that is in use by our thread ----------------------------------------------------------- */ // force abandon a segment static void mi_segment_force_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) { mi_assert_internal(!mi_segment_is_abandoned(segment)); mi_assert_internal(!segment->dont_free); // ensure the segment does not get free'd underneath us (so we can check if a page has been freed in `mi_page_force_abandon`) segment->dont_free = true; // for all slices const mi_slice_t* end; mi_slice_t* slice = mi_slices_start_iterate(segment, &end); while (slice < end) { mi_assert_internal(slice->slice_count > 0); mi_assert_internal(slice->slice_offset == 0); if (mi_slice_is_used(slice)) { // ensure used count is up to date and collect potential concurrent frees mi_page_t* const page = mi_slice_to_page(slice); _mi_page_free_collect(page, false); { // abandon the page if it is still in-use (this will free it if possible as well) mi_assert_internal(segment->used > 0); if (segment->used == segment->abandoned+1) { // the last page.. abandon and return as the segment will be abandoned after this // and we should no longer access it. segment->dont_free = false; _mi_page_force_abandon(page); return; } else { // abandon and continue _mi_page_force_abandon(page); // it might be freed, reset the slice (note: relies on coalesce setting the slice_offset) slice = mi_slice_first(slice); } } } slice = slice + slice->slice_count; } segment->dont_free = false; mi_assert(segment->used == segment->abandoned); mi_assert(segment->used == 0); if (segment->used == 0) { // paranoia // all free now mi_segment_free(segment, false, tld); } else { // perform delayed purges mi_segment_try_purge(segment, false /* force? */); } } // try abandon segments. // this should be called from `reclaim_or_alloc` so we know all segments are (about) fully in use. static void mi_segments_try_abandon_to_target(mi_heap_t* heap, size_t target, mi_segments_tld_t* tld) { if (target <= 1) return; const size_t min_target = (target > 4 ? (target*3)/4 : target); // 75% // todo: we should maintain a list of segments per thread; for now, only consider segments from the heap full pages for (int i = 0; i < 64 && tld->count >= min_target; i++) { mi_page_t* page = heap->pages[MI_BIN_FULL].first; while (page != NULL && mi_page_block_size(page) > MI_LARGE_OBJ_SIZE_MAX) { page = page->next; } if (page==NULL) { break; } mi_segment_t* segment = _mi_page_segment(page); mi_segment_force_abandon(segment, tld); mi_assert_internal(page != heap->pages[MI_BIN_FULL].first); // as it is just abandoned } } // try abandon segments. // this should be called from `reclaim_or_alloc` so we know all segments are (about) fully in use. static void mi_segments_try_abandon(mi_heap_t* heap, mi_segments_tld_t* tld) { // we call this when we are about to add a fresh segment so we should be under our target segment count. size_t target = 0; if (segment_count_is_within_target(tld, &target)) return; mi_segments_try_abandon_to_target(heap, target, tld); } void mi_collect_reduce(size_t target_size) mi_attr_noexcept { mi_collect(true); mi_heap_t* heap = mi_heap_get_default(); mi_segments_tld_t* tld = &heap->tld->segments; size_t target = target_size / MI_SEGMENT_SIZE; if (target == 0) { target = (size_t)mi_option_get_clamp(mi_option_target_segments_per_thread, 1, 1024); } mi_segments_try_abandon_to_target(heap, target, tld); } /* ----------------------------------------------------------- Reclaim or allocate ----------------------------------------------------------- */ static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t needed_slices, size_t block_size, mi_segments_tld_t* tld) { mi_assert_internal(block_size <= MI_LARGE_OBJ_SIZE_MAX); // try to abandon some segments to increase reuse between threads mi_segments_try_abandon(heap,tld); // 1. try to reclaim an abandoned segment bool reclaimed; mi_segment_t* segment = mi_segment_try_reclaim(heap, needed_slices, block_size, &reclaimed, tld); if (reclaimed) { // reclaimed the right page right into the heap mi_assert_internal(segment != NULL); return NULL; // pretend out-of-memory as the page will be in the page queue of the heap with available blocks } else if (segment != NULL) { // reclaimed a segment with a large enough empty span in it return segment; } // 2. otherwise allocate a fresh segment return mi_segment_alloc(0, 0, heap->arena_id, tld, NULL); } /* ----------------------------------------------------------- Page allocation ----------------------------------------------------------- */ static mi_page_t* mi_segments_page_alloc(mi_heap_t* heap, mi_page_kind_t page_kind, size_t required, size_t block_size, mi_segments_tld_t* tld) { mi_assert_internal(required <= MI_LARGE_OBJ_SIZE_MAX && page_kind <= MI_PAGE_LARGE); // find a free page size_t page_size = _mi_align_up(required, (required > MI_MEDIUM_PAGE_SIZE ? MI_MEDIUM_PAGE_SIZE : MI_SEGMENT_SLICE_SIZE)); size_t slices_needed = page_size / MI_SEGMENT_SLICE_SIZE; mi_assert_internal(slices_needed * MI_SEGMENT_SLICE_SIZE == page_size); mi_page_t* page = mi_segments_page_find_and_allocate(slices_needed, heap->arena_id, tld); //(required <= MI_SMALL_SIZE_MAX ? 0 : slices_needed), tld); if (page==NULL) { // no free page, allocate a new segment and try again if (mi_segment_reclaim_or_alloc(heap, slices_needed, block_size, tld) == NULL) { // OOM or reclaimed a good page in the heap return NULL; } else { // otherwise try again return mi_segments_page_alloc(heap, page_kind, required, block_size, tld); } } mi_assert_internal(page != NULL && page->slice_count*MI_SEGMENT_SLICE_SIZE == page_size); mi_assert_internal(_mi_ptr_segment(page)->thread_id == _mi_thread_id()); mi_segment_try_purge(_mi_ptr_segment(page), false); return page; } /* ----------------------------------------------------------- Huge page allocation ----------------------------------------------------------- */ static mi_page_t* mi_segment_huge_page_alloc(size_t size, size_t page_alignment, mi_arena_id_t req_arena_id, mi_segments_tld_t* tld) { mi_page_t* page = NULL; mi_segment_t* segment = mi_segment_alloc(size,page_alignment,req_arena_id,tld,&page); if (segment == NULL || page==NULL) return NULL; mi_assert_internal(segment->used==1); mi_assert_internal(mi_page_block_size(page) >= size); #if MI_HUGE_PAGE_ABANDON segment->thread_id = 0; // huge segments are immediately abandoned #endif // for huge pages we initialize the block_size as we may // overallocate to accommodate large alignments. size_t psize; uint8_t* start = _mi_segment_page_start(segment, page, &psize); page->block_size = psize; mi_assert_internal(page->is_huge); // decommit the part of the prefix of a page that will not be used; this can be quite large (close to MI_SEGMENT_SIZE) if (page_alignment > 0 && segment->allow_decommit) { uint8_t* aligned_p = (uint8_t*)_mi_align_up((uintptr_t)start, page_alignment); mi_assert_internal(_mi_is_aligned(aligned_p, page_alignment)); mi_assert_internal(psize - (aligned_p - start) >= size); uint8_t* decommit_start = start + sizeof(mi_block_t); // for the free list ptrdiff_t decommit_size = aligned_p - decommit_start; _mi_os_reset(decommit_start, decommit_size); // note: cannot use segment_decommit on huge segments } return page; } #if MI_HUGE_PAGE_ABANDON // free huge block from another thread void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block) { // huge page segments are always abandoned and can be freed immediately by any thread mi_assert_internal(segment->kind==MI_SEGMENT_HUGE); mi_assert_internal(segment == _mi_page_segment(page)); mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id)==0); // claim it and free mi_heap_t* heap = mi_heap_get_default(); // issue #221; don't use the internal get_default_heap as we need to ensure the thread is initialized. // paranoia: if this it the last reference, the cas should always succeed size_t expected_tid = 0; if (mi_atomic_cas_strong_acq_rel(&segment->thread_id, &expected_tid, heap->thread_id)) { mi_block_set_next(page, block, page->free); page->free = block; page->used--; page->is_zero_init = false; mi_assert(page->used == 0); mi_tld_t* tld = heap->tld; _mi_segment_page_free(page, true, &tld->segments); } #if (MI_DEBUG!=0) else { mi_assert_internal(false); } #endif } #else // reset memory of a huge block from another thread void _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_block_t* block) { MI_UNUSED(page); mi_assert_internal(segment->kind == MI_SEGMENT_HUGE); mi_assert_internal(segment == _mi_page_segment(page)); mi_assert_internal(page->used == 1); // this is called just before the free mi_assert_internal(page->free == NULL); if (segment->allow_decommit) { size_t csize = mi_usable_size(block); if (csize > sizeof(mi_block_t)) { csize = csize - sizeof(mi_block_t); uint8_t* p = (uint8_t*)block + sizeof(mi_block_t); _mi_os_reset(p, csize); // note: cannot use segment_decommit on huge segments } } } #endif /* ----------------------------------------------------------- Page allocation and free ----------------------------------------------------------- */ mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld) { mi_page_t* page; if mi_unlikely(page_alignment > MI_BLOCK_ALIGNMENT_MAX) { mi_assert_internal(_mi_is_power_of_two(page_alignment)); mi_assert_internal(page_alignment >= MI_SEGMENT_SIZE); if (page_alignment < MI_SEGMENT_SIZE) { page_alignment = MI_SEGMENT_SIZE; } page = mi_segment_huge_page_alloc(block_size,page_alignment,heap->arena_id,tld); } else if (block_size <= MI_SMALL_OBJ_SIZE_MAX) { page = mi_segments_page_alloc(heap,MI_PAGE_SMALL,block_size,block_size,tld); } else if (block_size <= MI_MEDIUM_OBJ_SIZE_MAX) { page = mi_segments_page_alloc(heap,MI_PAGE_MEDIUM,MI_MEDIUM_PAGE_SIZE,block_size,tld); } else if (block_size <= MI_LARGE_OBJ_SIZE_MAX) { page = mi_segments_page_alloc(heap,MI_PAGE_LARGE,block_size,block_size,tld); } else { page = mi_segment_huge_page_alloc(block_size,page_alignment,heap->arena_id,tld); } mi_assert_internal(page == NULL || _mi_heap_memid_is_suitable(heap, _mi_page_segment(page)->memid)); mi_assert_expensive(page == NULL || mi_segment_is_valid(_mi_page_segment(page),tld)); mi_assert_internal(page == NULL || _mi_page_segment(page)->subproc == tld->subproc); return page; } /* ----------------------------------------------------------- Visit blocks in a segment (only used for abandoned segments) ----------------------------------------------------------- */ static bool mi_segment_visit_page(mi_page_t* page, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) { mi_heap_area_t area; _mi_heap_area_init(&area, page); if (!visitor(NULL, &area, NULL, area.block_size, arg)) return false; if (visit_blocks) { return _mi_heap_area_visit_blocks(&area, page, visitor, arg); } else { return true; } } bool _mi_segment_visit_blocks(mi_segment_t* segment, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) { const mi_slice_t* end; mi_slice_t* slice = mi_slices_start_iterate(segment, &end); while (slice < end) { if (mi_slice_is_used(slice)) { mi_page_t* const page = mi_slice_to_page(slice); if (heap_tag < 0 || (int)page->heap_tag == heap_tag) { if (!mi_segment_visit_page(page, visit_blocks, visitor, arg)) return false; } } slice = slice + slice->slice_count; } return true; } ================================================ FILE: third-party/mimalloc/src/static.c ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2020, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #ifndef _DEFAULT_SOURCE #define _DEFAULT_SOURCE #endif #if defined(__sun) // same remarks as os.c for the static's context. #undef _XOPEN_SOURCE #undef _POSIX_C_SOURCE #endif #include "mimalloc.h" #include "mimalloc/internal.h" // For a static override we create a single object file // containing the whole library. If it is linked first // it will override all the standard library allocation // functions (on Unix's). #include "alloc.c" // includes alloc-override.c #include "alloc-aligned.c" #include "alloc-posix.c" #include "arena.c" #include "bitmap.c" #include "heap.c" #include "init.c" #include "libc.c" #include "options.c" #include "os.c" #include "page.c" // includes page-queue.c #include "random.c" #include "segment.c" #include "segment-map.c" #include "stats.c" #include "prim/prim.c" #if MI_OSX_ZONE #include "prim/osx/alloc-override-zone.c" #endif ================================================ FILE: third-party/mimalloc/src/stats.c ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2021, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #include "mimalloc.h" #include "mimalloc/internal.h" #include "mimalloc/atomic.h" #include "mimalloc/prim.h" #include // memset #if defined(_MSC_VER) && (_MSC_VER < 1920) #pragma warning(disable:4204) // non-constant aggregate initializer #endif /* ----------------------------------------------------------- Statistics operations ----------------------------------------------------------- */ static bool mi_is_in_main(void* stat) { return ((uint8_t*)stat >= (uint8_t*)&_mi_stats_main && (uint8_t*)stat < ((uint8_t*)&_mi_stats_main + sizeof(mi_stats_t))); } static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) { if (amount == 0) return; if mi_unlikely(mi_is_in_main(stat)) { // add atomically (for abandoned pages) int64_t current = mi_atomic_addi64_relaxed(&stat->current, amount); mi_atomic_maxi64_relaxed(&stat->peak, current + amount); if (amount > 0) { mi_atomic_addi64_relaxed(&stat->total,amount); } } else { // add thread local stat->current += amount; if (stat->current > stat->peak) { stat->peak = stat->current; } if (amount > 0) { stat->total += amount; } } } void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) { if (mi_is_in_main(stat)) { mi_atomic_addi64_relaxed( &stat->total, (int64_t)amount ); } else { stat->total += amount; } } void _mi_stat_increase(mi_stat_count_t* stat, size_t amount) { mi_stat_update(stat, (int64_t)amount); } void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount) { mi_stat_update(stat, -((int64_t)amount)); } // must be thread safe as it is called from stats_merge static void mi_stat_count_add_mt(mi_stat_count_t* stat, const mi_stat_count_t* src) { if (stat==src) return; mi_atomic_void_addi64_relaxed(&stat->total, &src->total); mi_atomic_void_addi64_relaxed(&stat->current, &src->current); // peak scores do really not work across threads .. we just add them mi_atomic_void_addi64_relaxed( &stat->peak, &src->peak); // or, take the max? // mi_atomic_maxi64_relaxed(&stat->peak, src->peak); } static void mi_stat_counter_add_mt(mi_stat_counter_t* stat, const mi_stat_counter_t* src) { if (stat==src) return; mi_atomic_void_addi64_relaxed(&stat->total, &src->total); } #define MI_STAT_COUNT(stat) mi_stat_count_add_mt(&stats->stat, &src->stat); #define MI_STAT_COUNTER(stat) mi_stat_counter_add_mt(&stats->stat, &src->stat); // must be thread safe as it is called from stats_merge static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) { if (stats==src) return; // copy all fields MI_STAT_FIELDS() #if MI_STAT>1 for (size_t i = 0; i <= MI_BIN_HUGE; i++) { mi_stat_count_add_mt(&stats->malloc_bins[i], &src->malloc_bins[i]); } #endif for (size_t i = 0; i <= MI_BIN_HUGE; i++) { mi_stat_count_add_mt(&stats->page_bins[i], &src->page_bins[i]); } } #undef MI_STAT_COUNT #undef MI_STAT_COUNTER /* ----------------------------------------------------------- Display statistics ----------------------------------------------------------- */ // unit > 0 : size in binary bytes // unit == 0: count as decimal // unit < 0 : count in binary static void mi_printf_amount(int64_t n, int64_t unit, mi_output_fun* out, void* arg, const char* fmt) { char buf[32]; buf[0] = 0; int len = 32; const char* suffix = (unit <= 0 ? " " : "B"); const int64_t base = (unit == 0 ? 1000 : 1024); if (unit>0) n *= unit; const int64_t pos = (n < 0 ? -n : n); if (pos < base) { if (n!=1 || suffix[0] != 'B') { // skip printing 1 B for the unit column _mi_snprintf(buf, len, "%lld %-3s", (long long)n, (n==0 ? "" : suffix)); } } else { int64_t divider = base; const char* magnitude = "K"; if (pos >= divider*base) { divider *= base; magnitude = "M"; } if (pos >= divider*base) { divider *= base; magnitude = "G"; } const int64_t tens = (n / (divider/10)); const long whole = (long)(tens/10); const long frac1 = (long)(tens%10); char unitdesc[8]; _mi_snprintf(unitdesc, 8, "%s%s%s", magnitude, (base==1024 ? "i" : ""), suffix); _mi_snprintf(buf, len, "%ld.%ld %-3s", whole, (frac1 < 0 ? -frac1 : frac1), unitdesc); } _mi_fprintf(out, arg, (fmt==NULL ? "%12s" : fmt), buf); } static void mi_print_amount(int64_t n, int64_t unit, mi_output_fun* out, void* arg) { mi_printf_amount(n,unit,out,arg,NULL); } static void mi_print_count(int64_t n, int64_t unit, mi_output_fun* out, void* arg) { if (unit==1) _mi_fprintf(out, arg, "%12s"," "); else mi_print_amount(n,0,out,arg); } static void mi_stat_print_ex(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out, void* arg, const char* notok ) { _mi_fprintf(out, arg,"%10s:", msg); if (unit != 0) { if (unit > 0) { mi_print_amount(stat->peak, unit, out, arg); mi_print_amount(stat->total, unit, out, arg); // mi_print_amount(stat->freed, unit, out, arg); mi_print_amount(stat->current, unit, out, arg); mi_print_amount(unit, 1, out, arg); mi_print_count(stat->total, unit, out, arg); } else { mi_print_amount(stat->peak, -1, out, arg); mi_print_amount(stat->total, -1, out, arg); // mi_print_amount(stat->freed, -1, out, arg); mi_print_amount(stat->current, -1, out, arg); if (unit == -1) { _mi_fprintf(out, arg, "%24s", ""); } else { mi_print_amount(-unit, 1, out, arg); mi_print_count((stat->total / -unit), 0, out, arg); } } if (stat->current != 0) { _mi_fprintf(out, arg, " "); _mi_fprintf(out, arg, (notok == NULL ? "not all freed" : notok)); _mi_fprintf(out, arg, "\n"); } else { _mi_fprintf(out, arg, " ok\n"); } } else { mi_print_amount(stat->peak, 1, out, arg); mi_print_amount(stat->total, 1, out, arg); _mi_fprintf(out, arg, "%11s", " "); // no freed mi_print_amount(stat->current, 1, out, arg); _mi_fprintf(out, arg, "\n"); } } static void mi_stat_print(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out, void* arg) { mi_stat_print_ex(stat, msg, unit, out, arg, NULL); } static void mi_stat_peak_print(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out, void* arg) { _mi_fprintf(out, arg, "%10s:", msg); mi_print_amount(stat->peak, unit, out, arg); _mi_fprintf(out, arg, "\n"); } static void mi_stat_counter_print(const mi_stat_counter_t* stat, const char* msg, mi_output_fun* out, void* arg ) { _mi_fprintf(out, arg, "%10s:", msg); mi_print_amount(stat->total, -1, out, arg); _mi_fprintf(out, arg, "\n"); } static void mi_stat_counter_print_avg(const mi_stat_counter_t* stat, const char* msg, mi_output_fun* out, void* arg) { const int64_t avg_tens = (stat->total == 0 ? 0 : (stat->total*10 / stat->total)); const long avg_whole = (long)(avg_tens/10); const long avg_frac1 = (long)(avg_tens%10); _mi_fprintf(out, arg, "%10s: %5ld.%ld avg\n", msg, avg_whole, avg_frac1); } static void mi_print_header(mi_output_fun* out, void* arg ) { _mi_fprintf(out, arg, "%10s: %11s %11s %11s %11s %11s\n", "heap stats", "peak ", "total ", "current ", "unit ", "total# "); } #if MI_STAT>1 static void mi_stats_print_bins(const mi_stat_count_t* bins, size_t max, const char* fmt, mi_output_fun* out, void* arg) { bool found = false; char buf[64]; for (size_t i = 0; i <= max; i++) { if (bins[i].total > 0) { found = true; int64_t unit = _mi_bin_size((uint8_t)i); _mi_snprintf(buf, 64, "%s %3lu", fmt, (long)i); mi_stat_print(&bins[i], buf, unit, out, arg); } } if (found) { _mi_fprintf(out, arg, "\n"); mi_print_header(out, arg); } } #endif //------------------------------------------------------------ // Use an output wrapper for line-buffered output // (which is nice when using loggers etc.) //------------------------------------------------------------ typedef struct buffered_s { mi_output_fun* out; // original output function void* arg; // and state char* buf; // local buffer of at least size `count+1` size_t used; // currently used chars `used <= count` size_t count; // total chars available for output } buffered_t; static void mi_buffered_flush(buffered_t* buf) { buf->buf[buf->used] = 0; _mi_fputs(buf->out, buf->arg, NULL, buf->buf); buf->used = 0; } static void mi_cdecl mi_buffered_out(const char* msg, void* arg) { buffered_t* buf = (buffered_t*)arg; if (msg==NULL || buf==NULL) return; for (const char* src = msg; *src != 0; src++) { char c = *src; if (buf->used >= buf->count) mi_buffered_flush(buf); mi_assert_internal(buf->used < buf->count); buf->buf[buf->used++] = c; if (c == '\n') mi_buffered_flush(buf); } } //------------------------------------------------------------ // Print statistics //------------------------------------------------------------ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0) mi_attr_noexcept { // wrap the output function to be line buffered char buf[256]; buffered_t buffer = { out0, arg0, NULL, 0, 255 }; buffer.buf = buf; mi_output_fun* out = &mi_buffered_out; void* arg = &buffer; // and print using that mi_print_header(out,arg); #if MI_STAT>1 mi_stats_print_bins(stats->malloc_bins, MI_BIN_HUGE, "normal",out,arg); #endif #if MI_STAT mi_stat_print(&stats->malloc_normal, "normal", (stats->malloc_normal_count.total == 0 ? 1 : -1), out, arg); // mi_stat_print(&stats->malloc_large, "large", (stats->malloc_large_count.total == 0 ? 1 : -1), out, arg); mi_stat_print(&stats->malloc_huge, "huge", (stats->malloc_huge_count.total == 0 ? 1 : -1), out, arg); mi_stat_count_t total = { 0,0,0 }; mi_stat_count_add_mt(&total, &stats->malloc_normal); // mi_stat_count_add(&total, &stats->malloc_large); mi_stat_count_add_mt(&total, &stats->malloc_huge); mi_stat_print_ex(&total, "total", 1, out, arg, ""); #endif #if MI_STAT>1 mi_stat_print_ex(&stats->malloc_requested, "malloc req", 1, out, arg, ""); _mi_fprintf(out, arg, "\n"); #endif mi_stat_print_ex(&stats->reserved, "reserved", 1, out, arg, ""); mi_stat_print_ex(&stats->committed, "committed", 1, out, arg, ""); mi_stat_peak_print(&stats->reset, "reset", 1, out, arg ); mi_stat_peak_print(&stats->purged, "purged", 1, out, arg ); mi_stat_print_ex(&stats->page_committed, "touched", 1, out, arg, ""); mi_stat_print(&stats->segments, "segments", -1, out, arg); mi_stat_print(&stats->segments_abandoned, "-abandoned", -1, out, arg); mi_stat_print(&stats->segments_cache, "-cached", -1, out, arg); mi_stat_print(&stats->pages, "pages", -1, out, arg); mi_stat_print(&stats->pages_abandoned, "-abandoned", -1, out, arg); mi_stat_counter_print(&stats->pages_extended, "-extended", out, arg); mi_stat_counter_print(&stats->pages_retire, "-retire", out, arg); mi_stat_counter_print(&stats->arena_count, "arenas", out, arg); // mi_stat_counter_print(&stats->arena_crossover_count, "-crossover", out, arg); mi_stat_counter_print(&stats->arena_rollback_count, "-rollback", out, arg); mi_stat_counter_print(&stats->mmap_calls, "mmaps", out, arg); mi_stat_counter_print(&stats->commit_calls, "commits", out, arg); mi_stat_counter_print(&stats->reset_calls, "resets", out, arg); mi_stat_counter_print(&stats->purge_calls, "purges", out, arg); mi_stat_counter_print(&stats->malloc_guarded_count, "guarded", out, arg); mi_stat_print(&stats->threads, "threads", -1, out, arg); mi_stat_counter_print_avg(&stats->page_searches, "searches", out, arg); _mi_fprintf(out, arg, "%10s: %5zu\n", "numa nodes", _mi_os_numa_node_count()); size_t elapsed; size_t user_time; size_t sys_time; size_t current_rss; size_t peak_rss; size_t current_commit; size_t peak_commit; size_t page_faults; mi_process_info(&elapsed, &user_time, &sys_time, ¤t_rss, &peak_rss, ¤t_commit, &peak_commit, &page_faults); _mi_fprintf(out, arg, "%10s: %5ld.%03ld s\n", "elapsed", elapsed/1000, elapsed%1000); _mi_fprintf(out, arg, "%10s: user: %ld.%03ld s, system: %ld.%03ld s, faults: %lu, rss: ", "process", user_time/1000, user_time%1000, sys_time/1000, sys_time%1000, (unsigned long)page_faults ); mi_printf_amount((int64_t)peak_rss, 1, out, arg, "%s"); if (peak_commit > 0) { _mi_fprintf(out, arg, ", commit: "); mi_printf_amount((int64_t)peak_commit, 1, out, arg, "%s"); } _mi_fprintf(out, arg, "\n"); } static mi_msecs_t mi_process_start; // = 0 static mi_stats_t* mi_stats_get_default(void) { mi_heap_t* heap = mi_heap_get_default(); return &heap->tld->stats; } static void mi_stats_merge_from(mi_stats_t* stats) { if (stats != &_mi_stats_main) { mi_stats_add(&_mi_stats_main, stats); memset(stats, 0, sizeof(mi_stats_t)); } } void mi_stats_reset(void) mi_attr_noexcept { mi_stats_t* stats = mi_stats_get_default(); if (stats != &_mi_stats_main) { memset(stats, 0, sizeof(mi_stats_t)); } memset(&_mi_stats_main, 0, sizeof(mi_stats_t)); if (mi_process_start == 0) { mi_process_start = _mi_clock_start(); }; } void mi_stats_merge(void) mi_attr_noexcept { mi_stats_merge_from( mi_stats_get_default() ); } void _mi_stats_done(mi_stats_t* stats) { // called from `mi_thread_done` mi_stats_merge_from(stats); } void mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept { mi_stats_merge_from(mi_stats_get_default()); _mi_stats_print(&_mi_stats_main, out, arg); } void mi_stats_print(void* out) mi_attr_noexcept { // for compatibility there is an `out` parameter (which can be `stdout` or `stderr`) mi_stats_print_out((mi_output_fun*)out, NULL); } void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept { _mi_stats_print(mi_stats_get_default(), out, arg); } // ---------------------------------------------------------------- // Basic timer for convenience; use milli-seconds to avoid doubles // ---------------------------------------------------------------- static mi_msecs_t mi_clock_diff; mi_msecs_t _mi_clock_now(void) { return _mi_prim_clock_now(); } mi_msecs_t _mi_clock_start(void) { if (mi_clock_diff == 0.0) { mi_msecs_t t0 = _mi_clock_now(); mi_clock_diff = _mi_clock_now() - t0; } return _mi_clock_now(); } mi_msecs_t _mi_clock_end(mi_msecs_t start) { mi_msecs_t end = _mi_clock_now(); return (end - start - mi_clock_diff); } // -------------------------------------------------------- // Basic process statistics // -------------------------------------------------------- mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept { mi_process_info_t pinfo; _mi_memzero_var(pinfo); pinfo.elapsed = _mi_clock_end(mi_process_start); pinfo.current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.current)); pinfo.peak_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.peak)); pinfo.current_rss = pinfo.current_commit; pinfo.peak_rss = pinfo.peak_commit; pinfo.utime = 0; pinfo.stime = 0; pinfo.page_faults = 0; _mi_prim_process_info(&pinfo); if (elapsed_msecs!=NULL) *elapsed_msecs = (pinfo.elapsed < 0 ? 0 : (pinfo.elapsed < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.elapsed : PTRDIFF_MAX)); if (user_msecs!=NULL) *user_msecs = (pinfo.utime < 0 ? 0 : (pinfo.utime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.utime : PTRDIFF_MAX)); if (system_msecs!=NULL) *system_msecs = (pinfo.stime < 0 ? 0 : (pinfo.stime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.stime : PTRDIFF_MAX)); if (current_rss!=NULL) *current_rss = pinfo.current_rss; if (peak_rss!=NULL) *peak_rss = pinfo.peak_rss; if (current_commit!=NULL) *current_commit = pinfo.current_commit; if (peak_commit!=NULL) *peak_commit = pinfo.peak_commit; if (page_faults!=NULL) *page_faults = pinfo.page_faults; } // -------------------------------------------------------- // Return statistics // -------------------------------------------------------- void mi_stats_get(size_t stats_size, mi_stats_t* stats) mi_attr_noexcept { if (stats == NULL || stats_size == 0) return; _mi_memzero(stats, stats_size); const size_t size = (stats_size > sizeof(mi_stats_t) ? sizeof(mi_stats_t) : stats_size); _mi_memcpy(stats, &_mi_stats_main, size); stats->version = MI_STAT_VERSION; } // -------------------------------------------------------- // Statics in json format // -------------------------------------------------------- typedef struct mi_heap_buf_s { char* buf; size_t size; size_t used; bool can_realloc; } mi_heap_buf_t; static bool mi_heap_buf_expand(mi_heap_buf_t* hbuf) { if (hbuf==NULL) return false; if (hbuf->buf != NULL && hbuf->size>0) { hbuf->buf[hbuf->size-1] = 0; } if (hbuf->size > SIZE_MAX/2 || !hbuf->can_realloc) return false; const size_t newsize = (hbuf->size == 0 ? 2*MI_KiB : 2*hbuf->size); char* const newbuf = (char*)mi_rezalloc(hbuf->buf, newsize); if (newbuf == NULL) return false; hbuf->buf = newbuf; hbuf->size = newsize; return true; } static void mi_heap_buf_print(mi_heap_buf_t* hbuf, const char* msg) { if (msg==NULL || hbuf==NULL) return; if (hbuf->used + 1 >= hbuf->size && !hbuf->can_realloc) return; for (const char* src = msg; *src != 0; src++) { char c = *src; if (hbuf->used + 1 >= hbuf->size) { if (!mi_heap_buf_expand(hbuf)) return; } mi_assert_internal(hbuf->used < hbuf->size); hbuf->buf[hbuf->used++] = c; } mi_assert_internal(hbuf->used < hbuf->size); hbuf->buf[hbuf->used] = 0; } static void mi_heap_buf_print_count_bin(mi_heap_buf_t* hbuf, const char* prefix, mi_stat_count_t* stat, size_t bin, bool add_comma) { const size_t binsize = _mi_bin_size(bin); const size_t pagesize = (binsize <= MI_SMALL_OBJ_SIZE_MAX ? MI_SMALL_PAGE_SIZE : (binsize <= MI_MEDIUM_OBJ_SIZE_MAX ? MI_MEDIUM_PAGE_SIZE : #if MI_LARGE_PAGE_SIZE (binsize <= MI_LARGE_OBJ_SIZE_MAX ? MI_LARGE_PAGE_SIZE : 0) #else 0 #endif )); char buf[128]; _mi_snprintf(buf, 128, "%s{ \"total\": %lld, \"peak\": %lld, \"current\": %lld, \"block_size\": %zu, \"page_size\": %zu }%s\n", prefix, stat->total, stat->peak, stat->current, binsize, pagesize, (add_comma ? "," : "")); buf[127] = 0; mi_heap_buf_print(hbuf, buf); } static void mi_heap_buf_print_count(mi_heap_buf_t* hbuf, const char* prefix, mi_stat_count_t* stat, bool add_comma) { char buf[128]; _mi_snprintf(buf, 128, "%s{ \"total\": %lld, \"peak\": %lld, \"current\": %lld }%s\n", prefix, stat->total, stat->peak, stat->current, (add_comma ? "," : "")); buf[127] = 0; mi_heap_buf_print(hbuf, buf); } static void mi_heap_buf_print_count_value(mi_heap_buf_t* hbuf, const char* name, mi_stat_count_t* stat) { char buf[128]; _mi_snprintf(buf, 128, " \"%s\": ", name); buf[127] = 0; mi_heap_buf_print(hbuf, buf); mi_heap_buf_print_count(hbuf, "", stat, true); } static void mi_heap_buf_print_value(mi_heap_buf_t* hbuf, const char* name, int64_t val) { char buf[128]; _mi_snprintf(buf, 128, " \"%s\": %lld,\n", name, val); buf[127] = 0; mi_heap_buf_print(hbuf, buf); } static void mi_heap_buf_print_size(mi_heap_buf_t* hbuf, const char* name, size_t val, bool add_comma) { char buf[128]; _mi_snprintf(buf, 128, " \"%s\": %zu%s\n", name, val, (add_comma ? "," : "")); buf[127] = 0; mi_heap_buf_print(hbuf, buf); } static void mi_heap_buf_print_counter_value(mi_heap_buf_t* hbuf, const char* name, mi_stat_counter_t* stat) { mi_heap_buf_print_value(hbuf, name, stat->total); } #define MI_STAT_COUNT(stat) mi_heap_buf_print_count_value(&hbuf, #stat, &stats->stat); #define MI_STAT_COUNTER(stat) mi_heap_buf_print_counter_value(&hbuf, #stat, &stats->stat); char* mi_stats_get_json(size_t output_size, char* output_buf) mi_attr_noexcept { mi_heap_buf_t hbuf = { NULL, 0, 0, true }; if (output_size > 0 && output_buf != NULL) { _mi_memzero(output_buf, output_size); hbuf.buf = output_buf; hbuf.size = output_size; hbuf.can_realloc = false; } else { if (!mi_heap_buf_expand(&hbuf)) return NULL; } mi_heap_buf_print(&hbuf, "{\n"); mi_heap_buf_print_value(&hbuf, "version", MI_STAT_VERSION); mi_heap_buf_print_value(&hbuf, "mimalloc_version", MI_MALLOC_VERSION); // process info mi_heap_buf_print(&hbuf, " \"process\": {\n"); size_t elapsed; size_t user_time; size_t sys_time; size_t current_rss; size_t peak_rss; size_t current_commit; size_t peak_commit; size_t page_faults; mi_process_info(&elapsed, &user_time, &sys_time, ¤t_rss, &peak_rss, ¤t_commit, &peak_commit, &page_faults); mi_heap_buf_print_size(&hbuf, "elapsed_msecs", elapsed, true); mi_heap_buf_print_size(&hbuf, "user_msecs", user_time, true); mi_heap_buf_print_size(&hbuf, "system_msecs", sys_time, true); mi_heap_buf_print_size(&hbuf, "page_faults", page_faults, true); mi_heap_buf_print_size(&hbuf, "rss_current", current_rss, true); mi_heap_buf_print_size(&hbuf, "rss_peak", peak_rss, true); mi_heap_buf_print_size(&hbuf, "commit_current", current_commit, true); mi_heap_buf_print_size(&hbuf, "commit_peak", peak_commit, false); mi_heap_buf_print(&hbuf, " },\n"); // statistics mi_stats_t* stats = &_mi_stats_main; MI_STAT_FIELDS() // size bins mi_heap_buf_print(&hbuf, " \"malloc_bins\": [\n"); for (size_t i = 0; i <= MI_BIN_HUGE; i++) { mi_heap_buf_print_count_bin(&hbuf, " ", &stats->malloc_bins[i], i, i!=MI_BIN_HUGE); } mi_heap_buf_print(&hbuf, " ],\n"); mi_heap_buf_print(&hbuf, " \"page_bins\": [\n"); for (size_t i = 0; i <= MI_BIN_HUGE; i++) { mi_heap_buf_print_count_bin(&hbuf, " ", &stats->page_bins[i], i, i!=MI_BIN_HUGE); } mi_heap_buf_print(&hbuf, " ]\n"); mi_heap_buf_print(&hbuf, "}\n"); return hbuf.buf; } ================================================ FILE: third-party/mimalloc/test/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.18) project(mimalloc-test C CXX) set(CMAKE_C_STANDARD 11) set(CMAKE_CXX_STANDARD 17) # Set default build type if (NOT CMAKE_BUILD_TYPE) if ("${CMAKE_BINARY_DIR}" MATCHES ".*(D|d)ebug$") message(STATUS "No build type selected, default to *** Debug ***") set(CMAKE_BUILD_TYPE "Debug") else() message(STATUS "No build type selected, default to *** Release ***") set(CMAKE_BUILD_TYPE "Release") endif() endif() # Import mimalloc (if installed) find_package(mimalloc 2.2 CONFIG REQUIRED) message(STATUS "Found mimalloc installed at: ${MIMALLOC_LIBRARY_DIR} (${MIMALLOC_VERSION_DIR})") # link with a dynamic shared library # use `LD_PRELOAD` to actually override malloc/free at runtime with mimalloc add_executable(dynamic-override main-override.c) target_link_libraries(dynamic-override PUBLIC mimalloc) add_executable(dynamic-override-cxx main-override.cpp) target_link_libraries(dynamic-override-cxx PUBLIC mimalloc) # overriding with a static object file works reliable as the symbols in the # object file have priority over those in library files add_executable(static-override-obj main-override.c ${MIMALLOC_OBJECT_DIR}/mimalloc${CMAKE_C_OUTPUT_EXTENSION}) target_include_directories(static-override-obj PUBLIC ${MIMALLOC_INCLUDE_DIR}) target_link_libraries(static-override-obj PUBLIC mimalloc-static) # overriding with a static library works too if using the `mimalloc-override.h` # header to redefine malloc/free. (the library already overrides new/delete) add_executable(static-override-static main-override-static.c) target_link_libraries(static-override-static PUBLIC mimalloc-static) # overriding with a static library: this may not work if the library is linked too late # on the command line after the C runtime library; but we cannot control that well in CMake add_executable(static-override main-override.c) target_link_libraries(static-override PUBLIC mimalloc-static) add_executable(static-override-cxx main-override.cpp) target_link_libraries(static-override-cxx PUBLIC mimalloc-static) ## test memory errors add_executable(test-wrong test-wrong.c) target_link_libraries(test-wrong PUBLIC mimalloc) ================================================ FILE: third-party/mimalloc/test/main-override-dep.cpp ================================================ // Issue #981: test overriding allocation in a DLL that is compiled independent of mimalloc. // This is imported by the `mimalloc-test-override` project. #include #include "main-override-dep.h" std::string TestAllocInDll::GetString() { char* test = new char[128]; memset(test, 0, 128); const char* t = "test"; memcpy(test, t, 4); std::string r = test; delete[] test; return r; } ================================================ FILE: third-party/mimalloc/test/main-override-dep.h ================================================ #pragma once // Issue #981: test overriding allocation in a DLL that is compiled independent of mimalloc. // This is imported by the `mimalloc-test-override` project. #include class TestAllocInDll { public: __declspec(dllexport) std::string GetString(); }; ================================================ FILE: third-party/mimalloc/test/main-override-static.c ================================================ #if _WIN32 #include #endif #include #include #include #include #include #include #include // redefines malloc etc. static void double_free1(); static void double_free2(); static void corrupt_free(); static void block_overflow1(); static void block_overflow2(); static void invalid_free(); static void test_aslr(void); static void test_process_info(void); static void test_reserved(void); static void negative_stat(void); static void alloc_huge(void); static void test_heap_walk(void); static void test_heap_arena(void); static void test_align(void); static void test_canary_leak(void); static void test_manage_os_memory(void); // static void test_large_pages(void); int main() { mi_version(); mi_stats_reset(); // mi_bins(); // test_manage_os_memory(); // test_large_pages(); // detect double frees and heap corruption // double_free1(); // double_free2(); // corrupt_free(); // block_overflow1(); // block_overflow2(); // test_canary_leak(); // test_aslr(); // invalid_free(); // test_reserved(); // negative_stat(); // test_heap_walk(); // alloc_huge(); // test_heap_walk(); // test_heap_arena(); // test_align(); void* p1 = malloc(78); void* p2 = malloc(24); free(p1); p1 = mi_malloc(8); char* s = strdup("hello\n"); free(p2); mi_heap_t* h = mi_heap_new(); mi_heap_set_default(h); p2 = malloc(16); p1 = realloc(p1, 32); free(p1); free(p2); free(s); /* now test if override worked by allocating/freeing across the api's*/ //p1 = mi_malloc(32); //free(p1); //p2 = malloc(32); //mi_free(p2); //mi_collect(true); //mi_stats_print(NULL); // test_process_info(); return 0; } static void test_align() { void* p = mi_malloc_aligned(256, 256); if (((uintptr_t)p % 256) != 0) { fprintf(stderr, "%p is not 256 alignend!\n", p); } } static void invalid_free() { free((void*)0xBADBEEF); realloc((void*)0xBADBEEF,10); } static void block_overflow1() { uint8_t* p = (uint8_t*)mi_malloc(17); p[18] = 0; free(p); } static void block_overflow2() { uint8_t* p = (uint8_t*)mi_malloc(16); p[17] = 0; free(p); } // The double free samples come ArcHeap [1] by Insu Yun (issue #161) // [1]: https://arxiv.org/pdf/1903.00503.pdf static void double_free1() { void* p[256]; //uintptr_t buf[256]; p[0] = mi_malloc(622616); p[1] = mi_malloc(655362); p[2] = mi_malloc(786432); mi_free(p[2]); // [VULN] Double free mi_free(p[2]); p[3] = mi_malloc(786456); // [BUG] Found overlap // p[3]=0x429b2ea2000 (size=917504), p[1]=0x429b2e42000 (size=786432) fprintf(stderr, "p3: %p-%p, p1: %p-%p, p2: %p\n", p[3], (uint8_t*)(p[3]) + 786456, p[1], (uint8_t*)(p[1]) + 655362, p[2]); } static void double_free2() { void* p[256]; //uintptr_t buf[256]; // [INFO] Command buffer: 0x327b2000 // [INFO] Input size: 182 p[0] = malloc(712352); p[1] = malloc(786432); free(p[0]); // [VULN] Double free free(p[0]); p[2] = malloc(786440); p[3] = malloc(917504); p[4] = malloc(786440); // [BUG] Found overlap // p[4]=0x433f1402000 (size=917504), p[1]=0x433f14c2000 (size=786432) fprintf(stderr, "p1: %p-%p, p2: %p-%p\n", p[4], (uint8_t*)(p[4]) + 917504, p[1], (uint8_t*)(p[1]) + 786432); } // Try to corrupt the heap through buffer overflow #define N 256 #define SZ 64 static void corrupt_free() { void* p[N]; // allocate for (int i = 0; i < N; i++) { p[i] = malloc(SZ); } // free some for (int i = 0; i < N; i += (N/10)) { free(p[i]); p[i] = NULL; } // try to corrupt the free list for (int i = 0; i < N; i++) { if (p[i] != NULL) { memset(p[i], 0, SZ+8); } } // allocate more.. trying to trigger an allocation from a corrupted entry // this may need many allocations to get there (if at all) for (int i = 0; i < 4096; i++) { malloc(SZ); } } static void test_aslr(void) { void* p[256]; p[0] = malloc(378200); p[1] = malloc(1134626); printf("p1: %p, p2: %p\n", p[0], p[1]); } static void test_process_info(void) { size_t elapsed = 0; size_t user_msecs = 0; size_t system_msecs = 0; size_t current_rss = 0; size_t peak_rss = 0; size_t current_commit = 0; size_t peak_commit = 0; size_t page_faults = 0; for (int i = 0; i < 100000; i++) { void* p = calloc(100,10); free(p); } mi_process_info(&elapsed, &user_msecs, &system_msecs, ¤t_rss, &peak_rss, ¤t_commit, &peak_commit, &page_faults); printf("\n\n*** process info: elapsed %3zd.%03zd s, user: %3zd.%03zd s, rss: %zd b, commit: %zd b\n\n", elapsed/1000, elapsed%1000, user_msecs/1000, user_msecs%1000, peak_rss, peak_commit); } static void test_reserved(void) { #define KiB 1024ULL #define MiB (KiB*KiB) #define GiB (MiB*KiB) mi_reserve_os_memory(4*GiB, false, true); void* p1 = malloc(100); void* p2 = malloc(100000); void* p3 = malloc(2*GiB); void* p4 = malloc(1*GiB + 100000); free(p1); free(p2); free(p3); p3 = malloc(1*GiB); free(p4); } static void negative_stat(void) { int* p = mi_malloc(60000); mi_stats_print_out(NULL, NULL); *p = 100; mi_free(p); mi_stats_print_out(NULL, NULL); } static void alloc_huge(void) { void* p = mi_malloc(67108872); mi_free(p); } static bool test_visit(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg) { if (block == NULL) { printf("visiting an area with blocks of size %zu (including padding)\n", area->full_block_size); } else { printf(" block of size %zu (allocated size is %zu)\n", block_size, mi_usable_size(block)); } return true; } static void test_heap_walk(void) { mi_heap_t* heap = mi_heap_new(); mi_heap_malloc(heap, 16*2097152); mi_heap_malloc(heap, 2067152); mi_heap_malloc(heap, 2097160); mi_heap_malloc(heap, 24576); mi_heap_visit_blocks(heap, true, &test_visit, NULL); } static void test_heap_arena(void) { mi_arena_id_t arena_id; int err = mi_reserve_os_memory_ex(100 * 1024 * 1024, false /* commit */, false /* allow large */, true /* exclusive */, &arena_id); if (err) abort(); mi_heap_t* heap = mi_heap_new_in_arena(arena_id); for (int i = 0; i < 500000; i++) { void* p = mi_heap_malloc(heap, 1024); if (p == NULL) { printf("out of memory after %d kb (expecting about 100_000kb)\n", i); break; } } } static void test_canary_leak(void) { char* p = mi_mallocn_tp(char,23); for(int i = 0; i < 23; i++) { p[i] = '0'+i; } puts(p); free(p); } #if _WIN32 static void test_manage_os_memory(void) { size_t size = 256 * 1024 * 1024; void* ptr = VirtualAlloc(NULL, size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); mi_arena_id_t arena_id; mi_manage_os_memory_ex(ptr, size, true /* committed */, true /* pinned */, false /* is zero */, -1 /* numa node */, true /* exclusive */, &arena_id); mi_heap_t* cuda_heap = mi_heap_new_in_arena(arena_id); // you can do this in any thread // now allocate only in the cuda arena void* p1 = mi_heap_malloc(cuda_heap, 8); int* p2 = mi_heap_malloc_tp(cuda_heap, int); *p2 = 42; // and maybe set the cuda heap as the default heap? (but careful as now `malloc` will allocate in the cuda heap as well) { mi_heap_t* prev_default_heap = mi_heap_set_default(cuda_heap); void* p3 = mi_malloc(8); // allocate in the cuda heap mi_free(p3); } mi_free(p1); mi_free(p2); } #else static void test_manage_os_memory(void) { // empty } #endif // Experiment with huge OS pages #if 0 #include #include #include #include static void test_large_pages(void) { mi_memid_t memid; #if 0 size_t pages_reserved; size_t page_size; uint8_t* p = (uint8_t*)_mi_os_alloc_huge_os_pages(1, -1, 30000, &pages_reserved, &page_size, &memid); const size_t req_size = pages_reserved * page_size; #else const size_t req_size = 64*MI_MiB; uint8_t* p = (uint8_t*)_mi_os_alloc(req_size,&memid,NULL); #endif p[0] = 1; //_mi_os_protect(p, _mi_os_page_size()); //_mi_os_unprotect(p, _mi_os_page_size()); //_mi_os_decommit(p, _mi_os_page_size(), NULL); if (madvise(p, req_size, MADV_HUGEPAGE) == 0) { printf("advised huge pages\n"); _mi_os_decommit(p, _mi_os_page_size(), NULL); }; _mi_os_free(p, req_size, memid, NULL); } #endif // ---------------------------- // bin size experiments // ------------------------------ #if 0 #include #include #define MI_INTPTR_SIZE 8 #define MI_LARGE_WSIZE_MAX (4*1024*1024 / MI_INTPTR_SIZE) #define MI_BIN_HUGE 100 //#define MI_ALIGN2W // Bit scan reverse: return the index of the highest bit. static inline uint8_t mi_bsr32(uint32_t x); #if defined(_MSC_VER) //#include #include static inline uint8_t mi_bsr32(uint32_t x) { uint32_t idx; _BitScanReverse(&idx, x); return idx; } #elif defined(__GNUC__) || defined(__clang__) static inline uint8_t mi_bsr32(uint32_t x) { return (31 - __builtin_clz(x)); } #else static inline uint8_t mi_bsr32(uint32_t x) { // de Bruijn multiplication, see static const uint8_t debruijn[32] = { 31, 0, 22, 1, 28, 23, 18, 2, 29, 26, 24, 10, 19, 7, 3, 12, 30, 21, 27, 17, 25, 9, 6, 11, 20, 16, 8, 5, 15, 4, 14, 13, }; x |= x >> 1; x |= x >> 2; x |= x >> 4; x |= x >> 8; x |= x >> 16; x++; return debruijn[(x*0x076be629) >> 27]; } #endif // Bit scan reverse: return the index of the highest bit. uint8_t _mi_bsr(uintptr_t x) { if (x == 0) return 0; #if MI_INTPTR_SIZE==8 uint32_t hi = (x >> 32); return (hi == 0 ? mi_bsr32((uint32_t)x) : 32 + mi_bsr32(hi)); #elif MI_INTPTR_SIZE==4 return mi_bsr32(x); #else # error "define bsr for non-32 or 64-bit platforms" #endif } static inline size_t _mi_wsize_from_size(size_t size) { return (size + sizeof(uintptr_t) - 1) / sizeof(uintptr_t); } // #define MI_ALIGN2W // Return the bin for a given field size. // Returns MI_BIN_HUGE if the size is too large. // We use `wsize` for the size in "machine word sizes", // i.e. byte size == `wsize*sizeof(void*)`. static inline size_t mi_bin(size_t wsize) { // size_t wsize = _mi_wsize_from_size(size); // size_t bin; /*if (wsize <= 1) { bin = 1; } */ #if defined(MI_ALIGN4W) if (wsize <= 4) { return (wsize <= 1 ? 1 : (wsize+1)&~1); // round to double word sizes } #elif defined(MI_ALIGN2W) if (wsize <= 8) { return (wsize <= 1 ? 1 : (wsize+1)&~1); // round to double word sizes } #else if (wsize <= 8) { return (wsize == 0 ? 1 : wsize); } #endif else if (wsize > MI_LARGE_WSIZE_MAX) { return MI_BIN_HUGE; } else { #if defined(MI_ALIGN4W) if (wsize <= 16) { wsize = (wsize+3)&~3; } // round to 4x word sizes #endif wsize--; // find the highest bit const size_t b = _mi_bsr(wsize); // note: wsize != 0 // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation). // - adjust with 3 because we use do not round the first 8 sizes // which each get an exact bin const size_t bin = ((b << 2) + ((wsize >> (b - 2)) & 0x03)) - 3; assert(bin > 0 && bin < MI_BIN_HUGE); return bin; } } static inline uint8_t _mi_bin4(size_t size) { size_t wsize = _mi_wsize_from_size(size); uint8_t bin; if (wsize <= 1) { bin = 1; } #if defined(MI_ALIGN4W) else if (wsize <= 4) { bin = (uint8_t)((wsize+1)&~1); // round to double word sizes } #elif defined(MI_ALIGN2W) else if (wsize <= 8) { bin = (uint8_t)((wsize+1)&~1); // round to double word sizes } #else else if (wsize <= 8) { bin = (uint8_t)wsize; } #endif else if (wsize > MI_LARGE_WSIZE_MAX) { bin = MI_BIN_HUGE; } else { uint8_t b = mi_bsr32((uint32_t)wsize); bin = ((b << 1) + (uint8_t)((wsize >> (b - 1)) & 0x01)) + 3; } return bin; } static size_t _mi_binx4(size_t wsize) { size_t bin; if (wsize <= 1) { bin = 1; } else if (wsize <= 8) { // bin = (wsize+1)&~1; // round to double word sizes bin = (uint8_t)wsize; } else { uint8_t b = mi_bsr32((uint32_t)wsize); if (b <= 1) return wsize; bin = ((b << 1) | (wsize >> (b - 1))&0x01) + 3; } return bin; } static size_t _mi_binx8(size_t bsize) { if (bsize<=1) return bsize; uint8_t b = mi_bsr32((uint32_t)bsize); if (b <= 2) return bsize; size_t bin = ((b << 2) | (bsize >> (b - 2))&0x03) - 5; return bin; } static inline size_t mi_binx(size_t wsize) { uint8_t bin; if (wsize <= 1) { bin = 1; } else if (wsize <= 8) { // bin = (wsize+1)&~1; // round to double word sizes bin = (uint8_t)wsize; } else { wsize--; // find the highest bit uint8_t b = (uint8_t)mi_bsr32((uint32_t)wsize); // note: wsize != 0 // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation). // - adjust with 3 because we use do not round the first 8 sizes // which each get an exact bin bin = ((b << 2) + (uint8_t)((wsize >> (b - 2)) & 0x03)) - 3; } return bin; } static void mi_bins(void) { //printf(" QNULL(1), /* 0 */ \\\n "); size_t last_bin = 0; for (size_t wsize = 1; wsize <= (4*1024*1024) / 8 + 1024; wsize++) { size_t bin = mi_bin(wsize); if (bin != last_bin) { //printf("min bsize: %6zd, max bsize: %6zd, bin: %6zd\n", min_wsize, last_wsize, last_bin); printf("QNULL(%6zd), ", wsize-1); if (last_bin%8 == 0) printf("/* %zu */ \\\n ", last_bin); last_bin = bin; } } } #endif ================================================ FILE: third-party/mimalloc/test/main-override.c ================================================ #include #include #include #include #include int main() { mi_version(); // ensure mimalloc library is linked void* p1 = malloc(78); void* p2 = malloc(24); free(p1); p1 = malloc(8); //char* s = strdup("hello\n"); free(p2); p2 = malloc(16); p1 = realloc(p1, 32); free(p1); free(p2); //free(s); //mi_collect(true); /* now test if override worked by allocating/freeing across the api's*/ //p1 = mi_malloc(32); //free(p1); //p2 = malloc(32); //mi_free(p2); p1 = malloc(24); p2 = reallocarray(p1, 16, 16); free(p2); p1 = malloc(24); assert(reallocarr(&p1, 16, 16) == 0); free(p1); mi_stats_print(NULL); return 0; } ================================================ FILE: third-party/mimalloc/test/main-override.cpp ================================================ #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _WIN32 #include #include static void msleep(unsigned long msecs) { Sleep(msecs); } #else #include static void msleep(unsigned long msecs) { usleep(msecs * 1000UL); } #endif static void heap_thread_free_large(); // issue #221 static void heap_no_delete(); // issue #202 static void heap_late_free(); // issue #204 static void padding_shrink(); // issue #209 static void various_tests(); static void test_mt_shutdown(); static void large_alloc(void); // issue #363 static void fail_aslr(); // issue #372 static void tsan_numa_test(); // issue #414 static void strdup_test(); // issue #445 static void bench_alloc_large(void); // issue #xxx //static void test_large_migrate(void); // issue #691 static void heap_thread_free_huge(); static void test_std_string(); // issue #697 static void test_thread_local(); // issue #944 // static void test_mixed0(); // issue #942 static void test_mixed1(); // issue #942 static void test_stl_allocators(); #if x_WIN32 #include "main-override-dep.h" static void test_dep(); // issue #981: test overriding in another DLL #else static void test_dep() { }; #endif int main() { mi_stats_reset(); // ignore earlier allocations various_tests(); test_mixed1(); test_dep(); //test_std_string(); //test_thread_local(); // heap_thread_free_huge(); /* heap_thread_free_huge(); heap_thread_free_large(); heap_no_delete(); heap_late_free(); padding_shrink(); various_tests(); large_alloc(); tsan_numa_test(); strdup_test(); */ // test_stl_allocators(); // test_mt_shutdown(); // test_large_migrate(); //fail_aslr(); mi_stats_print(NULL); return 0; } static void* p = malloc(8); void free_p() { free(p); return; } class Test { private: int i; public: Test(int x) { i = x; } ~Test() { } }; static void various_tests() { atexit(free_p); void* p1 = malloc(78); void* p2 = mi_malloc_aligned(24, 16); free(p1); p1 = malloc(8); char* s = mi_strdup("hello\n"); mi_free(p2); p2 = malloc(16); p1 = realloc(p1, 32); free(p1); free(p2); mi_free(s); Test* t = new Test(42); delete t; t = new (std::nothrow) Test(42); delete t; auto tbuf = new unsigned char[sizeof(Test)]; t = new (tbuf) Test(42); t->~Test(); delete[] tbuf; #if _WIN32 const char* ptr = ::_Getdays(); // test _base overrid free((void*)ptr); #endif } class Static { private: void* p; public: Static() { p = malloc(64); return; } ~Static() { free(p); return; } }; static Static s = Static(); static bool test_stl_allocator1() { std::vector > vec; vec.push_back(1); vec.pop_back(); return vec.size() == 0; } struct some_struct { int i; int j; double z; }; #if x_WIN32 static void test_dep() { TestAllocInDll t; std::string s = t.GetString(); } #endif static bool test_stl_allocator2() { std::vector > vec; vec.push_back(some_struct()); vec.pop_back(); return vec.size() == 0; } #if MI_HAS_HEAP_STL_ALLOCATOR static bool test_stl_allocator3() { std::vector > vec; vec.push_back(1); vec.pop_back(); return vec.size() == 0; } static bool test_stl_allocator4() { std::vector > vec; vec.push_back(some_struct()); vec.pop_back(); return vec.size() == 0; } static bool test_stl_allocator5() { std::vector > vec; vec.push_back(1); vec.pop_back(); return vec.size() == 0; } static bool test_stl_allocator6() { std::vector > vec; vec.push_back(some_struct()); vec.pop_back(); return vec.size() == 0; } #endif static void test_stl_allocators() { test_stl_allocator1(); test_stl_allocator2(); #if MI_HAS_HEAP_STL_ALLOCATOR test_stl_allocator3(); test_stl_allocator4(); test_stl_allocator5(); test_stl_allocator6(); #endif } #if 0 #include #include #include #include #include #include static void test_mixed0() { std::vector> numbers(1024 * 1024 * 100); std::vector threads(1); std::atomic index{}; auto start = std::chrono::system_clock::now(); for (auto& thread : threads) { thread = std::thread{[&index, &numbers]() { while (true) { auto i = index.fetch_add(1, std::memory_order_relaxed); if (i >= numbers.size()) return; numbers[i] = std::make_unique(i); } }}; } for (auto& thread : threads) thread.join(); auto end = std::chrono::system_clock::now(); auto duration = std::chrono::duration_cast(end - start); std::cout << "Running on " << threads.size() << " threads took " << duration << std::endl; } #endif void asd() { void* p = malloc(128); free(p); } static void test_mixed1() { std::thread thread(asd); thread.join(); } #if 0 // issue #691 static char* cptr; static void* thread1_allocate() { cptr = mi_calloc_tp(char,22085632); return NULL; } static void* thread2_free() { assert(cptr); mi_free(cptr); cptr = NULL; return NULL; } static void test_large_migrate(void) { auto t1 = std::thread(thread1_allocate); t1.join(); auto t2 = std::thread(thread2_free); t2.join(); /* pthread_t thread1, thread2; pthread_create(&thread1, NULL, &thread1_allocate, NULL); pthread_join(thread1, NULL); pthread_create(&thread2, NULL, &thread2_free, NULL); pthread_join(thread2, NULL); */ return; } #endif // issue 445 static void strdup_test() { #ifdef _MSC_VER char* s = _strdup("hello\n"); char* buf = NULL; size_t len; _dupenv_s(&buf, &len, "MIMALLOC_VERBOSE"); mi_free(buf); mi_free(s); #endif } // Issue #202 static void heap_no_delete_worker() { mi_heap_t* heap = mi_heap_new(); void* q = mi_heap_malloc(heap, 1024); (void)(q); // mi_heap_delete(heap); // uncomment to prevent assertion } static void heap_no_delete() { auto t1 = std::thread(heap_no_delete_worker); t1.join(); } // Issue #697 static void test_std_string() { std::string path = "/Users/xxxx/Library/Developer/Xcode/DerivedData/xxxxxxxxxx/Build/Intermediates.noindex/xxxxxxxxxxx/arm64/XX_lto.o/0.arm64.lto.o"; std::string path1 = "/Users/xxxx/Library/Developer/Xcode/DerivedData/xxxxxxxxxx/Build/Intermediates.noindex/xxxxxxxxxxx/arm64/XX_lto.o/1.arm64.lto.o"; std::cout << path + "\n>>> " + path1 + "\n>>> " << std::endl; } // Issue #204 static volatile void* global_p; static void t1main() { mi_heap_t* heap = mi_heap_new(); global_p = mi_heap_malloc(heap, 1024); mi_heap_delete(heap); } static void heap_late_free() { auto t1 = std::thread(t1main); msleep(2000); assert(global_p); mi_free((void*)global_p); t1.join(); } // issue #209 static void* shared_p; static void alloc0(/* void* arg */) { shared_p = mi_malloc(8); } static void padding_shrink(void) { auto t1 = std::thread(alloc0); t1.join(); mi_free(shared_p); } // Issue #221 static void heap_thread_free_large_worker() { mi_free(shared_p); } static void heap_thread_free_large() { for (int i = 0; i < 100; i++) { shared_p = mi_malloc_aligned(2 * 1024 * 1024 + 1, 8); auto t1 = std::thread(heap_thread_free_large_worker); t1.join(); } } static void heap_thread_free_huge_worker() { mi_free(shared_p); } static void heap_thread_free_huge() { for (int i = 0; i < 100; i++) { shared_p = mi_malloc(1024 * 1024 * 1024); auto t1 = std::thread(heap_thread_free_huge_worker); t1.join(); } } static void test_mt_shutdown() { const int threads = 5; std::vector< std::future< std::vector< char* > > > ts; auto fn = [&]() { std::vector< char* > ps; ps.reserve(1000); for (int i = 0; i < 1000; i++) ps.emplace_back(new char[1]); return ps; }; for (int i = 0; i < threads; i++) ts.emplace_back(std::async(std::launch::async, fn)); for (auto& f : ts) for (auto& p : f.get()) delete[] p; std::cout << "done" << std::endl; } // issue #363 using namespace std; void large_alloc(void) { char* a = new char[1ull << 25]; thread th([&] { delete[] a; }); th.join(); } // issue #372 static void fail_aslr() { size_t sz = (size_t)(4ULL << 40); // 4TiB void* p = malloc(sz); printf("pointer p: %p: area up to %p\n", p, (uint8_t*)p + sz); *(int*)0x5FFFFFFF000 = 0; // should segfault } // issues #414 static void dummy_worker() { void* p = mi_malloc(0); mi_free(p); } static void tsan_numa_test() { auto t1 = std::thread(dummy_worker); dummy_worker(); t1.join(); } // issue #? #include #include #include static void bench_alloc_large(void) { static constexpr int kNumBuffers = 20; static constexpr size_t kMinBufferSize = 5 * 1024 * 1024; static constexpr size_t kMaxBufferSize = 25 * 1024 * 1024; std::unique_ptr buffers[kNumBuffers]; std::random_device rd; (void)rd; std::mt19937 gen(42); //rd()); std::uniform_int_distribution<> size_distribution(kMinBufferSize, kMaxBufferSize); std::uniform_int_distribution<> buf_number_distribution(0, kNumBuffers - 1); static constexpr int kNumIterations = 2000; const auto start = std::chrono::steady_clock::now(); for (int i = 0; i < kNumIterations; ++i) { int buffer_idx = buf_number_distribution(gen); size_t new_size = size_distribution(gen); buffers[buffer_idx] = std::make_unique(new_size); } const auto end = std::chrono::steady_clock::now(); const auto num_ms = std::chrono::duration_cast(end - start).count(); const auto us_per_allocation = std::chrono::duration_cast(end - start).count() / kNumIterations; std::cout << kNumIterations << " allocations Done in " << num_ms << "ms." << std::endl; std::cout << "Avg " << us_per_allocation << " us per allocation" << std::endl; } class MTest { char *data; public: MTest() { data = (char*)malloc(1024); } ~MTest() { free(data); }; }; thread_local MTest tlVariable; void threadFun( int i ) { printf( "Thread %d\n", i ); std::this_thread::sleep_for( std::chrono::milliseconds(100) ); } void test_thread_local() { for( int i=1; i < 100; ++i ) { std::thread t( threadFun, i ); t.join(); mi_stats_print(NULL); } return; } ================================================ FILE: third-party/mimalloc/test/main.c ================================================ #include #include #include void test_heap(void* p_out) { mi_heap_t* heap = mi_heap_new(); void* p1 = mi_heap_malloc(heap,32); void* p2 = mi_heap_malloc(heap,48); mi_free(p_out); mi_heap_destroy(heap); //mi_heap_delete(heap); mi_free(p1); mi_free(p2); } void test_large() { const size_t N = 1000; for (size_t i = 0; i < N; ++i) { size_t sz = 1ull << 21; char* a = mi_mallocn_tp(char,sz); for (size_t k = 0; k < sz; k++) { a[k] = 'x'; } mi_free(a); } } int main() { void* p1 = mi_malloc(16); void* p2 = mi_malloc(1000000); mi_free(p1); mi_free(p2); p1 = mi_malloc(16); p2 = mi_malloc(16); mi_free(p1); mi_free(p2); test_heap(mi_malloc(32)); p1 = mi_malloc_aligned(64, 16); p2 = mi_malloc_aligned(160,24); mi_free(p2); mi_free(p1); //test_large(); mi_collect(true); mi_stats_print(NULL); return 0; } ================================================ FILE: third-party/mimalloc/test/readme.md ================================================ Testing allocators is difficult as bugs may only surface after particular allocation patterns. The main approach to testing _mimalloc_ is therefore to have extensive internal invariant checking (see `page_is_valid` in `page.c` for example), which is enabled in debug mode with `-DMI_DEBUG_FULL=ON`. The main testing strategy is then to run [`mimalloc-bench`][bench] using full invariant checking to catch any potential problems over a wide range of intensive allocation benchmarks and programs. However, this does not test well for the entire API surface and this is tested with `test-api.c` when using `make test` (from `out/debug` etc). (This is not complete yet, please add to it.) The `main.c` and `main-override.c` are there to test if building and overriding from a local install works and therefore these build a separate `test/CMakeLists.txt`. [bench]: https://github.com/daanx/mimalloc-bench ================================================ FILE: third-party/mimalloc/test/test-api-fill.c ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2020, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #include "mimalloc.h" #include "mimalloc/types.h" #include "testhelper.h" // --------------------------------------------------------------------------- // Helper functions // --------------------------------------------------------------------------- bool check_zero_init(uint8_t* p, size_t size); #if MI_DEBUG >= 2 bool check_debug_fill_uninit(uint8_t* p, size_t size); bool check_debug_fill_freed(uint8_t* p, size_t size); #endif // --------------------------------------------------------------------------- // Main testing // --------------------------------------------------------------------------- int main(void) { mi_option_disable(mi_option_verbose); // --------------------------------------------------- // Zeroing allocation // --------------------------------------------------- CHECK_BODY("zeroinit-zalloc-small") { size_t zalloc_size = MI_SMALL_SIZE_MAX / 2; uint8_t* p = (uint8_t*)mi_zalloc(zalloc_size); result = check_zero_init(p, zalloc_size); mi_free(p); }; CHECK_BODY("zeroinit-zalloc-large") { size_t zalloc_size = MI_SMALL_SIZE_MAX * 2; uint8_t* p = (uint8_t*)mi_zalloc(zalloc_size); result = check_zero_init(p, zalloc_size); mi_free(p); }; CHECK_BODY("zeroinit-zalloc_small") { size_t zalloc_size = MI_SMALL_SIZE_MAX / 2; uint8_t* p = (uint8_t*)mi_zalloc_small(zalloc_size); result = check_zero_init(p, zalloc_size); mi_free(p); }; CHECK_BODY("zeroinit-calloc-small") { size_t calloc_size = MI_SMALL_SIZE_MAX / 2; uint8_t* p = (uint8_t*)mi_calloc(calloc_size, 1); result = check_zero_init(p, calloc_size); mi_free(p); }; CHECK_BODY("zeroinit-calloc-large") { size_t calloc_size = MI_SMALL_SIZE_MAX * 2; uint8_t* p = (uint8_t*)mi_calloc(calloc_size, 1); result = check_zero_init(p, calloc_size); mi_free(p); }; CHECK_BODY("zeroinit-rezalloc-small") { size_t zalloc_size = MI_SMALL_SIZE_MAX / 2; uint8_t* p = (uint8_t*)mi_zalloc(zalloc_size); result = check_zero_init(p, zalloc_size); zalloc_size *= 3; p = (uint8_t*)mi_rezalloc(p, zalloc_size); result &= check_zero_init(p, zalloc_size); mi_free(p); }; CHECK_BODY("zeroinit-rezalloc-large") { size_t zalloc_size = MI_SMALL_SIZE_MAX * 2; uint8_t* p = (uint8_t*)mi_zalloc(zalloc_size); result = check_zero_init(p, zalloc_size); zalloc_size *= 3; p = (uint8_t*)mi_rezalloc(p, zalloc_size); result &= check_zero_init(p, zalloc_size); mi_free(p); }; CHECK_BODY("zeroinit-recalloc-small") { size_t calloc_size = MI_SMALL_SIZE_MAX / 2; uint8_t* p = (uint8_t*)mi_calloc(calloc_size, 1); result = check_zero_init(p, calloc_size); calloc_size *= 3; p = (uint8_t*)mi_recalloc(p, calloc_size, 1); result &= check_zero_init(p, calloc_size); mi_free(p); }; CHECK_BODY("zeroinit-recalloc-large") { size_t calloc_size = MI_SMALL_SIZE_MAX * 2; uint8_t* p = (uint8_t*)mi_calloc(calloc_size, 1); result = check_zero_init(p, calloc_size); calloc_size *= 3; p = (uint8_t*)mi_recalloc(p, calloc_size, 1); result &= check_zero_init(p, calloc_size); mi_free(p); }; // --------------------------------------------------- // Zeroing in aligned API // --------------------------------------------------- CHECK_BODY("zeroinit-zalloc_aligned-small") { size_t zalloc_size = MI_SMALL_SIZE_MAX / 2; uint8_t* p = (uint8_t*)mi_zalloc_aligned(zalloc_size, MI_MAX_ALIGN_SIZE * 2); result = check_zero_init(p, zalloc_size); mi_free(p); }; CHECK_BODY("zeroinit-zalloc_aligned-large") { size_t zalloc_size = MI_SMALL_SIZE_MAX * 2; uint8_t* p = (uint8_t*)mi_zalloc_aligned(zalloc_size, MI_MAX_ALIGN_SIZE * 2); result = check_zero_init(p, zalloc_size); mi_free(p); }; CHECK_BODY("zeroinit-calloc_aligned-small") { size_t calloc_size = MI_SMALL_SIZE_MAX / 2; uint8_t* p = (uint8_t*)mi_calloc_aligned(calloc_size, 1, MI_MAX_ALIGN_SIZE * 2); result = check_zero_init(p, calloc_size); mi_free(p); }; CHECK_BODY("zeroinit-calloc_aligned-large") { size_t calloc_size = MI_SMALL_SIZE_MAX * 2; uint8_t* p = (uint8_t*)mi_calloc_aligned(calloc_size, 1, MI_MAX_ALIGN_SIZE * 2); result = check_zero_init(p, calloc_size); mi_free(p); }; CHECK_BODY("zeroinit-rezalloc_aligned-small") { size_t zalloc_size = MI_SMALL_SIZE_MAX / 2; uint8_t* p = (uint8_t*)mi_zalloc_aligned(zalloc_size, MI_MAX_ALIGN_SIZE * 2); result = check_zero_init(p, zalloc_size); zalloc_size *= 3; p = (uint8_t*)mi_rezalloc_aligned(p, zalloc_size, MI_MAX_ALIGN_SIZE * 2); result &= check_zero_init(p, zalloc_size); mi_free(p); }; CHECK_BODY("zeroinit-rezalloc_aligned-large") { size_t zalloc_size = MI_SMALL_SIZE_MAX * 2; uint8_t* p = (uint8_t*)mi_zalloc_aligned(zalloc_size, MI_MAX_ALIGN_SIZE * 2); result = check_zero_init(p, zalloc_size); zalloc_size *= 3; p = (uint8_t*)mi_rezalloc_aligned(p, zalloc_size, MI_MAX_ALIGN_SIZE * 2); result &= check_zero_init(p, zalloc_size); mi_free(p); }; CHECK_BODY("zeroinit-recalloc_aligned-small") { size_t calloc_size = MI_SMALL_SIZE_MAX / 2; uint8_t* p = (uint8_t*)mi_calloc_aligned(calloc_size, 1, MI_MAX_ALIGN_SIZE * 2); result = check_zero_init(p, calloc_size); calloc_size *= 3; p = (uint8_t*)mi_recalloc_aligned(p, calloc_size, 1, MI_MAX_ALIGN_SIZE * 2); result &= check_zero_init(p, calloc_size); mi_free(p); }; CHECK_BODY("zeroinit-recalloc_aligned-large") { size_t calloc_size = MI_SMALL_SIZE_MAX * 2; uint8_t* p = (uint8_t*)mi_calloc_aligned(calloc_size, 1, MI_MAX_ALIGN_SIZE * 2); result = check_zero_init(p, calloc_size); calloc_size *= 3; p = (uint8_t*)mi_recalloc_aligned(p, calloc_size, 1, MI_MAX_ALIGN_SIZE * 2); result &= check_zero_init(p, calloc_size); mi_free(p); }; #if (MI_DEBUG >= 2) && !MI_TSAN // --------------------------------------------------- // Debug filling // --------------------------------------------------- CHECK_BODY("uninit-malloc-small") { size_t malloc_size = MI_SMALL_SIZE_MAX / 2; uint8_t* p = (uint8_t*)mi_malloc(malloc_size); result = check_debug_fill_uninit(p, malloc_size); mi_free(p); }; CHECK_BODY("uninit-malloc-large") { size_t malloc_size = MI_SMALL_SIZE_MAX * 2; uint8_t* p = (uint8_t*)mi_malloc(malloc_size); result = check_debug_fill_uninit(p, malloc_size); mi_free(p); }; CHECK_BODY("uninit-malloc_small") { size_t malloc_size = MI_SMALL_SIZE_MAX / 2; uint8_t* p = (uint8_t*)mi_malloc_small(malloc_size); result = check_debug_fill_uninit(p, malloc_size); mi_free(p); }; CHECK_BODY("uninit-realloc-small") { size_t malloc_size = MI_SMALL_SIZE_MAX / 2; uint8_t* p = (uint8_t*)mi_malloc(malloc_size); result = check_debug_fill_uninit(p, malloc_size); malloc_size *= 3; p = (uint8_t*)mi_realloc(p, malloc_size); result &= check_debug_fill_uninit(p, malloc_size); mi_free(p); }; CHECK_BODY("uninit-realloc-large") { size_t malloc_size = MI_SMALL_SIZE_MAX * 2; uint8_t* p = (uint8_t*)mi_malloc(malloc_size); result = check_debug_fill_uninit(p, malloc_size); malloc_size *= 3; p = (uint8_t*)mi_realloc(p, malloc_size); result &= check_debug_fill_uninit(p, malloc_size); mi_free(p); }; CHECK_BODY("uninit-mallocn-small") { size_t malloc_size = MI_SMALL_SIZE_MAX / 2; uint8_t* p = (uint8_t*)mi_mallocn(malloc_size, 1); result = check_debug_fill_uninit(p, malloc_size); mi_free(p); }; CHECK_BODY("uninit-mallocn-large") { size_t malloc_size = MI_SMALL_SIZE_MAX * 2; uint8_t* p = (uint8_t*)mi_mallocn(malloc_size, 1); result = check_debug_fill_uninit(p, malloc_size); mi_free(p); }; CHECK_BODY("uninit-reallocn-small") { size_t malloc_size = MI_SMALL_SIZE_MAX / 2; uint8_t* p = (uint8_t*)mi_mallocn(malloc_size, 1); result = check_debug_fill_uninit(p, malloc_size); malloc_size *= 3; p = (uint8_t*)mi_reallocn(p, malloc_size, 1); result &= check_debug_fill_uninit(p, malloc_size); mi_free(p); }; CHECK_BODY("uninit-reallocn-large") { size_t malloc_size = MI_SMALL_SIZE_MAX * 2; uint8_t* p = (uint8_t*)mi_mallocn(malloc_size, 1); result = check_debug_fill_uninit(p, malloc_size); malloc_size *= 3; p = (uint8_t*)mi_reallocn(p, malloc_size, 1); result &= check_debug_fill_uninit(p, malloc_size); mi_free(p); }; CHECK_BODY("uninit-malloc_aligned-small") { size_t malloc_size = MI_SMALL_SIZE_MAX / 2; uint8_t* p = (uint8_t*)mi_malloc_aligned(malloc_size, MI_MAX_ALIGN_SIZE * 2); result = check_debug_fill_uninit(p, malloc_size); mi_free(p); }; CHECK_BODY("uninit-malloc_aligned-large") { size_t malloc_size = MI_SMALL_SIZE_MAX * 2; uint8_t* p = (uint8_t*)mi_malloc_aligned(malloc_size, MI_MAX_ALIGN_SIZE * 2); result = check_debug_fill_uninit(p, malloc_size); mi_free(p); }; CHECK_BODY("uninit-realloc_aligned-small") { size_t malloc_size = MI_SMALL_SIZE_MAX / 2; uint8_t* p = (uint8_t*)mi_malloc_aligned(malloc_size, MI_MAX_ALIGN_SIZE * 2); result = check_debug_fill_uninit(p, malloc_size); malloc_size *= 3; p = (uint8_t*)mi_realloc_aligned(p, malloc_size, MI_MAX_ALIGN_SIZE * 2); result &= check_debug_fill_uninit(p, malloc_size); mi_free(p); }; CHECK_BODY("uninit-realloc_aligned-large") { size_t malloc_size = MI_SMALL_SIZE_MAX * 2; uint8_t* p = (uint8_t*)mi_malloc_aligned(malloc_size, MI_MAX_ALIGN_SIZE * 2); result = check_debug_fill_uninit(p, malloc_size); malloc_size *= 3; p = (uint8_t*)mi_realloc_aligned(p, malloc_size, MI_MAX_ALIGN_SIZE * 2); result &= check_debug_fill_uninit(p, malloc_size); mi_free(p); }; #if !(MI_TRACK_VALGRIND || MI_TRACK_ASAN || MI_GUARDED) CHECK_BODY("fill-freed-small") { size_t malloc_size = MI_SMALL_SIZE_MAX / 2; uint8_t* p = (uint8_t*)mi_malloc(malloc_size); mi_free(p); // First sizeof(void*) bytes will contain housekeeping data, skip these result = check_debug_fill_freed(p + sizeof(void*), malloc_size - sizeof(void*)); }; CHECK_BODY("fill-freed-large") { size_t malloc_size = MI_SMALL_SIZE_MAX * 2; uint8_t* p = (uint8_t*)mi_malloc(malloc_size); mi_free(p); // First sizeof(void*) bytes will contain housekeeping data, skip these result = check_debug_fill_freed(p + sizeof(void*), malloc_size - sizeof(void*)); }; #endif #endif // --------------------------------------------------- // Done // ---------------------------------------------------[] return print_test_summary(); } // --------------------------------------------------------------------------- // Helper functions // --------------------------------------------------------------------------- bool check_zero_init(uint8_t* p, size_t size) { if(!p) return false; bool result = true; for (size_t i = 0; i < size; ++i) { result &= p[i] == 0; } return result; } #if MI_DEBUG >= 2 bool check_debug_fill_uninit(uint8_t* p, size_t size) { #if MI_TRACK_VALGRIND || MI_TRACK_ASAN (void)p; (void)size; return true; // when compiled with valgrind we don't init on purpose #else if(!p) return false; bool result = true; for (size_t i = 0; i < size; ++i) { result &= p[i] == MI_DEBUG_UNINIT; } return result; #endif } bool check_debug_fill_freed(uint8_t* p, size_t size) { #if MI_TRACK_VALGRIND (void)p; (void)size; return true; // when compiled with valgrind we don't fill on purpose #else if(!p) return false; bool result = true; for (size_t i = 0; i < size; ++i) { result &= p[i] == MI_DEBUG_FREED; } return result; #endif } #endif ================================================ FILE: third-party/mimalloc/test/test-api.c ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2020, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #if defined(__GNUC__) && !defined(__clang__) #pragma GCC diagnostic ignored "-Walloc-size-larger-than=" #endif /* Testing allocators is difficult as bugs may only surface after particular allocation patterns. The main approach to testing _mimalloc_ is therefore to have extensive internal invariant checking (see `page_is_valid` in `page.c` for example), which is enabled in debug mode with `-DMI_DEBUG_FULL=ON`. The main testing is then to run `mimalloc-bench` [1] using full invariant checking to catch any potential problems over a wide range of intensive allocation bench marks. However, this does not test well for the entire API surface. In this test file we therefore test the API over various inputs. Please add more tests :-) [1] https://github.com/daanx/mimalloc-bench */ #include #include #include #include #ifdef __cplusplus #include #endif #include "mimalloc.h" // #include "mimalloc/internal.h" #include "mimalloc/types.h" // for MI_DEBUG and MI_BLOCK_ALIGNMENT_MAX #include "testhelper.h" // --------------------------------------------------------------------------- // Test functions // --------------------------------------------------------------------------- bool test_heap1(void); bool test_heap2(void); bool test_stl_allocator1(void); bool test_stl_allocator2(void); bool test_stl_heap_allocator1(void); bool test_stl_heap_allocator2(void); bool test_stl_heap_allocator3(void); bool test_stl_heap_allocator4(void); bool mem_is_zero(uint8_t* p, size_t size) { if (p==NULL) return false; for (size_t i = 0; i < size; ++i) { if (p[i] != 0) return false; } return true; } // --------------------------------------------------------------------------- // Main testing // --------------------------------------------------------------------------- int main(void) { mi_option_disable(mi_option_verbose); CHECK_BODY("malloc-aligned9a") { // test large alignments void* p = mi_zalloc_aligned(1024 * 1024, 2); mi_free(p); p = mi_zalloc_aligned(1024 * 1024, 2); mi_free(p); result = true; }; // --------------------------------------------------- // Malloc // --------------------------------------------------- CHECK_BODY("malloc-zero") { void* p = mi_malloc(0); result = (p != NULL); mi_free(p); }; CHECK_BODY("malloc-nomem1") { result = (mi_malloc((size_t)PTRDIFF_MAX + (size_t)1) == NULL); }; CHECK_BODY("malloc-null") { mi_free(NULL); }; CHECK_BODY("calloc-overflow") { // use (size_t)&mi_calloc to get some number without triggering compiler warnings result = (mi_calloc((size_t)&mi_calloc,SIZE_MAX/1000) == NULL); }; CHECK_BODY("calloc0") { void* p = mi_calloc(0,1000); result = (mi_usable_size(p) <= 16); mi_free(p); }; CHECK_BODY("malloc-large") { // see PR #544. void* p = mi_malloc(67108872); mi_free(p); }; // --------------------------------------------------- // Extended // --------------------------------------------------- CHECK_BODY("posix_memalign1") { void* p = &p; int err = mi_posix_memalign(&p, sizeof(void*), 32); result = ((err==0 && (uintptr_t)p % sizeof(void*) == 0) || p==&p); mi_free(p); }; CHECK_BODY("posix_memalign_no_align") { void* p = &p; int err = mi_posix_memalign(&p, 3, 32); result = (err==EINVAL && p==&p); }; CHECK_BODY("posix_memalign_zero") { void* p = &p; int err = mi_posix_memalign(&p, sizeof(void*), 0); mi_free(p); result = (err==0); }; CHECK_BODY("posix_memalign_nopow2") { void* p = &p; int err = mi_posix_memalign(&p, 3*sizeof(void*), 32); result = (err==EINVAL && p==&p); }; CHECK_BODY("posix_memalign_nomem") { void* p = &p; int err = mi_posix_memalign(&p, sizeof(void*), SIZE_MAX); result = (err==ENOMEM && p==&p); }; // --------------------------------------------------- // Aligned API // --------------------------------------------------- CHECK_BODY("malloc-aligned1") { void* p = mi_malloc_aligned(32,32); result = (p != NULL && (uintptr_t)(p) % 32 == 0); mi_free(p); }; CHECK_BODY("malloc-aligned2") { void* p = mi_malloc_aligned(48,32); result = (p != NULL && (uintptr_t)(p) % 32 == 0); mi_free(p); }; CHECK_BODY("malloc-aligned3") { void* p1 = mi_malloc_aligned(48,32); bool result1 = (p1 != NULL && (uintptr_t)(p1) % 32 == 0); void* p2 = mi_malloc_aligned(48,32); bool result2 = (p2 != NULL && (uintptr_t)(p2) % 32 == 0); mi_free(p2); mi_free(p1); result = (result1&&result2); }; CHECK_BODY("malloc-aligned4") { void* p; bool ok = true; for (int i = 0; i < 8 && ok; i++) { p = mi_malloc_aligned(8, 16); ok = (p != NULL && (uintptr_t)(p) % 16 == 0); mi_free(p); } result = ok; }; CHECK_BODY("malloc-aligned5") { void* p = mi_malloc_aligned(4097,4096); size_t usable = mi_usable_size(p); result = (usable >= 4097 && usable < 16000); printf("malloc_aligned5: usable size: %zi\n", usable); mi_free(p); }; /* CHECK_BODY("malloc-aligned6") { bool ok = true; for (size_t align = 1; align <= MI_BLOCK_ALIGNMENT_MAX && ok; align *= 2) { void* ps[8]; for (int i = 0; i < 8 && ok; i++) { ps[i] = mi_malloc_aligned(align*13 // size , align); if (ps[i] == NULL || (uintptr_t)(ps[i]) % align != 0) { ok = false; } } for (int i = 0; i < 8 && ok; i++) { mi_free(ps[i]); } } result = ok; }; */ CHECK_BODY("malloc-aligned7") { void* p = mi_malloc_aligned(1024,MI_BLOCK_ALIGNMENT_MAX); mi_free(p); result = ((uintptr_t)p % MI_BLOCK_ALIGNMENT_MAX) == 0; }; CHECK_BODY("malloc-aligned8") { bool ok = true; for (int i = 0; i < 5 && ok; i++) { int n = (1 << i); void* p = mi_malloc_aligned(1024, n * MI_BLOCK_ALIGNMENT_MAX); ok = ((uintptr_t)p % (n*MI_BLOCK_ALIGNMENT_MAX)) == 0; mi_free(p); } result = ok; }; CHECK_BODY("malloc-aligned9") { // test large alignments bool ok = true; void* p[8]; size_t sizes[8] = { 8, 512, 1024 * 1024, MI_BLOCK_ALIGNMENT_MAX, MI_BLOCK_ALIGNMENT_MAX + 1, 2 * MI_BLOCK_ALIGNMENT_MAX, 8 * MI_BLOCK_ALIGNMENT_MAX, 0 }; for (int i = 0; i < 28 && ok; i++) { int align = (1 << i); for (int j = 0; j < 8 && ok; j++) { p[j] = mi_zalloc_aligned(sizes[j], align); ok = ((uintptr_t)p[j] % align) == 0; } for (int j = 0; j < 8; j++) { mi_free(p[j]); } } result = ok; }; CHECK_BODY("malloc-aligned10") { bool ok = true; void* p[10+1]; int align; int j; for(j = 0, align = 1; j <= 10 && ok; align *= 2, j++ ) { p[j] = mi_malloc_aligned(43 + align, align); ok = ((uintptr_t)p[j] % align) == 0; } for ( ; j > 0; j--) { mi_free(p[j-1]); } result = ok; } CHECK_BODY("malloc_aligned11") { mi_heap_t* heap = mi_heap_new(); void* p = mi_heap_malloc_aligned(heap, 33554426, 8); result = mi_heap_contains_block(heap, p); mi_heap_destroy(heap); } CHECK_BODY("mimalloc-aligned12") { void* p = mi_malloc_aligned(0x100, 0x100); result = (((uintptr_t)p % 0x100) == 0); // #602 mi_free(p); } CHECK_BODY("mimalloc-aligned13") { bool ok = true; for( size_t size = 1; size <= (MI_SMALL_SIZE_MAX * 2) && ok; size++ ) { for(size_t align = 1; align <= size && ok; align *= 2 ) { void* p[10]; for(int i = 0; i < 10 && ok; i++) { p[i] = mi_malloc_aligned(size,align);; ok = (p[i] != NULL && ((uintptr_t)(p[i]) % align) == 0); } for(int i = 0; i < 10 && ok; i++) { mi_free(p[i]); } /* if (ok && align <= size && ((size + MI_PADDING_SIZE) & (align-1)) == 0) { size_t bsize = mi_good_size(size); ok = (align <= bsize && (bsize & (align-1)) == 0); } */ } } result = ok; } CHECK_BODY("malloc-aligned-at1") { void* p = mi_malloc_aligned_at(48,32,0); result = (p != NULL && ((uintptr_t)(p) + 0) % 32 == 0); mi_free(p); }; CHECK_BODY("malloc-aligned-at2") { void* p = mi_malloc_aligned_at(50,32,8); result = (p != NULL && ((uintptr_t)(p) + 8) % 32 == 0); mi_free(p); }; CHECK_BODY("memalign1") { void* p; bool ok = true; for (int i = 0; i < 8 && ok; i++) { p = mi_memalign(16,8); ok = (p != NULL && (uintptr_t)(p) % 16 == 0); mi_free(p); } result = ok; }; CHECK_BODY("zalloc-aligned-small1") { size_t zalloc_size = MI_SMALL_SIZE_MAX / 2; uint8_t* p = (uint8_t*)mi_zalloc_aligned(zalloc_size, MI_MAX_ALIGN_SIZE * 2); result = mem_is_zero(p, zalloc_size); mi_free(p); }; CHECK_BODY("rezalloc_aligned-small1") { size_t zalloc_size = MI_SMALL_SIZE_MAX / 2; uint8_t* p = (uint8_t*)mi_zalloc_aligned(zalloc_size, MI_MAX_ALIGN_SIZE * 2); result = mem_is_zero(p, zalloc_size); zalloc_size *= 3; p = (uint8_t*)mi_rezalloc_aligned(p, zalloc_size, MI_MAX_ALIGN_SIZE * 2); result = result && mem_is_zero(p, zalloc_size); mi_free(p); }; // --------------------------------------------------- // Reallocation // --------------------------------------------------- CHECK_BODY("realloc-null") { void* p = mi_realloc(NULL,4); result = (p != NULL); mi_free(p); }; CHECK_BODY("realloc-null-sizezero") { void* p = mi_realloc(NULL,0); // "If ptr is NULL, the behavior is the same as calling malloc(new_size)." result = (p != NULL); mi_free(p); }; CHECK_BODY("realloc-sizezero") { void* p = mi_malloc(4); void* q = mi_realloc(p, 0); result = (q != NULL); mi_free(q); }; CHECK_BODY("reallocarray-null-sizezero") { void* p = mi_reallocarray(NULL,0,16); // issue #574 result = (p != NULL && errno == 0); mi_free(p); }; // --------------------------------------------------- // Heaps // --------------------------------------------------- CHECK("heap_destroy", test_heap1()); CHECK("heap_delete", test_heap2()); //mi_stats_print(NULL); // --------------------------------------------------- // various // --------------------------------------------------- #if !defined(MI_TRACK_ASAN) // realpath may leak with ASAN enabled (as the ASAN allocator intercepts it) CHECK_BODY("realpath") { char* s = mi_realpath( ".", NULL ); // printf("realpath: %s\n",s); mi_free(s); }; #endif CHECK("stl_allocator1", test_stl_allocator1()); CHECK("stl_allocator2", test_stl_allocator2()); CHECK("stl_heap_allocator1", test_stl_heap_allocator1()); CHECK("stl_heap_allocator2", test_stl_heap_allocator2()); CHECK("stl_heap_allocator3", test_stl_heap_allocator3()); CHECK("stl_heap_allocator4", test_stl_heap_allocator4()); // --------------------------------------------------- // Done // ---------------------------------------------------[] return print_test_summary(); } // --------------------------------------------------- // Larger test functions // --------------------------------------------------- bool test_heap1(void) { mi_heap_t* heap = mi_heap_new(); int* p1 = mi_heap_malloc_tp(heap,int); int* p2 = mi_heap_malloc_tp(heap,int); *p1 = *p2 = 43; mi_heap_destroy(heap); return true; } bool test_heap2(void) { mi_heap_t* heap = mi_heap_new(); int* p1 = mi_heap_malloc_tp(heap,int); int* p2 = mi_heap_malloc_tp(heap,int); mi_heap_delete(heap); *p1 = 42; mi_free(p1); mi_free(p2); return true; } bool test_stl_allocator1(void) { #ifdef __cplusplus std::vector > vec; vec.push_back(1); vec.pop_back(); return vec.size() == 0; #else return true; #endif } struct some_struct { int i; int j; double z; }; bool test_stl_allocator2(void) { #ifdef __cplusplus std::vector > vec; vec.push_back(some_struct()); vec.pop_back(); return vec.size() == 0; #else return true; #endif } bool test_stl_heap_allocator1(void) { #ifdef __cplusplus std::vector > vec; vec.push_back(some_struct()); vec.pop_back(); return vec.size() == 0; #else return true; #endif } bool test_stl_heap_allocator2(void) { #ifdef __cplusplus std::vector > vec; vec.push_back(some_struct()); vec.pop_back(); return vec.size() == 0; #else return true; #endif } bool test_stl_heap_allocator3(void) { #ifdef __cplusplus mi_heap_t* heap = mi_heap_new(); bool good = false; { mi_heap_stl_allocator myAlloc(heap); std::vector > vec(myAlloc); vec.push_back(some_struct()); vec.pop_back(); good = vec.size() == 0; } mi_heap_delete(heap); return good; #else return true; #endif } bool test_stl_heap_allocator4(void) { #ifdef __cplusplus mi_heap_t* heap = mi_heap_new(); bool good = false; { mi_heap_destroy_stl_allocator myAlloc(heap); std::vector > vec(myAlloc); vec.push_back(some_struct()); vec.pop_back(); good = vec.size() == 0; } mi_heap_destroy(heap); return good; #else return true; #endif } ================================================ FILE: third-party/mimalloc/test/test-stress.c ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2020 Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. -----------------------------------------------------------------------------*/ /* This is a stress test for the allocator, using multiple threads and transferring objects between threads. It tries to reflect real-world workloads: - allocation size is distributed linearly in powers of two - with some fraction extra large (and some very large) - the allocations are initialized and read again at free - pointers transfer between threads - threads are terminated and recreated with some objects surviving in between - uses deterministic "randomness", but execution can still depend on (random) thread scheduling. Do not use this test as a benchmark! */ #include #include #include #include #include #include // #define MI_GUARDED // #define USE_STD_MALLOC // > mimalloc-test-stress [THREADS] [SCALE] [ITER] // // argument defaults #if defined(MI_TSAN) // with thread-sanitizer reduce the threads to test within the azure pipeline limits static int THREADS = 8; static int SCALE = 25; static int ITER = 400; #elif defined(MI_UBSAN) // with undefined behavious sanitizer reduce parameters to stay within the azure pipeline limits static int THREADS = 8; static int SCALE = 25; static int ITER = 20; #elif defined(MI_GUARDED) // with debug guard pages reduce parameters to stay within the azure pipeline limits static int THREADS = 8; static int SCALE = 10; static int ITER = 10; #else static int THREADS = 32; // more repeatable if THREADS <= #processors static int SCALE = 50; // scaling factor static int ITER = 50; // N full iterations destructing and re-creating all threads #endif #define STRESS // undefine for leak test static bool allow_large_objects = false; // allow very large objects? (set to `true` if SCALE>100) static size_t use_one_size = 0; // use single object size of `N * sizeof(uintptr_t)`? static bool main_participates = false; // main thread participates as a worker too #ifdef USE_STD_MALLOC #define custom_calloc(n,s) calloc(n,s) #define custom_realloc(p,s) realloc(p,s) #define custom_free(p) free(p) #else #include #include #define custom_calloc(n,s) mi_calloc(n,s) #define custom_realloc(p,s) mi_realloc(p,s) #define custom_free(p) mi_free(p) #ifndef NDEBUG #define HEAP_WALK // walk the heap objects? #endif #endif // transfer pointer between threads #define TRANSFERS (1000) static volatile void* transfer[TRANSFERS]; #if (UINTPTR_MAX != UINT32_MAX) const uintptr_t cookie = 0xbf58476d1ce4e5b9UL; #else const uintptr_t cookie = 0x1ce4e5b9UL; #endif static void* atomic_exchange_ptr(volatile void** p, void* newval); typedef uintptr_t* random_t; static uintptr_t pick(random_t r) { uintptr_t x = *r; #if (UINTPTR_MAX > UINT32_MAX) // by Sebastiano Vigna, see: x ^= x >> 30; x *= 0xbf58476d1ce4e5b9UL; x ^= x >> 27; x *= 0x94d049bb133111ebUL; x ^= x >> 31; #else // by Chris Wellons, see: x ^= x >> 16; x *= 0x7feb352dUL; x ^= x >> 15; x *= 0x846ca68bUL; x ^= x >> 16; #endif *r = x; return x; } static bool chance(size_t perc, random_t r) { return (pick(r) % 100 <= perc); } static void* alloc_items(size_t items, random_t r) { if (chance(1, r)) { if (chance(1, r) && allow_large_objects) items *= 10000; // 0.01% giant else if (chance(10, r) && allow_large_objects) items *= 1000; // 0.1% huge else items *= 100; // 1% large objects; } if (items>=32 && items<=40) items*=2; // pthreads uses 320b allocations (this shows that more clearly in the stats) if (use_one_size > 0) items = (use_one_size / sizeof(uintptr_t)); if (items==0) items = 1; uintptr_t* p = (uintptr_t*)custom_calloc(items,sizeof(uintptr_t)); if (p != NULL) { for (uintptr_t i = 0; i < items; i++) { assert(p[i] == 0); p[i] = (items - i) ^ cookie; } } return p; } static void free_items(void* p) { if (p != NULL) { uintptr_t* q = (uintptr_t*)p; uintptr_t items = (q[0] ^ cookie); for (uintptr_t i = 0; i < items; i++) { if ((q[i] ^ cookie) != items - i) { fprintf(stderr, "memory corruption at block %p at %zu\n", p, i); abort(); } } } custom_free(p); } #ifdef HEAP_WALK static bool visit_blocks(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg) { (void)(heap); (void)(area); size_t* total = (size_t*)arg; if (block != NULL) { *total += block_size; } return true; } #endif static void stress(intptr_t tid) { //bench_start_thread(); uintptr_t r = ((tid + 1) * 43); // rand(); const size_t max_item_shift = 5; // 128 const size_t max_item_retained_shift = max_item_shift + 2; size_t allocs = 100 * ((size_t)SCALE) * (tid % 8 + 1); // some threads do more size_t retain = allocs / 2; void** data = NULL; size_t data_size = 0; size_t data_top = 0; void** retained = (void**)custom_calloc(retain,sizeof(void*)); size_t retain_top = 0; while (allocs > 0 || retain > 0) { if (retain == 0 || (chance(50, &r) && allocs > 0)) { // 50%+ alloc allocs--; if (data_top >= data_size) { data_size += 100000; data = (void**)custom_realloc(data, data_size * sizeof(void*)); } data[data_top++] = alloc_items(1ULL << (pick(&r) % max_item_shift), &r); } else { // 25% retain retained[retain_top++] = alloc_items( 1ULL << (pick(&r) % max_item_retained_shift), &r); retain--; } if (chance(66, &r) && data_top > 0) { // 66% free previous alloc size_t idx = pick(&r) % data_top; free_items(data[idx]); data[idx] = NULL; } if (chance(25, &r) && data_top > 0) { // 25% exchange a local pointer with the (shared) transfer buffer. size_t data_idx = pick(&r) % data_top; size_t transfer_idx = pick(&r) % TRANSFERS; void* p = data[data_idx]; void* q = atomic_exchange_ptr(&transfer[transfer_idx], p); data[data_idx] = q; } } #ifdef HEAP_WALK // walk the heap size_t total = 0; mi_heap_visit_blocks(mi_heap_get_default(), true, visit_blocks, &total); #endif // free everything that is left for (size_t i = 0; i < retain_top; i++) { free_items(retained[i]); } for (size_t i = 0; i < data_top; i++) { free_items(data[i]); } custom_free(retained); custom_free(data); //bench_end_thread(); } static void run_os_threads(size_t nthreads, void (*entry)(intptr_t tid)); static void test_stress(void) { uintptr_t r = rand(); for (int n = 0; n < ITER; n++) { run_os_threads(THREADS, &stress); #if !defined(NDEBUG) && !defined(USE_STD_MALLOC) // switch between arena and OS allocation for testing // mi_option_set_enabled(mi_option_disallow_arena_alloc, (n%2)==1); #endif #ifdef HEAP_WALK size_t total = 0; mi_abandoned_visit_blocks(mi_subproc_main(), -1, true, visit_blocks, &total); #endif for (int i = 0; i < TRANSFERS; i++) { if (chance(50, &r) || n + 1 == ITER) { // free all on last run, otherwise free half of the transfers void* p = atomic_exchange_ptr(&transfer[i], NULL); free_items(p); } } #ifndef NDEBUG //mi_collect(false); //mi_debug_show_arenas(true); #endif #if !defined(NDEBUG) || defined(MI_TSAN) if ((n + 1) % 10 == 0) { printf("- iterations left: %3d\n", ITER - (n + 1)); } #endif } } #ifndef STRESS static void leak(intptr_t tid) { uintptr_t r = rand(); void* p = alloc_items(1 /*pick(&r)%128*/, &r); if (chance(50, &r)) { intptr_t i = (pick(&r) % TRANSFERS); void* q = atomic_exchange_ptr(&transfer[i], p); free_items(q); } } static void test_leak(void) { for (int n = 0; n < ITER; n++) { run_os_threads(THREADS, &leak); mi_collect(false); #ifndef NDEBUG if ((n + 1) % 10 == 0) { printf("- iterations left: %3d\n", ITER - (n + 1)); } #endif } } #endif #if defined(USE_STD_MALLOC) && defined(MI_LINK_VERSION) #ifdef __cplusplus extern "C" #endif int mi_version(void); #endif int main(int argc, char** argv) { #ifdef MI_LINK_VERSION mi_version(); #endif #ifdef HEAP_WALK mi_option_enable(mi_option_visit_abandoned); #endif #if !defined(NDEBUG) && !defined(USE_STD_MALLOC) mi_option_set(mi_option_arena_reserve, 32 * 1024 /* in kib = 32MiB */); #endif // > mimalloc-test-stress [THREADS] [SCALE] [ITER] if (argc >= 2) { char* end; long n = strtol(argv[1], &end, 10); if (n > 0) THREADS = n; } if (argc >= 3) { char* end; long n = (strtol(argv[2], &end, 10)); if (n > 0) SCALE = n; } if (argc >= 4) { char* end; long n = (strtol(argv[3], &end, 10)); if (n > 0) ITER = n; } if (SCALE > 100) { allow_large_objects = true; } printf("Using %d threads with a %d%% load-per-thread and %d iterations %s\n", THREADS, SCALE, ITER, (allow_large_objects ? "(allow large objects)" : "")); #if !defined(NDEBUG) && !defined(USE_STD_MALLOC) mi_stats_reset(); #endif //mi_reserve_os_memory(1024*1024*1024ULL, false, true); //int res = mi_reserve_huge_os_pages(4,1); //printf("(reserve huge: %i\n)", res); //bench_start_program(); // Run ITER full iterations where half the objects in the transfer buffer survive to the next round. srand(0x7feb352d); //mi_reserve_os_memory(512ULL << 20, true, true); #if !defined(NDEBUG) && !defined(USE_STD_MALLOC) mi_stats_reset(); #endif #ifdef STRESS test_stress(); #else test_leak(); #endif #ifndef USE_STD_MALLOC #ifndef NDEBUG mi_debug_show_arenas(); mi_collect(true); char* json = mi_stats_get_json(0, NULL); if (json != NULL) { fputs(json,stderr); mi_free(json); } #endif mi_stats_print(NULL); #endif //bench_end_program(); return 0; } static void (*thread_entry_fun)(intptr_t) = &stress; #ifdef _WIN32 #include static DWORD WINAPI thread_entry(LPVOID param) { thread_entry_fun((intptr_t)param); return 0; } static void run_os_threads(size_t nthreads, void (*fun)(intptr_t)) { thread_entry_fun = fun; DWORD* tids = (DWORD*)custom_calloc(nthreads,sizeof(DWORD)); HANDLE* thandles = (HANDLE*)custom_calloc(nthreads,sizeof(HANDLE)); const size_t start = (main_participates ? 1 : 0); for (size_t i = start; i < nthreads; i++) { thandles[i] = CreateThread(0, 8*1024, &thread_entry, (void*)(i), 0, &tids[i]); } if (main_participates) fun(0); // run the main thread as well for (size_t i = start; i < nthreads; i++) { WaitForSingleObject(thandles[i], INFINITE); } for (size_t i = start; i < nthreads; i++) { CloseHandle(thandles[i]); } custom_free(tids); custom_free(thandles); } static void* atomic_exchange_ptr(volatile void** p, void* newval) { #if (INTPTR_MAX == INT32_MAX) return (void*)InterlockedExchange((volatile LONG*)p, (LONG)newval); #else return (void*)InterlockedExchange64((volatile LONG64*)p, (LONG64)newval); #endif } #else #include static void* thread_entry(void* param) { thread_entry_fun((uintptr_t)param); return NULL; } static void run_os_threads(size_t nthreads, void (*fun)(intptr_t)) { thread_entry_fun = fun; pthread_t* threads = (pthread_t*)custom_calloc(nthreads,sizeof(pthread_t)); memset(threads, 0, sizeof(pthread_t) * nthreads); const size_t start = (main_participates ? 1 : 0); //pthread_setconcurrency(nthreads); for (size_t i = start; i < nthreads; i++) { pthread_create(&threads[i], NULL, &thread_entry, (void*)i); } if (main_participates) fun(0); // run the main thread as well for (size_t i = start; i < nthreads; i++) { pthread_join(threads[i], NULL); } custom_free(threads); } #ifdef __cplusplus #include static void* atomic_exchange_ptr(volatile void** p, void* newval) { return std::atomic_exchange((volatile std::atomic*)p, newval); } #else #include static void* atomic_exchange_ptr(volatile void** p, void* newval) { return atomic_exchange((volatile _Atomic(void*)*)p, newval); } #endif #endif ================================================ FILE: third-party/mimalloc/test/test-wrong.c ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2020, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ /* test file for valgrind/asan support. VALGRIND: ---------- Compile in an "out/debug" folder: > cd out/debug > cmake ../.. -DMI_TRACK_VALGRIND=1 > make -j8 and then compile this file as: > gcc -g -o test-wrong -I../../include ../../test/test-wrong.c libmimalloc-valgrind-debug.a -lpthread and test as: > valgrind ./test-wrong ASAN ---------- Compile in an "out/debug" folder: > cd out/debug > cmake ../.. -DMI_TRACK_ASAN=1 > make -j8 and then compile this file as: > clang -g -o test-wrong -I../../include ../../test/test-wrong.c libmimalloc-asan-debug.a -lpthread -fsanitize=address -fsanitize-recover=address and test as: > ASAN_OPTIONS=verbosity=1:halt_on_error=0 ./test-wrong */ #include #include #include "mimalloc.h" #ifdef USE_STD_MALLOC # define mi(x) x #else # define mi(x) mi_##x #endif int main(int argc, char** argv) { int* p = (int*)mi(malloc)(3*sizeof(int)); int* r = (int*)mi_malloc_aligned(8,16); mi_free(r); // illegal byte wise read char* c = (char*)mi(malloc)(3); printf("invalid byte: over: %d, under: %d\n", c[4], c[-1]); mi(free)(c); // undefined access int* q = (int*)mi(malloc)(sizeof(int)); printf("undefined: %d\n", *q); // illegal int read printf("invalid: over: %d, under: %d\n", q[1], q[-1]); *q = 42; // buffer overflow q[1] = 43; // buffer underflow q[-1] = 44; mi(free)(q); // double free mi(free)(q); // use after free printf("use-after-free: %d\n", *q); // leak p // mi_free(p) return 0; } ================================================ FILE: third-party/mimalloc/test/testhelper.h ================================================ /* ---------------------------------------------------------------------------- Copyright (c) 2018-2020, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #ifndef TESTHELPER_H_ #define TESTHELPER_H_ #include #include #include // --------------------------------------------------------------------------- // Test macros: CHECK(name,predicate) and CHECK_BODY(name,body) // --------------------------------------------------------------------------- static int ok = 0; static int failed = 0; static bool check_result(bool result, const char* testname, const char* fname, long lineno) { if (!(result)) { failed++; fprintf(stderr,"\n FAILED: %s: %s:%ld\n", testname, fname, lineno); /* exit(1); */ } else { ok++; fprintf(stderr, "ok.\n"); } return true; } #define CHECK_BODY(name) \ fprintf(stderr,"test: %s... ", name ); \ errno = 0; \ for(bool done = false, result = true; !done; done = check_result(result,name,__FILE__,__LINE__)) #define CHECK(name,expr) CHECK_BODY(name){ result = (expr); } // Print summary of test. Return value can be directly use as a return value for main(). static inline int print_test_summary(void) { fprintf(stderr,"\n\n---------------------------------------------\n" "succeeded: %i\n" "failed : %i\n\n", ok, failed); return failed; } #endif // TESTHELPER_H_ ================================================ FILE: third-party/rust-demangle/.clang-format ================================================ # Attempt to mimic the Rust official style. BasedOnStyle: LLVM IndentWidth: 4 AlignAfterOpenBracket: BlockIndent ================================================ FILE: third-party/rust-demangle/.gitignore ================================================ target/ ================================================ FILE: third-party/rust-demangle/Cargo.toml ================================================ [workspace] members = ["test-harness"] default-members = ["test-harness"] ================================================ FILE: third-party/rust-demangle/LICENSE-APACHE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS ================================================ FILE: third-party/rust-demangle/LICENSE-MIT ================================================ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: third-party/rust-demangle/README.md ================================================ # `rust-demangle.c` This is a single-file C99 port of the official Rust symbol demangler ([`rustc-demangle`](https://github.com/rust-lang/rustc-demangle), a Rust library). ## Usecases This C port is intended for situations in which a Rust dependency is hard to justify, or effectively impossible (e.g. platform toolchains, that would be used *while building Rust*, not the other way around). If a Rust dependency is acceptable, [using `rustc-demangle` from C](https://github.com/rust-lang/rustc-demangle#usage-from-non-rust-languages) (or other languages via FFI) is possible, and may be preferred over this C port. ## Status As this C port was originally (see the [History](#history) section) *only* for the `rustc-demangle` code demangling the (new at the time) [Rust RFC2603 (aka "`v0`") mangling scheme](https://rust-lang.github.io/rfcs/2603-rust-symbol-name-mangling-v0.html), it may lag behind `rustc-demangle` in functionality, for now. The current port status by category is: * **ported** `legacy` (pre-RFC2603 Rust symbols) demangling * `v0` ([RFC2603](https://rust-lang.github.io/rfcs/2603-rust-symbol-name-mangling-v0.html) Rust symbols) demangling * **ported** PRs: * [[#23] Support demangling the new Rust mangling scheme (v0).](https://github.com/rust-lang/rustc-demangle/pull/23) * [[#26] v0: allow identifiers to start with a digit.](https://github.com/rust-lang/rustc-demangle/pull/26) * [[#53] v0: replace `skip_*` methods with `print_*` methods in a "skip printing" mode.](https://github.com/rust-lang/rustc-demangle/pull/53) * arguably backported to Rust, as the C port always took this approach * symbol prefix flexibility (`__R` and `R`, instead of `_R`) * [[#39] Add support for `min_const_generics` constants](https://github.com/rust-lang/rustc-demangle/pull/39) * [[#40] Elide the type when the const value is a placeholder `p`](https://github.com/rust-lang/rustc-demangle/pull/40) * [[#55] v0: demangle structural constants and &str.](https://github.com/rust-lang/rustc-demangle/pull/55) (only usable in `const` generics on unstable Rust) * **(UNPORTED)** recursion limits * miscellaneous * **ported** PRs: * [[#30] v0: also support preserving extra suffixes found after mangled symbol.](https://github.com/rust-lang/rustc-demangle/pull/30) * **(UNPORTED)** output size limits Notable differences (intentionally) introduced by porting: * `rustc-demangle` can't use the heap (as it's `#![no_std]`), but the C port does * this is mainly dictated by the ergonomics of the `rust_demangle` API, which requires `malloc`/`realloc` to return a new C string allocation * if there is demand for it, `rust_demangle` support could be made optional, forcing heap-less users to always use `rust_demangle_with_callback` instead * a subtler consequence is that `rustc-demangle` uses a fixed-size buffer on the stack for punycode decoding, while the C port can allocate it on the heap * Unicode support is always handrolled in the C port, and often simplified ## Usage Get `rust-demangle.c` and `rust-demangle.h` (via `git submodule`, vendoring, etc.), add them to your project's build system (as C source, and include path, respectively), then you can call `rust_demangle` with a symbol and some flags, e.g.: ```c #include #include int main() { const char *sym = "_RNvNtCsbmNqQUJIY6D_4core3foo3bar"; printf("demangle(%s) = %s\n", sym, rust_demangle(sym, 0)); printf( "demangle(%s, VERBOSE) = %s\n", sym, rust_demangle(sym, RUST_DEMANGLE_FLAG_VERBOSE) ); } ``` which prints out, when ran: ``` demangle(_RNvNtCsbmNqQUJIY6D_4core3foo3bar) = core::foo::bar demangle(_RNvNtCsbmNqQUJIY6D_4core3foo3bar, VERBOSE) = core[846817f741e54dfd]::foo::bar ``` Note that the example leaks the returned C strings, ideally you would `free` them. ### Advanced usage (callback-based API) If you want to avoid the cost of allocating the output in memory, you can also use `rust_demangle_with_callback`, which takes a "printing" callback instead, e.g.: ```c #include #include static void fwrite_callback(const char *data, size_t len, void *opaque) { fwrite(data, len, 1, (FILE *)opaque); } int main() { const char *sym = "_RNvNtCsbmNqQUJIY6D_4core3foo3bar"; printf("demangle(%s) = ", sym); rust_demangle_with_callback(sym, 0, fwrite_callback, stdout); printf("\n"); printf("demangle(%s, VERBOSE) = ", sym); rust_demangle_with_callback( sym, RUST_DEMANGLE_FLAG_VERBOSE, fwrite_callback, stdout ); printf("\n"); } ``` (with identical output to the simpler example) ## Testing `cargo test` will run built-in tests - it's implemented in Rust (in `test-harness`) so that it can depend on `rustc-demangle` itself for comparisons. Additionally, `cargo run -q --release --example check-csv-dataset path/to/syms/*.csv` can be used to provide CSV files with additional mangled symbols test data, but such datasets aren't trivial to obtain (existing ones required building `rust-lang/rust` with a compiler patch that reacts to a custom environment variable). They're also quite large (~1GiB uncompressed) so none have been published anywhere yet. ## History This C port was started while the [Rust RFC2603 (aka "`v0`") mangling scheme](https://rust-lang.github.io/rfcs/2603-rust-symbol-name-mangling-v0.html) was still being developed, with the intent of upstreaming it into `libiberty` (which provides demangling for `binutils`, `gdb`, Linux `perf`, etc.) and other projects (e.g. `valgrind`) - you can see some of that upstreaming history [on the `v0` tracking issue](https://github.com/rust-lang/rust/issues/60705). At the time, the expectation was that most projects could either depend on `libiberty`, or vendor a copy of its code, so the C port focused on upstreaming to it, rather than producing an independent reusable C codebase. That meant that instead of a `git` repository, the [code revisions were only tracked by a gist](https://gist.github.com/eddyb/c41a69378750a433767cf53fe2316768/revisions), and the GNU code style was followed (including C89 comments and variable declarations). However, the LGPL license of `libiberty` turned out to be a problem for adoption, compared to the typical MIT/Apache-2.0 dual licensing of Rust projects. ### The `rust-demangle.c` fork This repository started out as a fork of [the original gist](https://gist.github.com/eddyb/c41a69378750a433767cf53fe2316768/revisions), at commit [`e2c30407516a87c0f8c3820cf152640bd08805dd`](https://github.com/LykenSol/rust-demangle.c/commit/e2c30407516a87c0f8c3820cf152640bd08805dd), *just before `libiberty` integration* (which was in commit [`0e6f57b0e86ccec4395f8850f4885b1e391a9f4b`](https://gist.github.com/eddyb/c41a69378750a433767cf53fe2316768/0e6f57b0e86ccec4395f8850f4885b1e391a9f4b)). Any changes since that gist are either fresh C ports of the Rust `rustc-demangle` code, or completely new code, in order to maintain the [MIT/Apache-2.0 dual licensing](#license). While this has the disadvantage of starting behind `libiberty` (which kept its Rust `legacy` demangler, and also got a few more features during/since upstreaming), the relationship may reverse eventually, where this port could get new features that would then have to be upstreamed into `libiberty`. ## License [Like `rustc-demangle`](https://github.com/rust-lang/rustc-demangle#license), this project is licensed under either of * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) at your option. ### Contribution Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in `rust-demangle.c` you, as defined in the Apache-2.0 license, shall be dual licensed as above, without any additional terms or conditions. ================================================ FILE: third-party/rust-demangle/rust-demangle.c ================================================ // FIXME(eddyb) should this use ``? #include "rust-demangle.h" #include #include #include #include struct rust_demangler { const char *sym; size_t sym_len; void *callback_opaque; void (*callback)(const char *data, size_t len, void *opaque); // Position of the next character to read from the symbol. size_t next; // `true` if any error occurred. bool errored; // `true` if nothing should be printed. bool skipping_printing; // `true` if printing should be verbose (e.g. include hashes). bool verbose; // Rust mangling version, with legacy mangling being -1. int version; uint64_t bound_lifetime_depth; }; #define ERROR_AND(x) \ do { \ rdm->errored = true; \ x; \ } while (0) #define CHECK_OR(cond, x) \ do { \ if (!(cond)) \ ERROR_AND(x); \ } while (0) // FIXME(eddyb) consider renaming these to not start with `IS` (UB?). #define IS_DIGIT(c) ((c) >= '0' && (c) <= '9') #define IS_UPPER(c) ((c) >= 'A' && (c) <= 'Z') #define IS_LOWER(c) ((c) >= 'a' && (c) <= 'z') // Parsing functions. static char peek(const struct rust_demangler *rdm) { if (rdm->next < rdm->sym_len) return rdm->sym[rdm->next]; return 0; } static bool eat(struct rust_demangler *rdm, char c) { if (peek(rdm) == c) { rdm->next++; return true; } else return false; } static char next(struct rust_demangler *rdm) { char c = peek(rdm); CHECK_OR(c, return 0); rdm->next++; return c; } struct hex_nibbles { const char *nibbles; size_t nibbles_len; }; static struct hex_nibbles parse_hex_nibbles(struct rust_demangler *rdm) { struct hex_nibbles hex; hex.nibbles = NULL; hex.nibbles_len = 0; size_t start = rdm->next, hex_len = 0; while (!eat(rdm, '_')) { char c = next(rdm); CHECK_OR(IS_DIGIT(c) || (c >= 'a' && c <= 'f'), return hex); hex_len++; } hex.nibbles = rdm->sym + start; hex.nibbles_len = hex_len; return hex; } static struct hex_nibbles parse_hex_nibbles_for_const_uint(struct rust_demangler *rdm) { struct hex_nibbles hex = parse_hex_nibbles(rdm); CHECK_OR(!rdm->errored, return hex); // Trim leading `0`s. while (hex.nibbles_len > 0 && *hex.nibbles == '0') { hex.nibbles++; hex.nibbles_len--; } return hex; } static struct hex_nibbles parse_hex_nibbles_for_const_bytes(struct rust_demangler *rdm) { struct hex_nibbles hex = parse_hex_nibbles(rdm); CHECK_OR(!rdm->errored && (hex.nibbles_len % 2 == 0), return hex); return hex; } static uint8_t decode_hex_nibble(char nibble) { return nibble >= 'a' ? 10 + (nibble - 'a') : nibble - '0'; } static uint64_t parse_integer_62(struct rust_demangler *rdm) { if (eat(rdm, '_')) return 0; uint64_t x = 0; while (!eat(rdm, '_')) { char c = next(rdm); x *= 62; if (IS_DIGIT(c)) x += c - '0'; else if (IS_LOWER(c)) x += 10 + (c - 'a'); else if (IS_UPPER(c)) x += 10 + 26 + (c - 'A'); else ERROR_AND(return 0); } return x + 1; } static uint64_t parse_opt_integer_62(struct rust_demangler *rdm, char tag) { if (!eat(rdm, tag)) return 0; return 1 + parse_integer_62(rdm); } static uint64_t parse_disambiguator(struct rust_demangler *rdm) { return parse_opt_integer_62(rdm, 's'); } struct rust_mangled_ident { // ASCII part of the identifier. const char *ascii; size_t ascii_len; // Punycode insertion codes for Unicode codepoints, if any. const char *punycode; size_t punycode_len; }; static struct rust_mangled_ident parse_ident(struct rust_demangler *rdm) { struct rust_mangled_ident ident; ident.ascii = NULL; ident.ascii_len = 0; ident.punycode = NULL; ident.punycode_len = 0; bool is_punycode = false; if (rdm->version != -1) { is_punycode = eat(rdm, 'u'); } char c = next(rdm); CHECK_OR(IS_DIGIT(c), return ident); size_t len = c - '0'; if (c != '0') while (IS_DIGIT(peek(rdm))) len = len * 10 + (next(rdm) - '0'); if (rdm->version != -1) { // Skip past the optional `_` separator. eat(rdm, '_'); } size_t start = rdm->next; rdm->next += len; // Check for overflows. CHECK_OR((start <= rdm->next) && (rdm->next <= rdm->sym_len), return ident); ident.ascii = rdm->sym + start; ident.ascii_len = len; if (is_punycode) { ident.punycode_len = 0; while (ident.ascii_len > 0) { ident.ascii_len--; // The last '_' is a separator between ascii & punycode. if (ident.ascii[ident.ascii_len] == '_') break; ident.punycode_len++; } CHECK_OR(ident.punycode_len > 0, return ident); ident.punycode = ident.ascii + (len - ident.punycode_len); } if (ident.ascii_len == 0) ident.ascii = NULL; return ident; } // Printing functions. static void print_str(struct rust_demangler *rdm, const char *data, size_t len) { if (!rdm->errored && !rdm->skipping_printing) rdm->callback(data, len, rdm->callback_opaque); } #define PRINT(s) print_str(rdm, s, strlen(s)) static void print_uint64(struct rust_demangler *rdm, uint64_t x) { char s[21]; sprintf(s, "%" PRIu64, x); PRINT(s); } static void print_uint64_hex(struct rust_demangler *rdm, uint64_t x) { char s[17]; sprintf(s, "%" PRIx64, x); PRINT(s); } static void print_quoted_escaped_char(struct rust_demangler *rdm, char quote, uint32_t c) { CHECK_OR(c < 0xd800 || (c > 0xdfff && c < 0x10ffff), return); switch (c) { case '\0': PRINT("\\0"); break; case '\t': PRINT("\\t"); break; case '\r': PRINT("\\r"); break; case '\n': PRINT("\\n"); break; case '\\': PRINT("\\\\"); break; case '"': if (quote == '"') { PRINT("\\\""); } else { PRINT("\""); } break; case '\'': if (quote == '\'') { PRINT("\\'"); } else { PRINT("'"); } break; default: if (c >= 0x20 && c <= 0x7e) { // Printable ASCII char v = (char)c; print_str(rdm, &v, 1); } else { // FIXME show printable unicode characters without hex encoding PRINT("\\u{"); char s[9] = {0}; sprintf(s, "%" PRIx32, c); PRINT(s); PRINT("}"); } } } static void print_ident(struct rust_demangler *rdm, struct rust_mangled_ident ident) { if (rdm->errored || rdm->skipping_printing) return; if (!ident.punycode) { print_str(rdm, ident.ascii, ident.ascii_len); return; } size_t len = 0; size_t cap = 4; while (cap < ident.ascii_len) { cap *= 2; // Check for overflows. CHECK_OR((cap * 4) / 4 == cap, return); } // Store the output codepoints as groups of 4 UTF-8 bytes. uint8_t *out = (uint8_t *)malloc(cap * 4); CHECK_OR(out, return); // Populate initial output from ASCII fragment. for (len = 0; len < ident.ascii_len; len++) { uint8_t *p = out + 4 * len; p[0] = 0; p[1] = 0; p[2] = 0; p[3] = ident.ascii[len]; } // Punycode parameters and initial state. size_t base = 36; size_t t_min = 1; size_t t_max = 26; size_t skew = 38; size_t damp = 700; size_t bias = 72; size_t i = 0; uint32_t c = 0x80; size_t punycode_pos = 0; while (punycode_pos < ident.punycode_len) { // Read one delta value. size_t delta = 0; size_t w = 1; size_t k = 0; size_t t; uint8_t d; do { k += base; t = k < bias ? 0 : (k - bias); if (t < t_min) t = t_min; if (t > t_max) t = t_max; CHECK_OR(punycode_pos < ident.punycode_len, goto cleanup); d = ident.punycode[punycode_pos++]; if (IS_LOWER(d)) d = d - 'a'; else if (IS_DIGIT(d)) d = 26 + (d - '0'); else ERROR_AND(goto cleanup); delta += d * w; w *= base - t; } while (d >= t); // Compute the new insert position and character. len++; i += delta; c += i / len; i %= len; // Ensure enough space is available. if (cap < len) { cap *= 2; // Check for overflows. CHECK_OR((cap * 4) / 4 == cap, goto cleanup); CHECK_OR(cap >= len, goto cleanup); } uint8_t *p = (uint8_t *)realloc(out, cap * 4); CHECK_OR(p, goto cleanup); out = p; // Move the characters after the insert position. p = out + i * 4; memmove(p + 4, p, (len - i - 1) * 4); // Insert the new character, as UTF-8 bytes. p[0] = c >= 0x10000 ? 0xf0 | (c >> 18) : 0; p[1] = c >= 0x800 ? (c < 0x10000 ? 0xe0 : 0x80) | ((c >> 12) & 0x3f) : 0; p[2] = (c < 0x800 ? 0xc0 : 0x80) | ((c >> 6) & 0x3f); p[3] = 0x80 | (c & 0x3f); // If there are no more deltas, decoding is complete. if (punycode_pos == ident.punycode_len) break; i++; // Perform bias adaptation. delta /= damp; damp = 2; delta += delta / len; k = 0; while (delta > ((base - t_min) * t_max) / 2) { delta /= base - t_min; k += base; } bias = k + ((base - t_min + 1) * delta) / (delta + skew); } // Remove all the 0 bytes to leave behind an UTF-8 string. size_t j; for (i = 0, j = 0; i < len * 4; i++) if (out[i] != 0) out[j++] = out[i]; print_str(rdm, (const char *)out, j); cleanup: free(out); } /// Print the lifetime according to the previously decoded index. /// An index of `0` always refers to `'_`, but starting with `1`, /// indices refer to late-bound lifetimes introduced by a binder. static void print_lifetime_from_index(struct rust_demangler *rdm, uint64_t lt) { PRINT("'"); if (lt == 0) { PRINT("_"); return; } uint64_t depth = rdm->bound_lifetime_depth - lt; // Try to print lifetimes alphabetically first. if (depth < 26) { char c = 'a' + depth; print_str(rdm, &c, 1); } else { // Use `'_123` after running out of letters. PRINT("_"); print_uint64(rdm, depth); } } // Demangling functions. static void demangle_binder(struct rust_demangler *rdm); static void demangle_path(struct rust_demangler *rdm, bool in_value); static void demangle_generic_arg(struct rust_demangler *rdm); static void demangle_type(struct rust_demangler *rdm); static bool demangle_path_maybe_open_generics(struct rust_demangler *rdm); static void demangle_dyn_trait(struct rust_demangler *rdm); static void demangle_const(struct rust_demangler *rdm, bool in_value); static void demangle_const_uint(struct rust_demangler *rdm, char ty_tag); static void demangle_const_str_literal(struct rust_demangler *rdm); /// Optionally enter a binder ('G') for late-bound lifetimes, /// printing e.g. `for<'a, 'b> `, and make those lifetimes visible /// to the caller (via depth level, which the caller should reset). static void demangle_binder(struct rust_demangler *rdm) { CHECK_OR(!rdm->errored, return); uint64_t bound_lifetimes = parse_opt_integer_62(rdm, 'G'); if (bound_lifetimes > 0) { PRINT("for<"); for (uint64_t i = 0; i < bound_lifetimes; i++) { if (i > 0) PRINT(", "); rdm->bound_lifetime_depth++; print_lifetime_from_index(rdm, 1); } PRINT("> "); } } static void demangle_path(struct rust_demangler *rdm, bool in_value) { CHECK_OR(!rdm->errored, return); char tag = next(rdm); switch (tag) { case 'C': { uint64_t dis = parse_disambiguator(rdm); struct rust_mangled_ident name = parse_ident(rdm); print_ident(rdm, name); if (rdm->verbose) { PRINT("["); print_uint64_hex(rdm, dis); PRINT("]"); } break; } case 'N': { char ns = next(rdm); CHECK_OR(IS_LOWER(ns) || IS_UPPER(ns), return); demangle_path(rdm, in_value); uint64_t dis = parse_disambiguator(rdm); struct rust_mangled_ident name = parse_ident(rdm); if (IS_UPPER(ns)) { // Special namespaces, like closures and shims. PRINT("::{"); switch (ns) { case 'C': PRINT("closure"); break; case 'S': PRINT("shim"); break; default: print_str(rdm, &ns, 1); } if (name.ascii || name.punycode) { PRINT(":"); print_ident(rdm, name); } PRINT("#"); print_uint64(rdm, dis); PRINT("}"); } else { // Implementation-specific/unspecified namespaces. if (name.ascii || name.punycode) { PRINT("::"); print_ident(rdm, name); } } break; } case 'M': case 'X': // Ignore the `impl`'s own path. parse_disambiguator(rdm); bool was_skipping_printing = rdm->skipping_printing; rdm->skipping_printing = true; demangle_path(rdm, in_value); rdm->skipping_printing = was_skipping_printing; __attribute__((fallthrough)); case 'Y': PRINT("<"); demangle_type(rdm); if (tag != 'M') { PRINT(" as "); demangle_path(rdm, false); } PRINT(">"); break; case 'I': demangle_path(rdm, in_value); if (in_value) PRINT("::"); PRINT("<"); for (size_t i = 0; !rdm->errored && !eat(rdm, 'E'); i++) { if (i > 0) PRINT(", "); demangle_generic_arg(rdm); } PRINT(">"); break; case 'B': { size_t backref = parse_integer_62(rdm); if (!rdm->skipping_printing) { size_t old_next = rdm->next; rdm->next = backref; demangle_path(rdm, in_value); rdm->next = old_next; } break; } default: ERROR_AND(return); } } static void demangle_generic_arg(struct rust_demangler *rdm) { if (eat(rdm, 'L')) { uint64_t lt = parse_integer_62(rdm); print_lifetime_from_index(rdm, lt); } else if (eat(rdm, 'K')) demangle_const(rdm, false); else demangle_type(rdm); } static const char *basic_type(char tag) { switch (tag) { case 'b': return "bool"; case 'c': return "char"; case 'e': return "str"; case 'u': return "()"; case 'a': return "i8"; case 's': return "i16"; case 'l': return "i32"; case 'x': return "i64"; case 'n': return "i128"; case 'i': return "isize"; case 'h': return "u8"; case 't': return "u16"; case 'm': return "u32"; case 'y': return "u64"; case 'o': return "u128"; case 'j': return "usize"; case 'f': return "f32"; case 'd': return "f64"; case 'z': return "!"; case 'p': return "_"; case 'v': return "..."; default: return NULL; } } static void demangle_type(struct rust_demangler *rdm) { CHECK_OR(!rdm->errored, return); char tag = next(rdm); const char *basic = basic_type(tag); if (basic) { PRINT(basic); return; } switch (tag) { case 'R': case 'Q': PRINT("&"); if (eat(rdm, 'L')) { uint64_t lt = parse_integer_62(rdm); if (lt) { print_lifetime_from_index(rdm, lt); PRINT(" "); } } if (tag != 'R') PRINT("mut "); demangle_type(rdm); break; case 'P': case 'O': PRINT("*"); if (tag != 'P') PRINT("mut "); else PRINT("const "); demangle_type(rdm); break; case 'A': case 'S': PRINT("["); demangle_type(rdm); if (tag == 'A') { PRINT("; "); demangle_const(rdm, true); } PRINT("]"); break; case 'T': { PRINT("("); size_t i; for (i = 0; !rdm->errored && !eat(rdm, 'E'); i++) { if (i > 0) PRINT(", "); demangle_type(rdm); } if (i == 1) PRINT(","); PRINT(")"); break; } case 'F': { uint64_t old_bound_lifetime_depth = rdm->bound_lifetime_depth; demangle_binder(rdm); if (eat(rdm, 'U')) PRINT("unsafe "); if (eat(rdm, 'K')) { struct rust_mangled_ident abi; if (eat(rdm, 'C')) { abi.ascii = "C"; abi.ascii_len = 1; } else { abi = parse_ident(rdm); CHECK_OR(abi.ascii && !abi.punycode, goto restore); } PRINT("extern \""); // If the ABI had any `-`, they were replaced with `_`, // so the parts between `_` have to be re-joined with `-`. for (size_t i = 0; i < abi.ascii_len; i++) { if (abi.ascii[i] == '_') { print_str(rdm, abi.ascii, i); PRINT("-"); abi.ascii += i + 1; abi.ascii_len -= i + 1; i = 0; } } print_str(rdm, abi.ascii, abi.ascii_len); PRINT("\" "); } PRINT("fn("); for (size_t i = 0; !rdm->errored && !eat(rdm, 'E'); i++) { if (i > 0) PRINT(", "); demangle_type(rdm); } PRINT(")"); if (eat(rdm, 'u')) { // Skip printing the return type if it's 'u', i.e. `()`. } else { PRINT(" -> "); demangle_type(rdm); } // Restore `bound_lifetime_depth` to outside the binder. restore: rdm->bound_lifetime_depth = old_bound_lifetime_depth; break; } case 'D': PRINT("dyn "); uint64_t old_bound_lifetime_depth = rdm->bound_lifetime_depth; demangle_binder(rdm); for (size_t i = 0; !rdm->errored && !eat(rdm, 'E'); i++) { if (i > 0) PRINT(" + "); demangle_dyn_trait(rdm); } // Restore `bound_lifetime_depth` to outside the binder. rdm->bound_lifetime_depth = old_bound_lifetime_depth; CHECK_OR(eat(rdm, 'L'), return); uint64_t lt = parse_integer_62(rdm); if (lt) { PRINT(" + "); print_lifetime_from_index(rdm, lt); } break; case 'B': { size_t backref = parse_integer_62(rdm); if (!rdm->skipping_printing) { size_t old_next = rdm->next; rdm->next = backref; demangle_type(rdm); rdm->next = old_next; } break; } default: // Go back to the tag, so `demangle_path` also sees it. rdm->next--; demangle_path(rdm, false); } } /// A trait in a trait object may have some "existential projections" /// (i.e. associated type bindings) after it, which should be printed /// in the `<...>` of the trait, e.g. `dyn Trait`. /// To this end, this method will keep the `<...>` of an 'I' path /// open, by omitting the `>`, and return `Ok(true)` in that case. static bool demangle_path_maybe_open_generics(struct rust_demangler *rdm) { bool open = false; CHECK_OR(!rdm->errored, return open); if (eat(rdm, 'B')) { size_t backref = parse_integer_62(rdm); if (!rdm->skipping_printing) { size_t old_next = rdm->next; rdm->next = backref; open = demangle_path_maybe_open_generics(rdm); rdm->next = old_next; } } else if (eat(rdm, 'I')) { demangle_path(rdm, false); PRINT("<"); open = true; for (size_t i = 0; !rdm->errored && !eat(rdm, 'E'); i++) { if (i > 0) PRINT(", "); demangle_generic_arg(rdm); } } else demangle_path(rdm, false); return open; } static void demangle_dyn_trait(struct rust_demangler *rdm) { CHECK_OR(!rdm->errored, return); bool open = demangle_path_maybe_open_generics(rdm); while (eat(rdm, 'p')) { if (!open) PRINT("<"); else PRINT(", "); open = true; struct rust_mangled_ident name = parse_ident(rdm); print_ident(rdm, name); PRINT(" = "); demangle_type(rdm); } if (open) PRINT(">"); } static void demangle_const(struct rust_demangler *rdm, bool in_value) { CHECK_OR(!rdm->errored, return); bool opened_brace = false; char ty_tag = next(rdm); switch (ty_tag) { case 'p': PRINT("_"); break; // Unsigned integer types. case 'h': case 't': case 'm': case 'y': case 'o': case 'j': demangle_const_uint(rdm, ty_tag); break; case 'a': case 's': case 'l': case 'x': case 'n': case 'i': if (eat(rdm, 'n')) { PRINT("-"); } demangle_const_uint(rdm, ty_tag); break; case 'b': { struct hex_nibbles hex = parse_hex_nibbles_for_const_uint(rdm); CHECK_OR(!rdm->errored && hex.nibbles_len <= 1, return); uint8_t v = hex.nibbles_len > 0 ? decode_hex_nibble(hex.nibbles[0]) : 0; CHECK_OR(v <= 1, return); PRINT(v == 1 ? "true" : "false"); break; } case 'c': { struct hex_nibbles hex = parse_hex_nibbles_for_const_uint(rdm); CHECK_OR(!rdm->errored && hex.nibbles_len <= 6, return); uint32_t c = 0; for (size_t i = 0; i < hex.nibbles_len; i++) c = (c << 4) | decode_hex_nibble(hex.nibbles[i]); PRINT("'"); print_quoted_escaped_char(rdm, '\'', c); PRINT("'"); break; } case 'e': // NOTE(eddyb) a string literal `"..."` has type `&str`, so // to get back the type `str`, `*"..."` syntax is needed // (even if that may not be valid in Rust itself). if (!in_value) { opened_brace = true; PRINT("{"); } PRINT("*"); demangle_const_str_literal(rdm); break; case 'R': case 'Q': if (ty_tag == 'R' && eat(rdm, 'e')) { // NOTE(eddyb) this prints `"..."` instead of `&*"..."`, which // is what `Re..._` would imply (see comment for `str` above). demangle_const_str_literal(rdm); break; } if (!in_value) { opened_brace = true; PRINT("{"); } PRINT("&"); if (ty_tag != 'R') { PRINT("mut "); } demangle_const(rdm, true); break; case 'A': { if (!in_value) { opened_brace = true; PRINT("{"); } PRINT("["); size_t i = 0; while (!eat(rdm, 'E')) { CHECK_OR(!rdm->errored, return); if (i > 0) PRINT(", "); demangle_const(rdm, true); i += 1; } PRINT("]"); break; } case 'T': { if (!in_value) { opened_brace = true; PRINT("{"); } PRINT("("); size_t i = 0; while (!eat(rdm, 'E')) { CHECK_OR(!rdm->errored, return); if (i > 0) PRINT(", "); demangle_const(rdm, true); i += 1; } if (i == 1) PRINT(","); PRINT(")"); break; } case 'V': if (!in_value) { opened_brace = true; PRINT("{"); } demangle_path(rdm, true); switch (next(rdm)) { case 'U': break; case 'T': { PRINT("("); size_t i = 0; while (!eat(rdm, 'E')) { CHECK_OR(!rdm->errored, return); if (i > 0) PRINT(", "); demangle_const(rdm, true); i += 1; } PRINT(")"); break; } case 'S': { PRINT(" { "); size_t i = 0; while (!eat(rdm, 'E')) { CHECK_OR(!rdm->errored, return); if (i > 0) PRINT(", "); parse_disambiguator(rdm); struct rust_mangled_ident name = parse_ident(rdm); print_ident(rdm, name); PRINT(": "); demangle_const(rdm, true); i += 1; } PRINT(" }"); break; } default: ERROR_AND(return); } break; case 'B': { size_t backref = parse_integer_62(rdm); if (!rdm->skipping_printing) { size_t old_next = rdm->next; rdm->next = backref; demangle_const(rdm, in_value); rdm->next = old_next; } break; } default: ERROR_AND(return); } if (opened_brace) { PRINT("}"); } } static void demangle_const_uint(struct rust_demangler *rdm, char ty_tag) { CHECK_OR(!rdm->errored, return); struct hex_nibbles hex = parse_hex_nibbles_for_const_uint(rdm); CHECK_OR(!rdm->errored, return); // Print anything that doesn't fit in `uint64_t` verbatim. if (hex.nibbles_len > 16) { PRINT("0x"); print_str(rdm, hex.nibbles, hex.nibbles_len); } else { uint64_t v = 0; for (size_t i = 0; i < hex.nibbles_len; i++) v = (v << 4) | decode_hex_nibble(hex.nibbles[i]); print_uint64(rdm, v); } if (rdm->verbose) PRINT(basic_type(ty_tag)); } // UTF-8 uses an unary encoding for its "length" field (`1`s followed by a `0`). struct utf8_byte { // Decoded "length" field of an UTF-8 byte, including the special cases: // - `0` indicates this is a lone ASCII byte // - `1` indicates a continuation byte (cannot start an UTF-8 sequence) size_t seq_len; // Remaining (`payload_width`) bits in the UTF-8 byte, contributing to // the Unicode scalar value being encoded in the UTF-8 sequence. uint8_t payload; size_t payload_width; }; static struct utf8_byte utf8_decode(uint8_t byte) { struct utf8_byte utf8; utf8.seq_len = 0; utf8.payload = byte; utf8.payload_width = 8; // FIXME(eddyb) figure out if using "count leading ones/zeros" is an option. while (utf8.seq_len <= 6) { uint8_t msb = 0x80 >> utf8.seq_len; utf8.payload &= ~msb; utf8.payload_width--; if ((byte & msb) == 0) break; utf8.seq_len++; } return utf8; } static void demangle_const_str_literal(struct rust_demangler *rdm) { CHECK_OR(!rdm->errored, return); struct hex_nibbles hex = parse_hex_nibbles_for_const_bytes(rdm); CHECK_OR(!rdm->errored, return); PRINT("\""); for (size_t i = 0; i < hex.nibbles_len; i += 2) { struct utf8_byte utf8 = utf8_decode( (decode_hex_nibble(hex.nibbles[i]) << 4) | decode_hex_nibble(hex.nibbles[i + 1]) ); uint32_t c = utf8.payload; if (utf8.seq_len > 0) { CHECK_OR(utf8.seq_len >= 2 && utf8.seq_len <= 4, return); for (size_t extra = utf8.seq_len - 1; extra > 0; extra--) { i += 2; utf8 = utf8_decode( (decode_hex_nibble(hex.nibbles[i]) << 4) | decode_hex_nibble(hex.nibbles[i + 1]) ); CHECK_OR(utf8.seq_len == 1, return); c = (c << utf8.payload_width) | utf8.payload; } } print_quoted_escaped_char(rdm, '"', c); } PRINT("\""); } static bool is_rust_hash(struct rust_mangled_ident name) { if (name.ascii[0] != 'h') { return false; } for (size_t i = 1; i < name.ascii_len; i++) { if (!IS_DIGIT(name.ascii[i]) && !(name.ascii[i] >= 'a' && name.ascii[i] <= 'f')) { return false; } } return true; } static void print_legacy_ident( struct rust_demangler *rdm, struct rust_mangled_ident ident ) { if (rdm->errored || rdm->skipping_printing) return; CHECK_OR(!ident.punycode, return); if (ident.ascii[0] == '_' && ident.ascii[1] == '$') { ident.ascii += 1; ident.ascii_len -= 1; } while (1) { if (ident.ascii_len == 0) { break; } else if (ident.ascii[0] == '.') { if (ident.ascii_len >= 2 && ident.ascii[1] == '.') { PRINT("::"); ident.ascii += 2; ident.ascii_len -= 2; } else { PRINT("."); ident.ascii += 1; ident.ascii_len -= 1; } } else if (ident.ascii[0] == '$') { const char *end_ptr = (const char *)memchr(&ident.ascii[1], '$', ident.ascii_len - 1); if (!end_ptr) break; const char *escape = &ident.ascii[1]; size_t escape_len = end_ptr - escape; if (strncmp(escape, "SP", 2) == 0) { PRINT("@"); } else if (strncmp(escape, "BP", 2) == 0) { PRINT("*"); } else if (strncmp(escape, "RF", 2) == 0) { PRINT("&"); } else if (strncmp(escape, "LT", 2) == 0) { PRINT("<"); } else if (strncmp(escape, "GT", 2) == 0) { PRINT(">"); } else if (strncmp(escape, "LP", 2) == 0) { PRINT("("); } else if (strncmp(escape, "RP", 2) == 0) { PRINT(")"); } else if (strncmp(escape, "C", 1) == 0) { PRINT(","); } else { if (escape[0] != 'u') { break; } const char *digits = &escape[1]; size_t digits_len = escape_len - 1; bool invalid = false; for (size_t i = 1; i < digits_len; i++) { if (!IS_DIGIT(digits[i]) && !(digits[i] >= 'a' && digits[i] <= 'f')) { invalid = true; break; } } if (invalid) break; struct hex_nibbles hex; hex.nibbles = digits; hex.nibbles_len = digits_len; uint32_t c = 0; for (size_t i = 0; i < hex.nibbles_len; i++) c = (c << 4) | decode_hex_nibble(hex.nibbles[i]); if (!(c < 0xd800 || (c > 0xdfff && c < 0x10ffff))) { break; // Not a valid unicode scalar } if (c >= 0x20 && c <= 0x7e) { // Printable ASCII char v = (char)c; print_str(rdm, &v, 1); } else { // FIXME show printable unicode characters without hex // encoding PRINT("\\u{"); char s[9] = {0}; sprintf(s, "%" PRIx32, c); PRINT(s); PRINT("}"); } } ident.ascii += escape_len + 2; ident.ascii_len -= escape_len + 2; } else { bool found = false; for (size_t i = 0; i < ident.ascii_len; i++) { if (ident.ascii[i] == '$' || ident.ascii[i] == '.') { print_str(rdm, ident.ascii, i); ident.ascii += i; ident.ascii_len -= i; found = true; break; } } if (!found) { break; } } } print_str(rdm, ident.ascii, ident.ascii_len); } static void demangle_legacy_path(struct rust_demangler *rdm) { bool first = true; while (1) { if (eat(rdm, 'E')) { // FIXME Maybe check if at end of symbol? return; } struct rust_mangled_ident name = parse_ident(rdm); if (!rdm->verbose && peek(rdm) == 'E' && is_rust_hash(name)) { // Skip printing the hash if verbose mode is disabled. eat(rdm, 'E'); break; } if (!first) { PRINT("::"); } first = false; print_legacy_ident(rdm, name); CHECK_OR(!rdm->errored, return); } } bool rust_demangle_with_callback( const char *whole_mangled_symbol, int flags, void (*callback)(const char *data, size_t len, void *opaque), void *opaque ) { struct rust_demangler rdm; rdm.sym = whole_mangled_symbol; rdm.sym_len = 0; rdm.callback_opaque = opaque; rdm.callback = callback; rdm.next = 0; rdm.errored = false; rdm.skipping_printing = false; rdm.verbose = (flags & RUST_DEMANGLE_FLAG_VERBOSE) != 0; rdm.version = -2; // Invalid version rdm.bound_lifetime_depth = 0; // Rust symbols always start with R, _R or __R for the v0 scheme or ZN, _ZN // or __ZN for the legacy scheme. if (strncmp(rdm.sym, "_R", 2) == 0) { rdm.sym += 2; rdm.version = 0; // v0 } else if (rdm.sym[0] == 'R') { // On Windows, dbghelp strips leading underscores, so we accept "R..." // form too. rdm.sym += 1; rdm.version = 0; // v0 } else if (strncmp(rdm.sym, "__R", 3) == 0) { // On OSX, symbols are prefixed with an extra _ rdm.sym += 3; rdm.version = 0; // v0 } else if (strncmp(rdm.sym, "_ZN", 3) == 0) { rdm.sym += 3; rdm.version = -1; // legacy } else if (strncmp(rdm.sym, "ZN", 2) == 0) { // On Windows, dbghelp strips leading underscores, so we accept "R..." // form too. rdm.sym += 2; rdm.version = -1; // legacy } else if (strncmp(rdm.sym, "__ZN", 4) == 0) { // On OSX, symbols are prefixed with an extra _ rdm.sym += 4; rdm.version = -1; // legacy } else { return false; } if (rdm.version != -1) { // Paths always start with uppercase characters. if (!IS_UPPER(rdm.sym[0])) return false; } // Rust symbols only use ASCII characters. for (const char *p = rdm.sym; *p; p++) { if ((*p & 0x80) != 0) return false; if (*p == '.' && strncmp(p, ".llvm.", 6) == 0) { // Ignore .llvm. suffixes break; } rdm.sym_len++; } if (rdm.version == -1) { demangle_legacy_path(&rdm); } else { demangle_path(&rdm, true); // Skip instantiating crate. if (!rdm.errored && rdm.next < rdm.sym_len && peek(&rdm) >= 'A' && peek(&rdm) <= 'Z') { rdm.skipping_printing = true; demangle_path(&rdm, false); } } if (!rdm.errored && (rdm.sym_len - rdm.next > 0)) { for (const char *p = rdm.sym + rdm.next; *p; p++) { // FIXME match is_symbol_like from rustc-demangle if (!((*p >= 'a' && *p <= 'z') || (*p >= 'A' && *p <= 'Z') || (*p >= '0' && *p <= '9') || *p == '.')) { // Suffix is not a symbol like string return false; } } // Print LLVM produced suffix print_str(&rdm, rdm.sym + rdm.next, rdm.sym_len - rdm.next); } return !rdm.errored; } // Growable string buffers. struct str_buf { char *ptr; size_t len; size_t cap; bool errored; }; static void str_buf_reserve(struct str_buf *buf, size_t extra) { // Allocation failed before. if (buf->errored) return; size_t available = buf->cap - buf->len; if (extra <= available) return; size_t min_new_cap = buf->cap + (extra - available); // Check for overflows. if (min_new_cap < buf->cap) { buf->errored = true; return; } size_t new_cap = buf->cap; if (new_cap == 0) new_cap = 4; // Double capacity until sufficiently large. while (new_cap < min_new_cap) { new_cap *= 2; // Check for overflows. if (new_cap < buf->cap) { buf->errored = true; return; } } char *new_ptr = (char *)realloc(buf->ptr, new_cap); if (new_ptr == NULL) { free(buf->ptr); buf->ptr = NULL; buf->len = 0; buf->cap = 0; buf->errored = true; } else { buf->ptr = new_ptr; buf->cap = new_cap; } } static void str_buf_append(struct str_buf *buf, const char *data, size_t len) { str_buf_reserve(buf, len); if (buf->errored) return; memcpy(buf->ptr + buf->len, data, len); buf->len += len; } static void str_buf_demangle_callback(const char *data, size_t len, void *opaque) { str_buf_append(opaque, data, len); } char *rust_demangle(const char *mangled, int flags) { struct str_buf out; out.ptr = NULL; out.len = 0; out.cap = 0; out.errored = false; bool success = rust_demangle_with_callback( mangled, flags, str_buf_demangle_callback, &out ); if (!success) { free(out.ptr); return NULL; } str_buf_append(&out, "\0", 1); return out.ptr; } ================================================ FILE: third-party/rust-demangle/rust-demangle.h ================================================ #include #include #define RUST_DEMANGLE_FLAG_VERBOSE 1 #ifdef __cplusplus extern "C" { #endif bool rust_demangle_with_callback( const char *mangled, int flags, void (*callback)(const char *data, size_t len, void *opaque), void *opaque ); char *rust_demangle(const char *mangled, int flags); #ifdef __cplusplus } #endif ================================================ FILE: third-party/rust-demangle/test-harness/Cargo.toml ================================================ [package] name = "rust-demangle-c-test-harness" version = "0.0.0" license = "MIT OR Apache-2.0" edition = "2021" publish = false # Tests go into `tests/`, not in `src/`. [lib] test = false doctest = false [dependencies] rustc-demangle = "0.1.21" [build-dependencies] cc = "1.0" ================================================ FILE: third-party/rust-demangle/test-harness/build.rs ================================================ fn main() { let src = "../rust-demangle.c"; let header = "../rust-demangle.h"; println!("cargo:rerun-if-changed={}", src); println!("cargo:rerun-if-changed={}", header); cc::Build::new() .file("../rust-demangle.c") .flag_if_supported("-std=c99") .flag_if_supported("-pedantic") .warnings(true) .warnings_into_errors(true) .flag_if_supported("-Werror=uninitialized") .compile("rust-demangle"); } ================================================ FILE: third-party/rust-demangle/test-harness/examples/check-csv-dataset.rs ================================================ use std::env; use std::fs::File; use std::io::prelude::*; use std::io::BufReader; use std::path::PathBuf; use rust_demangle_c_test_harness::demangle; // HACK(eddyb) this is only an `example` so that `cargo run` doesn't do anything. // FIXME(eddyb) document this better and provide datasets for it. fn main() { let header = "legacy+generics,legacy,mw,mw+compression,v0,v0+compression"; for path in env::args_os().skip(1).map(PathBuf::from) { let mut lines = BufReader::new(File::open(path).unwrap()) .lines() .map(|l| l.unwrap()); assert_eq!(lines.next().unwrap(), header); for line in lines { for mangling in line.split(',').skip(4) { for verbose in [false, true] { demangle(mangling).to_string_maybe_verbose(verbose); } } } } } ================================================ FILE: third-party/rust-demangle/test-harness/src/lib.rs ================================================ use std::fmt; // HACK(eddyb) helper macros for tests. #[macro_export] macro_rules! assert_contains { ($s:expr, $needle:expr) => {{ let (s, needle) = ($s, $needle); assert!( s.contains(needle), "{:?} should've contained {:?}", s, needle ); }}; } #[macro_export] macro_rules! assert_ends_with { ($s:expr, $suffix:expr) => {{ let (s, suffix) = ($s, $suffix); assert!( s.ends_with(suffix), "{:?} should've ended in {:?}", s, suffix ); }}; } /// `rustc_demangle::Demangle` wrapper that will also attempt demanging with /// `rust-demangle.c`'s `rust_demangle` when formatted, and assert equality. /// /// The reason this mimics `rustc_demangle`'s API is to allow its tests to be /// reused without rewriting them (which could risk introducing bugs). pub struct Demangle<'a> { // NOTE(eddyb) we don't trust `rustc_demangle` to keep the original string // unmodified, as if it e.g. strips suffixes early, it could hide the fact // that the C port doesn't have that sort of thing supported yet. original: &'a str, rustc_demangle: rustc_demangle::Demangle<'a>, } pub fn demangle(s: &str) -> Demangle { Demangle { original: s, rustc_demangle: rustc_demangle::demangle(s), } } pub fn try_demangle(s: &str) -> Result, rustc_demangle::TryDemangleError> { match rustc_demangle::try_demangle(s) { Ok(d) => Ok(Demangle { original: s, rustc_demangle: d, }), Err(e) => { assert!(demangle_via_c(s, false).is_err()); Err(e) } } } impl fmt::Display for Demangle<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.write_str(&self.to_string_maybe_verbose(!f.alternate())) } } // HACK(eddyb) the C port doesn't have the "is printable Unicode" heuristic, // to avoid having to include the non-trivial amount of data that requires, // so instead we allow mismatches when `b` has more `\u{...}` escapes than `a`. fn equal_modulo_unicode_escapes(a: &str, b: &str) -> bool { let mut a_chars = a.chars(); let mut a_active_escape: Option = None; let mut b_chars = b.chars(); loop { let a_ch = a_active_escape .as_mut() .and_then(|escape| escape.next()) .or_else(|| { a_active_escape = None; a_chars.next() }); let b_ch = b_chars.next(); match (a_ch, b_ch) { (Some(a_ch), Some(b_ch)) if a_ch == b_ch => {} // Compare with the `\u{...}` escape instead, if possible. (Some(a_ch), Some('\\')) if a_active_escape.is_none() => { let mut escape = a_ch.escape_unicode(); assert_eq!(escape.next(), Some('\\')); a_active_escape = Some(escape); } (None, None) => return true, _ => return false, } } } impl Demangle<'_> { pub fn to_string_maybe_verbose(&self, verbose: bool) -> String { let rust = if verbose { format!("{}", self.rustc_demangle) } else { format!("{:#}", self.rustc_demangle) }; let c = demangle_via_c(self.original, verbose).unwrap_or_else(|_| self.original.to_owned()); if rust != c && !equal_modulo_unicode_escapes(&rust, &c) { panic!( "Rust vs C demangling difference:\ \n mangled: {mangled:?}\ \n rust: {rust:?}\ \n c: {c:?}\ \n", mangled = self.original ); } rust } } fn demangle_via_c(mangled: &str, verbose: bool) -> Result { use std::ffi::{CStr, CString}; use std::os::raw::c_char; extern "C" { fn rust_demangle(mangled: *const c_char, flags: i32) -> *mut c_char; fn free(ptr: *mut c_char); } let flags = if verbose { 1 } else { 0 }; let Ok(mangled) = CString::new(mangled) else { // C can't handle strings containing nul bytes return Err(()); }; let out = unsafe { rust_demangle(mangled.as_ptr(), flags) }; if out.is_null() { Err(()) } else { unsafe { let s = CStr::from_ptr(out).to_string_lossy().into_owned(); free(out); Ok(s) } } } ================================================ FILE: third-party/rust-demangle/test-harness/tests/legacy.rs ================================================ //! Tests copied from `https://github.com/rust-lang/rustc-demangle`'s //! `src/legacy.rs` at `fd906f850f90f6d4845c7b8219d218293e0ab3ed`. //! //! These are the only changes made to the tests: //! * `::` absolute paths -> `rust_demangle_c_test_harness::` //! * `#[should_panic]` was added to tests that don't pass yet macro_rules! t { ($a:expr, $b:expr) => { assert!(ok($a, $b)) }; } macro_rules! t_err { ($a:expr) => { assert!(ok_err($a)) }; } macro_rules! t_nohash { ($a:expr, $b:expr) => {{ assert_eq!( format!("{:#}", rust_demangle_c_test_harness::demangle($a)), $b ); }}; } fn ok(sym: &str, expected: &str) -> bool { match rust_demangle_c_test_harness::try_demangle(sym) { Ok(s) => { if s.to_string() == expected { true } else { println!("\n{}\n!=\n{}\n", s, expected); false } } Err(_) => { println!("error demangling"); false } } } fn ok_err(sym: &str) -> bool { match rust_demangle_c_test_harness::try_demangle(sym) { Ok(_) => { println!("succeeded in demangling"); false } Err(_) => rust_demangle_c_test_harness::demangle(sym).to_string() == sym, } } #[test] fn demangle() { t_err!("test"); t!("_ZN4testE", "test"); t_err!("_ZN4test"); t!("_ZN4test1a2bcE", "test::a::bc"); } #[test] fn demangle_dollars() { t!("_ZN4$RP$E", ")"); t!("_ZN8$RF$testE", "&test"); t!("_ZN8$BP$test4foobE", "*test::foob"); t!("_ZN9$u20$test4foobE", " test::foob"); t!("_ZN35Bar$LT$$u5b$u32$u3b$$u20$4$u5d$$GT$E", "Bar<[u32; 4]>"); } #[test] fn demangle_many_dollars() { t!("_ZN13test$u20$test4foobE", "test test::foob"); t!("_ZN12test$BP$test4foobE", "test*test::foob"); } #[test] fn demangle_osx() { t!( "__ZN5alloc9allocator6Layout9for_value17h02a996811f781011E", "alloc::allocator::Layout::for_value::h02a996811f781011" ); t!("__ZN38_$LT$core..option..Option$LT$T$GT$$GT$6unwrap18_MSG_FILE_LINE_COL17haf7cb8d5824ee659E", ">::unwrap::_MSG_FILE_LINE_COL::haf7cb8d5824ee659"); t!("__ZN4core5slice89_$LT$impl$u20$core..iter..traits..IntoIterator$u20$for$u20$$RF$$u27$a$u20$$u5b$T$u5d$$GT$9into_iter17h450e234d27262170E", "core::slice::::into_iter::h450e234d27262170"); } #[test] fn demangle_windows() { t!("ZN4testE", "test"); t!("ZN13test$u20$test4foobE", "test test::foob"); t!("ZN12test$RF$test4foobE", "test&test::foob"); } #[test] fn demangle_elements_beginning_with_underscore() { t!("_ZN13_$LT$test$GT$E", ""); t!("_ZN28_$u7b$$u7b$closure$u7d$$u7d$E", "{{closure}}"); t!("_ZN15__STATIC_FMTSTRE", "__STATIC_FMTSTR"); } #[test] fn demangle_trait_impls() { t!( "_ZN71_$LT$Test$u20$$u2b$$u20$$u27$static$u20$as$u20$foo..Bar$LT$Test$GT$$GT$3barE", ">::bar" ); } #[test] fn demangle_without_hash() { let s = "_ZN3foo17h05af221e174051e9E"; t!(s, "foo::h05af221e174051e9"); t_nohash!(s, "foo"); } #[test] fn demangle_without_hash_edgecases() { // One element, no hash. t_nohash!("_ZN3fooE", "foo"); // Two elements, no hash. t_nohash!("_ZN3foo3barE", "foo::bar"); // Longer-than-normal hash. t_nohash!("_ZN3foo20h05af221e174051e9abcE", "foo"); // Shorter-than-normal hash. t_nohash!("_ZN3foo5h05afE", "foo"); // Valid hash, but not at the end. t_nohash!("_ZN17h05af221e174051e93fooE", "h05af221e174051e9::foo"); // Not a valid hash, missing the 'h'. t_nohash!("_ZN3foo16ffaf221e174051e9E", "foo::ffaf221e174051e9"); // Not a valid hash, has a non-hex-digit. t_nohash!("_ZN3foo17hg5af221e174051e9E", "foo::hg5af221e174051e9"); } #[test] fn demangle_thinlto() { // One element, no hash. t!("_ZN3fooE.llvm.9D1C9369", "foo"); t!("_ZN3fooE.llvm.9D1C9369@@16", "foo"); t_nohash!( "_ZN9backtrace3foo17hbb467fcdaea5d79bE.llvm.A5310EB9", "backtrace::foo" ); } #[test] fn demangle_llvm_ir_branch_labels() { t!("_ZN4core5slice77_$LT$impl$u20$core..ops..index..IndexMut$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$9index_mut17haf9727c2edfbc47bE.exit.i.i", "core::slice:: for [T]>::index_mut::haf9727c2edfbc47b.exit.i.i"); t_nohash!("_ZN4core5slice77_$LT$impl$u20$core..ops..index..IndexMut$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$9index_mut17haf9727c2edfbc47bE.exit.i.i", "core::slice:: for [T]>::index_mut.exit.i.i"); } #[test] fn demangle_ignores_suffix_that_doesnt_look_like_a_symbol() { t_err!("_ZN3fooE.llvm moocow"); } #[test] fn dont_panic() { rust_demangle_c_test_harness::demangle("_ZN2222222222222222222222EE").to_string(); rust_demangle_c_test_harness::demangle("_ZN5*70527e27.ll34csaғE").to_string(); rust_demangle_c_test_harness::demangle("_ZN5*70527a54.ll34_$b.1E").to_string(); rust_demangle_c_test_harness::demangle( "\ _ZN5~saäb4e\n\ 2734cOsbE\n\ 5usage20h)3\0\0\0\0\0\0\07e2734cOsbE\ ", ) .to_string(); } #[test] fn invalid_no_chop() { t_err!("_ZNfooE"); } #[test] fn handle_assoc_types() { t!("_ZN151_$LT$alloc..boxed..Box$LT$alloc..boxed..FnBox$LT$A$C$$u20$Output$u3d$R$GT$$u20$$u2b$$u20$$u27$a$GT$$u20$as$u20$core..ops..function..FnOnce$LT$A$GT$$GT$9call_once17h69e8f44b3723e1caE", " + 'a> as core::ops::function::FnOnce>::call_once::h69e8f44b3723e1ca"); } #[test] fn handle_bang() { t!( "_ZN88_$LT$core..result..Result$LT$$u21$$C$$u20$E$GT$$u20$as$u20$std..process..Termination$GT$6report17hfc41d0da4a40b3e8E", " as std::process::Termination>::report::hfc41d0da4a40b3e8" ); } #[test] fn demangle_utf8_idents() { t_nohash!( "_ZN11utf8_idents157_$u10e1$$u10d0$$u10ed$$u10db$$u10d4$$u10da$$u10d0$$u10d3$_$u10d2$$u10d4$$u10db$$u10e0$$u10d8$$u10d4$$u10da$$u10d8$_$u10e1$$u10d0$$u10d3$$u10d8$$u10da$$u10d8$17h21634fd5714000aaE", "utf8_idents::საჭმელად_გემრიელი_სადილი" ); } #[test] fn demangle_issue_60925() { t_nohash!( "_ZN11issue_609253foo37Foo$LT$issue_60925..llv$u6d$..Foo$GT$3foo17h059a991a004536adE", "issue_60925::foo::Foo::foo" ); } ================================================ FILE: third-party/rust-demangle/test-harness/tests/top_level.rs ================================================ //! Tests copied from `https://github.com/rust-lang/rustc-demangle`'s //! `src/lib.rs` at `fd906f850f90f6d4845c7b8219d218293e0ab3ed`. //! //! These are the only changes made to the tests: //! * `super::` paths -> `rust_demangle_c_test_harness::` //! * `#[ignore = "stack overflow"]` was added to tests that overflow the stack //! * `#[should_panic]` was added to tests that don't pass yet use rust_demangle_c_test_harness::{assert_contains, assert_ends_with}; macro_rules! t { ($a:expr, $b:expr) => { assert!(ok($a, $b)) }; } macro_rules! t_err { ($a:expr) => { assert!(ok_err($a)) }; } macro_rules! t_nohash { ($a:expr, $b:expr) => {{ assert_eq!( format!("{:#}", rust_demangle_c_test_harness::demangle($a)), $b ); }}; } fn ok(sym: &str, expected: &str) -> bool { match rust_demangle_c_test_harness::try_demangle(sym) { Ok(s) => { if s.to_string() == expected { true } else { println!("\n{}\n!=\n{}\n", s, expected); false } } Err(_) => { println!("error demangling"); false } } } fn ok_err(sym: &str) -> bool { match rust_demangle_c_test_harness::try_demangle(sym) { Ok(_) => { println!("succeeded in demangling"); false } Err(_) => rust_demangle_c_test_harness::demangle(sym).to_string() == sym, } } #[test] fn demangle() { t_err!("test"); t!("_ZN4testE", "test"); t_err!("_ZN4test"); t!("_ZN4test1a2bcE", "test::a::bc"); } #[test] fn demangle_dollars() { t!("_ZN4$RP$E", ")"); t!("_ZN8$RF$testE", "&test"); t!("_ZN8$BP$test4foobE", "*test::foob"); t!("_ZN9$u20$test4foobE", " test::foob"); t!("_ZN35Bar$LT$$u5b$u32$u3b$$u20$4$u5d$$GT$E", "Bar<[u32; 4]>"); } #[test] fn demangle_many_dollars() { t!("_ZN13test$u20$test4foobE", "test test::foob"); t!("_ZN12test$BP$test4foobE", "test*test::foob"); } #[test] fn demangle_osx() { t!( "__ZN5alloc9allocator6Layout9for_value17h02a996811f781011E", "alloc::allocator::Layout::for_value::h02a996811f781011" ); t!("__ZN38_$LT$core..option..Option$LT$T$GT$$GT$6unwrap18_MSG_FILE_LINE_COL17haf7cb8d5824ee659E", ">::unwrap::_MSG_FILE_LINE_COL::haf7cb8d5824ee659"); t!("__ZN4core5slice89_$LT$impl$u20$core..iter..traits..IntoIterator$u20$for$u20$$RF$$u27$a$u20$$u5b$T$u5d$$GT$9into_iter17h450e234d27262170E", "core::slice::::into_iter::h450e234d27262170"); } #[test] fn demangle_windows() { t!("ZN4testE", "test"); t!("ZN13test$u20$test4foobE", "test test::foob"); t!("ZN12test$RF$test4foobE", "test&test::foob"); } #[test] fn demangle_elements_beginning_with_underscore() { t!("_ZN13_$LT$test$GT$E", ""); t!("_ZN28_$u7b$$u7b$closure$u7d$$u7d$E", "{{closure}}"); t!("_ZN15__STATIC_FMTSTRE", "__STATIC_FMTSTR"); } #[test] fn demangle_trait_impls() { t!( "_ZN71_$LT$Test$u20$$u2b$$u20$$u27$static$u20$as$u20$foo..Bar$LT$Test$GT$$GT$3barE", ">::bar" ); } #[test] fn demangle_without_hash() { let s = "_ZN3foo17h05af221e174051e9E"; t!(s, "foo::h05af221e174051e9"); t_nohash!(s, "foo"); } #[test] fn demangle_without_hash_edgecases() { // One element, no hash. t_nohash!("_ZN3fooE", "foo"); // Two elements, no hash. t_nohash!("_ZN3foo3barE", "foo::bar"); // Longer-than-normal hash. t_nohash!("_ZN3foo20h05af221e174051e9abcE", "foo"); // Shorter-than-normal hash. t_nohash!("_ZN3foo5h05afE", "foo"); // Valid hash, but not at the end. t_nohash!("_ZN17h05af221e174051e93fooE", "h05af221e174051e9::foo"); // Not a valid hash, missing the 'h'. t_nohash!("_ZN3foo16ffaf221e174051e9E", "foo::ffaf221e174051e9"); // Not a valid hash, has a non-hex-digit. t_nohash!("_ZN3foo17hg5af221e174051e9E", "foo::hg5af221e174051e9"); } #[test] fn demangle_thinlto() { // One element, no hash. t!("_ZN3fooE.llvm.9D1C9369", "foo"); t!("_ZN3fooE.llvm.9D1C9369@@16", "foo"); t_nohash!( "_ZN9backtrace3foo17hbb467fcdaea5d79bE.llvm.A5310EB9", "backtrace::foo" ); } #[test] fn demangle_llvm_ir_branch_labels() { t!("_ZN4core5slice77_$LT$impl$u20$core..ops..index..IndexMut$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$9index_mut17haf9727c2edfbc47bE.exit.i.i", "core::slice:: for [T]>::index_mut::haf9727c2edfbc47b.exit.i.i"); t_nohash!("_ZN4core5slice77_$LT$impl$u20$core..ops..index..IndexMut$LT$I$GT$$u20$for$u20$$u5b$T$u5d$$GT$9index_mut17haf9727c2edfbc47bE.exit.i.i", "core::slice:: for [T]>::index_mut.exit.i.i"); } #[test] fn demangle_ignores_suffix_that_doesnt_look_like_a_symbol() { t_err!("_ZN3fooE.llvm moocow"); } #[test] fn dont_panic() { rust_demangle_c_test_harness::demangle("_ZN2222222222222222222222EE").to_string(); rust_demangle_c_test_harness::demangle("_ZN5*70527e27.ll34csaғE").to_string(); rust_demangle_c_test_harness::demangle("_ZN5*70527a54.ll34_$b.1E").to_string(); rust_demangle_c_test_harness::demangle( "\ _ZN5~saäb4e\n\ 2734cOsbE\n\ 5usage20h)3\0\0\0\0\0\0\07e2734cOsbE\ ", ) .to_string(); } #[test] fn invalid_no_chop() { t_err!("_ZNfooE"); } #[test] fn handle_assoc_types() { t!("_ZN151_$LT$alloc..boxed..Box$LT$alloc..boxed..FnBox$LT$A$C$$u20$Output$u3d$R$GT$$u20$$u2b$$u20$$u27$a$GT$$u20$as$u20$core..ops..function..FnOnce$LT$A$GT$$GT$9call_once17h69e8f44b3723e1caE", " + 'a> as core::ops::function::FnOnce>::call_once::h69e8f44b3723e1ca"); } #[test] fn handle_bang() { t!( "_ZN88_$LT$core..result..Result$LT$$u21$$C$$u20$E$GT$$u20$as$u20$std..process..Termination$GT$6report17hfc41d0da4a40b3e8E", " as std::process::Termination>::report::hfc41d0da4a40b3e8" ); } // FIXME(eddyb) port recursion limits to C. #[ignore = "stack overflow"] #[test] fn limit_recursion() { assert_contains!( rust_demangle_c_test_harness::demangle("_RNvB_1a").to_string(), "{recursion limit reached}" ); assert_contains!( rust_demangle_c_test_harness::demangle("_RMC0RB2_").to_string(), "{recursion limit reached}" ); } // FIXME(eddyb) port the relevant functionality to C. #[ignore = "would slowly use up all RAM before being OOM-killed"] #[test] fn limit_output_oom_hazard() { assert_ends_with!( rust_demangle_c_test_harness::demangle("RYFG_FGyyEvRYFF_EvRYFFEvERLB_B_B_ERLRjB_B_B_") .to_string(), "{size limit reached}" ); } // FIXME(eddyb) port the relevant functionality to C. #[should_panic] #[test] fn limit_output() { // NOTE(eddyb) somewhat reduced version of the above, effectively // ` fn()>` with a larger number of lifetimes in `...`. assert_ends_with!( rust_demangle_c_test_harness::demangle("_RMC0FGZZZ_Eu").to_string(), "{size limit reached}" ); } ================================================ FILE: third-party/rust-demangle/test-harness/tests/v0.rs ================================================ //! Tests copied from `https://github.com/rust-lang/rustc-demangle`'s //! `src/v0.rs` at `fd906f850f90f6d4845c7b8219d218293e0ab3ed`. //! //! These are the only changes made to the tests: //! * `::` absolute paths -> `rust_demangle_c_test_harness::` //! * `#[cfg(unsupported_tests)]` was added to tests that couldn't compile //! * `#[ignore = "stack overflow"]` was added to tests that overflow the stack //! * `#[should_panic]` was added to tests that don't pass yet use rust_demangle_c_test_harness::assert_contains; macro_rules! t { ($a:expr, $b:expr) => {{ assert_eq!( format!("{}", rust_demangle_c_test_harness::demangle($a)), $b ); }}; } macro_rules! t_nohash { ($a:expr, $b:expr) => {{ assert_eq!( format!("{:#}", rust_demangle_c_test_harness::demangle($a)), $b ); }}; } macro_rules! t_nohash_type { ($a:expr, $b:expr) => { t_nohash!(concat!("_RMC0", $a), concat!("<", $b, ">")) }; } macro_rules! t_const { ($mangled:expr, $value:expr) => { t_nohash!( concat!("_RIC0K", $mangled, "E"), concat!("::<", $value, ">") ) }; } macro_rules! t_const_suffixed { ($mangled:expr, $value:expr, $value_ty_suffix:expr) => {{ t_const!($mangled, $value); t!( concat!("_RIC0K", $mangled, "E"), concat!("[0]::<", $value, $value_ty_suffix, ">") ); }}; } #[test] fn demangle_crate_with_leading_digit() { t_nohash!("_RNvC6_123foo3bar", "123foo::bar"); } #[test] fn demangle_utf8_idents() { t_nohash!( "_RNqCs4fqI2P2rA04_11utf8_identsu30____7hkackfecea1cbdathfdh9hlq6y", "utf8_idents::საჭმელად_გემრიელი_სადილი" ); } #[test] fn demangle_closure() { t_nohash!( "_RNCNCNgCs6DXkGYLi8lr_2cc5spawn00B5_", "cc::spawn::{closure#0}::{closure#0}" ); t_nohash!( "_RNCINkXs25_NgCsbmNqQUJIY6D_4core5sliceINyB9_4IterhENuNgNoBb_4iter8iterator8Iterator9rpositionNCNgNpB9_6memchr7memrchrs_0E0Bb_", " as core::iter::iterator::Iterator>::rposition::::{closure#0}" ); } #[test] fn demangle_dyn_trait() { t_nohash!( "_RINbNbCskIICzLVDPPb_5alloc5alloc8box_freeDINbNiB4_5boxed5FnBoxuEp6OutputuEL_ECs1iopQbuBiw2_3std", "alloc::alloc::box_free::>" ); } #[test] fn demangle_const_generics_preview() { // NOTE(eddyb) this was hand-written, before rustc had working // const generics support (but the mangling format did include them). t_nohash_type!( "INtC8arrayvec8ArrayVechKj7b_E", "arrayvec::ArrayVec" ); t_const_suffixed!("j7b_", "123", "usize"); } #[test] fn demangle_min_const_generics() { t_const!("p", "_"); t_const_suffixed!("hb_", "11", "u8"); t_const_suffixed!("off00ff00ff00ff00ff_", "0xff00ff00ff00ff00ff", "u128"); t_const_suffixed!("s98_", "152", "i16"); t_const_suffixed!("anb_", "-11", "i8"); t_const!("b0_", "false"); t_const!("b1_", "true"); t_const!("c76_", "'v'"); t_const!("c22_", r#"'"'"#); t_const!("ca_", "'\\n'"); t_const!("c2202_", "'∂'"); } #[test] fn demangle_const_str() { t_const!("e616263_", "{*\"abc\"}"); t_const!("e27_", r#"{*"'"}"#); t_const!("e090a_", "{*\"\\t\\n\"}"); t_const!("ee28882c3bc_", "{*\"∂ü\"}"); t_const!( "ee183a1e18390e183ade1839be18394e1839ae18390e183935fe18392e18394e1839b\ e183a0e18398e18394e1839ae183985fe183a1e18390e18393e18398e1839ae18398_", "{*\"საჭმელად_გემრიელი_სადილი\"}" ); t_const!( "ef09f908af09fa688f09fa686f09f90ae20c2a720f09f90b6f09f9192e298\ 95f09f94a520c2a720f09fa7a1f09f929bf09f929af09f9299f09f929c_", "{*\"🐊🦈🦆🐮 § 🐶👒☕🔥 § 🧡💛💚💙💜\"}" ); } // NOTE(eddyb) this uses the same strings as `demangle_const_str` and should // be kept in sync with it - while a macro could be used to generate both // `str` and `&str` tests, from a single list of strings, this seems clearer. #[test] fn demangle_const_ref_str() { t_const!("Re616263_", "\"abc\""); t_const!("Re27_", r#""'""#); t_const!("Re090a_", "\"\\t\\n\""); t_const!("Ree28882c3bc_", "\"∂ü\""); t_const!( "Ree183a1e18390e183ade1839be18394e1839ae18390e183935fe18392e18394e1839b\ e183a0e18398e18394e1839ae183985fe183a1e18390e18393e18398e1839ae18398_", "\"საჭმელად_გემრიელი_სადილი\"" ); t_const!( "Ref09f908af09fa688f09fa686f09f90ae20c2a720f09f90b6f09f9192e298\ 95f09f94a520c2a720f09fa7a1f09f929bf09f929af09f9299f09f929c_", "\"🐊🦈🦆🐮 § 🐶👒☕🔥 § 🧡💛💚💙💜\"" ); } #[test] fn demangle_const_ref() { t_const!("Rp", "{&_}"); t_const!("Rh7b_", "{&123}"); t_const!("Rb0_", "{&false}"); t_const!("Rc58_", "{&'X'}"); t_const!("RRRh0_", "{&&&0}"); t_const!("RRRe_", "{&&\"\"}"); t_const!("QAE", "{&mut []}"); } #[test] fn demangle_const_array() { t_const!("AE", "{[]}"); t_const!("Aj0_E", "{[0]}"); t_const!("Ah1_h2_h3_E", "{[1, 2, 3]}"); t_const!("ARe61_Re62_Re63_E", "{[\"a\", \"b\", \"c\"]}"); t_const!("AAh1_h2_EAh3_h4_EE", "{[[1, 2], [3, 4]]}"); } #[test] fn demangle_const_tuple() { t_const!("TE", "{()}"); t_const!("Tj0_E", "{(0,)}"); t_const!("Th1_b0_E", "{(1, false)}"); t_const!( "TRe616263_c78_RAh1_h2_h3_EE", "{(\"abc\", 'x', &[1, 2, 3])}" ); } #[test] fn demangle_const_adt() { t_const!( "VNvINtNtC4core6option6OptionjE4NoneU", "{core::option::Option::::None}" ); t_const!( "VNvINtNtC4core6option6OptionjE4SomeTj0_E", "{core::option::Option::::Some(0)}" ); t_const!( "VNtC3foo3BarS1sRe616263_2chc78_5sliceRAh1_h2_h3_EE", "{foo::Bar { s: \"abc\", ch: 'x', slice: &[1, 2, 3] }}" ); } #[test] fn demangle_exponential_explosion() { // NOTE(eddyb) because of the prefix added by `t_nohash_type!` is // 3 bytes long, `B2_` refers to the start of the type, not `B_`. // 6 backrefs (`B8_E` through `B3_E`) result in 2^6 = 64 copies of `_`. // Also, because the `p` (`_`) type is after all of the starts of the // backrefs, it can be replaced with any other type, independently. t_nohash_type!( concat!("TTTTTT", "p", "B8_E", "B7_E", "B6_E", "B5_E", "B4_E", "B3_E"), "((((((_, _), (_, _)), ((_, _), (_, _))), (((_, _), (_, _)), ((_, _), (_, _)))), \ ((((_, _), (_, _)), ((_, _), (_, _))), (((_, _), (_, _)), ((_, _), (_, _))))), \ (((((_, _), (_, _)), ((_, _), (_, _))), (((_, _), (_, _)), ((_, _), (_, _)))), \ ((((_, _), (_, _)), ((_, _), (_, _))), (((_, _), (_, _)), ((_, _), (_, _))))))" ); } #[test] fn demangle_thinlto() { t_nohash!("_RC3foo.llvm.9D1C9369", "foo"); t_nohash!("_RC3foo.llvm.9D1C9369@@16", "foo"); t_nohash!("_RNvC9backtrace3foo.llvm.A5310EB9", "backtrace::foo"); } #[test] fn demangle_extra_suffix() { // From alexcrichton/rustc-demangle#27: t_nohash!( "_RNvNtNtNtNtCs92dm3009vxr_4rand4rngs7adapter9reseeding4fork23FORK_HANDLER_REGISTERED.0.0", "rand::rngs::adapter::reseeding::fork::FORK_HANDLER_REGISTERED.0.0" ); } // FIXME(eddyb) get this working with the `rust-demangle.c` test harness. #[cfg(unsupported_tests)] #[test] fn demangling_limits() { // Stress tests found via fuzzing. for sym in include_str!("v0-large-test-symbols/early-recursion-limit") .lines() .filter(|line| !line.is_empty() && !line.starts_with('#')) { assert_eq!( super::demangle(sym).map(|_| ()), Err(super::ParseError::RecursedTooDeep) ); } assert_contains!( ::demangle( "RIC20tRYIMYNRYFG05_EB5_B_B6_RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRR\ RRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRB_E", ) .to_string(), "{recursion limit reached}" ); } // FIXME(eddyb) get this working with the `rust-demangle.c` test harness. #[cfg(unsupported_tests)] #[test] fn recursion_limit_leaks() { // NOTE(eddyb) this test checks that both paths and types support the // recursion limit correctly, i.e. matching `push_depth` and `pop_depth`, // and don't leak "recursion levels" and trip the limit. // The test inputs are generated on the fly, using a repeated pattern, // as hardcoding the actual strings would be too verbose. // Also, `MAX_DEPTH` can be directly used, instead of assuming its value. for &(sym_leaf, expected_leaf) in &[("p", "_"), ("Rp", "&_"), ("C1x", "x")] { let mut sym = format!("_RIC0p"); let mut expected = format!("::<_"); for _ in 0..(super::MAX_DEPTH * 2) { sym.push_str(sym_leaf); expected.push_str(", "); expected.push_str(expected_leaf); } sym.push('E'); expected.push('>'); t_nohash!(&sym, expected); } } // FIXME(eddyb) port recursion limits to C. #[ignore = "stack overflow"] #[test] fn recursion_limit_backref_free_bypass() { // NOTE(eddyb) this test checks that long symbols cannot bypass the // recursion limit by not using backrefs, and cause a stack overflow. // This value was chosen to be high enough that stack overflows were // observed even with `cargo test --release`. let depth = 100_000; // In order to hide the long mangling from the initial "shallow" parse, // it's nested in an identifier (crate name), preceding its use. let mut sym = format!("_RIC{}", depth); let backref_start = sym.len() - 2; for _ in 0..depth { sym.push('R'); } // Write a backref to just after the length of the identifier. sym.push('B'); sym.push(char::from_digit((backref_start - 1) as u32, 36).unwrap()); sym.push('_'); // Close the `I` at the start. sym.push('E'); assert_contains!( rust_demangle_c_test_harness::demangle(&sym).to_string(), "{recursion limit reached}" ); } ================================================ FILE: third-party/tbb/.bazelrc ================================================ # Copyright (c) 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # DISCLAIMER: Bazel support is community-based. The maintainers do not # use Bazel internally. The Bazel build can have security risks or # optimization gaps. build --symlink_prefix=/ # Out of source build ================================================ FILE: third-party/tbb/.bazelversion ================================================ 7.2.1 ================================================ FILE: third-party/tbb/.gitattributes ================================================ # Set the default behavior, in case people don't have core.autocrlf set. * text=auto # Explicitly declare text files you want to always be normalized and converted # to native line endings on checkout. *.c text *.h text *.cpp text *.def text *.rc text *.i text *.sh text *.csh text *.mk text *.java text *.csv text *.lst text *.asm text *.cfg text *.css text *.inc text *.js text *.rb text *.strings text *.txt text *export.lst text *.xml text *.py text *.md text *.classpath text *.cproject text *.project text *.properties text *.java text *.gradle text # Declare files that will always have CRLF line endings on checkout. *.sln text eol=crlf *.bat text eol=crlf # Denote all files that are truly binary and should not be modified. *.png binary *.jpg binary *.ico binary *.spir binary ================================================ FILE: third-party/tbb/.github/CODEOWNERS ================================================ # Lines starting with '#' are comments. # Each line is a file pattern followed by one or more owners. # More details are here: https://help.github.com/articles/about-codeowners/ src/tbbmalloc @ldorau @lplewa @kfilipek src/tbbmalloc_proxy @ldorau @lplewa @kfilipek ================================================ FILE: third-party/tbb/.github/ISSUE_TEMPLATE/1_question.md ================================================ --- name: Ask a question about: Use this template for any questions title: '' labels: 'question' assignees: '' --- ================================================ FILE: third-party/tbb/.github/ISSUE_TEMPLATE/2_bug_report.md ================================================ --- name: Report a bug or a performance issue about: Use this template to report unexpected behavior title: '' labels: 'bug' assignees: '' --- # Summary Provide a short summary of the issue. See the sections below for factors important for the reproduction of an issue. # Version Report oneTBB version used to reproduce the problem. # Environment Provide any environmental details that you consider significant for reproducing the issue. The following information is important: * Hardware * OS name and version * Compiler version # Observed Behavior Document behavior you observe. # Expected Behavior Document behavior you expect. # Steps To Reproduce Check that the issue is reproducible with the latest revision of the master branch. Include all the steps to reproduce the issue. ================================================ FILE: third-party/tbb/.github/ISSUE_TEMPLATE/3_feature_request.md ================================================ --- name: Request a feature about: Use this template to request new functionality or change the behavior of the library title: '' labels: 'new feature' assignees: '' --- # Summary Include a short summary of the request. See the sections below for factors important for a feature request. # Problem Statement Describe the problem you want to solve with a reasonable level of detail. # Preferred Solution Provide your ideas regarding problem solutions. ================================================ FILE: third-party/tbb/.github/ISSUE_TEMPLATE/4_documentation.md ================================================ --- name: Request a documentation change about: Use this template to report documentation issue or request documentation changes title: '' labels: 'documentation' assignees: '' --- # Summary Include a short summary of the issue or request. See the sections below for factors important for a documentation issue. # URLs Include pointers to documents that are impacted. # Additional Details Provide a detailed description of the expected changes in documentation and suggestions you have. ================================================ FILE: third-party/tbb/.github/dependabot.yml ================================================ version: 2 updates: - package-ecosystem: "github-actions" directory: "/" schedule: interval: "weekly" ================================================ FILE: third-party/tbb/.github/issue_labeler.yml ================================================ # Copyright (c) 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # PR template regexp's for issue labeler bug fix: - '\[(x|X)\]\sbug\sfix' enhancement: - '\[(x|X)\]\snew\sfeature' tests: - '\[(x|X)\]\stests' infrastructure: - '\[(x|X)\]\sinfrastructure' documentation: - '\[(x|X)\]\sdocumentation' allocator: - '\[(x|X)\]\sallocator' ================================================ FILE: third-party/tbb/.github/labeler.yml ================================================ # Copyright (c) 2023-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. allocator: - changed-files: - any-glob-to-any-file: ['src/tbbmalloc/**/*', 'src/tbbmalloc_proxy/**/*', 'test/tbbmalloc/**/*'] ================================================ FILE: third-party/tbb/.github/pull_request_template.md ================================================ ### Description _Add a comprehensive description of proposed changes_ Fixes # - _issue number(s) if exists_ ### Type of change _Choose one or multiple, leave empty if none of the other choices apply_ _Add a respective label(s) to PR if you have permissions_ - [ ] bug fix - _change that fixes an issue_ - [ ] new feature - _change that adds functionality_ - [ ] tests - _change in tests_ - [ ] infrastructure - _change in infrastructure and CI_ - [ ] documentation - _documentation update_ ### Tests - [ ] added - _required for new features and some bug fixes_ - [ ] not needed ### Documentation - [ ] updated in # - _add PR number_ - [ ] needs to be updated - [ ] not needed ### Breaks backward compatibility - [ ] Yes - [ ] No - [ ] Unknown ### Notify the following users _List users with `@` to send notifications_ ### Other information ================================================ FILE: third-party/tbb/.github/scripts/codespell.sh ================================================ #!/bin/bash # # Copyright (c) 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. SCAN_TARGET=$1 SKIP_PATTERN='*/.github/*' # Ignored cases IGNORE_COMMAND="sed -e /.*\\sOd\\s*=.*/d \ -e /.*\\sOt\\s*=.*/d \ -e /.*\\siff\\s*=.*/d \ -e /.*\\sith\\s*=.*/d \ -e /.*\\scas\\s*=.*/d \ -e /.*\\sCAS\\s*=.*/d \ -e /.*\\ssom\\s*=.*/d \ -e /.*\\sSOM\\s*=.*/d \ -e /.*\\suint\\s*=.*/d \ -e /.*\\sUINT\\s*=.*/d \ -e /.*\\scopyable\\s*=.*/d \ -e /.*\\sCopyable\\s*=.*/d \ -e /.*\\sFo\\s*=.*/d \ -e /.*pipeline_filters.h.*nd\\s*=.*/d \ -e /.*ittnotify.h.*unx\\s*=.*/d \ -e /.*bzlib.cpp.*MSDOS\\s*=.*/d \ -e /.*test_task.cpp.*tE\\s*=.*/d \ -e /.*backend.cpp.*resSize\\s*=.*/d \ -e /.*test_join_node.h.*Ned\\s*=.*/d \ -e /.*test_indexer_node.cpp.*OT\\s*=.*/d \ -e /.*allocator_stl_test_common.h.*Aci*\\s*=.*/d \ -e /.*seismic_video.cpp.*DialogBox\\s*=.*/d \ -e /.*test_composite_node.cpp.*que\\s*=.*/d \ -e /.*blocksort.cpp.*hiSt\\s*=.*/d \ -e /.*compress.cpp.*fave\\s*=.*/d \ -e /.*count_strings.cpp.*ue\\s*=.*/d \ -e /.*count_strings.cpp.*nd\\s*=.*/d \ -e /.*count_strings.cpp.*ths\\s*=.*/d \ -e /.*polygon_overlay\/README.md.*ist\\s*=.*/d \ -e /.*_pipeline_filters.h.*nd\\s*=.*/d \ -e /.*sub_string_finder\/README.md.*ba\\s*=.*/d" SCAN_RESULT=`codespell --quiet-level=2 --skip "${SKIP_PATTERN}" ${SCAN_TARGET}` SCAN_RESULT=`echo -e "${SCAN_RESULT}" | ${IGNORE_COMMAND}` echo "${SCAN_RESULT}" if [[ ! -z ${SCAN_RESULT} ]]; then exit 1 fi ================================================ FILE: third-party/tbb/.github/workflows/ci.yml ================================================ # Copyright (c) 2021-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. name: oneTBB CI on: push: branches: [master] pull_request: branches: [master] types: - opened - synchronize - reopened permissions: read-all env: BUILD_CONCURRENCY: 2 MACOS_BUILD_CONCURRENCY: 3 TEST_TIMEOUT: 180 jobs: codespell: runs-on: [ubuntu-20.04] timeout-minutes: 10 steps: - uses: actions/checkout@v4 - name: Run scan run: | sudo apt update && sudo apt install -y codespell ${GITHUB_WORKSPACE}/.github/scripts/codespell.sh `pwd` examples_clang-format: runs-on: [ubuntu-20.04] timeout-minutes: 10 steps: - uses: actions/checkout@v4 - name: Run scan run: | command -v clang-format-10 cp -r examples examples_formatted find examples_formatted -regex '.*\.\(cpp\|hpp\)' -exec clang-format-10 -style=file -i {} \; diff -r examples examples_formatted documentation: needs: [codespell] env: BUILD_TYPE: oss runs-on: [ubuntu-22.04] timeout-minutes: 10 steps: - uses: actions/checkout@v4 - name: Install prerequisites run: | pip3 install -U Jinja2 pip3 install git+https://github.com/executablebooks/sphinx-book-theme.git pip3 install sphinx-tabs echo GITHUB_SHA_SHORT=${GITHUB_SHA::8} >> $GITHUB_ENV mkdir html - name: Build documentation run: | export BUILD_TYPE=${BUILD_TYPE} && sphinx-build doc html tar -czvf html.tar.gz html/ - name: Save docs uses: actions/upload-artifact@v4 with: name: oneTBB-html-docs-${{ env.GITHUB_SHA_SHORT }} path: html.tar.gz pages: if: ${{ github.ref == 'refs/heads/master' }} permissions: contents: write pages: write id-token: write runs-on: ubuntu-latest needs: [documentation] steps: - name: Checkout gh-pages uses: actions/checkout@v4 with: ref: gh-pages path: gh-pages - name: Set env run: echo GITHUB_SHA_SHORT=${GITHUB_SHA::8} >> $GITHUB_ENV - name: Download documetation uses: actions/download-artifact@v4 with: name: oneTBB-html-docs-${{ env.GITHUB_SHA_SHORT }} - name: Publish to github pages run: | tar -xvf html.tar.gz cd gh-pages rm -rf * touch .nojekyll # https://github.blog/2009-12-29-bypassing-jekyll-on-github-pages/ cp -r ../html/* . git config user.name github-actions git config user.email github-actions@github.com git add . git commit --reset-author --amend -m "Update from GitHub Actions" git push --force origin gh-pages copyright_check: if: ${{ github.ref != 'refs/heads/master' }} runs-on: [ubuntu-20.04] steps: - uses: actions/checkout@v4 with: fetch-depth: 0 - name: Run check run: | sed -i \ -e "/Copyright (c) .* Intel Corporation/s/ \([0-9]\+\)[-0-9]*/ \\1-$(date +%Y)/" \ -e "/Copyright (c) .* Intel Corporation/s/$(date +%Y)-$(date +%Y)/$(date +%Y)/" \ $(git diff --diff-filter=d --name-only ${{ github.event.pull_request.base.sha }}) git checkout -- third-party-programs.txt git diff > years.diff if [[ -s years.diff ]]; then echo "Wrong copyright years" cat years.diff exit 1 fi python_module_test_ubuntu_latest: runs-on: [ubuntu-latest] timeout-minutes: 15 steps: - uses: actions/checkout@v4 - name: Run testing run: | mkdir build && cd build cmake -DTBB4PY_BUILD=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc .. make VERBOSE=1 -j${BUILD_CONCURRENCY} python_build ctest -R python_test --output-on-failure --timeout ${TEST_TIMEOUT} linux-testing: name: ${{ matrix.os }}_${{ matrix.cxx_compiler }}_cxx${{ matrix.std }}_${{ matrix.build_type }}_preview=${{ matrix.preview }}${{ matrix.cmake_static }} runs-on: ['${{ matrix.os }}'] timeout-minutes: 45 strategy: fail-fast: false matrix: include: - os: ubuntu-latest c_compiler: gcc cxx_compiler: g++ std: 14 build_type: relwithdebinfo preview: 'OFF' - os: ubuntu-20.04 c_compiler: gcc cxx_compiler: g++ std: 17 build_type: release preview: 'ON' - os: ubuntu-20.04 c_compiler: gcc-10 cxx_compiler: g++-10 std: 20 build_type: debug preview: 'ON' - os: ubuntu-22.04 c_compiler: gcc-11 cxx_compiler: g++-11 std: 20 build_type: release preview: 'ON' cmake_static: -DBUILD_SHARED_LIBS=OFF steps: - uses: actions/checkout@v4 - name: Run testing shell: bash run: | set -x mkdir build && cd build cmake -DCMAKE_CXX_STANDARD=${{ matrix.std }} -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ${{ matrix.cmake_static }} \ -DCMAKE_CXX_COMPILER=${{ matrix.cxx_compiler }} -DCMAKE_C_COMPILER=${{ matrix.c_compiler }} -DTBB_CPF=${{ matrix.preview }} .. make VERBOSE=1 -j${BUILD_CONCURRENCY} ctest --timeout ${TEST_TIMEOUT} --output-on-failure macos-testing: name: ${{ matrix.os }}_${{ matrix.cxx_compiler }}_cxx${{ matrix.std }}_${{ matrix.build_type }}_preview=${{ matrix.preview }}${{ matrix.cmake_static }} runs-on: ['${{ matrix.os }}'] timeout-minutes: 45 strategy: fail-fast: false matrix: include: - os: macos-14 c_compiler: clang cxx_compiler: clang++ std: 14 build_type: relwithdebinfo preview: 'ON' - os: macos-13 c_compiler: clang cxx_compiler: clang++ std: 20 build_type: release preview: 'ON' cmake_static: -DBUILD_SHARED_LIBS=OFF steps: - uses: actions/checkout@v4 - name: Run testing shell: bash run: | set -x mkdir build && cd build cmake -DCMAKE_CXX_STANDARD=${{ matrix.std }} -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ${{ matrix.cmake_static }} \ -DCMAKE_CXX_COMPILER=${{ matrix.cxx_compiler }} -DCMAKE_C_COMPILER=${{ matrix.c_compiler }} -DTBB_CPF=${{ matrix.preview }} .. make VERBOSE=1 -j${MACOS_BUILD_CONCURRENCY} ctest --timeout ${TEST_TIMEOUT} --output-on-failure windows-testing: name: ${{ matrix.job_name }} runs-on: ['${{ matrix.os }}'] timeout-minutes: 45 strategy: fail-fast: false matrix: include: - os: windows-2019 generator: Visual Studio 16 2019 c_compiler: cl cxx_compiler: cl std: 14 build_type: relwithdebinfo preview: 'ON' job_name: windows_cl2019_cxx14_relwithdebinfo_preview=ON - os: windows-2019 generator: Visual Studio 16 2019 c_compiler: cl cxx_compiler: cl std: 20 build_type: release preview: 'ON' job_name: windows_cl2019_cxx20_release_preview=ON-DBUILD_SHARED_LIBS=OFF cmake_static: -DBUILD_SHARED_LIBS=OFF - os: windows-2022 generator: Visual Studio 17 2022 c_compiler: cl cxx_compiler: cl std: 17 build_type: relwithdebinfo preview: 'OFF' job_name: windows_cl2022_cxx17_relwithdebinfo_preview=OFF steps: - uses: actions/checkout@v4 - name: Run testing run: | mkdir build cd build cmake -G "${{ matrix.generator }}" -A x64 -DCMAKE_CXX_STANDARD=${{ matrix.std }} ${{ matrix.cmake_static }} ` -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DCMAKE_CXX_COMPILER=${{ matrix.cxx_compiler }} ` -DCMAKE_C_COMPILER=${{ matrix.c_compiler }} -DTBB_CPF=${{ matrix.preview }} .. cmake --build . --config ${{ matrix.build_type }} -j -v ctest -C ${{ matrix.build_type }} --timeout ${env:TEST_TIMEOUT} --output-on-failure linux-examples-testing: name: examples_${{ matrix.os }}_${{ matrix.cxx_compiler }}_cxx${{ matrix.std }}_${{ matrix.build_type }}_preview=${{ matrix.preview }} runs-on: ['${{ matrix.os }}'] timeout-minutes: 20 strategy: fail-fast: false matrix: include: - os: ubuntu-latest c_compiler: gcc cxx_compiler: g++ std: 14 build_type: relwithdebinfo preview: 'OFF' - os: ubuntu-20.04 c_compiler: gcc cxx_compiler: g++ std: 17 build_type: release preview: 'ON' - os: ubuntu-20.04 c_compiler: gcc-10 cxx_compiler: g++-10 std: 20 build_type: debug preview: 'ON' steps: - uses: actions/checkout@v4 - name: Run testing shell: bash run: | set -x mkdir build && cd build cmake -DCMAKE_CXX_STANDARD=${{ matrix.std }} -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ -DCMAKE_CXX_COMPILER=${{ matrix.cxx_compiler }} -DCMAKE_C_COMPILER=${{ matrix.c_compiler }} \ -DTBB_CPF=${{ matrix.preview }} -DTBB_TEST=OFF -DTBB_EXAMPLES=ON .. cmake --build . -v --target light_test_examples macos-examples-testing: name: examples_${{ matrix.os }}_${{ matrix.cxx_compiler }}_cxx${{ matrix.std }}_${{ matrix.build_type }}_preview=${{ matrix.preview }} runs-on: ['${{ matrix.os }}'] timeout-minutes: 20 strategy: fail-fast: false matrix: include: - os: macos-15 c_compiler: clang cxx_compiler: clang++ std: 14 build_type: relwithdebinfo preview: 'ON' steps: - uses: actions/checkout@v4 - name: Run testing shell: bash run: | set -x mkdir build && cd build cmake -DCMAKE_CXX_STANDARD=${{ matrix.std }} -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ -DCMAKE_CXX_COMPILER=${{ matrix.cxx_compiler }} -DCMAKE_C_COMPILER=${{ matrix.c_compiler }} \ -DTBB_CPF=${{ matrix.preview }} -DTBB_TEST=OFF -DTBB_EXAMPLES=ON .. cmake --build . -v --target light_test_examples windows-examples-testing: name: ${{ matrix.job_name }} runs-on: ['${{ matrix.os }}'] timeout-minutes: 20 strategy: fail-fast: false matrix: include: - os: windows-2019 generator: Visual Studio 16 2019 c_compiler: cl cxx_compiler: cl std: 14 build_type: relwithdebinfo preview: 'ON' job_name: examples_windows_cl2019_cxx14_relwithdebinfo_preview=ON - os: windows-2022 generator: Visual Studio 17 2022 c_compiler: cl cxx_compiler: cl std: 17 build_type: relwithdebinfo preview: 'OFF' job_name: examples_windows_cl2022_cxx17_relwithdebinfo_preview=OFF steps: - uses: actions/checkout@v4 - name: Run testing run: | mkdir build cd build cmake -G "${{ matrix.generator }}" -A x64 -DCMAKE_CXX_STANDARD=${{ matrix.std }} ` -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DCMAKE_CXX_COMPILER=${{ matrix.cxx_compiler }} ` -DCMAKE_C_COMPILER=${{ matrix.c_compiler }} -DTBB_CPF=${{ matrix.preview }} -DTBB_TEST=OFF -DTBB_EXAMPLES=ON .. cmake --build . -v --target light_test_examples ================================================ FILE: third-party/tbb/.github/workflows/codeql.yml ================================================ # Copyright (c) 2024-2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. name: "CodeQL" on: push: branches: [ "master" ] pull_request: branches: [ "master" ] schedule: - cron: '0 0 * * 1' permissions: contents: read jobs: analyze: name: Analyze (${{ matrix.language }}) runs-on: ubuntu-latest # timeout-minutes: permissions: # required for all workflows security-events: write # required to fetch internal or private CodeQL packs packages: read # only required for workflows in private repositories actions: read contents: read strategy: fail-fast: false matrix: language: ["cpp", "python"] steps: - name: Harden Runner uses: step-security/harden-runner@v2.10.4 with: egress-policy: audit - name: Checkout repository uses: actions/checkout@v4 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL uses: github/codeql-action/init@v3 with: languages: ${{ matrix.language }} # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild uses: github/codeql-action/autobuild@v3.24.10 # If the analyze step fails for one of the languages you are analyzing with # "We were unable to automatically build your code", modify the matrix above # to set the build mode to "manual" for that language. Then modify this step # to build your code. # Command-line programs to run using the OS shell. # See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun #- if: matrix.build-mode == 'manual' # shell: bash # run: | # echo 'If you are using a "manual" build mode for one or more of the' \ # 'languages you are analyzing, replace this with the commands to build' \ # 'your code, for example:' # echo ' make bootstrap' # echo ' make release' # exit 1 - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@v3 with: category: "/language:${{matrix.language}}" ================================================ FILE: third-party/tbb/.github/workflows/coverity.yml ================================================ # Copyright (c) 2024-2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. name: Coverity Scan on: # Only run on push to master branch push: branches: [master] permissions: read-all env: BUILD_CONCURRENCY: 4 COVERITY_PROJECT: oneapi-src%2FoneTBB jobs: coverity_linux: name: Coverity Linux if: github.repository == 'uxlfoundation/oneTBB' runs-on: [ubuntu-latest] steps: - uses: actions/checkout@v4 - name: Download Linux 64 Coverity Tool run: | curl https://scan.coverity.com/download/cxx/linux64 --output ${GITHUB_WORKSPACE}/cov-linux64-tool.tar.gz \ --data "token=${{secrets.COVERITY_TOKEN}}&project=${{env.COVERITY_PROJECT}}" mkdir cov-linux64-tool tar -xzf cov-linux64-tool.tar.gz --strip 1 -C cov-linux64-tool - name: Build with cov-build run: | export PATH="${PWD}/cov-linux64-tool/bin:${PATH}" mkdir build && cd build cmake -DCMAKE_CXX_STANDARD=20 -DCMAKE_BUILD_TYPE=relwithdebinfo \ -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DTBB_CPF=ON .. cov-build --dir cov-int make VERBOSE=1 -j${{env.BUILD_CONCURRENCY}} - name: Archive Coverity build results run: | cd build tar -czvf cov-int.tgz cov-int - name: Submit Coverity results for analysis run: | cd build curl \ --form token="${{ secrets.COVERITY_TOKEN }}" \ --form email="${{ secrets.COVERITY_EMAIL }}" \ --form file=@cov-int.tgz \ --form version="${GITHUB_SHA}" \ --form description="" \ "https://scan.coverity.com/builds?project=${{env.COVERITY_PROJECT}}" ================================================ FILE: third-party/tbb/.github/workflows/issue_labeler.yml ================================================ # Copyright (c) 2023-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. name: "Issue Labeler" on: issues: types: [opened, edited] pull_request: types: [opened, edited] permissions: read-all jobs: triage: runs-on: ubuntu-latest permissions: pull-requests: write issues: write contents: read steps: - uses: github/issue-labeler@v3.4 #May not be the latest version with: repo-token: "${{ secrets.GITHUB_TOKEN }}" configuration-path: .github/issue_labeler.yml enable-versioned-regex: 0 ================================================ FILE: third-party/tbb/.github/workflows/labeler.yml ================================================ # Copyright (c) 2023-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. name: "Pull Request Labeler" on: - pull_request_target permissions: read-all jobs: triage: permissions: contents: read pull-requests: write runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: actions/labeler@v5 with: configuration-path: .github/labeler.yml ================================================ FILE: third-party/tbb/.github/workflows/ossf-scorecard.yml ================================================ # Copyright (c) 2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. name: OSSF Scorecard on: # For Branch-Protection check. Only the default branch is supported. See # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection branch_protection_rule: # To guarantee Maintained check is occasionally updated. See # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained schedule: - cron: '00 02 * * *' push: branches: [ "master" ] # Declare default permissions as read only. permissions: read-all jobs: analysis: name: Scorecard analysis runs-on: ubuntu-latest permissions: # Needed to upload the results to code-scanning dashboard. security-events: write # Needed to publish results and get a badge (see publish_results below). id-token: write # Uncomment the permissions below if installing in a private repository. # contents: read # actions: read steps: - name: "Checkout code" uses: actions/checkout@v4.1.1 with: persist-credentials: false - name: "Run analysis" uses: ossf/scorecard-action@v2.4.0 with: results_file: results.sarif results_format: sarif # (Optional) "write" PAT token. Uncomment the `repo_token` line below if: # - you want to enable the Branch-Protection check on a *public* repository, or # - you are installing Scorecard on a *private* repository # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action?tab=readme-ov-file#authentication-with-fine-grained-pat-optional. # repo_token: ${{ secrets.SCORECARD_TOKEN }} # Public repositories: # - Publish results to OpenSSF REST API for easy access by consumers # - Allows the repository to include the Scorecard badge. # - See https://github.com/ossf/scorecard-action#publishing-results. # For private repositories: # - `publish_results` will always be set to `false`, regardless # of the value entered here. publish_results: true # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF # format to the repository Actions tab. #- name: "Upload artifact" # uses: actions/upload-artifact@97a0fba1372883ab732affbe8f94b823f91727db # v3.pre.node20 # with: # name: SARIF file # path: results.sarif # retention-days: 5 # Upload the results to GitHub's code scanning dashboard (optional). # Commenting out will disable upload of results to your repo's Code Scanning dashboard #- name: "Upload to code-scanning" # uses: github/codeql-action/upload-sarif@1b1aada464948af03b950897e5eb522f92603cc2 # v3.24.9 # with: # sarif_file: results.sarif ================================================ FILE: third-party/tbb/.gitignore ================================================ # -------- C++ -------- # Prerequisites *.d # Compiled Object files *.slo *.lo *.o *.obj # Precompiled Headers *.gch *.pch # Compiled Dynamic libraries *.so *.so.* *.dylib *.dll # Fortran module files *.mod *.smod # Compiled Static libraries *.lai *.la *.a *.lib # Executables *.exe *.out *.app # -------- CMake -------- CMakeCache.txt CMakeFiles CMakeScripts Testing Makefile cmake_install.cmake install_manifest.txt compile_commands.json CTestTestfile.cmake build/* # -------- Python -------- __pycache__/ *.py[cod] *$py.class # -------- IDE -------- .vscode/* .vs/* out/* CMakeSettings.json # -------- CTags -------- .tags .ctags ================================================ FILE: third-party/tbb/BUILD.bazel ================================================ # Copyright (c) 2021-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # DISCLAIMER: Bazel support is community-based. The maintainers do not # use Bazel internally. The Bazel build can have security risks or # optimization gaps. package( default_visibility = ["//visibility:public"], ) cc_library( name = "tbb", srcs = glob([ "src/tbb/*.cpp", "src/tbb/*.h", ]) + select({ "@platforms//cpu:x86_64": glob(["src/tbb/tools_api/**/*.h"]), "//conditions:default": [], }), hdrs = glob([ "include/tbb/*.h", "include/oneapi/*.h", "include/oneapi/tbb/*.h", "include/oneapi/tbb/detail/*.h", ]), copts = ["-w"] + select({ "@platforms//os:windows": [""], "@platforms//cpu:arm64": [""], "//conditions:default": ["-mwaitpkg"], }), defines = select({ "@platforms//cpu:x86_64": ["__TBB_NO_IMPLICIT_LINKAGE"], "//conditions:default": [ "USE_PTHREAD", ], }) + select({ "@platforms//os:osx": ["_XOPEN_SOURCE"], "//conditions:default": [], }), includes = [ "include", ], linkopts = select({ "@platforms//os:windows": [], "@platforms//os:linux": [ "-ldl", "-pthread", "-lrt", ], "//conditions:default": ["-pthread"], }), local_defines = select({ "@platforms//cpu:x86_64": [ "__TBB_USE_ITT_NOTIFY", ], "//conditions:default": [], }) + [ "__TBB_BUILD", ], textual_hdrs = select({ "@platforms//cpu:x86_64": [ "src/tbb/tools_api/ittnotify_static.c", ], "//conditions:default": [], }), ) cc_library( name = "tbbmalloc", srcs = glob([ "src/tbbmalloc/*.h", "src/tbb/*.h", "src/tbbmalloc_proxy/*.h", ]) + [ "src/tbbmalloc/backend.cpp", "src/tbbmalloc/backref.cpp", "src/tbbmalloc/frontend.cpp", "src/tbbmalloc/large_objects.cpp", "src/tbbmalloc/tbbmalloc.cpp", ], hdrs = glob([ "include/tbb/*.h", "include/oneapi/tbb/detail/*.h", "include/oneapi/tbb/*.h", ]), includes = [ "include", ], local_defines = [ "__TBBMALLOC_BUILD", ], ) cc_library( name = "tbbmalloc_proxy", srcs = [ "src/tbbmalloc_proxy/function_replacement.cpp", "src/tbbmalloc_proxy/proxy.cpp", ], deps = [ ":tbbmalloc", ], ) cc_test( name = "test_mutex", srcs = [ "test/tbb/test_mutex.cpp", "test/tbb/test_mutex.h" ] + glob([ "test/common/*.h", ]), includes = ["test"], deps = [ ":tbb", ], ) cc_test( name = "test_parallel_for", srcs = [ "test/tbb/test_parallel_for.cpp", "test/tbb/test_partitioner.h" ] + glob([ "test/common/*.h", ]), includes = ["test"], deps = [ ":tbb", ], ) cc_test( name = "test_parallel_reduce", srcs = [ "test/tbb/test_parallel_reduce.cpp", ] + glob([ "test/common/*.h", ]), includes = ["test"], deps = [ ":tbb", ], ) cc_test( name = "test_task", srcs = [ "test/tbb/test_task.cpp", ] + glob([ "test/common/*.h", ]), includes = ["test"], deps = [ ":tbb", ], ) ================================================ FILE: third-party/tbb/Bazel.md ================================================ # Bazel* build support The main build system of oneTBB is CMake*. [Bazel*](https://bazel.build/) support is community-based. The Bazel configuration may not include recommended compiler and/or linker flags used in the official CMake configuration. --- **NOTE** Bazel is not recommended for use by oneTBB maintainers. Thus, it is not used internally. --- The Bazel oneTBB build is currently only intended for a subset of oneTBB that suffices restricted use cases. Pull requests to improve the Bazel build experience are welcome. The standard Bazel approach to handling third-party libraries is static linking. It is the best practice within the Bazel ecosystem. ## Using oneTBB as a dependency ### Traditional WORKSPACE approach This example demonstrates how to use oneTBB as a dependency within a Bazel project. The following file structure is assumed: ``` example ├── .bazelrc ├── BUILD.bazel ├── main.cpp └── WORKSPACE.bazel ``` _WORKSPACE.bazel_: ```python load("@platforms//tools/build_defs/repo:git.bzl", "git_repository") git_repository( name = "oneTBB", branch = "master", remote = "https://github.com/uxlfoundation/oneTBB/", ) ``` In the *WORKSPACE* file, the oneTBB GitHub* repository is fetched. _BUILD.bazel_: ```python cc_binary( name = "Demo", srcs = ["main.cpp"], deps = ["@oneTBB//:tbb"], ) ``` The *BUILD* file defines a binary named `Demo` that has a dependency to oneTBB. _main.cpp_: ```c++ #include "oneapi/tbb/version.h" #include int main() { std::cout << "Hello from oneTBB " << TBB_VERSION_MAJOR << "." << TBB_VERSION_MINOR << "." << TBB_VERSION_PATCH << "!" << std::endl; return 0; } ``` The expected output of this program is the current version of oneTBB. Switch to the folder with the files created earlier and run the binary with `bazel run //:Demo`. ### Bzlmod If you use Bzlmod, you can fetch oneTBB with the [Bazel Central Registry](https://registry.bazel.build/). Add the following line to your `MODULE.bazel` file: ```bazel bazel_dep(name = "onetbb", version = "2021.13.0") ``` ## Build oneTBB using Bazel Run ```bazel build //...``` in the oneTBB root directory. ## Compiler support The Bazel build uses the compiler flag `-mwaitpkg` in non-Windows* builds. This flag is supported by the GNU* Compiler Collection (GCC) version 9.3, Clang* 12, and newer versions of those tools. --- **NOTE** To use the Bazel build with earlier versions of GCC, remove `-mwaitpkg` flag as it leads to errors during compilation. --- ================================================ FILE: third-party/tbb/CMakeLists.txt ================================================ # Copyright (c) 2020-2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # If the running version of CMake is older than 3.12, the extra ... dots will be seen as version component separators, # resulting in the ... part being ignored and preserving the pre-3.12 behavior of basing policies on . cmake_minimum_required(VERSION 3.5.0...3.31.3) # Enable CMake policies if (POLICY CMP0063) # The NEW behavior for this policy is to honor the visibility properties for all target types. cmake_policy(SET CMP0063 NEW) set(CMAKE_POLICY_DEFAULT_CMP0063 NEW) endif() if (POLICY CMP0068) # RPATH settings do not affect install_name on macOS since CMake 3.9 cmake_policy(SET CMP0068 NEW) endif() if (POLICY CMP0091) # The NEW behavior for this policy is to not place MSVC runtime library flags in the default # CMAKE__FLAGS_ cache entries and use CMAKE_MSVC_RUNTIME_LIBRARY abstraction instead. cmake_policy(SET CMP0091 NEW) elseif (DEFINED CMAKE_MSVC_RUNTIME_LIBRARY) message(FATAL_ERROR "CMAKE_MSVC_RUNTIME_LIBRARY was defined while policy CMP0091 is not available. Use CMake 3.15 or newer.") endif() if (POLICY CMP0148) # CMake 3.27: The FindPythonInterp and FindPythonLibs modules are removed cmake_policy(SET CMP0148 OLD) endif() if (TBB_WINDOWS_DRIVER AND (NOT ("${CMAKE_MSVC_RUNTIME_LIBRARY}" STREQUAL MultiThreaded OR "${CMAKE_MSVC_RUNTIME_LIBRARY}" STREQUAL MultiThreadedDebug))) message(FATAL_ERROR "Enabled TBB_WINDOWS_DRIVER requires CMAKE_MSVC_RUNTIME_LIBRARY to be set to MultiThreaded or MultiThreadedDebug.") endif() # Enable support of minimum supported macOS version flag if (APPLE) if (NOT CMAKE_CXX_OSX_DEPLOYMENT_TARGET_FLAG) set(CMAKE_CXX_OSX_DEPLOYMENT_TARGET_FLAG "-mmacosx-version-min=" CACHE STRING "Minimum macOS version flag") endif() if (NOT CMAKE_C_OSX_DEPLOYMENT_TARGET_FLAG) set(CMAKE_C_OSX_DEPLOYMENT_TARGET_FLAG "-mmacosx-version-min=" CACHE STRING "Minimum macOS version flag") endif() endif() file(READ include/oneapi/tbb/version.h _tbb_version_info) string(REGEX REPLACE ".*#define TBB_VERSION_MAJOR ([0-9]+).*" "\\1" _tbb_ver_major "${_tbb_version_info}") string(REGEX REPLACE ".*#define TBB_VERSION_MINOR ([0-9]+).*" "\\1" _tbb_ver_minor "${_tbb_version_info}") string(REGEX REPLACE ".*#define TBB_VERSION_PATCH ([0-9]+).*" "\\1" _tbb_ver_patch "${_tbb_version_info}") string(REGEX REPLACE ".*#define TBB_INTERFACE_VERSION ([0-9]+).*" "\\1" TBB_INTERFACE_VERSION "${_tbb_version_info}") string(REGEX REPLACE ".*#define __TBB_BINARY_VERSION ([0-9]+).*" "\\1" TBB_BINARY_VERSION "${_tbb_version_info}") string(REGEX REPLACE "..(..)." "\\1" TBB_BINARY_MINOR_VERSION "${TBB_INTERFACE_VERSION}") set(TBBMALLOC_BINARY_VERSION 2) set(TBBBIND_BINARY_VERSION 3) project(TBB VERSION ${_tbb_ver_major}.${_tbb_ver_minor}.${_tbb_ver_patch} LANGUAGES CXX) unset(_tbb_ver_major) unset(_tbb_ver_minor) include(CheckCXXCompilerFlag) include(GNUInstallDirs) include(CMakeDependentOption) # --------------------------------------------------------------------------------------------------------- # Handle C++ standard version. if (NOT MSVC) # no need to cover MSVC as it uses C++14 by default. if (NOT CMAKE_CXX_STANDARD) set(CMAKE_CXX_STANDARD 11) endif() if (CMAKE_CXX${CMAKE_CXX_STANDARD}_STANDARD_COMPILE_OPTION) # if standard option was detected by CMake set(CMAKE_CXX_STANDARD_REQUIRED ON) else() # if standard option wasn't detected by CMake (e.g. for Intel Compiler with CMake 3.1) # TBB_CXX_STD_FLAG should be added to targets via target_compile_options set(TBB_CXX_STD_FLAG -std=c++${CMAKE_CXX_STANDARD}) check_cxx_compiler_flag(${TBB_CXX_STD_FLAG} c++${CMAKE_CXX_STANDARD}) if (NOT c++${CMAKE_CXX_STANDARD}) message(FATAL_ERROR "C++${CMAKE_CXX_STANDARD} (${TBB_CXX_STD_FLAG}) support is required") endif() unset(c++${CMAKE_CXX_STANDARD}) endif() endif() set(CMAKE_CXX_EXTENSIONS OFF) # use -std=c++... instead of -std=gnu++... # --------------------------------------------------------------------------------------------------------- # Setup symbol visibility properties. set(CMAKE_VISIBILITY_INLINES_HIDDEN TRUE) set(CMAKE_CXX_VISIBILITY_PRESET "hidden") # --------------------------------------------------------------------------------------------------------- # Detect architecture (bitness). if (CMAKE_SIZEOF_VOID_P EQUAL 4) set(TBB_ARCH 32) else() set(TBB_ARCH 64) endif() option(TBB_TEST "Enable testing" ON) option(TBB_EXAMPLES "Enable examples" OFF) option(TBB_STRICT "Treat compiler warnings as errors" ON) option(TBB_WINDOWS_DRIVER "Build as Universal Windows Driver (UWD)" OFF) option(TBB_NO_APPCONTAINER "Apply /APPCONTAINER:NO (for testing binaries for Windows Store)" OFF) option(TBB4PY_BUILD "Enable tbb4py build" OFF) option(TBB_BUILD "Enable tbb build" ON) option(TBBMALLOC_BUILD "Enable tbbmalloc build" ON) cmake_dependent_option(TBBMALLOC_PROXY_BUILD "Enable tbbmalloc_proxy build" ON "TBBMALLOC_BUILD" OFF) option(TBB_CPF "Enable preview features of the library" OFF) option(TBB_FIND_PACKAGE "Enable search for external oneTBB using find_package instead of build from sources" OFF) option(TBB_DISABLE_HWLOC_AUTOMATIC_SEARCH "Disable HWLOC automatic search by pkg-config tool" ${CMAKE_CROSSCOMPILING}) option(TBB_ENABLE_IPO "Enable Interprocedural Optimization (IPO) during the compilation" ON) option(TBB_CONTROL_FLOW_GUARD "Enable Control Flow Guard (CFG) during the compilation" OFF) option(TBB_FUZZ_TESTING "Enable fuzz testing" OFF) option(TBB_INSTALL "Enable installation" ON) option(TBB_FILE_TRIM "Enable __FILE__ trim" ON) if(LINUX) option(TBB_LINUX_SEPARATE_DBG "Enable separation of the debug symbols during the build" OFF) endif() if(APPLE) option(TBB_BUILD_APPLE_FRAMEWORKS "Build as Apple Frameworks" OFF) endif() if (NOT DEFINED BUILD_SHARED_LIBS) set(BUILD_SHARED_LIBS ON) endif() if (NOT BUILD_SHARED_LIBS) if(NOT DEFINED CMAKE_POSITION_INDEPENDENT_CODE) set(CMAKE_POSITION_INDEPENDENT_CODE ON) endif() message(WARNING "You are building oneTBB as a static library. This is highly discouraged and such configuration is not supported. Consider building a dynamic library to avoid unforeseen issues.") endif() if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) set(CMAKE_BUILD_TYPE RelWithDebInfo CACHE STRING "Build type" FORCE) message(STATUS "CMAKE_BUILD_TYPE is not specified. Using default: ${CMAKE_BUILD_TYPE}") # Possible values of build type for cmake-gui set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") endif() if (CMAKE_BUILD_TYPE) string(TOLOWER ${CMAKE_BUILD_TYPE} _tbb_build_type) if (_tbb_build_type STREQUAL "debug") set(TBB_ENABLE_IPO OFF) endif() unset(_tbb_build_type) endif() # ------------------------------------------------------------------- # Files and folders naming set(CMAKE_DEBUG_POSTFIX _debug) if (NOT DEFINED TBB_OUTPUT_DIR_BASE) if (MSVC) if (NOT DEFINED CMAKE_MSVC_RUNTIME_LIBRARY OR CMAKE_MSVC_RUNTIME_LIBRARY MATCHES DLL) set(_tbb_msvc_runtime _md) else() set(_tbb_msvc_runtime _mt) endif() if (WINDOWS_STORE) if (TBB_NO_APPCONTAINER) set(_tbb_win_store _wsnoappcont) else() set(_tbb_win_store _ws) endif() elseif(TBB_WINDOWS_DRIVER) set(_tbb_win_store _wd) endif() endif() string(REGEX MATCH "^([0-9]+\.[0-9]+|[0-9]+)" _tbb_compiler_version_short ${CMAKE_CXX_COMPILER_VERSION}) string(TOLOWER ${CMAKE_CXX_COMPILER_ID}_${_tbb_compiler_version_short}_cxx${CMAKE_CXX_STANDARD}_${TBB_ARCH}${_tbb_msvc_runtime}${_tbb_win_store} TBB_OUTPUT_DIR_BASE) unset(_tbb_msvc_runtime) unset(_tbb_win_store) unset(_tbb_compiler_version_short) endif() foreach(output_type LIBRARY ARCHIVE PDB RUNTIME) if (CMAKE_BUILD_TYPE) string(TOLOWER ${CMAKE_BUILD_TYPE} _tbb_build_type_lower) set(CMAKE_${output_type}_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/${TBB_OUTPUT_DIR_BASE}_${_tbb_build_type_lower}) unset(_tbb_build_type_lower) endif() if (CMAKE_CONFIGURATION_TYPES) foreach(suffix ${CMAKE_CONFIGURATION_TYPES}) string(TOUPPER ${suffix} _tbb_suffix_upper) string(TOLOWER ${suffix} _tbb_suffix_lower) set(CMAKE_${output_type}_OUTPUT_DIRECTORY_${_tbb_suffix_upper} ${CMAKE_BINARY_DIR}/${TBB_OUTPUT_DIR_BASE}_${_tbb_suffix_lower}) endforeach() unset(_tbb_suffix_lower) unset(_tbb_suffix_upper) endif() endforeach() if (CMAKE_CONFIGURATION_TYPES) # We can't use generator expressions in a cmake variable name. set(TBB_TEST_WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/${TBB_OUTPUT_DIR_BASE}_$>) else() set(TBB_TEST_WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) endif() # ------------------------------------------------------------------- # ------------------------------------------------------------------- # Common dependencies #force -pthread during compilation for Emscripten if (EMSCRIPTEN AND NOT EMSCRIPTEN_WITHOUT_PTHREAD) set(THREADS_HAVE_PTHREAD_ARG TRUE) endif() set(THREADS_PREFER_PTHREAD_FLAG TRUE) find_package(Threads REQUIRED) # ------------------------------------------------------------------- file(GLOB FILES_WITH_EXTRA_TARGETS ${CMAKE_CURRENT_SOURCE_DIR}/cmake/*.cmake) foreach(FILE_WITH_EXTRA_TARGETS ${FILES_WITH_EXTRA_TARGETS}) include(${FILE_WITH_EXTRA_TARGETS}) endforeach() # - Enabling LTO on Android causes the NDK bug. # NDK throws the warning: "argument unused during compilation: '-Wa,--noexecstack'" # - For some reason GCC does not instrument code with Thread Sanitizer when lto is enabled and C linker is used. if (TBB_ENABLE_IPO AND BUILD_SHARED_LIBS AND NOT ANDROID_PLATFORM AND NOT TBB_SANITIZE MATCHES "thread") if (NOT CMAKE_VERSION VERSION_LESS 3.9) cmake_policy(SET CMP0069 NEW) include(CheckIPOSupported) check_ipo_supported(RESULT TBB_IPO_PROPERTY) else() set(TBB_IPO_FLAGS TRUE) endif() if (TBB_IPO_PROPERTY OR TBB_IPO_FLAGS) message(STATUS "IPO enabled") endif() endif() if (TBB_FILE_TRIM) file(RELATIVE_PATH TBB_RELATIVE_BIN_PATH ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_SOURCE_DIR}) file(TO_NATIVE_PATH ${CMAKE_SOURCE_DIR} NATIVE_TBB_PROJECT_ROOT_DIR) file(TO_NATIVE_PATH ${TBB_RELATIVE_BIN_PATH} NATIVE_TBB_RELATIVE_BIN_PATH) endif () if (TBB_CONTROL_FLOW_GUARD) message(STATUS "Control Flow Guard (CFG) enabled") endif() set(TBB_COMPILER_SETTINGS_FILE ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compilers/${CMAKE_CXX_COMPILER_ID}.cmake) if (EXISTS ${TBB_COMPILER_SETTINGS_FILE}) include(${TBB_COMPILER_SETTINGS_FILE}) else() message(WARNING "TBB compiler settings not found ${TBB_COMPILER_SETTINGS_FILE}") endif() if (TBB_FIND_PACKAGE AND TBB_DIR) # Allow specifying external TBB to test with. # Do not add main targets and installation instructions in that case. message(STATUS "Using external TBB for testing") find_package(TBB REQUIRED) else() if (TBB_BUILD) add_subdirectory(src/tbb) endif() if (TBBMALLOC_BUILD) add_subdirectory(src/tbbmalloc) if(TBBMALLOC_PROXY_BUILD AND NOT "${MSVC_CXX_ARCHITECTURE_ID}" MATCHES "ARM64") add_subdirectory(src/tbbmalloc_proxy) endif() endif() if (NOT BUILD_SHARED_LIBS) message(STATUS "TBBBind build targets are disabled due to unsupported environment") else() add_subdirectory(src/tbbbind) endif() if (TBB_INSTALL) # ------------------------------------------------------------------- # Installation instructions include(CMakePackageConfigHelpers) install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} COMPONENT devel) install(EXPORT ${PROJECT_NAME}Targets NAMESPACE TBB:: DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME} COMPONENT devel) file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake "include(\${CMAKE_CURRENT_LIST_DIR}/${PROJECT_NAME}Targets.cmake)\n") if (NOT BUILD_SHARED_LIBS) file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake "include(CMakeFindDependencyMacro)\nfind_dependency(Threads)\n") endif() write_basic_package_version_file("${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake" COMPATIBILITY AnyNewerVersion) install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake" "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake" DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME} COMPONENT devel) install(FILES "README.md" DESTINATION ${CMAKE_INSTALL_DOCDIR} COMPONENT devel) # ------------------------------------------------------------------- endif() endif() if (TBB_TEST) enable_testing() add_subdirectory(test) endif() if (TBB_EXAMPLES) add_subdirectory(examples) endif() if (TBB_BENCH) if (NOT EXISTS ${CMAKE_CURRENT_LIST_DIR}/benchmark) message(FATAL_ERROR "Benchmarks are not supported yet") endif() enable_testing() add_subdirectory(benchmark) endif() if (ANDROID_PLATFORM) if ("${ANDROID_STL}" STREQUAL "c++_shared") if (${ANDROID_NDK_MAJOR} GREATER_EQUAL "25") if(ANDROID_ABI STREQUAL "arm64-v8a") set(ANDROID_TOOLCHAIN_NAME "aarch64-linux-android") elseif(ANDROID_ABI STREQUAL "x86_64") set(ANDROID_TOOLCHAIN_NAME "x86_64-linux-android") elseif(ANDROID_ABI STREQUAL "armeabi-v7a") set(ANDROID_TOOLCHAIN_NAME "arm-linux-androideabi") elseif(ANDROID_ABI STREQUAL "x86") set(ANDROID_TOOLCHAIN_NAME "i686-linux-android") endif() configure_file( "${ANDROID_TOOLCHAIN_ROOT}/sysroot/usr/lib/${ANDROID_TOOLCHAIN_NAME}/libc++_shared.so" "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libc++_shared.so" COPYONLY) else() configure_file( "${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ABI}/libc++_shared.so" "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libc++_shared.so" COPYONLY) endif() endif() # This custom target may be implemented without separate CMake script, but it requires # ADB(Android Debug Bridge) executable file availability, so to incapsulate this requirement # only for corresponding custom target, it was implemented by this way. add_custom_target(device_environment_cleanup COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/android/device_environment_cleanup.cmake) endif() if (TBB4PY_BUILD) add_subdirectory(python) endif() # Keep it the last instruction. add_subdirectory(cmake/post_install) ================================================ FILE: third-party/tbb/CODEOWNERS ================================================ # Where component owners are known, add them here. /oneTBB/src/tbb/ @pavelkumbrasev /oneTBB/src/tbb/ @dnmokhov /oneTBB/src/tbb/ @sarathnandu /oneTBB/include/oneapi/tbb/parallel_* @pavelkumbrasev /oneTBB/include/oneapi/tbb/concurrent_* @kboyarinov /oneTBB/include/oneapi/tbb/flow_graph* @kboyarinov /oneTBB/include/oneapi/tbb/flow_graph* @aleksei-fedotov /oneTBB/include/oneapi/tbb/detail/_flow_graph* @kboyarinov /oneTBB/include/oneapi/tbb/detail/_flow_graph* @aleksei-fedotov /oneTBB/include/oneapi/tbb/detail/_concurrent* @kboyarinov /oneTBB/src/doc @aepanchi /oneTBB/src/tbbbind/ @isaevil /oneTBB/src/tbbmalloc/ @lplewa /oneTBB/src/tbbmalloc_proxy/ @lplewa /oneTBB/cmake/ @isaevil /oneTBB/*CMakeLists.txt @isaevil /oneTBB/python/ @sarathnandu /oneTBB/python/ @isaevil # Bazel build related files. /oneTBB/.bazelversion @Vertexwahn /oneTBB/Bazel.md @Vertexwahn /oneTBB/BUILD.bazel @Vertexwahn /oneTBB/MODULE.bazel @Vertexwahn ================================================ FILE: third-party/tbb/CODE_OF_CONDUCT.md ================================================ # Contributor Covenant Code of Conduct ## Our Pledge We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, caste, color, religion, or sexual identity and orientation. We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community. ## Our Standards Examples of behavior that contributes to a positive environment for our community include: * Demonstrating empathy and kindness toward other people * Being respectful of differing opinions, viewpoints, and experiences * Giving and gracefully accepting constructive feedback * Accepting responsibility and apologizing to those affected by our mistakes, and learning from the experience * Focusing on what is best not just for us as individuals, but for the overall community Examples of unacceptable behavior include: * The use of sexualized language or imagery, and sexual attention or advances of any kind * Trolling, insulting or derogatory comments, and personal or political attacks * Public or private harassment * Publishing others' private information, such as a physical or email address, without their explicit permission * Other conduct which could reasonably be considered inappropriate in a professional setting ## Enforcement Responsibilities Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful. Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, and will communicate reasons for moderation decisions when appropriate. ## Scope This Code of Conduct applies within all community spaces, and also applies when an individual is officially representing the community in public spaces. Examples of representing our community include using an official email address, posting via an official social media account, or acting as an appointed representative at an online or offline event. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at oneTBBCodeOfConduct At intel DOT com. All complaints will be reviewed and investigated promptly and fairly. All community leaders are obligated to respect the privacy and security of the reporter of any incident. ## Enforcement Guidelines Community leaders will follow these Community Impact Guidelines in determining the consequences for any action they deem in violation of this Code of Conduct: ### 1. Correction **Community Impact**: Use of inappropriate language or other behavior deemed unprofessional or unwelcome in the community. **Consequence**: A private, written warning from community leaders, providing clarity around the nature of the violation and an explanation of why the behavior was inappropriate. A public apology may be requested. ### 2. Warning **Community Impact**: A violation through a single incident or series of actions. **Consequence**: A warning with consequences for continued behavior. No interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, for a specified period of time. This includes avoiding interactions in community spaces as well as external channels like social media. Violating these terms may lead to a temporary or permanent ban. ### 3. Temporary Ban **Community Impact**: A serious violation of community standards, including sustained inappropriate behavior. **Consequence**: A temporary ban from any sort of interaction or public communication with the community for a specified period of time. No public or private interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, is allowed during this period. Violating these terms may lead to a permanent ban. ### 4. Permanent Ban **Community Impact**: Demonstrating a pattern of violation of community standards, including sustained inappropriate behavior, harassment of an individual, or aggression toward or disparagement of classes of individuals. **Consequence**: A permanent ban from any sort of public interaction within the community. ## Attribution This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 2.1, available at [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder][Mozilla CoC]. For answers to common questions about this code of conduct, see the FAQ at [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at [https://www.contributor-covenant.org/translations][translations]. [homepage]: https://www.contributor-covenant.org [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html [Mozilla CoC]: https://github.com/mozilla/diversity [FAQ]: https://www.contributor-covenant.org/faq [translations]: https://www.contributor-covenant.org/translations ================================================ FILE: third-party/tbb/CONTRIBUTING.md ================================================ # How to Contribute As an open source project, we welcome community contributions to oneAPI Threading Building Blocks (oneTBB). This document explains how to participate in project conversations, log bugs and enhancement requests, and submit code patches to the project. ## Licensing Licensing is very important to open source projects. It helps ensure the software continues to be available under the terms that the author desired. The oneTBB project uses the [Apache 2.0 License](https://github.com/uxlfoundation/oneTBB/blob/master/LICENSE.txt), a permissive open source license that allows you to freely use, modify, and distribute your own products that include Apache 2.0 licensed software. By contributing to the oneTBB project, you agree to the license and copyright terms therein and release your own contributions under these terms. Some imported or reused components within oneTBB use other licenses, as described in [third-party-programs.txt](https://github.com/uxlfoundation/oneTBB/blob/master/third-party-programs.txt). By carefully reviewing potential contributions, we can ensure that the community can develop products with oneTBB without concerns over patent or copyright issues. ## Prerequisites As a contributor, you'll want to be familiar with the oneTBB project and the repository layout. You should also know how to use it as explained in the [oneTBB documentation](https://uxlfoundation.github.io/oneTBB/) and how to set up your build development environment to configure, build, and test oneTBB as explained in the [oneTBB Build System Description](cmake/README.md). ## Pull Requests You can find all [open oneTBB pull requests](https://github.com/uxlfoundation/oneTBB/pulls) on GitHub. ### Before contributing changes directly to the oneTBB repository * Make sure you can build the product and run all the tests with your patch. * For a larger feature, provide a relevant test. * Document your code. The oneTBB project uses reStructuredText for documentation. * Update the copyright year in the first line of the changing file(s). For example, if you commit your changes in 2022: * the copyright year should be `2005-2022` for existing files * the copyright year should be `2022` for new files * Submit a pull request into the master branch. You can submit changes with a pull request (preferred) or by sending an email patch. Continuous Integration (CI) testing is enabled for the repository. Your pull request must pass all checks before it can be merged. We will review your contribution and may provide feedback to guide you if any additional fixes or modifications are necessary. When reviewed and accepted, your pull request will be merged into our GitHub repository. ================================================ FILE: third-party/tbb/INSTALL.md ================================================ # Installation from Sources ## Prerequisites - Make sure you have installed CMake version 3.1 (or newer) on your system. oneTBB uses CMake build configuration. - Configure and build oneTBB. To work with build configurations, see [Build System Description](cmake/README.md). ## Configure oneTBB At the command prompt, type: ``` cmake ``` You may want to use some additional options for configuration: | Option | Purpose | Description | | ------ |------ | ------ | | `-G ` | Specify project generator | For more information, run cmake `–help`. | |`-DCMAKE_BUILD_TYPE=Debug` | Specify for Debug build | Not applicable for multi-configuration generators such as Visual Studio generator. | ## Build oneTBB To build the system, run: ``` cmake --build . ``` Some useful build options: - `--target ` - specific target, "all" is default. - `--config ` - build configuration, applicable only for multi-config generators such as Visual Studio generator. ## Install and Pack oneTBB --- **NOTE** Be careful about installing prefix. It defaults to `/usr/local` on UNIX* and `c:/Program Files/${PROJECT_NAME}` on Windows* OS. You can define custom `CMAKE_INSTALL_PREFIX` during configuration: ``` cmake -DCMAKE_INSTALL_PREFIX=/my/install/prefix .. ``` --- Installation can also be done using: ``` cmake --install ``` Special ``--install`` target can alternatively be used for installation, e.g. ``make install``. You can use the ``install`` components for partial installation. The following install components are supported: - `runtime` - oneTBB runtime package (core shared libraries and `.dll` files on Windows* OS). - `devel` - oneTBB development package (header files, CMake integration files, library symbolic links, and `.lib` files on Windows* OS). - `tbb4py` - [oneTBB Module for Python](https://github.com/uxlfoundation/oneTBB/blob/master/python/README.md). If you want to install specific components after configuration and build, run: ```bash cmake -DCOMPONENT= [-DBUILD_TYPE=] -P cmake_install.cmake ``` Simple packaging using CPack is supported. The following commands allow you to create a simple portable package that includes header files, libraries, and integration files for CMake: ```bash cmake .. cpack ``` ## Installation from vcpkg You can download and install oneTBB using the [vcpkg](https://github.com/Microsoft/vcpkg) dependency manager: ```sh git clone https://github.com/Microsoft/vcpkg.git cd vcpkg ./bootstrap-vcpkg.sh #.\bootstrap-vcpkg.bat(for Windows) ./vcpkg integrate install ./vcpkg install tbb ``` The oneTBB port in vcpkg is kept up to date by Microsoft* team members and community contributors. If the version is out of date, create an issue or pull request on the [vcpkg repository](https://github.com/Microsoft/vcpkg). ## Example of Installation ### Single-configuration generators The following example demonstrates how to install oneTBB for single-configuration generators (e.g. GNU Make, Ninja, etc.). ```bash # Do our experiments in /tmp cd /tmp # Clone oneTBB repository git clone https://github.com/uxlfoundation/oneTBB.git cd oneTBB # Create binary directory for out-of-source build mkdir build && cd build # Configure: customize CMAKE_INSTALL_PREFIX and disable TBB_TEST to avoid tests build cmake -DCMAKE_INSTALL_PREFIX=/tmp/my_installed_onetbb -DTBB_TEST=OFF .. # Build cmake --build . # Install cmake --install . # Well done! Your installed oneTBB is in /tmp/my_installed_onetbb ``` ### Multi-configuration generators The following example demonstrates how to install oneTBB for multi-configuration generators such as Visual Studio*. Choose the configuration during the build and install steps: ```batch REM Do our experiments in %TMP% cd %TMP% REM Clone oneTBB repository git clone https://github.com/uxlfoundation/oneTBB.git cd oneTBB REM Create binary directory for out-of-source build mkdir build && cd build REM Configure: customize CMAKE_INSTALL_PREFIX and disable TBB_TEST to avoid tests build cmake -DCMAKE_INSTALL_PREFIX=%TMP%\my_installed_onetbb -DTBB_TEST=OFF .. REM Build "release with debug information" configuration cmake --build . --config relwithdebinfo REM Install "release with debug information" configuration cmake --install . --config relwithdebinfo REM Well done! Your installed oneTBB is in %TMP%\my_installed_onetbb ``` ================================================ FILE: third-party/tbb/LICENSE.txt ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: third-party/tbb/MAINTAINERS.md ================================================ # Introduction This document defines roles in the oneTBB project. # Roles and Responsibilities oneTBB project defines three main roles: * [Contributor](#contributor) * [Code Owner](#code-Owner) * [Maintainer](#maintainer) [permissions]: https://docs.github.com/en/organizations/managing-user-access-to-your-organizations-repositories/managing-repository-roles/repository-roles-for-an-organization#permissions-for-each-role | | Contributor | Code Owner | Maintainer | | :------------------------------------------------------------------------------------------------------------------------------------------ | :---------------------: | :---------------------: | :---------------------: | | _Responsibilities_ | | | | | Follow the [Code of Conduct](./CODE_OF_CONDUCT.md) | ✓ | ✓ | ✓ | | Follow [Contribution Guidelines](./CONTRIBUTING.md) | ✓ | ✓ | ✓ | | Ensure [Contribution Guidelines](./CONTRIBUTING.md) are followed | ✗ | ✓ | ✓ | | Co-own component or aspect of the library,
including contributing: bug fixes, implementing features,
and performance optimizations | ✗ | ✓ | ✓ | | Co-own on technical direction of component or
aspect of the library including work on RFCs | ✗ | ✓ | ✓ | | Co-own the project as a whole,
including determining strategy and policy for the project | ✗ | ✗ | ✓ | | _Privileges_ | | | | | Permission granted | [Read][permissions] | [Write][permissions] | [Maintain][permissions] | | Eligible to become | Code Owner | Maintainer | ✗ | | Can recommend Contributors
to become Code Owner | ✗ | ✓ | ✓ | | Can participate in promotions of
Code Owners and Maintainers | ✗ | ✗ | ✓ | | Can suggest Milestones during planning | ✓ | ✓ | ✓ | | Can choose Milestones for specific component | ✗ | ✓ | ✓ | | Make a decision on project's Milestones during planning | ✗ | ✗ | ✓ | | Can propose new RFC or
participate in review of existing RFC | ✓ | ✓ | ✓ | | Can request rework of RFCs
in represented area of responsibility | ✗ | ✓ | ✓ | | Can request rework of RFCs
in any part of the project | ✗ | ✗ | ✓ | | Can manage release process of the project | ✗ | ✗ | ✓ | | Can represent the project in public as a Maintainer | ✗ | ✗ | ✓ | These roles are merit based. Refer to the corresponding section for specific requirements and the nomination process. ## Contributor A Contributor invests time and resources to improve oneTBB project. Anyone can become a Contributor by bringing value in any following way: * Answer questions from community members. * Propose changes to the design. * Provide feedback on design proposals. * Review and/or test pull requests. * Test releases and report bugs. * Contribute code, including bug fixes, features implementations, and performance optimizations. ## Code Owner A Code Owner has responsibility for a specific project component or a functional area. Code Owners are collectively responsible for developing and maintaining their component or functional areas, including reviewing all changes to corresponding areas of responsibility and indicating whether those changes are ready to be merged. Code Owners have a track record of contribution and review in the project. **Requirements:** * Track record of accepted code contributions to a specific project component. * Track record of contributions to the code review process. * Demonstrate in-depth knowledge of the architecture of a specific project component. * Commit to being responsible for that specific area. How to become a Code Owner? 1. A Contributor is nominated by opening a PR modifying the MAINTAINERS.md file including name, Github username, and affiliation. 2. At least two specific component Maintainers approve the PR. 3. [CODEOWNERS](./CODEOWNERS) file is updated to represent corresponding areas of responsibility. ## Maintainer Maintainers are the most established contributors responsible for the project technical direction. They participate in making decisions about the strategy and priorities of the project. **Requirements:** * Have experience as a Code Owner. * Track record of major project contributions to a specific project component. * Demonstrate deep knowledge of a specific project component. * Demonstrate broad knowledge of the project across multiple areas. * Commit to using privileges responsibly for the good of the project. * Be able to exercise judgment for the good of the project, independent of their employer, friends, or team. Process of becoming a maintainer: 1. A Maintainer may nominate a current code owner to become a new Maintainer by opening a PR against MAINTAINERS.md file. 2. A majority of the current Maintainers must then approve the PR. # Code Owners and Maintainers List ## oneTBB core (API, Architecture, Tests) | Name | Github ID | Affiliation | Role | | --------------------- | --------------------- | ----------------- | ---------- | | Ilya Isaev | @isaevil | Intel Corporation | Code Owner | | Sarath Nandu R | @sarathnandu | Intel Corporation | Code Owner | | Dmitri Mokhov | @dnmokhov | Intel Corporation | Code Owner | | Alexey Kukanov | @akukanov | Intel Corporation | Code Owner | | Konstantin Boyarinov | @kboyarinov | Intel Corporation | Maintainer | | Aleksei Fedotov | @aleksei-fedotov | Intel Corporation | Maintainer | | Michael Voss | @vossmjp | Intel Corporation | Maintainer | | Pavel Kumbrasev | @pavelkumbrasev | Intel Corporation | Maintainer | ## oneTBB TBBMALLOC (API, Architecture, Tests) | Name | Github ID | Affiliation | Role | | --------------------- | --------------------- | ----------------- | ---------- | | Łukasz Plewa | @lplewa | Intel Corporation | Maintainer | ## oneTBB Documentation | Name | Github ID | Affiliation | Role | | ---------------------- | --------------------- | ----------------- | ---------- | | Alexandra Epanchinzeva | @aepanchi | Intel Corporation | Code Owner | ## oneTBB Release Management | Name | Github ID | Affiliation | Role | | ------------------ | --------------------- | ----------------- | ---------- | | Olga Malysheva | @omalyshe | Intel Corporation | Maintainer | ================================================ FILE: third-party/tbb/MODULE.bazel ================================================ # Copyright (c) 2021-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # DISCLAIMER: Bazel support is community-based. The maintainers do not # use Bazel internally. The Bazel build can have security risks or # optimization gaps. module( name = "onetbb", compatibility_level = 1, ) bazel_dep(name = "platforms", version = "0.0.10") ================================================ FILE: third-party/tbb/README.md ================================================ # oneAPI Threading Building Blocks (oneTBB) [![Apache License Version 2.0](https://img.shields.io/badge/license-Apache_2.0-green.svg)](LICENSE.txt) [![oneTBB CI](https://github.com/uxlfoundation/oneTBB/actions/workflows/ci.yml/badge.svg)](https://github.com/uxlfoundation/oneTBB/actions/workflows/ci.yml?query=branch%3Amaster) [![Join the community on GitHub Discussions](https://badgen.net/badge/join%20the%20discussion/on%20github/blue?icon=github)](https://github.com/uxlfoundation/oneTBB/discussions) [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/9125/badge)](https://www.bestpractices.dev/projects/9125) [![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/uxlfoundation/oneTBB/badge)](https://securityscorecards.dev/viewer/?uri=github.com/uxlfoundation/oneTBB) [![Gurubase](https://img.shields.io/badge/Gurubase-Ask%20oneTBB%20Guru-006BFF)](https://gurubase.io/g/onetbb) [![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/oneapi-src/oneTBB/badge)](https://securityscorecards.dev/viewer/?uri=github.com/oneapi-src/oneTBB) [![Coverity Scan Build Status](https://img.shields.io/coverity/scan/30373.svg)](https://scan.coverity.com/projects/oneapi-src-onetbb) oneTBB is a flexible C++ library that simplifies the work of adding parallelism to complex applications, even if you are not a threading expert. The library lets you easily write parallel programs that take full advantage of the multi-core performance. Such programs are portable, composable and have a future-proof scalability. oneTBB provides you with functions, interfaces, and classes to parallelize and scale the code. All you have to do is to use the templates. The library differs from typical threading packages in the following ways: * oneTBB enables you to specify logical parallelism instead of threads. * oneTBB targets threading for performance. * oneTBB is compatible with other threading packages. * oneTBB emphasizes scalable, data parallel programming. * oneTBB relies on generic programming. Refer to oneTBB [examples](examples) and [samples](https://github.com/oneapi-src/oneAPI-samples/tree/master/Libraries/oneTBB) to see how you can use the library. oneTBB is a part of the [UXL Foundation](http://www.uxlfoundation.org) and is an implementation of [oneAPI specification](https://oneapi.io). > **_NOTE:_** Threading Building Blocks (TBB) is now called oneAPI Threading Building Blocks (oneTBB) to highlight that the tool is a part of the oneAPI ecosystem. ## Release Information See [Release Notes](RELEASE_NOTES.md) and [System Requirements](SYSTEM_REQUIREMENTS.md). ## Documentation * [oneTBB Specification](https://spec.oneapi.com/versions/latest/elements/oneTBB/source/nested-index.html) * [oneTBB Developer Guide and Reference](https://uxlfoundation.github.io/oneTBB) * [Migrating from TBB to oneTBB](https://uxlfoundation.github.io/oneTBB/main/tbb_userguide/Migration_Guide.html) * [README for the CMake build system](cmake/README.md) * [oneTBB Testing Approach](https://uxlfoundation.github.io/oneTBB/main/intro/testing_approach.html) * [Basic support for the Bazel build system](Bazel.md) * [oneTBB Discussions](https://github.com/uxlfoundation/oneTBB/discussions) * [WASM Support](WASM_Support.md) ## Installation See [Installation from Sources](INSTALL.md) to learn how to install oneTBB. ## Governance The oneTBB project is governed by the UXL Foundation. You can get involved in this project in following ways: * Join the [Open Source and Specification Working Group](https://github.com/uxlfoundation/foundation/tree/main?tab=readme-ov-file#working-groups) meetings. * Join the mailing lists for the [UXL Foundation](https://lists.uxlfoundation.org/g/main/subgroups) to receive meetings schedule and latest updates. * Contribute to oneTBB project or oneTBB specification. Read [CONTRIBUTING](./CONTRIBUTING.md) for more information. ## Support See our [documentation](./SUPPORT.md) to learn how to request help. ## How to Contribute We welcome community contributions, so check our [Contributing Guidelines](CONTRIBUTING.md) to learn more. Use GitHub Issues for feature requests, bug reports, and minor inquiries. For broader questions and development-related discussions, use GitHub Discussions. ## License oneAPI Threading Building Blocks is licensed under [Apache License, Version 2.0](LICENSE.txt). By its terms, contributions submitted to the project are also done under that license. ------------------------------------------------------------------------ \* All names and brands may be claimed as the property of others. ================================================ FILE: third-party/tbb/RELEASE_NOTES.md ================================================ # Release Notes This document contains changes of oneTBB compared to the last release. ## Table of Contents - [Preview Features](#preview-features) - [Known Limitations](#known-limitations) - [Issues Fixed](#issues-fixed) - [Open-Source Contributions Integrated](#open-source-contributions-integrated) ## :tada: Preview Features - Extended the Flow Graph receiving nodes with a new ``try_put_and_wait`` API that submits a message to the graph and waits for its completion. ## :rotating_light: Known Limitations - The ``oneapi::tbb::info`` namespace interfaces might unexpectedly change the process affinity mask on Windows* OS systems (see https://github.com/open-mpi/hwloc/issues/366 for details) when using hwloc version lower than 2.5. - Using a hwloc version other than 1.11, 2.0, or 2.5 may cause an undefined behavior on Windows OS. See https://github.com/open-mpi/hwloc/issues/477 for details. - The NUMA topology may be detected incorrectly on Windows* OS machines where the number of NUMA node threads exceeds the size of 1 processor group. - On Windows OS on ARM64*, when compiling an application using oneTBB with the Microsoft* Compiler, the compiler issues a warning C4324 that a structure was padded due to the alignment specifier. Consider suppressing the warning by specifying /wd4324 to the compiler command line. - C++ exception handling mechanism on Windows* OS on ARM64* might corrupt memory if an exception is thrown from any oneTBB parallel algorithm (see Windows* OS on ARM64* compiler issue: https://developercommunity.visualstudio.com/t/ARM64-incorrect-stack-unwinding-for-alig/1544293. - When CPU resource coordination is enabled, tasks from a lower-priority ``task_arena`` might be executed before tasks from a higher-priority ``task_arena``. - Using oneTBB on WASM*, may cause applications to run in a single thread. See [Limitations of WASM Support](https://github.com/uxlfoundation/oneTBB/blob/master/WASM_Support.md#limitations). > **_NOTE:_** To see known limitations that impact all versions of oneTBB, refer to [oneTBB Documentation](https://uxlfoundation.github.io/oneTBB/main/intro/limitations.html). ## :hammer: Issues Fixed - Fixed the missed signal for thread request for enqueue operation. - Significantly improved scalability of ``task_group``, ``flow_graph``, and ``parallel_for_each``. - Removed usage of ``std::aligned_storage`` deprecated in C++23 (Inspired by Valery Matskevich https://github.com/uxlfoundation/oneTBB/pull/1394). - Fixed the issue where ``oneapi::tbb::info`` interfaces might interfere with the process affinity mask on the Windows* OS systems with multiple processor groups. ## :octocat: Open-Source Contributions Integrated - Detect the GNU Binutils version to determine WAITPKG support better. Contributed by Martijn Courteaux (https://github.com/uxlfoundation/oneTBB/pull/1347). - Fixed the build on non-English locales. Contributed by Vladislav Shchapov (https://github.com/uxlfoundation/oneTBB/pull/1450). - Improved Bazel support. Contributed by Julian Amann (https://github.com/uxlfoundation/oneTBB/pull/1434). ================================================ FILE: third-party/tbb/SECURITY.md ================================================ # Security Policy As an open-source project, we understand the importance of and responsibility for security. This Security Policy outlines our guidelines and procedures to ensure the highest level of security and trust for oneTBB users. ## Supported Versions Security vulnerabilities are fixed in the [latest version][1] and delivered as a patch release. We don't guarantee security fixes to be back-ported to older oneTBB versions. ## Report a Vulnerability We are very grateful to the security researchers and users that report back security vulnerabilities. We investigate every report thoroughly. We strongly encourage you to report security vulnerabilities to us privately, before disclosing them on public forums or opening a public GitHub* issue. Report a vulnerability to us in one of two ways: * Open a draft **[GitHub* Security Advisory][2]** * Send an e-mail to: **security@uxlfoundation.org**. Along with the report, provide the following info: * A descriptive title. * Your name and affiliation (if any). * A description of the technical details of the vulnerabilities. * A minimal example of the vulnerability so we can reproduce your findings. * An explanation of who can exploit this vulnerability, and what they gain doing so. * Whether this vulnerability is public or known to third parties. If it is, provide details. ### When Should I Report a Vulnerability? * You think you discovered a potential security vulnerability in oneTBB. * You are unsure how the potential vulnerability affects oneTBB. * You think you discovered a vulnerability in another project or 3rd party component on which oneTBB depends. If the issue is not fixed in the 3rd party component, try to report directly there first. ### When Should I NOT Report a Vulnerability? * You got an automated scan hit and are unable to provide details. * You need help using oneTBB for security. * You need help applying security-related updates. * Your issue is not security-related. ## Security Reports Review Process We aim to respond quickly to your inquiry and coordinate a fix and disclosure with you. All confirmed security vulnerabilities will be addressed according to severity level and impact on oneTBB. Normally, security issues are fixed in the next planned release. ## Disclosure Policy We will publish security advisories using the [**GitHub Security Advisories feature**][3] to keep our community well-informed, and will credit you for your findings unless you prefer to stay anonymous. We request that you refrain from exploiting the vulnerability or making it public before the official disclosure. We will disclose the vulnerabilities and bugs as soon as possible once mitigation is implemented and available. ## Feedback on This Policy If you have any suggestions on how this Policy could be improved, submit an issue or a pull request to this repository. **Do not** report potential vulnerabilities or security flaws via a pull request. [1]: https://github.com/uxlfoundation/oneTBB/releases/latest [2]: https://github.com/uxlfoundation/oneTBB/security/advisories/new [3]: https://github.com/uxlfoundation/oneTBB/security/advisories ================================================ FILE: third-party/tbb/SUPPORT.md ================================================ # oneTBB Support We are committed to providing support and assistance to help you make the most out of oneTBB. Use the following methods if you face any challenges. ## Issues If you have a problem, check out the [GitHub Issues](https://github.com/uxlfoundation/oneTBB/issues) to see if the issue you want to address is already reported. You may find users that have encountered the same bug or have similar ideas for changes or updates. You can use issues to report a problem, make a feature request, or add comments on an existing issue. ## Discussions Visit the [GitHub Discussions](https://github.com/uxlfoundation/oneTBB/discussions) to engage with the community, ask questions, or help others. ## Email Reach out to us privately via [email](mailto:inteltbbdevelopers@intel.com). ================================================ FILE: third-party/tbb/SYSTEM_REQUIREMENTS.md ================================================ # System Requirements This document provides details about hardware, operating system, and software prerequisites for the oneAPI Threading Building Blocks (oneTBB). ## Table of Contents - [Supported Hardware](#supported-hardware) - [Software](#software) - [Supported Operating Systems](#supported-operating-systems) - [Community-Supported Platforms](#community-supported-platforms) - [Supported Compilers](#supported-compilers) - [Limitations](#limitations) ## Supported Hardware - Intel(R) Celeron(R) processor family - Intel(R) Core* processor family - Intel(R) Xeon(R) processor family - Intel(R) Atom* processor family - Non-Intel(R) processors compatible with the processors listed above ## Software ### Supported Operating Systems - Systems with Microsoft* Windows* operating systems: - Microsoft* Windows* 10 - Microsoft* Windows* 11 - Microsoft* Windows* Server 2019 - Microsoft* Windows* Server 2022 - Systems with Linux* operating systems: - Oracle Linux* 8 - Amazon* Linux 2, 2022 - Debian* 9, 10, 11 - Fedora* 36, 37, 38 - Rocky* Linux* 8, 9 - Red Hat* Enterprise Linux* 8, 9 - SuSE* Linux* Enterprise Server 15 - Ubuntu* 20.04, 22.04 - Systems with macOS* operating systems: - macOS* 12.x, 13.x - Systems with Android* operating systems: - Android* 9 ### Community-Supported Platforms - MinGW* - FreeBSD* - Microsoft* Windows* on ARM*/ARM64* - macOS* on ARM64* ### Supported Compilers - Intel* oneAPI DPC++/C++ Compiler - Intel® C++ Compiler Classic 2021.1 - 2021.9 - Microsoft* Visual C++ 14.2 (Microsoft* Visual Studio* 2019, Windows* OS only) - Microsoft* Visual C++ 14.3 (Microsoft* Visual Studio* 2022, Windows* OS only) - For each supported Linux* operating system, the standard gcc version provided with that operating system is supported: - GNU Compilers (gcc) 8.x – 12.x - GNU C Library (glibc) version 2.28 – 2.36 - Clang* 6.0.0 - 13.0.0 ## Limitations There are some cases where we cannot provide support for your platforms. It includes: 1. The platform is out of official support (met end of life). When you use an unsupported platform, you can face a security risk that can be difficult to resolve. 2. We do not have the infrastructure to test a platform. Therefore we cannot guarantee that oneTBB works correctly on that platform. 3. Changes affect more code than just platform-specific macros. 4. The platform is incompatible with oneTBB. Some platforms may have limitations that prevent oneTBB from working correctly. We cannot provide support in these cases as the issue is beyond our control. 5. The platform is modified or customized. If you made significant updates to your platform, it might be hard for us to find the root cause of the issue. Therefore, we may not be able to provide support as the modification could affect the oneTBB functionality. We understand that these limitations can be frustrating. Thus, we suggest creating a branch specifically for the unsupported platform, allowing other users to contribute to or use your implementation. ================================================ FILE: third-party/tbb/WASM_Support.md ================================================ # WASM Support oneTBB extends its capabilities by offering robust support for ``WASM`` (see ``Limitation`` sections). ``WASM`` stands for WebAssembly, a low-level binary format for executing code in web browsers. It is designed to be a portable target for compilers and efficient to parse and execute. Using oneTBB with WASM, you can take full advantage of parallelism and concurrency while working on web-based applications, interactive websites, and a variety of other WASM-compatible platforms. oneTBB offers WASM support through the integration with [Emscripten*](https://emscripten.org/docs/introducing_emscripten/index.html), a powerful toolchain for compiling C and C++ code into WASM-compatible runtimes. ## Build **Prerequisites:** Download and install Emscripten*. See the [instructions](https://emscripten.org/docs/getting_started/downloads.html). To build the system, run: ``` mkdir build && cd build emcmake cmake .. -DCMAKE_CXX_COMPILER=em++ -DCMAKE_C_COMPILER=emcc -DTBB_STRICT=OFF -DCMAKE_CXX_FLAGS=-Wno-unused-command-line-argument -DTBB_DISABLE_HWLOC_AUTOMATIC_SEARCH=ON -DBUILD_SHARED_LIBS=OFF -DTBB_EXAMPLES=ON -DTBB_TEST=ON ``` To compile oneTBB without ``pthreads``, set the flag ``-DEMSCRIPTEN_WITHOUT_PTHREAD=true`` in the command above. By default, oneTBB uses the ``pthreads``. ``` cmake --build . cmake --install . ``` Where: * ``emcmake`` - a tool that sets up the environment for Emscripten*. * ``-DCMAKE_CXX_COMPILER=em++`` - specifies the C++ compiler as Emscripten* C++ compiler. * ``-DCMAKE_C_COMPILER=emcc`` - specifies the C compiler as Emscripten* C compiler. > **_NOTE:_** See [CMake documentation](https://github.com/uxlfoundation/oneTBB/blob/master/cmake/README.md) to learn about other options. ## Run Test To run tests, use: ``` ctest ``` # Limitations You can successfully build your application with oneTBB using WASM, but you may not achieve optimal performance immediately. This is due to the limitation with nested Web Workers: a Web Worker cannot schedule another worker without help from a browser thread. This can lead to unexpected performance outcomes, such as the application running in serial. Find more information in the [issue](https://github.com/emscripten-core/emscripten/discussions/21963) in the Emscripten repository. To workaround this issue, try one of the following ways: 1. **Recommended Solution: Use the ``-sPROXY_TO_PTHREAD`` Flag**. This flag splits the initial thread into a browser thread and a main thread (proxied by a Web Worker), effectively resolving the issue as the browser thread is always present in the event loop and can participate in Web Workers scheduling. Refer to the [Emscripten documentation](https://emscripten.org/docs/porting/pthreads.html) for more details about ``-sPROXY_TO_PTHREAD`` since using this flag may require refactoring the code. 2. **Alternative Solution: Warm Up the oneTBB Thread Pool** Initialize the oneTBB thread pool before making the first call to oneTBB. This approach forces the browser thread to participate in Web Workers scheduling. ```cpp int num_threads = tbb::this_task_arena::max_concurrency(); std::atomic barrier{num_threads}; tbb::parallel_for(0, num_threads, [&barrier] (int) { barrier--; while (barrier > 0) { // Send browser thread to event loop std::this_thread::yield(); } }, tbb::static_partitioner{}); ``` > **_NOTE:_** Be aware that it might cause delays on the browser side. ================================================ FILE: third-party/tbb/WORKSPACE.bazel ================================================ # Copyright (c) 2021-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # DISCLAIMER: Bazel support is community-based. The maintainers do not # use Bazel internally. The Bazel build can have security risks or # optimization gaps. # WORKSPACE marker file needed by Bazel ================================================ FILE: third-party/tbb/cmake/README.md ================================================ # Build System Description The project uses CMake* build configuration. The following controls are available during the configure stage: ``` TBB_TEST:BOOL - Enable testing (ON by default) TBB_STRICT:BOOL - Treat compiler warnings as errors (ON by default) TBB_SANITIZE:STRING - Sanitizer parameter, passed to compiler/linker TBB_SIGNTOOL:FILEPATH - Tool for digital signing, used in post-install step for libraries if provided. TBB_SIGNTOOL_ARGS:STRING - Additional arguments for TBB_SIGNTOOL, used if TBB_SIGNTOOL is set. TBB_BUILD:BOOL - Enable Intel(R) oneAPI Threading Building Blocks (oneTBB) build (ON by default) TBB_FIND_PACKAGE - Enable search for external oneTBB using find_package instead of build from sources (OFF by default) TBBMALLOC_BUILD:BOOL - Enable Intel(R) oneAPI Threading Building Blocks (oneTBB) memory allocator build (ON by default) TBBMALLOC_PROXY_BUILD:BOOL - Enable Intel(R) oneAPI Threading Building Blocks (oneTBB) memory allocator proxy build (requires TBBMALLOC_BUILD. ON by default) TBB4PY_BUILD:BOOL - Enable Intel(R) oneAPI Threading Building Blocks (oneTBB) Python module build (OFF by default) TBB_CPF:BOOL - Enable preview features of the library (OFF by default) TBB_INSTALL:BOOL - Enable installation (ON by default) TBB_INSTALL_VARS:BOOL - Enable auto-generated vars installation(packages generated by `cpack` and `make install` will also include the vars script)(OFF by default) TBB_VALGRIND_MEMCHECK:BOOL - Enable scan for memory leaks using Valgrind (OFF by default) TBB_DISABLE_HWLOC_AUTOMATIC_SEARCH - Disable HWLOC automatic search by pkg-config tool (OFF by default) TBB_ENABLE_IPO - Enable Interprocedural Optimization (IPO) during the compilation (ON by default) TBB_CONTROL_FLOW_GUARD:BOOL - Enable Control Flow Guard (CFG) during the compilation (OFF by default) TBB_BUILD_APPLE_FRAMEWORKS - Enable the Apple* frameworks instead of dylibs, only available on the Apple platform. (OFF by default) TBB_FILE_TRIM - Enable __FILE__ trim, replace a build-time full path with a relative path in the debug info and macro __FILE__; use it to make reproducible location-independent builds (ON by default) ``` ## Configure, Build, and Test ### Preparation To perform an out-of-source build, create a build directory and go there: ```bash mkdir /tmp/my-build cd /tmp/my-build ``` ### Configure ```bash cmake ``` Some useful options: - `-G ` - specify particular project generator. See `cmake --help` for details. - `-DCMAKE_BUILD_TYPE=Debug` - specify for Debug build. It is not applicable for multi-config generators, e.g., Microsoft* Visual Studio* generator. #### TBBBind Library Configuration > **_TIP:_** It is recommended to install the HWLOC* library. See [oneTBB documentation](https://uxlfoundation.github.io/oneTBB/GSG/next_steps.html#hybrid-cpu-and-numa-support) for details. The TBBbind library has three versions: `tbbbind`, `tbbbind_2_0`, and `tbbbind_2_5`. Each of these versions is linked with the corresponding HWLOC* library version: - `tbbbind` links with `HWLOC 1.11.x` - `tbbbind_2_0` links with `HWLOC 2.1–2.4` - `tbbbind_2_5` links with `HWLOC 2.5` and later The search for a suitable version of the HWLOC library is enabled by default. If you want to use a specific version of the library, you can specify the path to it manually using the following CMake variables: - `CMAKE_HWLOC__LIBRARY_PATH` - path to the corresponding HWLOC version shared library on Linux* OS or path to `.lib` file on Windows* OS - `CMAKE_HWLOC__INCLUDE_PATH` - path to the corresponding HWLOC version including directory --- **NOTE:** Automatic HWLOC searching requires CMake version 3.6 or higher. --- Windows* OS requires an additional variable for correct TBBBind library building: - `CMAKE_HWLOC__DLL_PATH` - path to the corresponding HWLOC version `.dll` file. The `HWLOC_VER` substring used earlier can be replaced with one of the three values: - `1_11` for the `tbbbind` library configuration - `2` for the `tbbbind_2_0` library configuration - `2_5` for the `tbbbind_2_5` library configuration If you specify variables for several TBBBind versions, the building process for all of these versions is performed during a single build session. --- **TIP** Specify the `TBB_DISABLE_HWLOC_AUTOMATIC_SEARCH` to turn off the HWLOC library's automatic search. --- ### Build ```bash cmake --build . ``` Some useful options: - `--target ` - specific target, "all" is the default. - `--config ` - build configuration, applicable only for multi-config generators, e.g., Visual Studio* generator. The binaries are placed to `./__cxx_`. For example, `./gnu_4.8_cxx11_release`. #### Build For 32-bit * **Intel(R) Compiler**. Source Intel(R) C++ Compiler with `ia32` and build as usual. * **MSVC**. Use switch for [generator](https://cmake.org/cmake/help/latest/manual/cmake-generators.7.html) (e.g., `-A Win32` for [VS2019](https://cmake.org/cmake/help/latest/generator/Visual%20Studio%2016%202019.html)) during the configuration stage and then build as usual. * **GCC/Clang**. Specify `-m32` during the configuration. It can be `CXXFLAGS=-m32 cmake ..` or `cmake -DCMAKE_CXX_FLAGS=-m32 ..` * For any other compiler, which builds for 64-bit by default, specify a 32-bit compiler key during the configuration as above. #### Windows* OS-Specific Builds --- **NOTE** The following builds require CMake version 3.15 or higher. --- * **Dynamic linkage with C Runtime Library (CRT)**. The default behavior can be explicitly specified by setting `CMAKE_MSVC_RUNTIME_LIBRARY` to `MultiThreadedDLL` or `MultiThreadedDebugDLL`. ```bash cmake .. # dynamic linkage is used by default ``` ```bash cmake -DCMAKE_MSVC_RUNTIME_LIBRARY=MultiThreadedDLL .. ``` ```bash cmake -DCMAKE_MSVC_RUNTIME_LIBRARY=MultiThreadedDebugDLL -DCMAKE_BUILD_TYPE=Debug .. ``` * **Static linkage with CRT**. Set `CMAKE_MSVC_RUNTIME_LIBRARY` to `MultiThreaded` or `MultiThreadedDebug`. ```bash cmake -DCMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded .. ``` ```bash cmake -DCMAKE_MSVC_RUNTIME_LIBRARY=MultiThreadedDebug -DCMAKE_BUILD_TYPE=Debug .. ``` * **Windows OS 10 Universal Windows application build**. Set `CMAKE_SYSTEM_NAME` to `WindowsStore` and `CMAKE_SYSTEM_VERSION` to `10.0`. --- **NOTE** Set `TBB_NO_APPCONTAINER` to `ON` to apply the `/APPCONTAINER:NO` option during the compilation (used for testing). --- ```bash cmake -DCMAKE_SYSTEM_NAME:STRING=WindowsStore -DCMAKE_SYSTEM_VERSION:STRING=10.0 .. ``` * **Universal Windows OS Driver build**. Set `TBB_WINDOWS_DRIVER` to `ON` and use static linkage with CRT. ```bash cmake -DTBB_WINDOWS_DRIVER=ON -DCMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded .. ``` #### Example ```bash cmake -DCMAKE_CXX_COMPILER=icpc -DCMAKE_C_COMPILER=icc -DTBB_TEST=off -DCMAKE_HWLOC_1_11_LIBRARY_PATH=/libhwloc.so.15 -DCMAKE_HWLOC_1_11_INCLUDE_PATH= -DCMAKE_INSTALL_PREFIX=/oneTBB_install .. make -j8 && make install ``` --- **NOTE** The library path points to a file, while the include path points to a directory and not to ``hwloc.h``. --- ### Test #### Build test To build a test, use the default target ``all``: ``` cmake --build . ``` Or use a specific test target: ``` cmake --build . --target # e.g. test_version ``` #### Run Test You can run a test by using CTest: ```bash ctest ``` Or by using the ``test`` target: ```bash cmake --build . --target test # currently does not work on Windows* OS ``` ## Installation See [Installation from Sources](../INSTALL.md) to learn how to install oneTBB. To install oneTBB from the release packages, use the following commands: ```bash tar -xvf oneapi-tbb-xxx.xx.x-*.tgz source env/vars.sh ``` ## Sanitizers - Configure, Build, and Run ```bash mkdir build cd build cmake -DTBB_SANITIZE=thread .. # or -DTBB_SANITIZE=memory or any other sanitizer make -j ctest -V ``` ## Valgrind Memcheck - Configure, Build, and Run ### Prerequisites * Valgrind tool executable ### Example ```bash mkdir build cd build cmake -DTBB_VALGRIND_MEMCHECK=ON .. make -j memcheck- # or memcheck-all to scan all tests ``` ## Test Specification Use Doxygen* to generate oneTBB test specification: ```bash mkdir build cd build cmake -DTBB_TEST_SPEC=ON .. make test_spec ``` ## TBBConfig - Integration of Binary Packages It is a configuration module that is used for the integration of prebuilt oneTBB. It consists of two files (``TBBConfig.cmake`` and ``TBBConfigVersion.cmake``) and can be used via the [find_package](https://cmake.org/cmake/help/latest/command/find_package.html) function. To use this module in your CMake project: 1. Let CMake know where to search for TBBConfig, e.g. specify the location of ``TBBConfig.cmake`` in `TBB_DIR` (for more details about search paths, see [find_package](https://cmake.org/cmake/help/latest/command/find_package.html)). 2. Use [find_package](https://cmake.org/cmake/help/latest/command/find_package.html) to find oneTBB. 3. Use provided variables and/or imported targets (described below) to work with the found oneTBB. Example: ```cmake add_executable(foo foo.cpp) find_package(TBB) target_link_libraries(foo TBB::tbb) ``` oneTBB components can be passed to [find_package](https://cmake.org/cmake/help/latest/command/find_package.html) after keyword ``COMPONENTS`` or ``REQUIRED``. Use basic names of components (`tbb`, `tbbmalloc`, etc.). If components are not specified, then the default set is used: `tbb`, `tbbmalloc`, and ``tbbmalloc_proxy``. If `tbbmalloc_proxy` is requested, the `tbbmalloc` component is also added and set as a dependency for `tbbmalloc_proxy`. TBBConfig creates [imported targets](https://cmake.org/cmake/help/latest/manual/cmake-buildsystem.7.html#imported-targets>) as shared libraries using the following format: `TBB::`. For example, `TBB::tbb` or `TBB::tbbmalloc`. To search only for release oneTBB version, set `TBB_FIND_RELEASE_ONLY` to `TRUE` before calling `find_package`. This variable helps to avoid simultaneous linkage of release and debug oneTBB versions when CMake configuration is `Debug,` but a third-party component depends on the release oneTBB version. Variables set during TBB configuration: Variable | Description --- | --- `TBB_FOUND` | oneTBB is found `TBB__FOUND` | Specific oneTBB component is found `TBB_VERSION` | oneTBB version (format: `...`) `TBB_IMPORTED_TARGETS` | All created oneTBB imported targets (not supported for builds from source code) Starting from [oneTBB 2021.1](https://github.com/uxlfoundation/oneTBB/releases/tag/v2021.1), GitHub* release TBBConfig files in the binary packages are located under `/lib/cmake/TBB`. For example, `TBB_DIR` should be set to `/lib/cmake/TBB`. TBBConfig files are automatically created during the build from source code and can be installed together with the library. Also, oneTBB provides a helper function that creates TBBConfig files from predefined templates. See `tbb_generate_config` in `cmake/config_generation.cmake`. ## oneTBB Python Module Support The `TBB4PY_BUILD` Cmake option provides the ability to build a Python module for oneTBB. ### Targets: - `irml` - build IPC RML server - `python_build` - build oneTBB module for Python `python_build` target requirements: - Python version 3.5 or newer - SWIG version 3.0.6 or newer ## CMake Files ### Compile and Link Options Compile and link options may be specific for certain compilers. This part is handled in `cmake/compilers/*` files. Options in TBB CMake are handled via variables in two ways for convenience: * by options group * by the specific option #### Options Group Naming convention is the following: `TBB___`, where: * `` can be: * `LIB` - options applied during libraries build. * `TEST` - options applied during test build. * `BENCH` - options applied during benchmarks build. * `COMMON` - options applied during all (libraries, test, benchmarks) builds. * `` can be: * `COMPILE` - options applied during the compilation. * `LINK` - options applied during the linkage. * `` can be: * `FLAGS` - list of flags * `LIBS` - list of libraries *Examples* Variable | Description --- | --- `TBB_COMMON_COMPILE_FLAGS` | Applied to libraries, tests, and benchmarks as compile options `TBB_LIB_LINK_FLAGS` | Applied to libraries as link options `TBB_LIB_LINK_LIBS ` | Applied to libraries as link libraries `TBB_TEST_COMPILE_FLAGS` | Applied to tests as compile options Specify the `LINK` options prefixed with a dash(-) for MSVC(Visual Studio) compiler with CMake < 3.13 to avoid issues caused by `target_link_libraries` CMake command usage. #### Specific Options If the option is used only in part of the places (library, tests, benchmarks) and adding this option to the group of other options is not possible, then the option must be named using common sense. Warning suppressions should be added to the `TBB_WARNING_SUPPRESS` variable, which is applied during the compilation of libraries, tests, and benchmarks. Additional warnings should be added to the `TBB_WARNING_TEST_FLAGS` variable, which is applied during the compilation of tests. ================================================ FILE: third-party/tbb/cmake/android/device_environment_cleanup.cmake ================================================ # Copyright (c) 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. include(${CMAKE_CURRENT_LIST_DIR}/environment.cmake) execute_on_device("rm -rf ${ANDROID_DEVICE_TESTING_DIRECTORY}") ================================================ FILE: third-party/tbb/cmake/android/environment.cmake ================================================ # Copyright (c) 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. set(ANDROID_DEVICE_TESTING_DIRECTORY "/data/local/tmp/tbb_testing") find_program(adb_executable adb) if (NOT adb_executable) message(FATAL_ERROR "Could not find adb") endif() macro(execute_on_device cmd) execute_process(COMMAND ${adb_executable} shell ${cmd} RESULT_VARIABLE CMD_RESULT) if (CMD_RESULT) message(FATAL_ERROR "Error while on device execution: ${cmd} error_code: ${CMD_RESULT}") endif() endmacro() macro(transfer_data data_path) execute_process(COMMAND ${adb_executable} push --sync ${data_path} ${ANDROID_DEVICE_TESTING_DIRECTORY} RESULT_VARIABLE CMD_RESULT OUTPUT_QUIET) if (CMD_RESULT) message(FATAL_ERROR "Error while data transferring: ${data_path} error_code: ${CMD_RESULT}") endif() endmacro() ================================================ FILE: third-party/tbb/cmake/android/test_launcher.cmake ================================================ # Copyright (c) 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. include(${CMAKE_CURRENT_LIST_DIR}/environment.cmake) # transfer data to device execute_on_device("mkdir -m 755 -p ${ANDROID_DEVICE_TESTING_DIRECTORY}") file (GLOB_RECURSE BINARIES_LIST "${BINARIES_PATH}/*.so*" "${BINARIES_PATH}/${TEST_NAME}") foreach(BINARY_FILE ${BINARIES_LIST}) transfer_data(${BINARY_FILE}) endforeach() # execute binary execute_on_device("chmod -R 755 ${ANDROID_DEVICE_TESTING_DIRECTORY}") execute_on_device("LD_LIBRARY_PATH=${ANDROID_DEVICE_TESTING_DIRECTORY} ${ANDROID_DEVICE_TESTING_DIRECTORY}/${TEST_NAME}") ================================================ FILE: third-party/tbb/cmake/compilers/AppleClang.cmake ================================================ # Copyright (c) 2020-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. set(TBB_LINK_DEF_FILE_FLAG -Wl,-exported_symbols_list,) set(TBB_DEF_FILE_PREFIX mac${TBB_ARCH}) set(TBB_WARNING_LEVEL -Wall -Wextra $<$:-Werror>) set(TBB_TEST_WARNING_FLAGS -Wshadow -Wcast-qual -Woverloaded-virtual -Wnon-virtual-dtor) set(TBB_WARNING_SUPPRESS -Wno-parentheses -Wno-non-virtual-dtor -Wno-dangling-else) # For correct ucontext.h structures layout set(TBB_COMMON_COMPILE_FLAGS -D_XOPEN_SOURCE) # Depfile options (e.g. -MD) are inserted automatically in some cases. # Don't add -MMD to avoid conflicts in such cases. if (NOT CMAKE_GENERATOR MATCHES "Ninja" AND NOT CMAKE_CXX_DEPENDS_USE_COMPILER) set(TBB_MMD_FLAG -MMD) endif() # Ignore -Werror set through add_compile_options() or added to CMAKE_CXX_FLAGS if TBB_STRICT is disabled. if (NOT TBB_STRICT AND COMMAND tbb_remove_compile_flag) tbb_remove_compile_flag(-Werror) endif() # Enable Intel(R) Transactional Synchronization Extensions (-mrtm) and WAITPKG instructions support (-mwaitpkg) on relevant processors if (CMAKE_OSX_ARCHITECTURES) set(_tbb_target_architectures "${CMAKE_OSX_ARCHITECTURES}") else() set(_tbb_target_architectures "${CMAKE_SYSTEM_PROCESSOR}") endif() if ("${_tbb_target_architectures}" MATCHES "(x86_64|amd64|AMD64)") # OSX systems are 64-bit only set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -mrtm $<$>:-mwaitpkg>) endif() unset(_tbb_target_architectures) if (TBB_FILE_TRIM AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 10) set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -ffile-prefix-map=${NATIVE_TBB_PROJECT_ROOT_DIR}/= -ffile-prefix-map=${NATIVE_TBB_RELATIVE_BIN_PATH}/=) endif () # TBB malloc settings set(TBBMALLOC_LIB_COMPILE_FLAGS -fno-rtti -fno-exceptions) ================================================ FILE: third-party/tbb/cmake/compilers/Clang.cmake ================================================ # Copyright (c) 2020-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. if (EMSCRIPTEN) set(TBB_EMSCRIPTEN 1) set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -fexceptions) set(TBB_TEST_LINK_FLAGS ${TBB_COMMON_LINK_FLAGS} -fexceptions -sINITIAL_MEMORY=65536000 -sALLOW_MEMORY_GROWTH=1 -sMALLOC=mimalloc -sEXIT_RUNTIME=1) if (NOT EMSCRIPTEN_WITHOUT_PTHREAD) set_property(TARGET Threads::Threads PROPERTY INTERFACE_LINK_LIBRARIES "-pthread") endif() set(TBB_EMSCRIPTEN_STACK_SIZE 65536) set(TBB_LIB_COMPILE_FLAGS -D__TBB_EMSCRIPTEN_STACK_SIZE=${TBB_EMSCRIPTEN_STACK_SIZE}) set(TBB_TEST_LINK_FLAGS ${TBB_TEST_LINK_FLAGS} -sTOTAL_STACK=${TBB_EMSCRIPTEN_STACK_SIZE}) unset(TBB_EMSCRIPTEN_STACK_SIZE) endif() if (MINGW) set(TBB_LINK_DEF_FILE_FLAG "") set(TBB_DEF_FILE_PREFIX "") elseif (APPLE) set(TBB_LINK_DEF_FILE_FLAG -Wl,-exported_symbols_list,) set(TBB_DEF_FILE_PREFIX mac${TBB_ARCH}) # For correct ucontext.h structures layout set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -D_XOPEN_SOURCE) elseif (MSVC) include(${CMAKE_CURRENT_LIST_DIR}/MSVC.cmake) return() else() set(TBB_LINK_DEF_FILE_FLAG -Wl,--version-script=) set(TBB_DEF_FILE_PREFIX lin${TBB_ARCH}) set(TBB_TEST_COMPILE_FLAGS ${TBB_TEST_COMPILE_FLAGS} $<$>:-ffp-model=precise>) endif() # Depfile options (e.g. -MD) are inserted automatically in some cases. # Don't add -MMD to avoid conflicts in such cases. if (NOT CMAKE_GENERATOR MATCHES "Ninja" AND NOT CMAKE_CXX_DEPENDS_USE_COMPILER) set(TBB_MMD_FLAG -MMD) endif() set(TBB_WARNING_LEVEL -Wall -Wextra $<$:-Werror>) set(TBB_TEST_WARNING_FLAGS -Wshadow -Wcast-qual -Woverloaded-virtual -Wnon-virtual-dtor) # Ignore -Werror set through add_compile_options() or added to CMAKE_CXX_FLAGS if TBB_STRICT is disabled. if (NOT TBB_STRICT AND COMMAND tbb_remove_compile_flag) tbb_remove_compile_flag(-Werror) endif() # Enable Intel(R) Transactional Synchronization Extensions (-mrtm) and WAITPKG instructions support (-mwaitpkg) on relevant processors if (CMAKE_SYSTEM_PROCESSOR MATCHES "(AMD64|amd64|i.86|x86)" AND NOT EMSCRIPTEN) set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -mrtm $<$>:-mwaitpkg>) endif() # Clang flags to prevent compiler from optimizing out security checks set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -Wformat -Wformat-security -Werror=format-security -fPIC $<$>:-fstack-protector-strong>) # -z switch is not supported on MacOS if (NOT APPLE) set(TBB_LIB_LINK_FLAGS ${TBB_LIB_LINK_FLAGS} -Wl,-z,relro,-z,now) endif() set(TBB_COMMON_LINK_LIBS ${CMAKE_DL_LIBS}) if (NOT CMAKE_CXX_FLAGS MATCHES "_FORTIFY_SOURCE") set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} $<$>:-D_FORTIFY_SOURCE=2>) endif () if (MINGW) list(APPEND TBB_COMMON_COMPILE_FLAGS -U__STRICT_ANSI__) endif() if (TBB_FILE_TRIM AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 10) set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -ffile-prefix-map=${NATIVE_TBB_PROJECT_ROOT_DIR}/= -ffile-prefix-map=${NATIVE_TBB_RELATIVE_BIN_PATH}/=) endif () set(TBB_IPO_COMPILE_FLAGS $<$>:-flto>) set(TBB_IPO_LINK_FLAGS $<$>:-flto>) # TBB malloc settings set(TBBMALLOC_LIB_COMPILE_FLAGS -fno-rtti -fno-exceptions) ================================================ FILE: third-party/tbb/cmake/compilers/GNU.cmake ================================================ # Copyright (c) 2020-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. if (MINGW) set(TBB_LINK_DEF_FILE_FLAG "") set(TBB_DEF_FILE_PREFIX "") elseif (APPLE) set(TBB_LINK_DEF_FILE_FLAG -Wl,-exported_symbols_list,) set(TBB_DEF_FILE_PREFIX mac${TBB_ARCH}) # For correct ucontext.h structures layout set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -D_XOPEN_SOURCE) else() set(TBB_LINK_DEF_FILE_FLAG -Wl,--version-script=) set(TBB_DEF_FILE_PREFIX lin${TBB_ARCH}) endif() set(TBB_WARNING_LEVEL -Wall -Wextra $<$:-Werror> -Wfatal-errors) set(TBB_TEST_WARNING_FLAGS -Wshadow -Wcast-qual -Woverloaded-virtual -Wnon-virtual-dtor) # Depfile options (e.g. -MD) are inserted automatically in some cases. # Don't add -MMD to avoid conflicts in such cases. if (NOT CMAKE_GENERATOR MATCHES "Ninja" AND NOT CMAKE_CXX_DEPENDS_USE_COMPILER) set(TBB_MMD_FLAG -MMD) endif() # Binutils < 2.31.1 do not support the tpause instruction. When compiling with # a modern version of GCC (supporting it) but relying on an outdated assembler, # will result in an error reporting "no such instruction: tpause". # The following code invokes the GNU assembler to extract the version number # and convert it to an integer that can be used in the C++ code to compare # against, and conditionally disable the __TBB_WAITPKG_INTRINSICS_PRESENT # macro if the version is incompatible. Binutils only report the version in the # MAJOR.MINOR format, therefore the version checked is >=2.32 (instead of # >=2.31.1). Capturing the output in CMake can be done like below. The version # information is written to either stdout or stderr. To not make any # assumptions, both are captured. execute_process( COMMAND ${CMAKE_COMMAND} -E env "LANG=C" ${CMAKE_CXX_COMPILER} -xc -c /dev/null -Wa,-v -o/dev/null OUTPUT_VARIABLE ASSEMBLER_VERSION_LINE_OUT ERROR_VARIABLE ASSEMBLER_VERSION_LINE_ERR OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_STRIP_TRAILING_WHITESPACE ) set(ASSEMBLER_VERSION_LINE ${ASSEMBLER_VERSION_LINE_OUT}${ASSEMBLER_VERSION_LINE_ERR}) string(REGEX REPLACE ".*GNU assembler version ([0-9]+)\\.([0-9]+).*" "\\1" _tbb_gnu_asm_major_version "${ASSEMBLER_VERSION_LINE}") string(REGEX REPLACE ".*GNU assembler version ([0-9]+)\\.([0-9]+).*" "\\2" _tbb_gnu_asm_minor_version "${ASSEMBLER_VERSION_LINE}") unset(ASSEMBLER_VERSION_LINE_OUT) unset(ASSEMBLER_VERSION_LINE_ERR) unset(ASSEMBLER_VERSION_LINE) message(TRACE "Extracted GNU assembler version: major=${_tbb_gnu_asm_major_version} minor=${_tbb_gnu_asm_minor_version}") math(EXPR _tbb_gnu_asm_version_number "${_tbb_gnu_asm_major_version} * 1000 + ${_tbb_gnu_asm_minor_version}") set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} "-D__TBB_GNU_ASM_VERSION=${_tbb_gnu_asm_version_number}") message(STATUS "GNU Assembler version: ${_tbb_gnu_asm_major_version}.${_tbb_gnu_asm_minor_version} (${_tbb_gnu_asm_version_number})") # Enable Intel(R) Transactional Synchronization Extensions (-mrtm) and WAITPKG instructions support (-mwaitpkg) on relevant processors if (CMAKE_SYSTEM_PROCESSOR MATCHES "(AMD64|amd64|i.86|x86)" AND NOT EMSCRIPTEN) set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -mrtm $<$>,$>>:-mwaitpkg>) endif() set(TBB_COMMON_LINK_LIBS ${CMAKE_DL_LIBS}) # Ignore -Werror set through add_compile_options() or added to CMAKE_CXX_FLAGS if TBB_STRICT is disabled. if (NOT TBB_STRICT AND COMMAND tbb_remove_compile_flag) tbb_remove_compile_flag(-Werror) endif() if (NOT ${CMAKE_CXX_COMPILER_ID} STREQUAL Intel) # gcc 6.0 and later have -flifetime-dse option that controls elimination of stores done outside the object lifetime set(TBB_DSE_FLAG $<$>:-flifetime-dse=1>) set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} $<$>:-fstack-clash-protection>) # Suppress GCC 12.x-14.x warning here that to_wait_node(n)->my_is_in_list might have size 0 set(TBB_COMMON_LINK_FLAGS ${TBB_COMMON_LINK_FLAGS} $<$>,$>:-Wno-stringop-overflow>) endif() # Workaround for heavy tests and too many symbols in debug (rellocation truncated to fit: R_MIPS_CALL16) if ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "mips") set(TBB_TEST_COMPILE_FLAGS ${TBB_TEST_COMPILE_FLAGS} -DTBB_TEST_LOW_WORKLOAD $<$:-fPIE -mxgot>) set(TBB_TEST_LINK_FLAGS ${TBB_TEST_LINK_FLAGS} $<$:-pie>) endif() set(TBB_IPO_COMPILE_FLAGS $<$>:-flto>) set(TBB_IPO_LINK_FLAGS $<$>:-flto>) if (MINGW AND CMAKE_SYSTEM_PROCESSOR MATCHES "i.86") list (APPEND TBB_COMMON_COMPILE_FLAGS -msse2) endif () # Gnu flags to prevent compiler from optimizing out security checks set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -fno-strict-overflow -fno-delete-null-pointer-checks -fwrapv) set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -Wformat -Wformat-security -Werror=format-security -fstack-protector-strong ) # -z switch is not supported on MacOS and MinGW if (NOT APPLE AND NOT MINGW) set(TBB_LIB_LINK_FLAGS ${TBB_LIB_LINK_FLAGS} -Wl,-z,relro,-z,now,-z,noexecstack) endif() if (NOT CMAKE_CXX_FLAGS MATCHES "_FORTIFY_SOURCE") set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} $<$>:-D_FORTIFY_SOURCE=2> ) endif () if (TBB_FILE_TRIM AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8) set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -ffile-prefix-map=${NATIVE_TBB_PROJECT_ROOT_DIR}/= -ffile-prefix-map=${NATIVE_TBB_RELATIVE_BIN_PATH}/=) endif () # TBB malloc settings set(TBBMALLOC_LIB_COMPILE_FLAGS -fno-rtti -fno-exceptions) set(TBB_OPENMP_FLAG -fopenmp) ================================================ FILE: third-party/tbb/cmake/compilers/Intel.cmake ================================================ # Copyright (c) 2020-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. if (MSVC) include(${CMAKE_CURRENT_LIST_DIR}/MSVC.cmake) set(TBB_WARNING_LEVEL ${TBB_WARNING_LEVEL} /W3) set(TBB_OPENMP_FLAG /Qopenmp) set(TBB_IPO_COMPILE_FLAGS $<$>:/Qipo>) set(TBB_IPO_LINK_FLAGS $<$>:/INCREMENTAL:NO>) elseif (APPLE) include(${CMAKE_CURRENT_LIST_DIR}/AppleClang.cmake) set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -fstack-protector -Wformat -Wformat-security $<$>:-fno-omit-frame-pointer -qno-opt-report-embed>) if (NOT CMAKE_CXX_FLAGS MATCHES "_FORTIFY_SOURCE") set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} $<$>:-D_FORTIFY_SOURCE=2>) endif () set(TBB_OPENMP_FLAG -qopenmp) set(TBB_IPO_COMPILE_FLAGS $<$>:-ipo>) else() include(${CMAKE_CURRENT_LIST_DIR}/GNU.cmake) set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} $<$:-falign-stack=maintain-16-byte>) set(TBB_LIB_LINK_FLAGS ${TBB_LIB_LINK_FLAGS} -static-intel) set(TBB_OPENMP_FLAG -qopenmp) set(TBB_IPO_COMPILE_FLAGS $<$>:-ipo>) endif() set(TBB_IPO_LINK_FLAGS ${TBB_IPO_LINK_FLAGS} ${TBB_IPO_COMPILE_FLAGS}) ================================================ FILE: third-party/tbb/cmake/compilers/IntelLLVM.cmake ================================================ # Copyright (c) 2020-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. if (WIN32) include(${CMAKE_CURRENT_LIST_DIR}/MSVC.cmake) set(TBB_OPENMP_FLAG /Qopenmp) set(TBB_IPO_COMPILE_FLAGS $<$>:/Qipo>) set(TBB_IPO_LINK_FLAGS $<$>:/INCREMENTAL:NO>) else() include(${CMAKE_CURRENT_LIST_DIR}/Clang.cmake) set(TBB_IPO_COMPILE_FLAGS $<$>:-ipo>) # "--exclude-libs,ALL" is used to avoid accidental exporting of symbols # from statically linked libraries set(TBB_LIB_LINK_FLAGS ${TBB_LIB_LINK_FLAGS} -static-intel -Wl,--exclude-libs,ALL) set(TBB_OPENMP_FLAG -qopenmp) endif() set(TBB_IPO_LINK_FLAGS ${TBB_IPO_LINK_FLAGS} ${TBB_IPO_COMPILE_FLAGS}) ================================================ FILE: third-party/tbb/cmake/compilers/MSVC.cmake ================================================ # Copyright (c) 2020-2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. set(TBB_LINK_DEF_FILE_FLAG ${CMAKE_LINK_DEF_FILE_FLAG}) set(TBB_DEF_FILE_PREFIX win${TBB_ARCH}) # Workaround for CMake issue https://gitlab.kitware.com/cmake/cmake/issues/18317. # TODO: consider use of CMP0092 CMake policy. string(REGEX REPLACE "/W[0-4]" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") set(TBB_WARNING_LEVEL $<$>:/W4> $<$:/WX>) # Warning suppression C4324: structure was padded due to alignment specifier set(TBB_WARNING_SUPPRESS /wd4324) set(TBB_TEST_COMPILE_FLAGS ${TBB_TEST_COMPILE_FLAGS} /bigobj) if (MSVC_VERSION LESS_EQUAL 1900) # Warning suppression C4503 for VS2015 and earlier: # decorated name length exceeded, name was truncated. # More info can be found at # https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-1-c4503 set(TBB_TEST_COMPILE_FLAGS ${TBB_TEST_COMPILE_FLAGS} /wd4503) endif() set(TBB_LIB_COMPILE_FLAGS -D_CRT_SECURE_NO_WARNINGS /GS) set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} /volatile:iso /FS /EHsc) set(TBB_LIB_LINK_FLAGS ${TBB_LIB_LINK_FLAGS} /DEPENDENTLOADFLAG:0x2000 /DYNAMICBASE /NXCOMPAT) if (TBB_ARCH EQUAL 32) set(TBB_LIB_LINK_FLAGS ${TBB_LIB_LINK_FLAGS} /SAFESEH ) endif() # Ignore /WX set through add_compile_options() or added to CMAKE_CXX_FLAGS if TBB_STRICT is disabled. if (NOT TBB_STRICT AND COMMAND tbb_remove_compile_flag) tbb_remove_compile_flag(/WX) endif() if (WINDOWS_STORE OR TBB_WINDOWS_DRIVER) set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} /D_WIN32_WINNT=0x0A00) set(TBB_COMMON_LINK_FLAGS -NODEFAULTLIB:kernel32.lib -INCREMENTAL:NO) set(TBB_COMMON_LINK_LIBS OneCore.lib) endif() if (WINDOWS_STORE) if (NOT CMAKE_SYSTEM_VERSION EQUAL 10.0) message(FATAL_ERROR "CMAKE_SYSTEM_VERSION must be equal to 10.0") endif() set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} /ZW /ZW:nostdlib) # CMake define this extra lib, remove it for this build type string(REGEX REPLACE "WindowsApp.lib" "" CMAKE_CXX_STANDARD_LIBRARIES "${CMAKE_CXX_STANDARD_LIBRARIES}") if (TBB_NO_APPCONTAINER) set(TBB_LIB_LINK_FLAGS ${TBB_LIB_LINK_FLAGS} -APPCONTAINER:NO) endif() endif() if (TBB_WINDOWS_DRIVER) # Since this is universal driver disable this variable set(CMAKE_SYSTEM_PROCESSOR "") # CMake define list additional libs, remove it for this build type set(CMAKE_CXX_STANDARD_LIBRARIES "") set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} /D _UNICODE /DUNICODE /DWINAPI_FAMILY=WINAPI_FAMILY_APP /D__WRL_NO_DEFAULT_LIB__) endif() if (TBB_FILE_TRIM AND NOT CMAKE_CXX_COMPILER_ID MATCHES "(Intel|IntelLLVM|Clang)") add_compile_options( "$<$:/d1trimfile:${NATIVE_TBB_PROJECT_ROOT_DIR}\\>" "$<$:/d1trimfile:${CMAKE_SOURCE_DIR}/>") endif() if (TBB_CONTROL_FLOW_GUARD) set(TBB_LIB_COMPILE_FLAGS ${TBB_LIB_COMPILE_FLAGS} /guard:cf) set(TBB_LIB_LINK_FLAGS ${TBB_LIB_LINK_FLAGS} /guard:cf) endif() if (CMAKE_CXX_COMPILER_ID MATCHES "(Clang|IntelLLVM)") if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86|AMD64|i.86)") set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -mrtm -mwaitpkg) endif() set(TBB_IPO_COMPILE_FLAGS $<$>:-flto>) set(TBB_IPO_LINK_FLAGS $<$>:-flto>) else() set(TBB_IPO_COMPILE_FLAGS $<$>:/GL>) set(TBB_IPO_LINK_FLAGS $<$>:-LTCG> $<$>:-INCREMENTAL:NO>) endif() set(TBB_OPENMP_FLAG /openmp) set(TBB_OPENMP_NO_LINK_FLAG TRUE) # TBB_OPENMP_FLAG will be used only on compilation but not on linkage ================================================ FILE: third-party/tbb/cmake/compilers/QCC.cmake ================================================ # Copyright (c) 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/compilers/GNU.cmake) # Remove dl library not present in QNX systems unset(TBB_COMMON_LINK_LIBS) ================================================ FILE: third-party/tbb/cmake/config_generation.cmake ================================================ # Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Save current location, # see for details: https://cmake.org/cmake/help/latest/variable/CMAKE_CURRENT_LIST_DIR.html set(_tbb_gen_cfg_path ${CMAKE_CURRENT_LIST_DIR}) include(CMakeParseArguments) function(tbb_generate_config) set(options HANDLE_SUBDIRS) set(oneValueArgs INSTALL_DIR SYSTEM_NAME LIB_REL_PATH INC_REL_PATH VERSION TBB_BINARY_VERSION TBBMALLOC_BINARY_VERSION TBBMALLOC_PROXY_BINARY_VERSION TBBBIND_BINARY_VERSION) cmake_parse_arguments(tbb_gen_cfg "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) get_filename_component(config_install_dir ${tbb_gen_cfg_INSTALL_DIR} ABSOLUTE) file(MAKE_DIRECTORY ${config_install_dir}) file(TO_CMAKE_PATH "${tbb_gen_cfg_LIB_REL_PATH}" TBB_LIB_REL_PATH) file(TO_CMAKE_PATH "${tbb_gen_cfg_INC_REL_PATH}" TBB_INC_REL_PATH) set(TBB_VERSION ${tbb_gen_cfg_VERSION}) set(_tbb_pc_lib_name tbb) set(_prefix_for_pc_file "\${pcfiledir}/../../") set(_includedir_for_pc_file "\${prefix}/include") set(TBB_COMPONENTS_BIN_VERSION " set(_tbb_bin_version ${tbb_gen_cfg_TBB_BINARY_VERSION}) set(_tbbmalloc_bin_version ${tbb_gen_cfg_TBBMALLOC_BINARY_VERSION}) set(_tbbmalloc_proxy_bin_version ${tbb_gen_cfg_TBBMALLOC_PROXY_BINARY_VERSION}) set(_tbbbind_bin_version ${tbb_gen_cfg_TBBBIND_BINARY_VERSION}) ") if (tbb_gen_cfg_SYSTEM_NAME STREQUAL "Linux") set(TBB_LIB_PREFIX "lib") set(TBB_LIB_EXT "so.\${_\${_tbb_component}_bin_version}") set (TBB_HANDLE_IMPLIB " set (_tbb_release_dll \${_tbb_release_lib}) set (_tbb_debug_dll \${_tbb_debug_lib}) ") if (tbb_gen_cfg_HANDLE_SUBDIRS) set(TBB_HANDLE_SUBDIRS "set(_tbb_subdir gcc4.8)") set(_libdir_for_pc_file "\${prefix}/lib/intel64/gcc4.8") set(_tbb_pc_extra_libdir "-L\${prefix}/lib") configure_file(${_tbb_gen_cfg_path}/../integration/pkg-config/tbb.pc.in ${config_install_dir}/tbb.pc @ONLY) set(_libdir_for_pc_file "\${prefix}/lib/ia32/gcc4.8") set(_tbb_pc_extra_libdir "-L\${prefix}/lib32") configure_file(${_tbb_gen_cfg_path}/../integration/pkg-config/tbb.pc.in ${config_install_dir}/tbb32.pc @ONLY) endif() elseif (tbb_gen_cfg_SYSTEM_NAME STREQUAL "Darwin") set(TBB_LIB_PREFIX "lib") set(TBB_LIB_EXT "\${_\${_tbb_component}_bin_version}.dylib") set (TBB_HANDLE_IMPLIB " set (_tbb_release_dll \${_tbb_release_lib}) set (_tbb_debug_dll \${_tbb_debug_lib}) ") set(_libdir_for_pc_file "\${prefix}/lib") configure_file(${_tbb_gen_cfg_path}/../integration/pkg-config/tbb.pc.in ${config_install_dir}/tbb.pc @ONLY) elseif (tbb_gen_cfg_SYSTEM_NAME STREQUAL "Windows") set(TBB_LIB_PREFIX "") set(TBB_LIB_EXT "lib") set(TBB_COMPILE_DEFINITIONS " INTERFACE_COMPILE_DEFINITIONS \"__TBB_NO_IMPLICIT_LINKAGE=1\"") # .lib - installed to TBB_LIB_REL_PATH (e.g. /lib) and are passed as IMPORTED_IMPLIB_ property to target # .dll - installed to /bin or /redist and are passed as IMPORTED_LOCATION_ property to target set (TBB_HANDLE_IMPLIB " find_file(_tbb_release_dll NAMES \${_tbb_component}\${_bin_version}.dll PATHS \${_tbb_root} PATH_SUFFIXES \"redist/\${_tbb_intel_arch}/\${_tbb_subdir}\" \"bin\${_tbb_arch_suffix}/\${_tbb_subdir}\" \"bin\${_tbb_arch_suffix}/\" \"bin\" NO_DEFAULT_PATH ) if (EXISTS \"\${_tbb_debug_lib}\") find_file(_tbb_debug_dll NAMES \${_tbb_component}\${_bin_version}_debug.dll PATHS \${_tbb_root} PATH_SUFFIXES \"redist/\${_tbb_intel_arch}/\${_tbb_subdir}\" \"bin\${_tbb_arch_suffix}/\${_tbb_subdir}\" \"bin\${_tbb_arch_suffix}/\" \"bin\" NO_DEFAULT_PATH ) endif() ") set(TBB_IMPLIB_RELEASE " IMPORTED_IMPLIB_RELEASE \"\${_tbb_release_lib}\"") set(TBB_IMPLIB_DEBUG " IMPORTED_IMPLIB_DEBUG \"\${_tbb_debug_lib}\"") if (tbb_gen_cfg_HANDLE_SUBDIRS) set(TBB_HANDLE_SUBDIRS " set(_tbb_subdir vc14) if (WINDOWS_STORE) set(_tbb_subdir \${_tbb_subdir}_uwp) endif() ") set(_tbb_pc_lib_name ${_tbb_pc_lib_name}${TBB_BINARY_VERSION}) set(_libdir_for_pc_file "\${prefix}/lib/intel64/vc14") set(_tbb_pc_extra_libdir "-L\${prefix}/lib") configure_file(${_tbb_gen_cfg_path}/../integration/pkg-config/tbb.pc.in ${config_install_dir}/tbb.pc @ONLY) set(_libdir_for_pc_file "\${prefix}/lib/ia32/vc14") set(_tbb_pc_extra_libdir "-L\${prefix}/lib32") configure_file(${_tbb_gen_cfg_path}/../integration/pkg-config/tbb.pc.in ${config_install_dir}/tbb32.pc @ONLY) endif() set(TBB_HANDLE_BIN_VERSION " unset(_bin_version) if (_tbb_component STREQUAL tbb) set(_bin_version \${_tbb_bin_version}) endif() ") else() message(FATAL_ERROR "Unsupported OS name: ${tbb_system_name}") endif() configure_file(${_tbb_gen_cfg_path}/templates/TBBConfig.cmake.in ${config_install_dir}/TBBConfig.cmake @ONLY) configure_file(${_tbb_gen_cfg_path}/templates/TBBConfigVersion.cmake.in ${config_install_dir}/TBBConfigVersion.cmake @ONLY) endfunction() ================================================ FILE: third-party/tbb/cmake/hwloc_detection.cmake ================================================ # Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. list(APPEND HWLOC_REQUIRED_VERSIONS 1_11 2 2_5) foreach(hwloc_version ${HWLOC_REQUIRED_VERSIONS}) if (NOT WIN32) set(CMAKE_HWLOC_${hwloc_version}_DLL_PATH STUB) endif() set(HWLOC_TARGET_NAME HWLOC::hwloc_${hwloc_version}) if (NOT TARGET ${HWLOC_TARGET_NAME} AND CMAKE_HWLOC_${hwloc_version}_LIBRARY_PATH AND CMAKE_HWLOC_${hwloc_version}_DLL_PATH AND CMAKE_HWLOC_${hwloc_version}_INCLUDE_PATH ) add_library(${HWLOC_TARGET_NAME} SHARED IMPORTED) set_target_properties(${HWLOC_TARGET_NAME} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_HWLOC_${hwloc_version}_INCLUDE_PATH}") if (WIN32) set_target_properties(${HWLOC_TARGET_NAME} PROPERTIES IMPORTED_LOCATION "${CMAKE_HWLOC_${hwloc_version}_DLL_PATH}" IMPORTED_IMPLIB "${CMAKE_HWLOC_${hwloc_version}_LIBRARY_PATH}") else() set_target_properties(${HWLOC_TARGET_NAME} PROPERTIES IMPORTED_LOCATION "${CMAKE_HWLOC_${hwloc_version}_LIBRARY_PATH}") endif() endif() if (TARGET ${HWLOC_TARGET_NAME}) set(HWLOC_TARGET_EXPLICITLY_DEFINED TRUE) endif() endforeach() unset(HWLOC_TARGET_NAME) if (NOT HWLOC_TARGET_EXPLICITLY_DEFINED AND NOT TBB_DISABLE_HWLOC_AUTOMATIC_SEARCH ) find_package(PkgConfig QUIET) if (PKG_CONFIG_FOUND AND NOT CMAKE_VERSION VERSION_LESS 3.6) pkg_search_module(HWLOC hwloc IMPORTED_TARGET) if (TARGET PkgConfig::HWLOC) if (HWLOC_VERSION VERSION_LESS 2) set(TBBBIND_LIBRARY_NAME tbbbind) elseif(HWLOC_VERSION VERSION_LESS 2.5) set(TBBBIND_LIBRARY_NAME tbbbind_2_0) else() set(TBBBIND_LIBRARY_NAME tbbbind_2_5) endif() endif() endif() endif() ================================================ FILE: third-party/tbb/cmake/memcheck.cmake ================================================ # Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. option(TBB_VALGRIND_MEMCHECK "Enable scan for memory leaks using Valgrind" OFF) if (NOT TBB_VALGRIND_MEMCHECK) return() endif() add_custom_target(memcheck-all COMMENT "Run memcheck on all tests") find_program(VALGRIND_EXE valgrind) if (NOT VALGRIND_EXE) message(FATAL_ERROR "Valgrind executable is not found, add tool to PATH or turn off TBB_VALGRIND_MEMCHECK") else() message(STATUS "Found Valgrind to run memory leak scan") endif() file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/memcheck) function(_tbb_run_memcheck test_target subdir) set(target_name memcheck-${test_target}) if(${subdir} STREQUAL "tbbmalloc") # Valgring intercepts all allocation symbols with its own by default, # so it disables using tbbmalloc. In case of tbbmalloc tests # intercept allocation symbols only in the default system libraries, # but not in any other shared library or the executable # defining public malloc or operator new related functions. set(option "--soname-synonyms=somalloc=nouserintercepts") endif() add_custom_target(${target_name} COMMAND ${VALGRIND_EXE} ${option} --leak-check=full --show-leak-kinds=all --log-file=${CMAKE_BINARY_DIR}/memcheck/${target_name}.log -v $) add_dependencies(memcheck-all ${target_name}) endfunction() add_custom_target(memcheck-short COMMENT "Run memcheck scan on specified list") # List of reasonable and quick enough tests to use in automated memcheck add_dependencies(memcheck-short memcheck-test_allocators memcheck-test_arena_constraints memcheck-test_dynamic_link memcheck-test_concurrent_lru_cache memcheck-conformance_concurrent_unordered_map memcheck-conformance_concurrent_unordered_set memcheck-conformance_concurrent_map memcheck-conformance_concurrent_set memcheck-conformance_concurrent_priority_queue memcheck-conformance_concurrent_vector memcheck-conformance_concurrent_queue memcheck-conformance_concurrent_hash_map memcheck-test_parallel_for memcheck-test_parallel_for_each memcheck-test_parallel_reduce memcheck-test_parallel_sort memcheck-test_parallel_invoke memcheck-test_parallel_scan memcheck-test_parallel_pipeline memcheck-test_eh_algorithms memcheck-test_task_group memcheck-test_task_arena memcheck-test_enumerable_thread_specific memcheck-test_resumable_tasks memcheck-conformance_mutex memcheck-test_function_node memcheck-test_multifunction_node memcheck-test_broadcast_node memcheck-test_buffer_node memcheck-test_composite_node memcheck-test_continue_node memcheck-test_eh_flow_graph memcheck-test_flow_graph memcheck-test_flow_graph_priorities memcheck-test_flow_graph_whitebox memcheck-test_indexer_node memcheck-test_join_node memcheck-test_join_node_key_matching memcheck-test_join_node_msg_key_matching memcheck-test_priority_queue_node memcheck-test_sequencer_node memcheck-test_split_node memcheck-test_tagged_msg memcheck-test_overwrite_node memcheck-test_write_once_node memcheck-test_async_node memcheck-test_input_node memcheck-test_profiling memcheck-test_concurrent_queue_whitebox memcheck-test_intrusive_list memcheck-test_semaphore memcheck-test_environment_whitebox memcheck-test_handle_perror memcheck-test_hw_concurrency memcheck-test_eh_thread memcheck-test_global_control memcheck-test_task memcheck-test_concurrent_monitor ) ================================================ FILE: third-party/tbb/cmake/packaging.cmake ================================================ # Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Note: current implementation uses CMAKE_BUILD_TYPE, # this parameter is not defined for multi-config generators. set(CPACK_PACKAGE_NAME "${PROJECT_NAME}") set(CPACK_PACKAGE_VERSION "${TBB_VERSION}") string(TOLOWER ${CPACK_PACKAGE_NAME}-${PROJECT_VERSION}-${CMAKE_SYSTEM_NAME}_${TBB_OUTPUT_DIR_BASE}_${CMAKE_BUILD_TYPE} CPACK_PACKAGE_FILE_NAME) set(CPACK_GENERATOR ZIP) # Note: this is an internal non-documented variable set by CPack if (NOT CPack_CMake_INCLUDED) include(CPack) endif() ================================================ FILE: third-party/tbb/cmake/post_install/CMakeLists.txt ================================================ # Copyright (c) 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Add code signing as post-install step. if (DEFINED TBB_SIGNTOOL) file(TO_CMAKE_PATH "${TBB_SIGNTOOL}" TBB_SIGNTOOL) install(CODE " file(GLOB_RECURSE FILES_TO_SIGN \${CMAKE_INSTALL_PREFIX}/*${CMAKE_SHARED_LIBRARY_SUFFIX}) execute_process(COMMAND ${TBB_SIGNTOOL} \${FILES_TO_SIGN} ${TBB_SIGNTOOL_ARGS}) ") endif() ================================================ FILE: third-party/tbb/cmake/python/test_launcher.cmake ================================================ # Copyright (c) 2020-2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. if (POLICY CMP0148) # CMake 3.27: The FindPythonInterp and FindPythonLibs modules are removed cmake_policy(SET CMP0148 OLD) endif() find_package(PythonInterp 3.5 REQUIRED) file(GLOB_RECURSE MODULES_LIST "${PYTHON_MODULE_BUILD_PATH}/*TBB.py*" ) list(LENGTH MODULES_LIST MODULES_COUNT) if (MODULES_COUNT EQUAL 0) message(FATAL_ERROR "Cannot find oneTBB Python module") elseif (MODULES_COUNT GREATER 1) message(WARNING "Found more than oneTBB Python modules, the only first found module will be tested") endif() list(GET MODULES_LIST 0 PYTHON_MODULE) get_filename_component(PYTHON_MODULE_PATH ${PYTHON_MODULE} DIRECTORY) execute_process( COMMAND ${CMAKE_COMMAND} -E env LD_LIBRARY_PATH=${TBB_BINARIES_PATH} ${PYTHON_EXECUTABLE} -m tbb test WORKING_DIRECTORY ${PYTHON_MODULE_PATH} RESULT_VARIABLE CMD_RESULT ) if (CMD_RESULT) message(FATAL_ERROR "Error while test execution: ${cmd} error_code: ${CMD_RESULT}") endif() ================================================ FILE: third-party/tbb/cmake/resumable_tasks.cmake ================================================ # Copyright (c) 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. include(CheckSymbolExists) if (UNIX) set(CMAKE_REQUIRED_FLAGS -Wno-deprecated-declarations) if (APPLE) set(CMAKE_REQUIRED_DEFINITIONS -D_XOPEN_SOURCE) endif() check_symbol_exists("getcontext" "ucontext.h" _tbb_have_ucontext) if (NOT _tbb_have_ucontext) set(TBB_RESUMABLE_TASKS_USE_THREADS "__TBB_RESUMABLE_TASKS_USE_THREADS=1") endif() unset(_tbb_have_ucontext) unset(CMAKE_REQUIRED_DEFINITIONS) unset(CMAKE_REQUIRED_FLAGS) endif() ================================================ FILE: third-party/tbb/cmake/sanitize.cmake ================================================ # Copyright (c) 2020-2022 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. set(TBB_SANITIZE ${TBB_SANITIZE} CACHE STRING "Sanitizer parameter passed to compiler/linker" FORCE) # Possible values of sanitizer parameter for cmake-gui for convenience, user still can use any other value. set_property(CACHE TBB_SANITIZE PROPERTY STRINGS "thread" "memory" "leak" "address -fno-omit-frame-pointer") if (NOT TBB_SANITIZE) return() endif() set(TBB_SANITIZE_OPTION -fsanitize=${TBB_SANITIZE}) # It is required to add sanitizer option to CMAKE_REQUIRED_LIBRARIES to make check_cxx_compiler_flag working properly: # sanitizer option should be passed during the compilation phase as well as during the compilation. set(CMAKE_REQUIRED_LIBRARIES "${TBB_SANITIZE_OPTION} ${CMAKE_REQUIRED_LIBRARIES}") string(MAKE_C_IDENTIFIER ${TBB_SANITIZE_OPTION} FLAG_DISPLAY_NAME) check_cxx_compiler_flag(${TBB_SANITIZE_OPTION} ${FLAG_DISPLAY_NAME}) if (NOT ${FLAG_DISPLAY_NAME}) message(FATAL_ERROR "${TBB_SANITIZE_OPTION} is not supported by compiler ${CMAKE_CXX_COMPILER_ID}:${CMAKE_CXX_COMPILER_VERSION}, " "please try another compiler or omit TBB_SANITIZE variable") endif() set(TBB_TESTS_ENVIRONMENT ${TBB_TESTS_ENVIRONMENT} "TSAN_OPTIONS=suppressions=${CMAKE_CURRENT_SOURCE_DIR}/cmake/suppressions/tsan.suppressions" "LSAN_OPTIONS=suppressions=${CMAKE_CURRENT_SOURCE_DIR}/cmake/suppressions/lsan.suppressions") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TBB_SANITIZE_OPTION}") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${TBB_SANITIZE_OPTION}") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${TBB_SANITIZE_OPTION}") ================================================ FILE: third-party/tbb/cmake/scripts/cmake_gen_github_configs.cmake ================================================ # Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. include(${CMAKE_CURRENT_LIST_DIR}/../config_generation.cmake) # TBBConfig in TBB provided packages are expected to be placed into: /lib/cmake/tbb* set(TBB_ROOT_REL_PATH "../../..") # Paths relative to TBB root directory set(INC_REL_PATH "include") set(LIB_REL_PATH "lib") # Parse version info file(READ ${CMAKE_CURRENT_LIST_DIR}/../../include/oneapi/tbb/version.h _tbb_version_info) string(REGEX REPLACE ".*#define TBB_VERSION_MAJOR ([0-9]+).*" "\\1" _tbb_ver_major "${_tbb_version_info}") string(REGEX REPLACE ".*#define TBB_VERSION_MINOR ([0-9]+).*" "\\1" _tbb_ver_minor "${_tbb_version_info}") string(REGEX REPLACE ".*#define TBB_VERSION_PATCH ([0-9]+).*" "\\1" _tbb_ver_patch "${_tbb_version_info}") string(REGEX REPLACE ".*#define __TBB_BINARY_VERSION ([0-9]+).*" "\\1" TBB_BINARY_VERSION "${_tbb_version_info}") file(READ ${CMAKE_CURRENT_LIST_DIR}/../../CMakeLists.txt _tbb_cmakelist) string(REGEX REPLACE ".*TBBMALLOC_BINARY_VERSION ([0-9]+).*" "\\1" TBBMALLOC_BINARY_VERSION "${_tbb_cmakelist}") set(TBBMALLOC_PROXY_BINARY_VERSION ${TBBMALLOC_BINARY_VERSION}) string(REGEX REPLACE ".*TBBBIND_BINARY_VERSION ([0-9]+).*" "\\1" TBBBIND_BINARY_VERSION "${_tbb_cmakelist}") set(COMMON_ARGS TBB_ROOT_REL_PATH ${TBB_ROOT_REL_PATH} INC_REL_PATH ${INC_REL_PATH} LIB_REL_PATH ${LIB_REL_PATH} VERSION ${_tbb_ver_major}.${_tbb_ver_minor}.${_tbb_ver_patch} TBB_BINARY_VERSION ${TBB_BINARY_VERSION} TBBMALLOC_BINARY_VERSION ${TBBMALLOC_BINARY_VERSION} TBBMALLOC_PROXY_BINARY_VERSION ${TBBMALLOC_PROXY_BINARY_VERSION} TBBBIND_BINARY_VERSION ${TBBBIND_BINARY_VERSION} ) tbb_generate_config(INSTALL_DIR ${INSTALL_DIR}/linux SYSTEM_NAME Linux HANDLE_SUBDIRS ${COMMON_ARGS}) tbb_generate_config(INSTALL_DIR ${INSTALL_DIR}/windows SYSTEM_NAME Windows HANDLE_SUBDIRS ${COMMON_ARGS}) tbb_generate_config(INSTALL_DIR ${INSTALL_DIR}/darwin SYSTEM_NAME Darwin ${COMMON_ARGS}) message(STATUS "TBBConfig files were created in ${INSTALL_DIR}") ================================================ FILE: third-party/tbb/cmake/suppressions/lsan.suppressions ================================================ # LSAN suppression for ltdl library known issue. leak:libltdl.so ================================================ FILE: third-party/tbb/cmake/suppressions/tsan.suppressions ================================================ # TSAN suppression for known issues. # Possible data race during ittnotify initialization. Low impact. race:__itt_nullify_all_pointers race:__itt_init_ittlib ================================================ FILE: third-party/tbb/cmake/templates/TBBConfig.cmake.in ================================================ # Copyright (c) 2017-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # It defines the following variables: # TBB__FOUND # TBB_IMPORTED_TARGETS # # TBBConfigVersion.cmake defines TBB_VERSION # # Initialize to default values if (NOT TBB_IMPORTED_TARGETS) set(TBB_IMPORTED_TARGETS "") endif() if (NOT TBB_FIND_COMPONENTS) set(TBB_FIND_COMPONENTS "tbb;tbbmalloc;tbbmalloc_proxy") foreach (_tbb_component ${TBB_FIND_COMPONENTS}) set(TBB_FIND_REQUIRED_${_tbb_component} 1) endforeach() endif() get_filename_component(_tbb_root "${CMAKE_CURRENT_LIST_DIR}" REALPATH) get_filename_component(_tbb_root "${_tbb_root}/@TBB_ROOT_REL_PATH@" ABSOLUTE) set(TBB_INTERFACE_VERSION @TBB_INTERFACE_VERSION@) @TBB_COMPONENTS_BIN_VERSION@ # Add components with internal dependencies: tbbmalloc_proxy -> tbbmalloc list(FIND TBB_FIND_COMPONENTS tbbmalloc_proxy _tbbmalloc_proxy_ix) if (NOT _tbbmalloc_proxy_ix EQUAL -1) list(APPEND TBB_FIND_COMPONENTS tbbmalloc) list(REMOVE_DUPLICATES TBB_FIND_COMPONENTS) set(TBB_FIND_REQUIRED_tbbmalloc ${TBB_FIND_REQUIRED_tbbmalloc_proxy}) endif() unset(_tbbmalloc_proxy_ix) if (CMAKE_SIZEOF_VOID_P STREQUAL "8") set(_tbb_intel_arch intel64) else () set(_tbb_intel_arch ia32) set(_tbb_arch_suffix 32) endif() @TBB_HANDLE_SUBDIRS@ foreach (_tbb_component ${TBB_FIND_COMPONENTS}) unset(_tbb_release_dll CACHE) unset(_tbb_debug_dll CACHE) unset(_tbb_release_lib CACHE) unset(_tbb_debug_lib CACHE) set(TBB_${_tbb_component}_FOUND 0) @TBB_HANDLE_BIN_VERSION@ find_library(_tbb_release_lib NAMES @TBB_LIB_PREFIX@${_tbb_component}${_bin_version}.@TBB_LIB_EXT@ PATHS ${_tbb_root} PATH_SUFFIXES "@TBB_LIB_REL_PATH@/${_tbb_intel_arch}/${_tbb_subdir}" "@TBB_LIB_REL_PATH@${_tbb_arch_suffix}/${_tbb_subdir}" "@TBB_LIB_REL_PATH@${_tbb_arch_suffix}" "@TBB_LIB_REL_PATH@" NO_DEFAULT_PATH ) if (NOT TBB_FIND_RELEASE_ONLY) find_library(_tbb_debug_lib NAMES @TBB_LIB_PREFIX@${_tbb_component}${_bin_version}_debug.@TBB_LIB_EXT@ PATHS ${_tbb_root} PATH_SUFFIXES "@TBB_LIB_REL_PATH@/${_tbb_intel_arch}/${_tbb_subdir}" "@TBB_LIB_REL_PATH@${_tbb_arch_suffix}/${_tbb_subdir}" "@TBB_LIB_REL_PATH@${_tbb_arch_suffix}" "@TBB_LIB_REL_PATH@" NO_DEFAULT_PATH ) endif() if (EXISTS "${_tbb_release_lib}" OR EXISTS "${_tbb_debug_lib}") if (NOT TARGET TBB::${_tbb_component}) add_library(TBB::${_tbb_component} SHARED IMPORTED) get_filename_component(_tbb_include_dir "${_tbb_root}/@TBB_INC_REL_PATH@" ABSOLUTE) set_target_properties(TBB::${_tbb_component} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${_tbb_include_dir}"@TBB_COMPILE_DEFINITIONS@) unset(_tbb_current_realpath) unset(_tbb_include_dir) @TBB_HANDLE_IMPLIB@ if (EXISTS "${_tbb_release_dll}") set_target_properties(TBB::${_tbb_component} PROPERTIES IMPORTED_LOCATION_RELEASE "${_tbb_release_dll}"@TBB_IMPLIB_RELEASE@) set_property(TARGET TBB::${_tbb_component} APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE) endif() if (EXISTS "${_tbb_debug_dll}") set_target_properties(TBB::${_tbb_component} PROPERTIES IMPORTED_LOCATION_DEBUG "${_tbb_debug_dll}"@TBB_IMPLIB_DEBUG@) set_property(TARGET TBB::${_tbb_component} APPEND PROPERTY IMPORTED_CONFIGURATIONS DEBUG) endif() # Add internal dependencies for imported targets: TBB::tbbmalloc_proxy -> TBB::tbbmalloc if (_tbb_component STREQUAL tbbmalloc_proxy) set_target_properties(TBB::tbbmalloc_proxy PROPERTIES INTERFACE_LINK_LIBRARIES TBB::tbbmalloc) endif() endif() list(APPEND TBB_IMPORTED_TARGETS TBB::${_tbb_component}) set(TBB_${_tbb_component}_FOUND 1) elseif (TBB_FIND_REQUIRED AND TBB_FIND_REQUIRED_${_tbb_component}) message(STATUS "Missed required oneTBB component: ${_tbb_component}") if (TBB_FIND_RELEASE_ONLY) message(STATUS " ${_tbb_release_lib} must exist.") else() message(STATUS " one or both of:\n ${_tbb_release_lib}\n ${_tbb_debug_lib}\n files must exist.") endif() set(TBB_FOUND FALSE) endif() endforeach() list(REMOVE_DUPLICATES TBB_IMPORTED_TARGETS) unset(_tbb_release_dll) unset(_tbb_debug_dll) unset(_tbb_release_lib) unset(_tbb_debug_lib) unset(_tbb_root) unset(_tbb_intel_arch) unset(_tbb_arch_suffix) ================================================ FILE: third-party/tbb/cmake/templates/TBBConfigVersion.cmake.in ================================================ # Copyright (c) 2017-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. set(PACKAGE_VERSION @TBB_VERSION@) if ("${PACKAGE_VERSION}" VERSION_LESS "${PACKAGE_FIND_VERSION}") set(PACKAGE_VERSION_COMPATIBLE FALSE) else() set(PACKAGE_VERSION_COMPATIBLE TRUE) if ("${PACKAGE_VERSION}" VERSION_EQUAL "${PACKAGE_FIND_VERSION}") set(PACKAGE_VERSION_EXACT TRUE) endif() endif() ================================================ FILE: third-party/tbb/cmake/test_spec.cmake ================================================ # Copyright (c) 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. option(TBB_TEST_SPEC "Generate test specification (Doxygen)" OFF) if (TBB_TEST_SPEC) find_package(Doxygen REQUIRED) set(DOXYGEN_PREDEFINED_MACROS "TBB_USE_EXCEPTIONS \ __TBB_RESUMABLE_TASKS \ __TBB_HWLOC_PRESENT \ __TBB_CPP17_DEDUCTION_GUIDES_PRESENT \ __TBB_CPP17_MEMORY_RESOURCE_PRESENT \ __TBB_CPP14_GENERIC_LAMBDAS_PRESENT" ) add_custom_target( test_spec ALL COMMAND ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile COMMENT "Generating test specification with Doxygen" VERBATIM) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/doc/Doxyfile.in ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile @ONLY) endif() ================================================ FILE: third-party/tbb/cmake/toolchains/mips.cmake ================================================ # Copyright (c) 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Prevent double invocation. if (MIPS_TOOLCHAIN_INCLUDED) return() endif() set(MIPS_TOOLCHAIN_INCLUDED TRUE) set(CMAKE_SYSTEM_NAME Linux) set(CMAKE_SYSTEM_VERSION 1) set(CMAKE_SYSTEM_PROCESSOR mips) set(CMAKE_C_COMPILER ${CMAKE_FIND_ROOT_PATH}/bin/mips-img-linux-gnu-gcc) set(CMAKE_CXX_COMPILER ${CMAKE_FIND_ROOT_PATH}/bin/mips-img-linux-gnu-g++) set(CMAKE_LINKER ${CMAKE_FIND_ROOT_PATH}/bin/mips-img-linux-gnu-ld) # Define result for try_run used in find_package(Threads). # In old CMake versions (checked on 3.5) there is invocation of try_run command in FindThreads.cmake module. # It can't be executed on host system in case of cross-compilation for MIPS architecture. # Define return code for this try_run as 0 since threads are expected to be available on target machine. set(THREADS_PTHREAD_ARG "0" CACHE STRING "Result from TRY_RUN" FORCE) set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -EL -mabi=64 -march=mips64r6 -mcrc -mfp64 -mmt -mtune=mips64r6 -ggdb -ffp-contract=off -mhard-float" CACHE INTERNAL "") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -mvirt -mxpa" CACHE INTERNAL "") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -mvirt -mxpa" CACHE INTERNAL "") # for tests ================================================ FILE: third-party/tbb/cmake/toolchains/riscv64.cmake ================================================ # Copyright (c) 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Prevent double invocation. if (RISCV_TOOLCHAIN_INCLUDED) return() endif() set(RISCV_TOOLCHAIN_INCLUDED TRUE) set(CMAKE_SYSTEM_NAME Linux) set(CMAKE_SYSTEM_VERSION 1) set(CMAKE_SYSTEM_PROCESSOR riscv) # User can use -DCMAKE_FIND_ROOT_PATH to specific toolchain path set(CMAKE_C_COMPILER ${CMAKE_FIND_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-clang) set(CMAKE_CXX_COMPILER ${CMAKE_FIND_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-clang++) set(CMAKE_LINKER ${CMAKE_FIND_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-ld) set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) # Most linux on riscv64 support rv64imafd_zba_zbb extensions set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64imafd_zba_zbb -mabi=lp64d " CACHE INTERNAL "") ================================================ FILE: third-party/tbb/cmake/utils.cmake ================================================ # Copyright (c) 2020-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. macro(tbb_remove_compile_flag flag) get_property(_tbb_compile_options DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY COMPILE_OPTIONS) list(REMOVE_ITEM _tbb_compile_options ${flag}) set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY COMPILE_OPTIONS ${_tbb_compile_options}) unset(_tbb_compile_options) if (CMAKE_CXX_FLAGS) string(REGEX REPLACE "(^|[ \t\r\n]+)${flag}($|[ \t\r\n]+)" " " CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) endif() endmacro() macro(tbb_install_target target) if (TBB_INSTALL) install(TARGETS ${target} EXPORT TBBTargets LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} NAMELINK_SKIP COMPONENT runtime RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT runtime ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT devel FRAMEWORK DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT runtime OPTIONAL) if (BUILD_SHARED_LIBS) install(TARGETS ${target} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} NAMELINK_ONLY COMPONENT devel) endif() if (MSVC AND BUILD_SHARED_LIBS) install(FILES $ DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT devel OPTIONAL) endif() endif() endmacro() macro(tbb_handle_ipo target) if (TBB_IPO_PROPERTY) set_target_properties(${target} PROPERTIES INTERPROCEDURAL_OPTIMIZATION TRUE INTERPROCEDURAL_OPTIMIZATION_DEBUG FALSE ) elseif (TBB_IPO_FLAGS) target_compile_options(${target} PRIVATE ${TBB_IPO_COMPILE_FLAGS}) if (COMMAND target_link_options) target_link_options(${target} PRIVATE ${TBB_IPO_LINK_FLAGS}) else() target_link_libraries(${target} PRIVATE ${TBB_IPO_LINK_FLAGS}) endif() endif() endmacro() ================================================ FILE: third-party/tbb/cmake/vars_utils.cmake ================================================ # Copyright (c) 2020-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. option(TBB_INSTALL_VARS "Enable auto-generated vars installation" OFF) if (WIN32) set(TBB_VARS_TEMPLATE "windows/env/vars.bat.in") elseif (APPLE) set(TBB_VARS_TEMPLATE "mac/env/vars.sh.in") else() set(TBB_VARS_TEMPLATE "linux/env/vars.sh.in") endif() get_filename_component(TBB_VARS_TEMPLATE_NAME ${PROJECT_SOURCE_DIR}/integration/${TBB_VARS_TEMPLATE} NAME) string(REPLACE ".in" "" TBB_VARS_NAME ${TBB_VARS_TEMPLATE_NAME}) macro(tbb_gen_vars target) if (NOT TBB_BUILD_APPLE_FRAMEWORKS) set(BIN_PATH $) else() # For Apple* frameworks, the binaries are placed in a framework bundle. # When using an Apple* framework, you refer to the bundle, not the binary inside, so we take the bundle's path and go up one level. # This path will then be used to generate the vars file, and the contents of the vars file will use the bundle's parent directory. set(BIN_PATH $/..) endif() if (${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME}) add_custom_command(TARGET ${target} POST_BUILD COMMAND ${CMAKE_COMMAND} -DBINARY_DIR=${CMAKE_BINARY_DIR} -DSOURCE_DIR=${PROJECT_SOURCE_DIR} -DBIN_PATH=${BIN_PATH} -DVARS_TEMPLATE=${TBB_VARS_TEMPLATE} -DVARS_NAME=${TBB_VARS_NAME} -DTBB_INSTALL_VARS=${TBB_INSTALL_VARS} -DTBB_CMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR} -P ${PROJECT_SOURCE_DIR}/integration/cmake/generate_vars.cmake ) endif() endmacro(tbb_gen_vars) if (TBB_INSTALL_VARS) install(PROGRAMS "${CMAKE_BINARY_DIR}/internal_install_vars" DESTINATION env RENAME ${TBB_VARS_NAME}) endif() ================================================ FILE: third-party/tbb/doc/Doxyfile.in ================================================ # Doxyfile 1.8.13 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project. # # All text after a double hash (##) is considered a comment and is placed in # front of the TAG it is preceding. # # All text after a single hash (#) is considered a comment and will be ignored. # The format is: # TAG = value [value, ...] # For lists, items can also be appended using: # TAG += value [value, ...] # Values that contain spaces should be placed between quotes (\" \"). #--------------------------------------------------------------------------- # Project related configuration options #--------------------------------------------------------------------------- # This tag specifies the encoding used for all characters in the config file # that follow. The default is UTF-8 which is also the encoding used for all text # before the first occurrence of this tag. Doxygen uses libiconv (or the iconv # built into libc) for the transcoding. See http://www.gnu.org/software/libiconv # for the list of possible encodings. # The default value is: UTF-8. DOXYFILE_ENCODING = UTF-8 # The PROJECT_NAME tag is a single word (or a sequence of words surrounded by # double-quotes, unless you are using Doxywizard) that should identify the # project for which the documentation is generated. This name is used in the # title of most generated pages and in a few other places. # The default value is: My Project. PROJECT_NAME = "oneTBB Test Specification" # The PROJECT_NUMBER tag can be used to enter a project or revision number. This # could be handy for archiving the generated documentation or if some version # control system is used. PROJECT_NUMBER = # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a # quick idea about the purpose of the project. Keep the description short. PROJECT_BRIEF = # With the PROJECT_LOGO tag one can specify a logo or an icon that is included # in the documentation. The maximum height of the logo should not exceed 55 # pixels and the maximum width should not exceed 200 pixels. Doxygen will copy # the logo to the output directory. PROJECT_LOGO = # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path # into which the generated documentation will be written. If a relative path is # entered, it will be relative to the location where doxygen was started. If # left blank the current directory will be used. OUTPUT_DIRECTORY = # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- # directories (in 2 levels) under the output directory of each output format and # will distribute the generated files over these directories. Enabling this # option can be useful when feeding doxygen a huge amount of source files, where # putting all generated files in the same directory would otherwise causes # performance problems for the file system. # The default value is: NO. CREATE_SUBDIRS = NO # If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII # characters to appear in the names of generated files. If set to NO, non-ASCII # characters will be escaped, for example _xE3_x81_x84 will be used for Unicode # U+3044. # The default value is: NO. ALLOW_UNICODE_NAMES = NO # The OUTPUT_LANGUAGE tag is used to specify the language in which all # documentation generated by doxygen is written. Doxygen will use this # information to generate all constant output in the proper language. # Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, # Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), # Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, # Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), # Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, # Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, # Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, # Ukrainian and Vietnamese. # The default value is: English. OUTPUT_LANGUAGE = English # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member # descriptions after the members that are listed in the file and class # documentation (similar to Javadoc). Set to NO to disable this. # The default value is: YES. BRIEF_MEMBER_DESC = YES # If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief # description of a member or function before the detailed description # # Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the # brief descriptions will be completely suppressed. # The default value is: YES. REPEAT_BRIEF = YES # This tag implements a quasi-intelligent brief description abbreviator that is # used to form the text in various listings. Each string in this list, if found # as the leading text of the brief description, will be stripped from the text # and the result, after processing the whole list, is used as the annotated # text. Otherwise, the brief description is used as-is. If left blank, the # following values are used ($name is automatically replaced with the name of # the entity):The $name class, The $name widget, The $name file, is, provides, # specifies, contains, represents, a, an and the. ABBREVIATE_BRIEF = "The $name class" \ "The $name widget" \ "The $name file" \ is \ provides \ specifies \ contains \ represents \ a \ an \ the # If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then # doxygen will generate a detailed section even if there is only a brief # description. # The default value is: NO. ALWAYS_DETAILED_SEC = NO # If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all # inherited members of a class in the documentation of that class as if those # members were ordinary class members. Constructors, destructors and assignment # operators of the base classes will not be shown. # The default value is: NO. INLINE_INHERITED_MEMB = NO # If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path # before files name in the file list and in the header files. If set to NO the # shortest path that makes the file name unique will be used # The default value is: YES. FULL_PATH_NAMES = YES # The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. # Stripping is only done if one of the specified strings matches the left-hand # part of the path. The tag can be used to show relative paths in the file list. # If left blank the directory from which doxygen is run is used as the path to # strip. # # Note that you can specify absolute paths here, but also relative paths, which # will be relative from the directory where doxygen is started. # This tag requires that the tag FULL_PATH_NAMES is set to YES. STRIP_FROM_PATH = # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the # path mentioned in the documentation of a class, which tells the reader which # header file to include in order to use a class. If left blank only the name of # the header file containing the class definition is used. Otherwise one should # specify the list of include paths that are normally passed to the compiler # using the -I flag. STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but # less readable) file names. This can be useful is your file systems doesn't # support long names like on DOS, Mac, or CD-ROM. # The default value is: NO. SHORT_NAMES = NO # If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the # first line (until the first dot) of a Javadoc-style comment as the brief # description. If set to NO, the Javadoc-style will behave just like regular Qt- # style comments (thus requiring an explicit @brief command for a brief # description.) # The default value is: NO. JAVADOC_AUTOBRIEF = NO # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first # line (until the first dot) of a Qt-style comment as the brief description. If # set to NO, the Qt-style will behave just like regular Qt-style comments (thus # requiring an explicit \brief command for a brief description.) # The default value is: NO. QT_AUTOBRIEF = NO # The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a # multi-line C++ special comment block (i.e. a block of //! or /// comments) as # a brief description. This used to be the default behavior. The new default is # to treat a multi-line C++ comment block as a detailed description. Set this # tag to YES if you prefer the old behavior instead. # # Note that setting this tag to YES also means that rational rose comments are # not recognized any more. # The default value is: NO. MULTILINE_CPP_IS_BRIEF = NO # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the # documentation from any documented member that it re-implements. # The default value is: YES. INHERIT_DOCS = YES # If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new # page for each member. If set to NO, the documentation of a member will be part # of the file/class/namespace that contains it. # The default value is: NO. SEPARATE_MEMBER_PAGES = NO # The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen # uses this value to replace tabs by spaces in code fragments. # Minimum value: 1, maximum value: 16, default value: 4. TAB_SIZE = 4 # This tag can be used to specify a number of aliases that act as commands in # the documentation. An alias has the form: # name=value # For example adding # "sideeffect=@par Side Effects:\n" # will allow you to put the command \sideeffect (or @sideeffect) in the # documentation, which will result in a user-defined paragraph with heading # "Side Effects:". You can put \n's in the value part of an alias to insert # newlines. ALIASES = # This tag can be used to specify a number of word-keyword mappings (TCL only). # A mapping has the form "name=value". For example adding "class=itcl::class" # will allow you to use the command class in the itcl::class meaning. TCL_SUBST = # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources # only. Doxygen will then generate output that is more tailored for C. For # instance, some of the names that are used will be different. The list of all # members will be omitted, etc. # The default value is: NO. OPTIMIZE_OUTPUT_FOR_C = NO # Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or # Python sources only. Doxygen will then generate output that is more tailored # for that language. For instance, namespaces will be presented as packages, # qualified scopes will look different, etc. # The default value is: NO. OPTIMIZE_OUTPUT_JAVA = NO # Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran # sources. Doxygen will then generate output that is tailored for Fortran. # The default value is: NO. OPTIMIZE_FOR_FORTRAN = NO # Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL # sources. Doxygen will then generate output that is tailored for VHDL. # The default value is: NO. OPTIMIZE_OUTPUT_VHDL = NO # Doxygen selects the parser to use depending on the extension of the files it # parses. With this tag you can assign which parser to use for a given # extension. Doxygen has a built-in mapping, but you can override or extend it # using this tag. The format is ext=language, where ext is a file extension, and # language is one of the parsers supported by doxygen: IDL, Java, Javascript, # C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: # FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: # Fortran. In the later case the parser tries to guess whether the code is fixed # or free formatted code, this is the default for Fortran type files), VHDL. For # instance to make doxygen treat .inc files as Fortran files (default is PHP), # and .f files as C (default is Fortran), use: inc=Fortran f=C. # # Note: For files without extension you can use no_extension as a placeholder. # # Note that for custom extensions you also need to set FILE_PATTERNS otherwise # the files are not read by doxygen. EXTENSION_MAPPING = # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments # according to the Markdown format, which allows for more readable # documentation. See http://daringfireball.net/projects/markdown/ for details. # The output of markdown processing is further processed by doxygen, so you can # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in # case of backward compatibilities issues. # The default value is: YES. MARKDOWN_SUPPORT = YES # When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up # to that level are automatically included in the table of contents, even if # they do not have an id attribute. # Note: This feature currently applies only to Markdown headings. # Minimum value: 0, maximum value: 99, default value: 0. # This tag requires that the tag MARKDOWN_SUPPORT is set to YES. TOC_INCLUDE_HEADINGS = 0 # When enabled doxygen tries to link words that correspond to documented # classes, or namespaces to their corresponding documentation. Such a link can # be prevented in individual cases by putting a % sign in front of the word or # globally by setting AUTOLINK_SUPPORT to NO. # The default value is: YES. AUTOLINK_SUPPORT = YES # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want # to include (a tag file for) the STL sources as input, then you should set this # tag to YES in order to let doxygen match functions declarations and # definitions whose arguments contain STL classes (e.g. func(std::string); # versus func(std::string) {}). This also make the inheritance and collaboration # diagrams that involve STL classes more complete and accurate. # The default value is: NO. BUILTIN_STL_SUPPORT = NO # If you use Microsoft's C++/CLI language, you should set this option to YES to # enable parsing support. # The default value is: NO. CPP_CLI_SUPPORT = NO # Set the SIP_SUPPORT tag to YES if your project consists of sip (see: # http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen # will parse them like normal C++ but will assume all classes use public instead # of private inheritance when no explicit protection keyword is present. # The default value is: NO. SIP_SUPPORT = NO # For Microsoft's IDL there are propget and propput attributes to indicate # getter and setter methods for a property. Setting this option to YES will make # doxygen to replace the get and set methods by a property in the documentation. # This will only work if the methods are indeed getting or setting a simple # type. If this is not the case, or you want to show the methods anyway, you # should set this option to NO. # The default value is: YES. IDL_PROPERTY_SUPPORT = YES # If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC # tag is set to YES then doxygen will reuse the documentation of the first # member in the group (if any) for the other members of the group. By default # all members of a group must be documented explicitly. # The default value is: NO. DISTRIBUTE_GROUP_DOC = NO # If one adds a struct or class to a group and this option is enabled, then also # any nested class or struct is added to the same group. By default this option # is disabled and one has to add nested compounds explicitly via \ingroup. # The default value is: NO. GROUP_NESTED_COMPOUNDS = NO # Set the SUBGROUPING tag to YES to allow class member groups of the same type # (for instance a group of public functions) to be put as a subgroup of that # type (e.g. under the Public Functions section). Set it to NO to prevent # subgrouping. Alternatively, this can be done per class using the # \nosubgrouping command. # The default value is: YES. SUBGROUPING = YES # When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions # are shown inside the group in which they are included (e.g. using \ingroup) # instead of on a separate page (for HTML and Man pages) or section (for LaTeX # and RTF). # # Note that this feature does not work in combination with # SEPARATE_MEMBER_PAGES. # The default value is: NO. INLINE_GROUPED_CLASSES = NO # When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions # with only public data fields or simple typedef fields will be shown inline in # the documentation of the scope in which they are defined (i.e. file, # namespace, or group documentation), provided this scope is documented. If set # to NO, structs, classes, and unions are shown on a separate page (for HTML and # Man pages) or section (for LaTeX and RTF). # The default value is: NO. INLINE_SIMPLE_STRUCTS = NO # When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or # enum is documented as struct, union, or enum with the name of the typedef. So # typedef struct TypeS {} TypeT, will appear in the documentation as a struct # with name TypeT. When disabled the typedef will appear as a member of a file, # namespace, or class. And the struct will be named TypeS. This can typically be # useful for C code in case the coding convention dictates that all compound # types are typedef'ed and only the typedef is referenced, never the tag name. # The default value is: NO. TYPEDEF_HIDES_STRUCT = NO # The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This # cache is used to resolve symbols given their name and scope. Since this can be # an expensive process and often the same symbol appears multiple times in the # code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small # doxygen will become slower. If the cache is too large, memory is wasted. The # cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range # is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 # symbols. At the end of a run doxygen will report the cache usage and suggest # the optimal cache size from a speed point of view. # Minimum value: 0, maximum value: 9, default value: 0. LOOKUP_CACHE_SIZE = 0 #--------------------------------------------------------------------------- # Build related configuration options #--------------------------------------------------------------------------- # If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in # documentation are documented, even if no documentation was available. Private # class members and static file members will be hidden unless the # EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. # Note: This will also disable the warnings about undocumented members that are # normally produced when WARNINGS is set to YES. # The default value is: NO. EXTRACT_ALL = NO # If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will # be included in the documentation. # The default value is: NO. EXTRACT_PRIVATE = NO # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal # scope will be included in the documentation. # The default value is: NO. EXTRACT_PACKAGE = NO # If the EXTRACT_STATIC tag is set to YES, all static members of a file will be # included in the documentation. # The default value is: NO. EXTRACT_STATIC = NO # If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined # locally in source files will be included in the documentation. If set to NO, # only classes defined in header files are included. Does not have any effect # for Java sources. # The default value is: YES. EXTRACT_LOCAL_CLASSES = YES # This flag is only useful for Objective-C code. If set to YES, local methods, # which are defined in the implementation section but not in the interface are # included in the documentation. If set to NO, only methods in the interface are # included. # The default value is: NO. EXTRACT_LOCAL_METHODS = NO # If this flag is set to YES, the members of anonymous namespaces will be # extracted and appear in the documentation as a namespace called # 'anonymous_namespace{file}', where file will be replaced with the base name of # the file that contains the anonymous namespace. By default anonymous namespace # are hidden. # The default value is: NO. EXTRACT_ANON_NSPACES = NO # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all # undocumented members inside documented classes or files. If set to NO these # members will be included in the various overviews, but no documentation # section is generated. This option has no effect if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_MEMBERS = NO # If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all # undocumented classes that are normally visible in the class hierarchy. If set # to NO, these classes will be included in the various overviews. This option # has no effect if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_CLASSES = NO # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend # (class|struct|union) declarations. If set to NO, these declarations will be # included in the documentation. # The default value is: NO. HIDE_FRIEND_COMPOUNDS = NO # If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any # documentation blocks found inside the body of a function. If set to NO, these # blocks will be appended to the function's detailed documentation block. # The default value is: NO. HIDE_IN_BODY_DOCS = NO # The INTERNAL_DOCS tag determines if documentation that is typed after a # \internal command is included. If the tag is set to NO then the documentation # will be excluded. Set it to YES to include the internal documentation. # The default value is: NO. INTERNAL_DOCS = NO # If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file # names in lower-case letters. If set to YES, upper-case letters are also # allowed. This is useful if you have classes or files whose names only differ # in case and if your file system supports case sensitive file names. Windows # and Mac users are advised to set this option to NO. # The default value is: system dependent. CASE_SENSE_NAMES = YES # If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with # their full class and namespace scopes in the documentation. If set to YES, the # scope will be hidden. # The default value is: NO. HIDE_SCOPE_NAMES = NO # If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will # append additional text to a page's title, such as Class Reference. If set to # YES the compound reference will be hidden. # The default value is: NO. HIDE_COMPOUND_REFERENCE= NO # If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of # the files that are included by a file in the documentation of that file. # The default value is: YES. SHOW_INCLUDE_FILES = YES # If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each # grouped member an include statement to the documentation, telling the reader # which file to include in order to use the member. # The default value is: NO. SHOW_GROUPED_MEMB_INC = NO # If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include # files with double quotes in the documentation rather than with sharp brackets. # The default value is: NO. FORCE_LOCAL_INCLUDES = NO # If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the # documentation for inline members. # The default value is: YES. INLINE_INFO = YES # If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the # (detailed) documentation of file and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. # The default value is: YES. SORT_MEMBER_DOCS = YES # If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief # descriptions of file, namespace and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. Note that # this will also influence the order of the classes in the class list. # The default value is: NO. SORT_BRIEF_DOCS = NO # If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the # (brief and detailed) documentation of class members so that constructors and # destructors are listed first. If set to NO the constructors will appear in the # respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. # Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief # member documentation. # Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting # detailed member documentation. # The default value is: NO. SORT_MEMBERS_CTORS_1ST = NO # If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy # of group names into alphabetical order. If set to NO the group names will # appear in their defined order. # The default value is: NO. SORT_GROUP_NAMES = NO # If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by # fully-qualified names, including namespaces. If set to NO, the class list will # be sorted only by class name, not including the namespace part. # Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. # Note: This option applies only to the class list, not to the alphabetical # list. # The default value is: NO. SORT_BY_SCOPE_NAME = NO # If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper # type resolution of all parameters of a function it will reject a match between # the prototype and the implementation of a member function even if there is # only one candidate or it is obvious which candidate to choose by doing a # simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still # accept a match between prototype and implementation in such cases. # The default value is: NO. STRICT_PROTO_MATCHING = NO # The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo # list. This list is created by putting \todo commands in the documentation. # The default value is: YES. GENERATE_TODOLIST = YES # The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test # list. This list is created by putting \test commands in the documentation. # The default value is: YES. GENERATE_TESTLIST = YES # The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug # list. This list is created by putting \bug commands in the documentation. # The default value is: YES. GENERATE_BUGLIST = YES # The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) # the deprecated list. This list is created by putting \deprecated commands in # the documentation. # The default value is: YES. GENERATE_DEPRECATEDLIST= YES # The ENABLED_SECTIONS tag can be used to enable conditional documentation # sections, marked by \if ... \endif and \cond # ... \endcond blocks. ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the # initial value of a variable or macro / define can have for it to appear in the # documentation. If the initializer consists of more lines than specified here # it will be hidden. Use a value of 0 to hide initializers completely. The # appearance of the value of individual variables and macros / defines can be # controlled using \showinitializer or \hideinitializer command in the # documentation regardless of this setting. # Minimum value: 0, maximum value: 10000, default value: 30. MAX_INITIALIZER_LINES = 30 # Set the SHOW_USED_FILES tag to NO to disable the list of files generated at # the bottom of the documentation of classes and structs. If set to YES, the # list will mention the files that were used to generate the documentation. # The default value is: YES. SHOW_USED_FILES = YES # Set the SHOW_FILES tag to NO to disable the generation of the Files page. This # will remove the Files entry from the Quick Index and from the Folder Tree View # (if specified). # The default value is: YES. SHOW_FILES = YES # Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces # page. This will remove the Namespaces entry from the Quick Index and from the # Folder Tree View (if specified). # The default value is: YES. SHOW_NAMESPACES = NO # The FILE_VERSION_FILTER tag can be used to specify a program or script that # doxygen should invoke to get the current version for each file (typically from # the version control system). Doxygen will invoke the program by executing (via # popen()) the command command input-file, where command is the value of the # FILE_VERSION_FILTER tag, and input-file is the name of an input file provided # by doxygen. Whatever the program writes to standard output is used as the file # version. For an example see the documentation. FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed # by doxygen. The layout file controls the global structure of the generated # output files in an output format independent way. To create the layout file # that represents doxygen's defaults, run doxygen with the -l option. You can # optionally specify a file name after the option, if omitted DoxygenLayout.xml # will be used as the name of the layout file. # # Note that if you run doxygen from a directory containing a file called # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE # tag is left empty. LAYOUT_FILE = @CMAKE_CURRENT_SOURCE_DIR@/doc/DoxygenLayout.xml # The CITE_BIB_FILES tag can be used to specify one or more bib files containing # the reference definitions. This must be a list of .bib files. The .bib # extension is automatically appended if omitted. This requires the bibtex tool # to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. # For LaTeX the style of the bibliography can be controlled using # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the # search path. See also \cite for info how to create references. CITE_BIB_FILES = #--------------------------------------------------------------------------- # Configuration options related to warning and progress messages #--------------------------------------------------------------------------- # The QUIET tag can be used to turn on/off the messages that are generated to # standard output by doxygen. If QUIET is set to YES this implies that the # messages are off. # The default value is: NO. QUIET = NO # The WARNINGS tag can be used to turn on/off the warning messages that are # generated to standard error (stderr) by doxygen. If WARNINGS is set to YES # this implies that the warnings are on. # # Tip: Turn warnings on while writing the documentation. # The default value is: YES. WARNINGS = YES # If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate # warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag # will automatically be disabled. # The default value is: YES. WARN_IF_UNDOCUMENTED = YES # If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for # potential errors in the documentation, such as not documenting some parameters # in a documented function, or documenting parameters that don't exist or using # markup commands wrongly. # The default value is: YES. WARN_IF_DOC_ERROR = YES # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that # are documented, but have no documentation for their parameters or return # value. If set to NO, doxygen will only warn about wrong or incomplete # parameter documentation, but not about the absence of documentation. # The default value is: NO. WARN_NO_PARAMDOC = NO # If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when # a warning is encountered. # The default value is: NO. WARN_AS_ERROR = NO # The WARN_FORMAT tag determines the format of the warning messages that doxygen # can produce. The string should contain the $file, $line, and $text tags, which # will be replaced by the file and line number from which the warning originated # and the warning text. Optionally the format may contain $version, which will # be replaced by the version of the file (if it could be obtained via # FILE_VERSION_FILTER) # The default value is: $file:$line: $text. WARN_FORMAT = "$file:$line: $text" # The WARN_LOGFILE tag can be used to specify a file to which warning and error # messages should be written. If left blank the output is written to standard # error (stderr). WARN_LOGFILE = #--------------------------------------------------------------------------- # Configuration options related to the input files #--------------------------------------------------------------------------- # The INPUT tag is used to specify the files and/or directories that contain # documented source files. You may enter file names like myfile.cpp or # directories like /usr/src/myproject. Separate the files or directories with # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. INPUT = @CMAKE_CURRENT_SOURCE_DIR@/test \ @CMAKE_CURRENT_SOURCE_DIR@/doc # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses # libiconv (or the iconv built into libc) for the transcoding. See the libiconv # documentation (see: http://www.gnu.org/software/libiconv) for the list of # possible encodings. # The default value is: UTF-8. INPUT_ENCODING = UTF-8 # If the value of the INPUT tag contains directories, you can use the # FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and # *.h) to filter out the source-files in the directories. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # read by doxygen. # # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, # *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, # *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf. FILE_PATTERNS = *.c \ *.cc \ *.cxx \ *.cpp \ *.c++ \ *.java \ *.ii \ *.ixx \ *.ipp \ *.i++ \ *.inl \ *.idl \ *.ddl \ *.odl \ *.h \ *.hh \ *.hxx \ *.hpp \ *.h++ \ *.cs \ *.d \ *.php \ *.php4 \ *.php5 \ *.phtml \ *.inc \ *.m \ *.markdown \ *.md \ *.mm \ *.dox \ *.py \ *.pyw \ *.f90 \ *.f95 \ *.f03 \ *.f08 \ *.f \ *.for \ *.tcl \ *.vhd \ *.vhdl \ *.ucf \ *.qsf # The RECURSIVE tag can be used to specify whether or not subdirectories should # be searched for input files as well. # The default value is: NO. RECURSIVE = YES # The EXCLUDE tag can be used to specify files and/or directories that should be # excluded from the INPUT source files. This way you can easily exclude a # subdirectory from a directory tree whose root is specified with the INPUT tag. # # Note that relative paths are relative to the directory from which doxygen is # run. EXCLUDE = # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded # from the input. # The default value is: NO. EXCLUDE_SYMLINKS = NO # If the value of the INPUT tag contains directories, you can use the # EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude # certain files from those directories. # # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories for example use the pattern */test/* EXCLUDE_PATTERNS = */common/* # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the # output. The symbol name can be a fully qualified name, a word, or if the # wildcard * is used, a substring. Examples: ANamespace, AClass, # AClass::ANamespace, ANamespace::*Test # # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories use the pattern */test/* EXCLUDE_SYMBOLS = # The EXAMPLE_PATH tag can be used to specify one or more files or directories # that contain example code fragments that are included (see the \include # command). EXAMPLE_PATH = # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and # *.h) to filter out the source-files in the directories. If left blank all # files are included. EXAMPLE_PATTERNS = * # If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be # searched for input files to be used with the \include or \dontinclude commands # irrespective of the value of the RECURSIVE tag. # The default value is: NO. EXAMPLE_RECURSIVE = NO # The IMAGE_PATH tag can be used to specify one or more files or directories # that contain images that are to be included in the documentation (see the # \image command). IMAGE_PATH = # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program # by executing (via popen()) the command: # # # # where is the value of the INPUT_FILTER tag, and is the # name of an input file. Doxygen will then use the output that the filter # program writes to standard output. If FILTER_PATTERNS is specified, this tag # will be ignored. # # Note that the filter must not add or remove lines; it is applied before the # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # properly processed by doxygen. INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. Doxygen will compare the file name with each pattern and apply the # filter if there is a match. The filters are a list of the form: pattern=filter # (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how # filters are used. If the FILTER_PATTERNS tag is empty or if none of the # patterns match the file name, INPUT_FILTER is applied. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # properly processed by doxygen. FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will also be used to filter the input files that are used for # producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). # The default value is: NO. FILTER_SOURCE_FILES = NO # The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file # pattern. A pattern will override the setting for FILTER_PATTERN (if any) and # it is also possible to disable source filtering for a specific pattern using # *.ext= (so without naming a filter). # This tag requires that the tag FILTER_SOURCE_FILES is set to YES. FILTER_SOURCE_PATTERNS = # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that # is part of the input, its contents will be placed on the main page # (index.html). This can be useful if you have a project on for instance GitHub # and want to reuse the introduction page also for the doxygen output. USE_MDFILE_AS_MAINPAGE = #--------------------------------------------------------------------------- # Configuration options related to source browsing #--------------------------------------------------------------------------- # If the SOURCE_BROWSER tag is set to YES then a list of source files will be # generated. Documented entities will be cross-referenced with these sources. # # Note: To get rid of all source code in the generated output, make sure that # also VERBATIM_HEADERS is set to NO. # The default value is: NO. SOURCE_BROWSER = YES # Setting the INLINE_SOURCES tag to YES will include the body of functions, # classes and enums directly into the documentation. # The default value is: NO. INLINE_SOURCES = NO # Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any # special comment blocks from generated source code fragments. Normal C, C++ and # Fortran comments will always remain visible. # The default value is: YES. STRIP_CODE_COMMENTS = NO # If the REFERENCED_BY_RELATION tag is set to YES then for each documented # function all documented functions referencing it will be listed. # The default value is: NO. REFERENCED_BY_RELATION = NO # If the REFERENCES_RELATION tag is set to YES then for each documented function # all documented entities called/used by that function will be listed. # The default value is: NO. REFERENCES_RELATION = NO # If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set # to YES then the hyperlinks from functions in REFERENCES_RELATION and # REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will # link to the documentation. # The default value is: YES. REFERENCES_LINK_SOURCE = YES # If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the # source code will show a tooltip with additional information such as prototype, # brief description and links to the definition and documentation. Since this # will make the HTML file larger and loading of large files a bit slower, you # can opt to disable this feature. # The default value is: YES. # This tag requires that the tag SOURCE_BROWSER is set to YES. SOURCE_TOOLTIPS = YES # If the USE_HTAGS tag is set to YES then the references to source code will # point to the HTML generated by the htags(1) tool instead of doxygen built-in # source browser. The htags tool is part of GNU's global source tagging system # (see http://www.gnu.org/software/global/global.html). You will need version # 4.8.6 or higher. # # To use it do the following: # - Install the latest version of global # - Enable SOURCE_BROWSER and USE_HTAGS in the config file # - Make sure the INPUT points to the root of the source tree # - Run doxygen as normal # # Doxygen will invoke htags (and that will in turn invoke gtags), so these # tools must be available from the command line (i.e. in the search path). # # The result: instead of the source browser generated by doxygen, the links to # source code will now point to the output of htags. # The default value is: NO. # This tag requires that the tag SOURCE_BROWSER is set to YES. USE_HTAGS = NO # If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a # verbatim copy of the header file for each class for which an include is # specified. Set to NO to disable this. # See also: Section \class. # The default value is: YES. VERBATIM_HEADERS = YES # If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the # clang parser (see: http://clang.llvm.org/) for more accurate parsing at the # cost of reduced performance. This can be particularly helpful with template # rich C++ code for which doxygen's built-in parser lacks the necessary type # information. # Note: The availability of this option depends on whether or not doxygen was # generated with the -Duse-libclang=ON option for CMake. # The default value is: NO. CLANG_ASSISTED_PARSING = NO # If clang assisted parsing is enabled you can provide the compiler with command # line options that you would normally use when invoking the compiler. Note that # the include paths will already be set by doxygen for the files and directories # specified with INPUT and INCLUDE_PATH. # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. CLANG_OPTIONS = #--------------------------------------------------------------------------- # Configuration options related to the alphabetical class index #--------------------------------------------------------------------------- # If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all # compounds will be generated. Enable this if the project contains a lot of # classes, structs, unions or interfaces. # The default value is: YES. ALPHABETICAL_INDEX = YES # The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in # which the alphabetical index list will be split. # Minimum value: 1, maximum value: 20, default value: 5. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. COLS_IN_ALPHA_INDEX = 5 # In case all classes in a project start with a common prefix, all classes will # be put under the same header in the alphabetical index. The IGNORE_PREFIX tag # can be used to specify a prefix (or a list of prefixes) that should be ignored # while generating the index headers. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. IGNORE_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the HTML output #--------------------------------------------------------------------------- # If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output # The default value is: YES. GENERATE_HTML = YES # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of # it. # The default directory is: html. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_OUTPUT = test_spec # The HTML_FILE_EXTENSION tag can be used to specify the file extension for each # generated HTML page (for example: .htm, .php, .asp). # The default value is: .html. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FILE_EXTENSION = .html # The HTML_HEADER tag can be used to specify a user-defined HTML header file for # each generated HTML page. If the tag is left blank doxygen will generate a # standard header. # # To get valid HTML the header file that includes any scripts and style sheets # that doxygen needs, which is dependent on the configuration options used (e.g. # the setting GENERATE_TREEVIEW). It is highly recommended to start with a # default header using # doxygen -w html new_header.html new_footer.html new_stylesheet.css # YourConfigFile # and then modify the file new_header.html. See also section "Doxygen usage" # for information on how to generate the default header that doxygen normally # uses. # Note: The header is subject to change so you typically have to regenerate the # default header when upgrading to a newer version of doxygen. For a description # of the possible markers and block names see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_HEADER = # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each # generated HTML page. If the tag is left blank doxygen will generate a standard # footer. See HTML_HEADER for more information on how to generate a default # footer and what special commands can be used inside the footer. See also # section "Doxygen usage" for information on how to generate the default footer # that doxygen normally uses. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FOOTER = # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style # sheet that is used by each HTML page. It can be used to fine-tune the look of # the HTML output. If left blank doxygen will generate a default style sheet. # See also section "Doxygen usage" for information on how to generate the style # sheet that doxygen normally uses. # Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as # it is more robust and this tag (HTML_STYLESHEET) will in the future become # obsolete. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_STYLESHEET = # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined # cascading style sheets that are included after the standard style sheets # created by doxygen. Using this option one can overrule certain style aspects. # This is preferred over using HTML_STYLESHEET since it does not replace the # standard style sheet and is therefore more robust against future updates. # Doxygen will copy the style sheet files to the output directory. # Note: The order of the extra style sheet files is of importance (e.g. the last # style sheet in the list overrules the setting of the previous ones in the # list). For an example see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_STYLESHEET = # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note # that these files will be copied to the base HTML output directory. Use the # $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these # files. In the HTML_STYLESHEET file, use the file name only. Also note that the # files will be copied as-is; there are no commands or markers available. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to # this color. Hue is specified as an angle on a colorwheel, see # http://en.wikipedia.org/wiki/Hue for more information. For instance the value # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 # purple, and 360 is red again. # Minimum value: 0, maximum value: 359, default value: 220. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_HUE = 220 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors # in the HTML output. For a value of 0 the output will use grayscales only. A # value of 255 will produce the most vivid colors. # Minimum value: 0, maximum value: 255, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_SAT = 100 # The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the # luminance component of the colors in the HTML output. Values below 100 # gradually make the output lighter, whereas values above 100 make the output # darker. The value divided by 100 is the actual gamma applied, so 80 represents # a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not # change the gamma. # Minimum value: 40, maximum value: 240, default value: 80. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_GAMMA = 80 # If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML # page will contain the date and time when the page was generated. Setting this # to YES can help to show when doxygen was last run and thus if the # documentation is up to date. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_TIMESTAMP = NO # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML # documentation will contain sections that can be hidden and shown after the # page has loaded. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_DYNAMIC_SECTIONS = NO # With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries # shown in the various tree structured indices initially; the user can expand # and collapse entries dynamically later on. Doxygen will expand the tree to # such a level that at most the specified number of entries are visible (unless # a fully collapsed tree already exceeds this amount). So setting the number of # entries 1 will produce a full collapsed tree by default. 0 is a special value # representing an infinite number of entries and will result in a full expanded # tree by default. # Minimum value: 0, maximum value: 9999, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_INDEX_NUM_ENTRIES = 100 # If the GENERATE_DOCSET tag is set to YES, additional index files will be # generated that can be used as input for Apple's Xcode 3 integrated development # environment (see: http://developer.apple.com/tools/xcode/), introduced with # OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a # Makefile in the HTML output directory. Running make will produce the docset in # that directory and running make install will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at # startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html # for more information. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_DOCSET = NO # This tag determines the name of the docset feed. A documentation feed provides # an umbrella under which multiple documentation sets from a single provider # (such as a company or product suite) can be grouped. # The default value is: Doxygen generated docs. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_FEEDNAME = "Doxygen generated docs" # This tag specifies a string that should uniquely identify the documentation # set bundle. This should be a reverse domain-name style string, e.g. # com.mycompany.MyDocSet. Doxygen will append .docset to the name. # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_BUNDLE_ID = org.doxygen.Project # The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify # the documentation publisher. This should be a reverse domain-name style # string, e.g. com.mycompany.MyDocSet.documentation. # The default value is: org.doxygen.Publisher. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_ID = org.doxygen.Publisher # The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. # The default value is: Publisher. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_NAME = Publisher # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three # additional HTML index files: index.hhp, index.hhc, and index.hhk. The # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop # (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on # Windows. # # The HTML Help Workshop contains a compiler that can convert all HTML output # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML # files are now used as the Windows 98 help format, and will replace the old # Windows help format (.hlp) on all Windows platforms in the future. Compressed # HTML files also contain an index, a table of contents, and you can search for # words in the documentation. The HTML workshop also contains a viewer for # compressed HTML files. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_HTMLHELP = NO # The CHM_FILE tag can be used to specify the file name of the resulting .chm # file. You can add a path in front of the file if the result should not be # written to the html output directory. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. CHM_FILE = # The HHC_LOCATION tag can be used to specify the location (absolute path # including file name) of the HTML help compiler (hhc.exe). If non-empty, # doxygen will try to run the HTML help compiler on the generated index.hhp. # The file has to be specified with full path. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. HHC_LOCATION = # The GENERATE_CHI flag controls if a separate .chi index file is generated # (YES) or that it should be included in the master .chm file (NO). # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. GENERATE_CHI = NO # The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) # and project file content. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. CHM_INDEX_ENCODING = # The BINARY_TOC flag controls whether a binary table of contents is generated # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it # enables the Previous and Next buttons. # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. BINARY_TOC = NO # The TOC_EXPAND flag can be set to YES to add extra items for group members to # the table of contents of the HTML help documentation and to the tree view. # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. TOC_EXPAND = NO # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and # QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that # can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help # (.qch) of the generated HTML documentation. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_QHP = NO # If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify # the file name of the resulting .qch file. The path specified is relative to # the HTML output folder. # This tag requires that the tag GENERATE_QHP is set to YES. QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace # (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_QHP is set to YES. QHP_NAMESPACE = org.doxygen.Project # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt # Help Project output. For more information please see Qt Help Project / Virtual # Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- # folders). # The default value is: doc. # This tag requires that the tag GENERATE_QHP is set to YES. QHP_VIRTUAL_FOLDER = doc # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom # filter to add. For more information please see Qt Help Project / Custom # Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- # filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom # Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- # filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's filter section matches. Qt Help Project / Filter Attributes (see: # http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_SECT_FILTER_ATTRS = # The QHG_LOCATION tag can be used to specify the location of Qt's # qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the # generated .qhp file. # This tag requires that the tag GENERATE_QHP is set to YES. QHG_LOCATION = # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be # generated, together with the HTML files, they form an Eclipse help plugin. To # install this plugin and make it available under the help contents menu in # Eclipse, the contents of the directory containing the HTML and XML files needs # to be copied into the plugins directory of eclipse. The name of the directory # within the plugins directory should be the same as the ECLIPSE_DOC_ID value. # After copying Eclipse needs to be restarted before the help appears. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_ECLIPSEHELP = NO # A unique identifier for the Eclipse help plugin. When installing the plugin # the directory name containing the HTML and XML files should also have this # name. Each documentation set should have its own identifier. # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. ECLIPSE_DOC_ID = org.doxygen.Project # If you want full control over the layout of the generated HTML pages it might # be necessary to disable the index and replace it with your own. The # DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top # of each HTML page. A value of NO enables the index and the value YES disables # it. Since the tabs in the index contain the same information as the navigation # tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. DISABLE_INDEX = NO # The GENERATE_TREEVIEW tag is used to specify whether a tree-like index # structure should be generated to display hierarchical information. If the tag # value is set to YES, a side panel will be generated containing a tree-like # index structure (just like the one that is generated for HTML Help). For this # to work a browser that supports JavaScript, DHTML, CSS and frames is required # (i.e. any modern browser). Windows users are probably better off using the # HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can # further fine-tune the look of the index. As an example, the default style # sheet generated by doxygen has an example that shows how to put an image at # the root of the tree instead of the PROJECT_NAME. Since the tree basically has # the same information as the tab index, you could consider setting # DISABLE_INDEX to YES when enabling this option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_TREEVIEW = NO # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that # doxygen will group on one line in the generated HTML documentation. # # Note that a value of 0 will completely suppress the enum values from appearing # in the overview section. # Minimum value: 0, maximum value: 20, default value: 4. # This tag requires that the tag GENERATE_HTML is set to YES. ENUM_VALUES_PER_LINE = 4 # If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used # to set the initial width (in pixels) of the frame in which the tree is shown. # Minimum value: 0, maximum value: 1500, default value: 250. # This tag requires that the tag GENERATE_HTML is set to YES. TREEVIEW_WIDTH = 250 # If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to # external symbols imported via tag files in a separate window. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. EXT_LINKS_IN_WINDOW = NO # Use this tag to change the font size of LaTeX formulas included as images in # the HTML documentation. When you change the font size after a successful # doxygen run you need to manually remove any form_*.png images from the HTML # output directory to force them to be regenerated. # Minimum value: 8, maximum value: 50, default value: 10. # This tag requires that the tag GENERATE_HTML is set to YES. FORMULA_FONTSIZE = 10 # Use the FORMULA_TRANPARENT tag to determine whether or not the images # generated for formulas are transparent PNGs. Transparent PNGs are not # supported properly for IE 6.0, but are supported on all modern browsers. # # Note that when changing this option you need to delete any form_*.png files in # the HTML output directory before the changes have effect. # The default value is: YES. # This tag requires that the tag GENERATE_HTML is set to YES. FORMULA_TRANSPARENT = YES # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see # http://www.mathjax.org) which uses client side Javascript for the rendering # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX # installed or if you want to formulas look prettier in the HTML output. When # enabled you may also need to install MathJax separately and configure the path # to it using the MATHJAX_RELPATH option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. USE_MATHJAX = NO # When MathJax is enabled you can set the default output format to be used for # the MathJax output. See the MathJax site (see: # http://docs.mathjax.org/en/latest/output.html) for more details. # Possible values are: HTML-CSS (which is slower, but has the best # compatibility), NativeMML (i.e. MathML) and SVG. # The default value is: HTML-CSS. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_FORMAT = HTML-CSS # When MathJax is enabled you need to specify the location relative to the HTML # output directory using the MATHJAX_RELPATH option. The destination directory # should contain the MathJax.js script. For instance, if the mathjax directory # is located at the same level as the HTML output directory, then # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax # Content Delivery Network so you can quickly see the result without installing # MathJax. However, it is strongly recommended to install a local copy of # MathJax from http://www.mathjax.org before deployment. # The default value is: http://cdn.mathjax.org/mathjax/latest. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest # The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax # extension names that should be enabled during MathJax rendering. For example # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_EXTENSIONS = # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces # of code that will be used on startup of the MathJax code. See the MathJax site # (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_CODEFILE = # When the SEARCHENGINE tag is enabled doxygen will generate a search box for # the HTML output. The underlying search engine uses javascript and DHTML and # should work on any modern browser. Note that when using HTML help # (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) # there is already a search function so this one should typically be disabled. # For large projects the javascript based search engine can be slow, then # enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to # search using the keyboard; to jump to the search box use + S # (what the is depends on the OS and browser, but it is typically # , /
© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.

oneTBB is licensed under Apache License Version 2.0. Refer to the LICENSE file for the full license text and copyright notice.
" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] if BUILD_TYPE == 'oneapi' or BUILD_TYPE == 'dita': html_context = { 'css_files': [ '_static/theme_overrides.css', # override wide tables in RTD theme ], } else: html_js_files = ['custom.js'] html_theme_options["logo"] = {"text": "oneTBB Documentation"} html_logo = '_static/oneAPI-rgb-rev-100.png' html_favicon = '_static/favicons.png' # Custom sidebar templates, must be a dictionary that maps document names # to template names. # # The default sidebars (for documents that don't match any pattern) are # defined by theme itself. Builtin themes are using these templates by # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', # 'searchbox.html']``. # # html_sidebars = {} # -- Options for HTMLHelp output --------------------------------------------- # Output file base name for HTML help builder. htmlhelp_basename = 'sphinx-infodevdoc' # -- Options for LaTeX output ------------------------------------------------ #latex_engine = 'xelatex' #PDF_TITLE = 'Information Development Template' # #with open(PREAMBLE_FILE, 'r', encoding='utf-8') as f: # PREAMBLE = f.read() # #with open(TITLE_PAGE_FILE, 'r', encoding='utf-8') as f: # TITLE_PAGE = f.read().replace('', PDF_TITLE) # # #latex_elements = { # # The paper size ('letterpaper' or 'a4paper'). # # # 'extraclassoptions': 'openany,oneside', # 'babel' : '\\usepackage[english]{babel}', # 'papersize': 'a4paper', # 'releasename':" ", # # Sonny, Lenny, Glenn, Conny, Rejne, Bjarne and Bjornstrup # # 'fncychap': '\\usepackage[Lenny]{fncychap}', # 'fncychap': '', # #'fontpkg': '\\usepackage{amsmath,amsfonts,amssymb,amsthm}', # # 'figure_align':'htbp', # # The font size ('10pt', '11pt' or '12pt'). # # # 'pointsize': '12pt', # # # Additional stuff for the LaTeX preamble. # # # 'preamble': PREAMBLE, # # 'maketitle': TITLE_PAGE, # # Latex figure (float) alignment # # # # 'figure_align': 'htbp', # 'sphinxsetup': \ # 'hmargin={0.7in,0.7in}, vmargin={1in,1in}, \ # verbatimwithframe=true, \ # TitleColor={rgb}{0,0.686,0.941}, \ # HeaderFamily=\\rmfamily\\bfseries, \ # InnerLinkColor={rgb}{0,0.686,0.941}, \ # OuterLinkColor={rgb}{0,0.686,0.941}', # # 'tableofcontents':' ' #} # #latex_logo = '_latex/intel_logo.png' ## Grouping the document tree into LaTeX files. List of tuples ## (source start file, target name, title, ## author, documentclass [howto, manual, or own class]). #latex_documents = [ # (master_doc, 'sphinx-infodev.tex', u'sphinx-infodev Documentation', # u'Intel', 'manual'), #] #breathe_projects = { #todd-mod # project: "../doxygen/xml" #} #breathe_default_project = project # Setup the exhale extension #exhale_args = { #todd-mod # # These arguments are required # "containmentFolder": "./api", # "rootFileName": "library_root.rst", # "rootFileTitle": "Library API", # "doxygenStripFromPath": "..", # "fullApiSubSectionTitle": 'Full API' #} # -- Options for manual page output ------------------------------------------ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ (master_doc, 'sphinx-infodev', u'sphinx-infodev Documentation', [author], 1) ] # -- Options for Texinfo output ---------------------------------------------- # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ (master_doc, 'sphinx-infodev', u'sphinx-infodev Documentation', author, 'sphinx-infodev', 'One line description of project.', 'Miscellaneous'), ] # -- Options for Epub output ------------------------------------------------- # Bibliographic Dublin Core info. epub_title = project # The unique identifier of the text. This can be a ISBN number # or the project homepage. # # epub_identifier = '' # A unique identification for the text. # # epub_uid = '' # A list of files that should not be packed into the epub file. epub_exclude_files = ['search.html'] # -- Extension configuration ------------------------------------------------- # -- Options for intersphinx extension --------------------------------------- # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = {'python': ('https://docs.python.org/3', None)} # -- Options for todo extension ---------------------------------------------- # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = True ================================================ FILE: third-party/tbb/doc/index/index_intro.rst ================================================ .. _index_intro: This document contains information about |short_name|. It is a flexible performance library that let you break computation into parallel running tasks. ================================================ FILE: third-party/tbb/doc/index/toctree.rst ================================================ .. _toctree: .. toctree:: :caption: About :hidden: :maxdepth: 1 /main/intro/help_support /main/intro/notation /main/intro/intro_os /main/intro/Benefits /main/intro/testing_approach /main/intro/limitations.rst .. toctree:: :caption: Get Started :hidden: :maxdepth: 1 /GSG/get_started /GSG/system_requirements /GSG/installation /GSG/next_steps /GSG/integrate /GSG/samples .. toctree:: :maxdepth: 3 :hidden: :caption: Developer Guide /main/tbb_userguide/title .. toctree:: :maxdepth: 3 :hidden: :caption: Developer Reference /main/reference/reference ================================================ FILE: third-party/tbb/doc/index/useful_topics.rst ================================================ .. _Usefull_Topics: The following is an important topic for the ``experienced user``: :ref:`Migration_Guide` describes how to migrate from TBB to oneTBB. ================================================ FILE: third-party/tbb/doc/index.rst ================================================ |full_name| =========== .. include:: index/index_intro.rst The following are some important topics for the ``novice user``: * :ref:`Get_Started_Guide` gives you a brief explanation of what oneTBB is. * :ref:`Benefits` describes how |short_name| differs from typical threading packages. * :ref:`Package_Contents` describes dynamic library files and header files for Windows*, Linux*, and macOS* operating systems used in |short_name|. .. include:: index/useful_topics.rst .. include:: index/toctree.rst ================================================ FILE: third-party/tbb/doc/main/_templates/layout.html ================================================ {% extends "!layout.html" %} {% block extrahead %} {% endblock %} ================================================ FILE: third-party/tbb/doc/main/examples_testing/CMakeLists.txt ================================================ # Copyright (c) 2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. cmake_minimum_required(VERSION 3.11) project(doc_examples_testing LANGUAGES CXX) set(_reference_examples_path "${CMAKE_CURRENT_SOURCE_DIR}/../reference/examples") set(_userguide_examples_path "${CMAKE_CURRENT_SOURCE_DIR}/../tbb_userguide/examples") add_custom_target(build-doc-examples COMMENT "Build oneTBB documentation samples") set(doc_examples_test_label "doc-examples") macro(add_doc_example _doc_example_path) get_filename_component(_doc_example_name "${_doc_example_path}" NAME_WE) add_executable(${_doc_example_name} EXCLUDE_FROM_ALL "${_doc_example_path}") add_dependencies(${_doc_example_name} TBB::tbb TBB::tbbmalloc TBB::tbbmalloc_proxy) target_link_libraries(${_doc_example_name} TBB::tbb TBB::tbbmalloc TBB::tbbmalloc_proxy) add_dependencies(build-doc-examples ${_doc_example_name}) add_test(NAME ${_doc_example_name} COMMAND ${_doc_example_name}) set_tests_properties(${_doc_example_name} PROPERTIES LABELS "${doc_examples_test_label}") endmacro() file(GLOB_RECURSE DOC_EXAMPLES_LIST "${_reference_examples_path}/*.cpp" "${_userguide_examples_path}/*.cpp") foreach(_doc_example_path IN LISTS DOC_EXAMPLES_LIST) add_doc_example(${_doc_example_path}) endforeach() ================================================ FILE: third-party/tbb/doc/main/intro/Benefits.rst ================================================ .. _Benefits: |short_name| Benefits ===================== |full_name| is a library that helps you leverage multi-core performance without having to be a threading expert. Typically you can improve performance for multi-core processors by implementing the key points explained in the early sections of the Developer Guide. As your expertise grows, you may want to dive into more complex subjects that are covered in advanced sections. There are a variety of approaches to parallel programming, ranging from using platform-dependent threading primitives to exotic new languages. The advantage of oneTBB is that it works at a higher level than raw threads, yet does not require exotic languages or compilers. You can use it with any compiler supporting ISO C++. The library differs from typical threading packages in the following ways: - **oneTBB enables you to specify logical parallelism instead of threads**. Most threading packages require you to specify threads. Programming directly in terms of threads can be tedious and lead to inefficient programs, because threads are low-level, heavy constructs that are close to the hardware. Direct programming with threads forces you to efficiently map logical tasks onto threads. In contrast, the oneTBB run-time library automatically maps logical parallelism onto threads in a way that makes efficient use of processor resources. - **oneTBB targets threading for performance**. Most general-purpose threading packages support many different kinds of threading, such as threading for asynchronous events in graphical user interfaces. As a result, general-purpose packages tend to be low-level tools that provide a foundation, not a solution. Instead, oneTBB focuses on the particular goal of parallelizing computationally intensive work, delivering higher-level, simpler solutions. - **oneTBB is compatible with other threading packages.** Because the library is not designed to address all threading problems, it can coexist seamlessly with other threading packages. - **oneTBB emphasizes scalable, data parallel programming**. Breaking a program up into separate functional blocks, and assigning a separate thread to each block is a solution that typically does not scale well since typically the number of functional blocks is fixed. In contrast, oneTBB emphasizes *data-parallel* programming, enabling multiple threads to work on different parts of a collection. Data-parallel programming scales well to larger numbers of processors by dividing the collection into smaller pieces. With data-parallel programming, program performance increases as you add processors. - **oneTBB relies on generic programming**. Traditional libraries specify interfaces in terms of specific types or base classes. Instead, oneAPI Threading Building Blocks uses generic programming. The essence of generic programming is writing the best possible algorithms with the fewest constraints. The C++ Standard Template Library (STL) is a good example of generic programming in which the interfaces are specified by *requirements* on types. For example, C++ STL has a template function ``sort`` that sorts a sequence abstractly defined in terms of iterators on the sequence. The requirements on the iterators are: - Provide random access - The expression ``*i<*j`` is true if the item pointed to by iterator ``i`` should precede the item pointed to by iterator ``j``, and false otherwise. - The expression ``swap(*i,*j)`` swaps two elements. Specification in terms of requirements on types enables the template to sort many different representations of sequences, such as vectors and deques. Similarly, the oneTBB templates specify requirements on types, not particular types, and thus adapt to different data representations. Generic programming enables oneTBB to deliver high performance algorithms with broad applicability. ================================================ FILE: third-party/tbb/doc/main/intro/help_support.rst ================================================ .. _help_support: Getting Help and Support ======================== .. container:: section .. rubric:: Getting Technical Support :class: sectiontitle For general information about oneTBB technical support, product updates, user forums, FAQs, tips and tricks and other support questions, go to `GitHub issues `_. ================================================ FILE: third-party/tbb/doc/main/intro/intro_os.rst ================================================ .. _intro: Introduction ============ |full_name| is a library that supports scalable parallel programming using standard ISO C++ code. It does not require special languages or compilers. It is designed to promote scalable data parallel programming. Additionally, it fully supports nested parallelism, so you can build larger parallel components from smaller parallel components. To use the library, you specify tasks, not threads, and let the library map tasks onto threads in an efficient manner. Many of the library interfaces employ generic programming, in which interfaces are defined by requirements on types and not specific types. The C++ Standard Template Library (STL) is an example of generic programming. Generic programming enables oneTBB to be flexible yet efficient. The generic interfaces enable you to customize components to your specific needs. .. note:: |full_name| requires C++11 standard compiler support. The net result is that oneTBB enables you to specify parallelism far more conveniently than using raw threads, and at the same time can improve performance. ================================================ FILE: third-party/tbb/doc/main/intro/limitations.rst ================================================ .. _limitations: Known Limitations ***************** This page outlines the known limitations of oneTBB to help you better understand its capabilities. Debug TBB In The SYCL Program ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ **Limitation:** The application may crash when using the Debug version of oneTBB in a SYCL program compiled with Intel(R) oneAPI DPC++/C++ Compiler. This happens because both ``tbb`` (Release version) and ``tbb_debug`` (Debug version) libraries load simultaneously, causing conflicts. **Solution:** Do one of the following: * Link the application with the Release version ``tbb`` instead of ``tbb_debug``. * Use the ``qtbb`` flag provided by the Intel(R) oneAPI DPC++/C++ Compiler. Freestanding Compilation Mode ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ **Limitation:** oneTBB does not support the freestanding compilation mode. **Risk:** Compiling an application that utilizes oneTBB headers using the Intel(R) oneAPI DPC++/C++ Compiler may result in failure on Windows* OS if the ``/Qfreestanding`` compiler option is employed. Static Assert ^^^^^^^^^^^^^ **Limitation:** A static assert causes the compilation failures in oneTBB headers if the following conditions are satisfied: * Compilation is done with Clang 12.0.0 or a more recent version. * The LLVM standard library is employed, coupled with the use of the ``-ffreestanding`` flag and C++11/14 compiler options. **Risk:** The compilation failures. Interface Incompatibilities: TBB vs oneTBB ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ **Limitation:** An application using Parallel STL algorithms in the ``libstdc++`` versions 9 and 10 may fail to compile due to incompatible interface changes between earlier versions of Threading Building Blocks (TBB) and oneAPI Threading Building Blocks (oneTBB). **Solution:** Disable support for Parallel STL algorithms by defining ``PSTL_USE_PARALLEL_POLICIES`` (in libstdc++ 9) or ``_GLIBCXX_USE_TBB_PAR_BACKEND`` (in libstdc++ 10) macro to zero before inclusion of the first standard header file in each translation unit. Incorrect Installation Location ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ **Limitation:** On Linux* OS, if oneAPI Threading Building Blocks (oneTBB) or Threading Building Blocks (TBB) are installed in a system folder, such as ``/usr/lib64``, the application may fail to link due to the order in which the linker searches for libraries. **Risk:** The issue does not affect the program execution. **Solution:** Use the ``-L`` linker option to specify the correct location of oneTBB library. ``fork()`` Support ^^^^^^^^^^^^^^^^^^^ **Limitation:** oneTBB does not support ``fork()``. **Solution:** To work-around the issue, consider using ``task_scheduler_handle`` to join oneTBB worker threads before using ``fork()``. Dynamic Malloc Replacement and Topology API Incompatibilities ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ **Limitation:** On Linux* OS, using dynamic malloc replacement with ``tbb::info`` and ``tbb::task_arena::constraints`` APIs may result in runtime failures. **Solution:** Set ``TBB_ENABLE_SANITIZERS=1`` in the environment. This informs that dynamic malloc replacement is used. ================================================ FILE: third-party/tbb/doc/main/intro/notation.rst ================================================ .. _notation: Notational Conventions ====================== The following conventions may be used in this document. .. container:: tablenoborder .. list-table:: :header-rows: 1 * - Convention - Explanation - Example * - \ *Italic* - Used for introducing new terms, denotation of terms, placeholders, or titles of manuals. - The filename consists of the *basename* and the *extension*. * - \ ``Monospace`` - Indicates directory paths and filenames, commands and command line options, function names, methods, classes, data structures in body text, source code. - \ ``ippsapi.h`` \ ``\alt\include`` Use the okCreateObjs() function to... \ ``printf("hello, world\n");`` * - [ ] - Items enclosed in brackets are optional. - Fa[c] Indicates Fa or Fac. * - { \| } - Braces and vertical bars indicate the choice of one item from a selection of two or more items. - X{K \| W \| P} Indicates XK, XW, or XP. * - "[" "]" "{" | " }" "|" - Writing a metacharacter in quotation marks negates the syntactical meaning stated above; | the character is taken as a literal. - "[" X "]" [ Y ] Denotes the letter X enclosed in brackets, optionally followed by the letter Y. * - ... - The ellipsis indicates that the previous item can be repeated several times. - \ ``filename`` ... Indicates that one or more filenames can be specified. * - ,... - The ellipsis preceded by a comma indicates that the previous item can be repeated several times, | separated by commas. - \ ``word`` ,... Indicates that one or more words can be specified. If more than one word is specified, the words are comma-separated. .. container:: section Class members are summarized by informal class declarations that describe the class as it seems to clients, not how it is actually implemented. For example, here is an informal declaration of class ``Foo``: :: class Foo { public: int x(); int y; ~Foo(); }; The actual implementation might look like: :: namespace internal { class FooBase { protected: int x(); }; class Foo_v3: protected FooBase { private: int internal_stuff; public: using FooBase::x; int y; }; } typedef internal::Foo_v3 Foo; The example shows two cases where the actual implementation departs from the informal declaration: - ``Foo`` is actually a typedef to ``Foo_v3``. - Method ``x()`` is inherited from a protected base class. - The destructor is an implicit method generated by the compiler. The informal declarations are intended to show you what you need to know to use the class without the distraction of irrelevant clutter particular to the implementation. ================================================ FILE: third-party/tbb/doc/main/intro/testing_approach.rst ================================================ .. _testing_approach: Testing Approach ================ There are four main types of errors/hazards you can encounter in the development of libraries for parallelism: * Interface correspondence to specification * Memory errors * Data race * Race conditions and deadlocks |short_name| testing approach is designed to provide high coverage of these error types. All types of errors are covered with unit testing and review. Code coverage metrics are tracked to ensure high code coverage with tests. Uncovered branches are analyzed manually. Memory errors and data races are additionally covered by special tools that include thread and memory sanitizers. Race conditions and deadlocks are the most complicated errors. They are covered by: * **Unit tests** that, however, have limited capability to catch such errors * **Integration tests**. Multiple different functionalities are heavily combined to emulate user use cases that may trigger such errors based on prior knowledge and expertise. * **Stress testing with different possible combinations**. It ensures that even rarely triggered error conditions are caught by testing. .. note:: Every fix is required to be covered by a test to guarantee the detection of such issues in the future. Continuous Integration triggers all the tests on each commit. This ensures that: * Issues are detected, starting from the early development phase and up to the moment of integration of changes into the library. * The highest quality of the library is maintained even in such error-prone domains as parallelism. ================================================ FILE: third-party/tbb/doc/main/reference/blocked_nd_range_ctad.rst ================================================ .. _blocked_nd_range_ctad: Deduction Guides for ``blocked_nd_range`` ========================================= .. note:: To enable this feature, define the ``TBB_PREVIEW_BLOCKED_ND_RANGE_DEDUCTION_GUIDES`` macro to 1. .. contents:: :local: :depth: 1 Description *********** The ``blocked_nd_range`` class represents a recursively divisible N-dimensional half-open interval for the oneTBB parallel algorithms. This feature extends ``blocked_nd_range`` to support Class Template Argument Deduction (starting from C++17). With that, you do not need to specify template arguments explicitly while creating a ``blocked_nd_range`` object if they can be inferred from the constructor arguments: .. literalinclude:: ./examples/blocked_nd_range_ctad_example.cpp :language: c++ :start-after: /*begin_blocked_nd_range_ctad_example_1*/ :end-before: /*end_blocked_nd_range_ctad_example_1*/ .. note:: For more detailed description of the implementation of this feature or to leave comments or feedback on the API, please refer to the [corresponding RFC](https://github.com/uxlfoundation/oneTBB/tree/master/rfcs/experimental/blocked_nd_range_ctad). API *** Header ------ .. code:: cpp #include Synopsis -------- .. code:: cpp namespace oneapi { namespace tbb { template class blocked_nd_range { public: // Member types and constructors defined as part of oneTBB specification using value_type = Value; using dim_range_type = blocked_range; using size_type = typename dim_range_type::size_type; blocked_nd_range(const dim_range_type& dim0, /*exactly N arguments of type const dim_range_type&*/); // [1] blocked_nd_range(const value_type (&dim_size)[N], size_type grainsize = 1); // [2] blocked_nd_range(blocked_nd_range& r, split); // [3] blocked_nd_range(blocked_nd_range& r, proportional_split); // [4] }; // class blocked_nd_range // Explicit deduction guides template blocked_nd_range(blocked_range, blocked_range...) -> blocked_nd_range; template blocked_nd_range(const Value (&...)[Ns]) -> blocked_nd_range; template blocked_nd_range(const Value (&)[N], typename blocked_nd_range::size_type = 1) -> blocked_nd_range; template blocked_nd_range(blocked_nd_range, split) -> blocked_nd_range; template blocked_nd_range(blocked_nd_range, proportional_split) -> blocked_nd_range; } // namespace tbb } // namespace oneapi Deduction Guides ---------------- The copy and move constructors of ``blocked_nd_range`` provide implicitly generated deduction guides. In addition, the following explicit deduction guides are provided: .. code:: cpp template blocked_nd_range(blocked_range, blocked_range...) -> blocked_nd_range; **Effects**: Enables deduction when a set of ``blocked_range`` objects is passed to the ``blocked_nd_range`` constructor ``[1]``. **Constraints**: Participates in overload resolution only if all of the types in `Values` are same as `Value`. .. code:: cpp template blocked_nd_range(const Value (&...)[Ns]) -> blocked_nd_range; **Effects**: Enables deduction when a set of ``blocked_range`` objects is provided as braced-init-lists to the ``blocked_nd_range`` constructor ``[1]``. **Constraints**: Participates in overload resolution only if ``sizeof...(Ns) >= 2``, and each integer ``Ni`` in ``Ns`` is either ``2`` or ``3``, corresponding to ``blocked_range`` constructors with 2 and 3 arguments, respectively. .. note:: The guide allows a deduction only from braced-init-lists containing objects of the same type. For ranges with non-integral ``value_type``, setting an explicit grainsize argument is not supported by the deduction guides and requires specifying explicit template arguments. .. code:: cpp template blocked_nd_range(const Value (&)[N], typename blocked_nd_range::size_type = 1) -> blocked_nd_range; **Effects**: Allows deduction from a single C array object indicating a set of dimension sizes to constructor ``2`` of ``blocked_nd_range``. .. code:: cpp template blocked_nd_range(blocked_nd_range, split) -> blocked_nd_range; **Effects**: Allows deduction while using the splitting constructor ``3`` of ``blocked_nd_range``. .. code:: cpp template blocked_nd_range(blocked_nd_range, proportional_split) -> blocked_nd_range; **Effects**: Allows deduction while using the proportional splitting constructor ``4`` of ``blocked_nd_range``. Example ------- .. literalinclude:: ./examples/blocked_nd_range_ctad_example.cpp :language: c++ :start-after: /*begin_blocked_nd_range_ctad_example_2*/ :end-before: /*end_blocked_nd_range_ctad_example_2*/ ================================================ FILE: third-party/tbb/doc/main/reference/blocking_terminate.rst ================================================ .. _task_scheduler_handle_reference: task_scheduler_handle Class =========================== .. note:: To enable this feature, set the ``TBB_PREVIEW_WAITING_FOR_WORKERS`` macro to 1. .. contents:: :local: :depth: 1 Description *********** The ``oneapi::tbb::task_scheduler_handle`` class and the ``oneapi::tbb::finalize`` function allow to wait for completion of worker threads. When the ``oneapi::tbb::finalize`` function is called with an ``oneapi::tbb::task_scheduler_handle`` instance, it blocks the calling thread until the completion of all worker threads that were implicitly created by the library. API *** Header ------ .. code:: cpp #include Synopsis -------- .. code:: cpp namespace oneapi { namespace tbb { class task_scheduler_handle { public: task_scheduler_handle() = default; ~task_scheduler_handle(); task_scheduler_handle(const task_scheduler_handle& other) = delete; task_scheduler_handle(task_scheduler_handle&& other) noexcept; task_scheduler_handle& operator=(const task_scheduler_handle& other) = delete; task_scheduler_handle& operator=(task_scheduler_handle&& other) noexcept; explicit operator bool() const noexcept; static task_scheduler_handle get(); static void release(task_scheduler_handle& handle); }; void finalize(task_scheduler_handle& handle); bool finalize(task_scheduler_handle& handle, const std::nothrow_t&) noexcept; } // namespace tbb } // namespace oneapi Member Functions ---------------- .. cpp:function:: task_scheduler_handle() **Effects**: Creates an instance of the ``task_scheduler_handle`` class that does not contain any reference to the task scheduler. ------------------------------------------------------- .. cpp:function:: ~task_scheduler_handle() **Effects**: Destroys an instance of the ``task_scheduler_handle`` class. Releases a reference to the task scheduler and deactivates an instance of the ``task_scheduler_handle`` class. ------------------------------------------------------- .. cpp:function:: task_scheduler_handle(task_scheduler_handle&& other) noexcept **Effects**: Creates an instance of the ``task_scheduler_handle`` class that references the task scheduler referenced by ``other``. In turn, ``other`` releases its reference to the task scheduler. ------------------------------------------------------- .. cpp:function:: task_scheduler_handle& operator=(task_scheduler_handle&& other) noexcept **Effects**: Releases a reference to the task scheduler referenced by ``this``. Adds a reference to the task scheduler referenced by ``other``. In turn, ``other`` releases its reference to the task scheduler. ------------------------------------------------------- .. cpp:function:: explicit operator bool() const noexcept **Returns**: ``true`` if ``this`` references any task scheduler; ``false`` otherwise. ------------------------------------------------------- .. cpp:function:: task_scheduler_handle get() **Returns**: An instance of the ``task_scheduler_handle`` class that holds a reference to the task scheduler preventing its premature destruction. ------------------------------------------------------- .. cpp:function:: void release(task_scheduler_handle& handle) **Effects**: Releases a reference to the task scheduler and deactivates an instance of the ``task_scheduler_handle`` class. Non-blocking method. Non-member Functions -------------------- .. cpp:function:: void finalize(task_scheduler_handle& handle) **Effects**: Blocks the program execution until all worker threads have been completed. Throws the ``oneapi::tbb::unsafe_wait`` exception if it is not safe to wait for the completion of the worker threads. The following conditions should be met for finalization to succeed: - No active (not yet terminated) instances of class ``task_arena`` exist in the whole program; - ``task_scheduler_handle::release`` is called for each other active instance of class ``task_scheduler_handle``, possibly by different application threads. Under these conditions, it is guaranteed that at least one ``finalize`` call succeeds, at which point all worker threads have been completed. If calls are performed simultaneously, more than one call might succeed. .. note:: If you know how many active ``task_scheduler_handle`` instances exist in the program, it is necessary to ``release`` all but the last one, then call ``finalize`` for the last instance. .. caution:: The method always fails if called within a task, a parallel algorithm, or a flow graph node. ------------------------------------------------------- .. cpp:function:: bool finalize(task_scheduler_handle& handle, const std::nothrow_t&) noexcept **Effects**: Blocks the program execution until all worker threads have been completed. Same as above, but returns ``true`` if all worker threads have been completed successfully, or ``false`` if it is not safe to wait for the completion of the worker threads. Examples ******** .. code:: cpp #define TBB_PREVIEW_WAITING_FOR_WORKERS 1 #include #include #include int main() { oneapi::tbb::task_scheduler_handle handle = oneapi::tbb::task_scheduler_handle::get(); // Do some parallel work here, e.g. oneapi::tbb::parallel_for(0, 10000, [](int){}); try { oneapi::tbb::finalize(handle); // oneTBB worker threads are terminated at this point. } catch (const oneapi::tbb::unsafe_wait&) { std::cerr << "Failed to terminate the worker threads." << std::endl; } return 0; } ================================================ FILE: third-party/tbb/doc/main/reference/concurrent_lru_cache_cls.rst ================================================ .. _concurrent_lru_cache: concurrent_lru_cache ==================== .. note:: To enable this feature, define the ``TBB_PREVIEW_CONCURRENT_LRU_CACHE`` macro to 1. A Class Template for Least Recently Used cache with concurrent operations. .. contents:: :local: :depth: 1 Description *********** A ``concurrent_lru_cache`` container maps keys to values with the ability to limit the number of stored unused values. For each key, there is at most one item stored in the container. The container permits multiple threads to concurrently retrieve items from it. The container tracks which items are in use by returning a proxy ``concurrent_lru_cache::handle`` object that refers to an item instead of its value. Once there are no ``handle`` objects holding reference to an item, it is considered unused. The container stores all the items that are currently in use plus a limited number of unused items. Excessive unused items are erased according to least recently used policy. When no item is found for a given key, the container calls the user-specified ``value_function_type`` object to construct a value for the key, and stores that value. The ``value_function_type`` object must be thread-safe. API *** Header ------ .. code:: cpp #include "oneapi/tbb/concurrent_lru_cache.h" Synopsis -------- .. code:: cpp namespace oneapi { namespace tbb { template class concurrent_lru_cache { public: using key_type = Key; using value_type = Value; using pointer = value_type*; using const_pointer = const value_type*; using reference = value_type&; using const_reference = const value_type&; using value_function_type = ValueFunctionType; class handle { public: handle(); handle( handle&& other ); ~handle(); handle& operator=( handle&& other ); operator bool() const; value_type& value(); }; // class handle concurrent_lru_cache( value_function_type f, std::size_t number_of_lru_history_items ); ~concurrent_lru_cache(); handle operator[]( key_type key ); }; // class concurrent_lru_cache } // namespace tbb } // namespace oneapi Member Functions ---------------- .. cpp:function:: concurrent_lru_cache( value_function_type f, std::size_t number_of_lru_history_items ); **Effects**: Constructs an empty cache that can keep up to ``number_of_lru_history_items`` unused values, with a function object ``f`` for constructing new values. ------------------------------------------------------- .. cpp:function:: ~concurrent_lru_cache(); **Effects**: Destroys the ``concurrent_lru_cache``. Calls the destructors of the stored elements and deallocates the used storage. The behavior is undefined in case of concurrent operations with ``*this``. ------------------------------------------------------- .. cpp:function:: handle operator[]( key_type k ); **Effects**: Searches the container for an item that corresponds to the given key. If such an item is not found, the user-specified function object is called to construct a value that is inserted into the container. **Returns**: a ``handle`` object holding reference to the matching value. Member Objects -------------- ``handle`` class ^^^^^^^^^^^^^^^^ **Member Functions** .. cpp:function:: handle(); **Effects**: Constructs a ``handle`` object that does not refer to any value. -------------------------------------------------- .. cpp:function:: handle( handle&& other ); **Effects**: Transfers the reference to the value stored in ``concurrent_lru_cache`` from ``other`` to the newly constructed object. Upon completion, ``other`` no longer refers to any value. --------------------------------------------------- .. cpp:function:: ~handle(); **Effects**: Releases the reference (if it exists) to a value stored in ``concurrent_lru_cache``. The behavior is undefined for concurrent operations with ``*this``. --------------------------------------------------- .. cpp:function:: handle& operator=( handle&& other ); **Effects**: Transfers the reference to a value stored in ``concurrent_lru_cache`` from ``other`` to ``*this``. If existed, the previous reference held by ``*this`` is released. Upon completion ``other`` no longer refers to any value. **Returns**: a reference to ``*this``. --------------------------------------------------- .. cpp:function:: operator bool() const; **Returns**: ``true`` if ``*this`` holds reference to a value, ``false`` otherwise. --------------------------------------------------- .. cpp:function:: value_type& value(); **Returns**: a reference to a ``value_type`` object stored in ``concurrent_lru_cache``. The behavior is undefined if ``*this`` does not refer to any value. ================================================ FILE: third-party/tbb/doc/main/reference/constraints_extensions.rst ================================================ .. _constraints_extensions: task_arena::constraints extensions ====================================== .. note:: To enable this feature, set the ``TBB_PREVIEW_TASK_ARENA_CONSTRAINTS_EXTENSION`` macro to 1. .. contents:: :local: :depth: 1 Description *********** These extensions allow to customize ``tbb::task_arena::constraints`` with the following properties: * On machines with Intel® Hybrid Technology set the preferred core type for threads working within the task arena. * Limit the maximum number of threads that can be scheduled to one core simultaneously. API *** Header ------ .. code:: cpp #include Synopsis -------- .. code:: cpp namespace oneapi { namespace tbb { class task_arena { public: struct constraints { constraints& set_numa_id(numa_node_id id); constraints& set_max_concurrency(int maximal_concurrency); constraints& set_core_type(core_type_id id); constraints& set_max_threads_per_core(int threads_number); numa_node_id numa_id = task_arena::automatic; int max_concurrency = task_arena::automatic; core_type_id core_type = task_arena::automatic; int max_threads_per_core = task_arena::automatic; }; // struct constraints }; // class task_arena } // namespace tbb } // namespace oneapi Member Functions ---------------- .. cpp:function:: constraints& set_numa_id(numa_node_id id) Sets the ``numa_id`` field to the ``id``. **Returns:** Reference to ``*this``. .. cpp:function:: constraints& set_max_concurrency(int maximal_concurrency) Sets the ``max_concurrency`` field to the ``maximal_concurrency``. **Returns:** Reference to ``*this``. .. cpp:function:: constraints& set_core_type(core_type_id id) Sets the ``core_type`` field to the ``id``. **Returns:** Reference to ``*this``. .. cpp:function:: constraints& set_max_threads_per_core(int threads_number) Sets the ``max_threads_per_core`` field to the ``threads_number``. **Returns:** Reference to ``*this``. Member Objects -------------- .. cpp:member:: numa_node_id numa_id An integral logical index uniquely identifying a NUMA node. All threads joining the ``task_arena`` are bound to this NUMA node. .. note:: To obtain a valid NUMA node ID, call ``oneapi::tbb::info::numa_nodes()``. .. cpp:member:: int max_concurrency The maximum number of threads that can participate in work processing within the ``task_arena`` at the same time. .. cpp:member:: core_type_id core_type An integral logical index uniquely identifying a core type. All threads joining the ``task_arena`` are bound to this core type. .. note:: To obtain a valid core type node ID, call ``oneapi::tbb::info::core_types()``. .. cpp:member:: int max_threads_per_core The maximum number of threads that can be scheduled to one core simultaneously. See also: * :doc:`oneapi::tbb::info namespace preview extensions ` * `oneapi::tbb::task_arena specification `_ ================================================ FILE: third-party/tbb/doc/main/reference/constructors_for_nodes.rst ================================================ .. _constructors_for_fg_nodes: Constructors for Flow Graph nodes ================================= .. note:: To enable this feature, define the ``TBB_PREVIEW_FLOW_GRAPH_FEATURES`` macro to 1. .. contents:: :local: :depth: 1 Description *********** The "Helper Functions for Expressing Graphs" feature adds a set of new constructors that can be used to construct a node that ``follows`` or ``precedes`` a set of nodes. Where possible, the constructors support Class Template Argument Deduction (since C++17). API *** Header ------ .. code:: cpp #include Syntax ------ .. code:: cpp // continue_node continue_node(follows(...), Body body, Policy = Policy()); continue_node(precedes(...), Body body, Policy = Policy()); continue_node(follows(...), int number_of_predecessors, Body body, Policy = Policy()); continue_node(precedes(...), int number_of_predecessors, Body body, Policy = Policy()); // function_node function_node(follows(...), std::size_t concurrency, Policy = Policy()); function_node(precedes(...), std::size_t concurrency, Policy = Policy()); // input_node input_node(precedes(...), body); // multifunction_node multifunction_node(follows(...), std::size_t concurrency, Body body); multifunction_node(precedes(...), std::size_t concurrency, Body body); // async_node async_node(follows(...), std::size_t concurrency, Body body); async_node(precedes(...), std::size_t concurrency, Body body); // overwrite_node explicit overwrite_node(follows(...)); explicit overwrite_node(precedes(...)); // write_once_node explicit write_once_node(follows(...)); explicit write_once_node(precedes(...)); // buffer_node explicit buffer_node(follows(...)); explicit buffer_node(precedes(...)); // queue_node explicit queue_node(follows(...)); explicit queue_node(precedes(...)); // priority_queue_node explicit priority_queue_node(follows(...), const Compare& comp = Compare()); explicit priority_queue_node(precedes(...), const Compare& compare = Compare()); // sequencer_node sequencer_node(follows(...), const Sequencer& s); sequencer_node(precedes(...), const Sequencer& s); // limiter_node limiter_node(follows(...), std::size_t threshold); limiter_node(precedes(...), std::size_t threshold); // broadcast_node explicit broadcast_node(follows(...)); explicit broadcast_node(precedes(...)); // join_node explicit join_node(follows(...), Policy = Policy()); explicit join_node(precedes(...), Policy = Policy()); // split_node explicit split_node(follows(...)); explicit split_node(precedes(...)); // indexer_node indexer_node(follows(...)); indexer_node(precedes(...)); See Also ******** :ref:`follows_precedes` ================================================ FILE: third-party/tbb/doc/main/reference/custom_mutex_chmap.rst ================================================ .. _custom_mutex_chmap: The customizing mutex type for ``concurrent_hash_map`` ====================================================== .. note:: To enable this feature, define the ``TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS`` macro to 1. .. contents:: :local: :depth: 1 Description *********** oneTBB ``concurrnent_hash_map`` class uses reader-writer mutex to provide thread safety and avoid data races for insert, lookup, and erasure operations. This feature adds an extra template parameter for ``concurrent_hash_map`` that allows to customize the type of the reader-writer mutex. API *** Header ------ .. code:: cpp #include Synopsis -------- .. code:: cpp namespace oneapi { namespace tbb { template , typename Allocator = tbb_allocator>, typename Mutex = spin_rw_mutex> class concurrent_hash_map { using mutex_type = Mutex; }; } // namespace tbb } // namespace oneapi Type requirements ----------------- The type of the mutex passed as a template argument for ``concurrent_hash_map`` should meet the requirements of `ReaderWriterMutex `_. It should also provide the following API: .. cpp:function:: bool ReaderWriterMutex::scoped_lock::is_writer() const; **Returns**: ``true`` if the ``scoped_lock`` object acquires the mutex as a writer, ``false`` otherwise. The behavior is undefined if the ``scoped_lock`` object does not acquire the mutex. ``oneapi::tbb::spin_rw_mutex``, ``oneapi::tbb::speculative_spin_rw_mutex``, ``oneapi::tbb::queuing_rw_mutex``, ``oneapi::tbb::null_rw_mutex``, and ``oneapi::tbb::rw_mutex`` meet the requirements above. .. rubric:: Example The example below demonstrates how to wrap ``std::shared_mutex`` (C++17) to meet the requirements of `ReaderWriterMutex` and how to customize ``concurrent_hash_map`` to use this mutex. .. literalinclude:: ./examples/custom_mutex_chmap_example.cpp :language: c++ :start-after: /*begin_custom_mutex_chmap_example*/ :end-before: /*end_custom_mutex_chmap_example*/ ================================================ FILE: third-party/tbb/doc/main/reference/examples/blocked_nd_range_ctad_example.cpp ================================================ /* Copyright (c) 2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #if __cplusplus >= 201703L #define TBB_PREVIEW_BLOCKED_ND_RANGE_DEDUCTION_GUIDES 1 #include int main() { { /*begin_blocked_nd_range_ctad_example_1*/ oneapi::tbb::blocked_range range1(0, 100); oneapi::tbb::blocked_range range2(0, 200); oneapi::tbb::blocked_range range3(0, 300); // Since 3 unidimensional ranges of type int are provided, the type of nd_range // can be deduced as oneapi::tbb::blocked_nd_range oneapi::tbb::blocked_nd_range nd_range(range1, range2, range3); /*end_blocked_nd_range_ctad_example_1*/ } /*begin_blocked_nd_range_ctad_example_2*/ { oneapi::tbb::blocked_range range1(0, 100); oneapi::tbb::blocked_range range2(0, 200); // Deduced as blocked_nd_range oneapi::tbb::blocked_nd_range nd_range(range1, range2); } { // Deduced as blocked_nd_range oneapi::tbb::blocked_nd_range nd_range({0, 100}, {0, 200, 5}); } { int endings[3] = {100, 200, 300}; // Deduced as blocked_nd_range oneapi::tbb::blocked_nd_range nd_range1(endings); // Deduced as blocked_nd_range oneapi::tbb::blocked_nd_range nd_range2({100, 200, 300}, /*grainsize = */10); } /*end_blocked_nd_range_ctad_example_2*/ } #else // Skip int main() {} #endif ================================================ FILE: third-party/tbb/doc/main/reference/examples/custom_mutex_chmap_example.cpp ================================================ /* Copyright (c) 2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #if __cplusplus >= 201703L /*begin_custom_mutex_chmap_example*/ #define TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS 1 #include "oneapi/tbb/concurrent_hash_map.h" #include class SharedMutexWrapper { public: // ReaderWriterMutex requirements static constexpr bool is_rw_mutex = true; static constexpr bool is_recursive_mutex = false; static constexpr bool is_fair_mutex = false; class scoped_lock { public: scoped_lock() : my_mutex_ptr(nullptr), my_writer_flag(false) {} scoped_lock(SharedMutexWrapper& mutex, bool write = true) : my_mutex_ptr(&mutex), my_writer_flag(write) { if (my_writer_flag) { my_mutex_ptr->my_mutex.lock(); } else { my_mutex_ptr->my_mutex.lock_shared(); } } ~scoped_lock() { if (my_mutex_ptr) release(); } void acquire(SharedMutexWrapper& mutex, bool write = true) { if (my_mutex_ptr) release(); my_mutex_ptr = &mutex; my_writer_flag = write; if (my_writer_flag) { my_mutex_ptr->my_mutex.lock(); } else { my_mutex_ptr->my_mutex.lock_shared(); } } bool try_acquire(SharedMutexWrapper& mutex, bool write = true) { if (my_mutex_ptr) release(); my_mutex_ptr = &mutex; bool result = false; if (my_writer_flag) { result = my_mutex_ptr->my_mutex.try_lock(); } else { result = my_mutex_ptr->my_mutex.try_lock_shared(); } if (result) my_writer_flag = write; return result; } void release() { if (my_writer_flag) { my_mutex_ptr->my_mutex.unlock(); } else { my_mutex_ptr->my_mutex.unlock_shared(); } } bool upgrade_to_writer() { // std::shared_mutex does not have the upgrade/downgrade semantics if (my_writer_flag) return true; // Already a writer my_mutex_ptr->my_mutex.unlock_shared(); my_mutex_ptr->my_mutex.lock(); return false; // The lock was reacquired } bool downgrade_to_reader() { if (!my_writer_flag) return true; // Already a reader my_mutex_ptr->my_mutex.unlock(); my_mutex_ptr->my_mutex.lock_shared(); return false; } bool is_writer() const { return my_writer_flag; } private: SharedMutexWrapper* my_mutex_ptr; bool my_writer_flag; }; private: std::shared_mutex my_mutex; }; // struct SharedMutexWrapper int main() { using map_type = oneapi::tbb::concurrent_hash_map, oneapi::tbb::tbb_allocator>, SharedMutexWrapper>; map_type map; // This object will use SharedMutexWrapper for thread safety of insert/find/erase operations } /*end_custom_mutex_chmap_example*/ #else // C++17 // Skip int main() {} #endif ================================================ FILE: third-party/tbb/doc/main/reference/examples/fixed_pool_example.cpp ================================================ /* Copyright (c) 2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /*begin_fixed_pool_example*/ #define TBB_PREVIEW_MEMORY_POOL 1 #include "oneapi/tbb/memory_pool.h" int main() { char buf[1024]; oneapi::tbb::fixed_pool my_pool(buf, 1024); void* my_ptr = my_pool.malloc(10); my_pool.free(my_ptr); } /*end_fixed_pool_example*/ ================================================ FILE: third-party/tbb/doc/main/reference/examples/helpers_for_expressing_graphs_preview_api_example.cpp ================================================ /* Copyright (c) 2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #if __cplusplus >= 201703L /*begin_helpers_for_expressing_graphs_preview_api_example*/ #define TBB_PREVIEW_FLOW_GRAPH_FEATURES 1 #include int main() { using namespace oneapi::tbb::flow; graph g; function_node doubler(g, unlimited, [](const int& v) { return 2 * v; }); function_node squarer(g, unlimited, [](const int& v) { return v * v; }); function_node cuber(g, unlimited, [](const int& v) { return v * v * v; }); auto handlers = make_node_set(doubler, squarer, cuber); broadcast_node input(precedes(handlers)); join_node join(follows(handlers)); int sum = 0; function_node summer(follows(join), serial, [&](const std::tuple& v) { int sub_sum = std::get<0>(v) + std::get<1>(v) + std::get<2>(v); sum += sub_sum; return sub_sum; }); for (int i = 1; i <= 10; ++i) { input.try_put(i); } g.wait_for_all(); } /*end_helpers_for_expressing_graphs_preview_api_example*/ #else // Skip int main() {} #endif ================================================ FILE: third-party/tbb/doc/main/reference/examples/helpers_for_expressing_graphs_regular_api_example.cpp ================================================ /* Copyright (c) 2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #if __cplusplus >= 201703L /*begin_helpers_for_expressing_graphs_regular_api_example*/ #include int main() { using namespace oneapi::tbb::flow; graph g; broadcast_node input(g); function_node doubler(g, unlimited, [](const int& v) { return 2 * v; }); function_node squarer(g, unlimited, [](const int& v) { return v * v; }); function_node cuber(g, unlimited, [](const int& v) { return v * v * v; }); join_node> join(g); int sum = 0; function_node summer(g, serial, [&](const std::tuple& v) { int sub_sum = std::get<0>(v) + std::get<1>(v) + std::get<2>(v); sum += sub_sum; return sub_sum; }); make_edge(input, doubler); make_edge(input, squarer); make_edge(input, cuber); make_edge(doubler, std::get<0>(join.input_ports())); make_edge(squarer, std::get<1>(join.input_ports())); make_edge(cuber, std::get<2>(join.input_ports())); make_edge(join, summer); for (int i = 1; i <= 10; ++i) { input.try_put(i); } g.wait_for_all(); } /*end_helpers_for_expressing_graphs_regular_api_example*/ #else // Skip int main() {} #endif ================================================ FILE: third-party/tbb/doc/main/reference/examples/make_edges_function_example.cpp ================================================ /* Copyright (c) 2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #if __cplusplus >= 201703L /*begin_make_edges_function_example*/ #define TBB_PREVIEW_FLOW_GRAPH_FEATURES 1 #include int main() { using namespace oneapi::tbb::flow; graph g; broadcast_node input(g); function_node doubler(g, unlimited, [](const int& i) { return 2 * i; }); function_node squarer(g, unlimited, [](const int& i) { return i * i; }); function_node cuber(g, unlimited, [](const int& i) { return i * i * i; }); buffer_node buffer(g); auto handlers = make_node_set(doubler, squarer, cuber); make_edges(input, handlers); make_edges(handlers, buffer); for (int i = 1; i <= 10; ++i) { input.try_put(i); } g.wait_for_all(); } /*end_make_edges_function_example*/ #else // Skip int main() {} #endif ================================================ FILE: third-party/tbb/doc/main/reference/examples/malloc_replacement_log_example.cpp ================================================ /* Copyright (c) 2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #if _WIN32 /*begin_malloc_replacement_log_example*/ #include "oneapi/tbb/tbbmalloc_proxy.h" #include int main(){ char **func_replacement_log; int func_replacement_status = TBB_malloc_replacement_log(&func_replacement_log); if (func_replacement_status != 0) { printf("tbbmalloc_proxy cannot replace memory allocation routines\n"); for (char** log_string = func_replacement_log; *log_string != 0; log_string++) { printf("%s\n",*log_string); } } return 0; } /*end_malloc_replacement_log_example*/ #else // Skip int main() {} #endif ================================================ FILE: third-party/tbb/doc/main/reference/examples/memory_pool_allocator_example.cpp ================================================ /* Copyright (c) 2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /*begin_memory_pool_allocator_example*/ #define TBB_PREVIEW_MEMORY_POOL 1 #include "oneapi/tbb/memory_pool.h" #include int main() { oneapi::tbb::memory_pool> my_pool; typedef oneapi::tbb::memory_pool_allocator pool_allocator_t; std::list my_list(pool_allocator_t{my_pool}); my_list.emplace_back(1); } /*end_memory_pool_allocator_example*/ ================================================ FILE: third-party/tbb/doc/main/reference/examples/memory_pool_example.cpp ================================================ /* Copyright (c) 2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /*begin_memory_pool_example*/ #define TBB_PREVIEW_MEMORY_POOL 1 #include "oneapi/tbb/memory_pool.h" int main() { oneapi::tbb::memory_pool> my_pool; void* my_ptr = my_pool.malloc(10); my_pool.free(my_ptr); } /*end_memory_pool_example*/ ================================================ FILE: third-party/tbb/doc/main/reference/examples/parallel_phase_example.cpp ================================================ /* Copyright (c) 2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /*begin_parallel_phase_example*/ #define TBB_PREVIEW_PARALLEL_PHASE 1 #include "oneapi/tbb/task_arena.h" #include "oneapi/tbb/parallel_for.h" #include "oneapi/tbb/parallel_sort.h" #include int main() { oneapi::tbb::task_arena ta { tbb::task_arena::automatic, /*reserved_for_masters=*/1, tbb::task_arena::priority::normal, tbb::task_arena::leave_policy::fast }; std::vector data(1000); { oneapi::tbb::task_arena::scoped_parallel_phase phase{ta}; ta.execute([&data]() { oneapi::tbb::parallel_for(std::size_t(0), data.size(), [&data](std::size_t i) { data[i] = static_cast(i*i); }); }); for (std::size_t i = 1; i < data.size(); ++i) { data[i] += data[i-1]; } ta.execute([&data]() { oneapi::tbb::parallel_sort(data.begin(), data.end()); }); } } /*end_parallel_phase_example*/ ================================================ FILE: third-party/tbb/doc/main/reference/examples/parallel_sort_ranges_extension_example.cpp ================================================ /* Copyright (c) 2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #if __cplusplus >= 202002L /*begin_parallel_sort_ranges_extension_example*/ #include #include // requires C++20 #include std::span get_span() { static std::array arr = {3, 2, 1}; return std::span(arr); } int main() { tbb::parallel_sort(get_span()); } /*end_parallel_sort_ranges_extension_example*/ #else // Skip int main() {} #endif ================================================ FILE: third-party/tbb/doc/main/reference/examples/rvalue_reduce.cpp ================================================ /* Copyright (c) 2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #if __cplusplus >= 201703L /*begin_rvalue_reduce_example*/ // C++17 #include #include #include #include int main() { std::vector> sets; oneapi::tbb::parallel_reduce(oneapi::tbb::blocked_range(0, sets.size()), std::set{}, // identity element - empty set [&](const oneapi::tbb::blocked_range& range, std::set&& value) { for (size_t i = range.begin(); i < range.end(); ++i) { // Having value as a non-const rvalue reference allows to efficiently // transfer nodes from sets[i] without copying/moving the data value.merge(std::move(sets[i])); } return value; }, [&](std::set&& x, std::set&& y) { x.merge(std::move(y)); return x; } ); } /*end_rvalue_reduce_example*/ #else // Skip int main() {} #endif ================================================ FILE: third-party/tbb/doc/main/reference/examples/try_put_and_wait_example.cpp ================================================ /* Copyright (c) 2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include // dummy bodies struct f1_body { int operator()(int input) { return input; }; }; struct f2_body : f1_body {}; struct f3_body : f1_body {}; struct f4_body { int operator()(const std::tuple& input) { return 0; } }; /*begin_try_put_and_wait_example*/ #define TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT 1 #include #include #include struct f1_body; struct f2_body; struct f3_body; struct f4_body; int main() { using namespace oneapi::tbb; flow::graph g; flow::broadcast_node start_node(g); flow::function_node f1(g, flow::unlimited, f1_body{}); flow::function_node f2(g, flow::unlimited, f2_body{}); flow::function_node f3(g, flow::unlimited, f3_body{}); flow::join_node> join(g); flow::function_node, int> f4(g, flow::serial, f4_body{}); flow::make_edge(start_node, f1); flow::make_edge(f1, f2); flow::make_edge(start_node, f3); flow::make_edge(f2, flow::input_port<0>(join)); flow::make_edge(f3, flow::input_port<1>(join)); flow::make_edge(join, f4); // Submit work into the graph parallel_for(0, 100, [&](int input) { start_node.try_put_and_wait(input); // Post processing the result of input }); } /*end_try_put_and_wait_example*/ ================================================ FILE: third-party/tbb/doc/main/reference/follows_and_precedes_functions.rst ================================================ .. _follows_precedes: ``follows`` and ``precedes`` function templates =============================================== .. note:: To enable this feature, define the ``TBB_PREVIEW_FLOW_GRAPH_FEATURES`` macro to 1. The ``follows`` and ``precedes`` helper functions aid in expressing dependencies between nodes when building oneTBB flow graphs. These helper functions can only be used while constructing the node. .. contents:: :local: :depth: 1 Description *********** The ``follows`` helper function specifies that the node being constructed is the successor of the set of nodes passed as an argument. The ``precedes`` helper function specifies that the node being constructed is the predecessor of the set of nodes passed as an argument. Functions ``follows`` and ``precedes`` are meant to replace the graph argument, which is passed as the first argument to the constructor of the node. The graph argument for the node being constructed is obtained either from the specified node set or the sequence of nodes passed to ``follows`` or ``precedes``. If the nodes passed to ``follows`` or ``precedes`` belong to different graphs, the behavior is undefined. API *** Header ------ .. code:: cpp #include Syntax ------ .. code:: cpp // node_set is an exposition-only name for the type returned from make_node_set function template /*unspecified*/ follows( node_set& set ); template /*unspecified*/ follows( NodeType& node, NodeTypes&... nodes ); template /*unspecified*/ precedes( node_set& set ); template /*unspecified*/ precedes( NodeType& node, NodeTypes&... nodes ); Input Parameters ---------------- Either a set or a sequence of nodes can be used as arguments for ``follows`` and ``precedes``. The following expressions are equivalent: .. code-block:: cpp :caption: A set of nodes as an input auto handlers = make_node_set(n1, n2, n3); broadcast_node input(precedes(handlers)); .. code-block:: cpp :caption: A sequence of nodes as an input broadcast_node input(precedes(n1, n2, n3)); ================================================ FILE: third-party/tbb/doc/main/reference/helpers_for_expressing_graphs.rst ================================================ .. _helpers_for_expressing_graphs: Helper Functions for Expressing Graphs ====================================== .. note:: To enable this feature, define the ``TBB_PREVIEW_FLOW_GRAPH_FEATURES`` macro to 1. Helper functions are intended to make creation of the flow graphs less verbose. .. contents:: :local: :depth: 1 Description *********** This feature adds ``make_edges``, ``make_node_set``, ``follows`` and ``precedes`` functions to ``oneapi::tbb::flow`` namespace. These functions simplify the process of building flow graphs by allowing to gather nodes into sets and connect them to other nodes in the graph. API *** .. toctree:: :titlesonly: constructors_for_nodes follows_and_precedes_functions make_node_set_function make_edges_function Example ******* Consider the graph depicted below. .. figure:: ./Resources/fg_api_graph_structure.png :align: center In the examples below, C++17 Class Template Argument Deduction is used to avoid template parameter specification where possible. **Regular API** .. literalinclude:: ./examples/helpers_for_expressing_graphs_regular_api_example.cpp :language: c++ :start-after: /*begin_helpers_for_expressing_graphs_regular_api_example*/ :end-before: /*end_helpers_for_expressing_graphs_regular_api_example*/ **Preview API** .. literalinclude:: ./examples/helpers_for_expressing_graphs_preview_api_example.cpp :language: c++ :start-after: /*begin_helpers_for_expressing_graphs_preview_api_example*/ :end-before: /*end_helpers_for_expressing_graphs_preview_api_example*/ ================================================ FILE: third-party/tbb/doc/main/reference/heterogeneous_extensions_chmap.rst ================================================ .. _heterogeneous_extensions_chmap: Heterogeneous overloads for ``concurrent_hash_map`` member functions ==================================================================== .. note:: To enable this feature, define the ``TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS`` macro to 1. A set of overloads for ``concurrent_hash_map`` member functions that allow to search, erase, and insert elements into the container without creating a temporary ``key_type`` object. .. contents:: :local: :depth: 1 Description *********** Heterogeneous overloads allow you to perform insert, lookup, and erasure operations on ``concurrent_hash_map`` object using an object of the type that is different from ``key_type`` but comparable with it. All member functions described below only participate in overload resolution if ``HashCompareType::is_transparent`` is valid and denotes a type.``HashCompareType`` is a type of the ``HashCompare`` passed as a template argument for ``concurrent_hash_map``. It means that the ``HashCompare`` object calculates a hash and compares keys for equality without creating a temporary ``key_type`` object. API *** Header ------ .. code:: cpp #include "oneapi/tbb/concurrent_hash_map.h" Synopsis -------- .. code:: cpp namespace oneapi { namespace tbb { template , typename Allocator = tbb_allocator>> class concurrent_hash_map { public: // Insertion template bool insert( accessor& result, const K& k ); template bool insert( const_accessor& result, const K& k ); // Lookup template bool find( accessor& result, const K& k ); template bool find( const_accessor& result, const K& k ) const; template size_type count( const K& k ) const; template std::pair equal_range( const K& k ); template std::pair equal_range( const K& k ) const; // Erasure template bool erase( const K& k ); }; } // namespace tbb } // namespace oneapi Member functions ---------------- Insertion ^^^^^^^^^ .. code:: cpp template bool insert( accessor& result, const K& k ); template bool insert( const_accessor& result, const K& k ); If the accessor ``result`` is not empty, releases the ``result`` and tries to insert the value constructed from ``{k, mapped_type()}`` into the container. Sets the ``result`` to provide access to the inserted element or to the element with the key that compares `equivalent` to the value ``k``. This overload only participates in overload resolution if ``std::is_constructible`` is ``true``. **Returns**: ``true`` if the insertion was applied, ``false`` otherwise. Lookup ^^^^^^ .. code:: cpp template bool find( accessor& result, const K& k ); template bool find( const_accessor& result, const K& k ) const; If the accessor ``result`` is not empty, releases the ``result``. If an element with the key that compares `equivalent` to the value ``k`` exists, sets the ``result`` to provide access to this element. **Returns**: ``true`` if an element with the key that compares `equivalent` to the value ``k`` is found, ``false`` otherwise. ------------------------------------------------ .. code:: cpp template size_type count( const K& k ) const; **Returns**: ``1`` if an element with the key that compares `equivalent` to the value ``k`` exists, ``0`` otherwise. ------------------------------------------------ .. code:: cpp template std::pair equal_range( const K& k ); template std::pair equal_range( const K& k ) const; **Returns**: - A pair of iterators ``{f, l}`` if an element with the key that compares `equivalent` to the value ``k`` exists in the container. Here ``f`` is an iterator to this element, ``l`` is ``std::next(f)``. - ``{end(), end()}`` otherwise. .. rubric:: Example The example below demonstrates how to use heterogeneous lookup feature to find an object with the key of type ``std::string`` using an object of type ``const char*`` without conversions. .. code:: cpp #define TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS 1 #include "oneapi/tbb/concurrent_hash_map.h" #include #include // HashCompare an object that can calculate the hash code for // std::string only and compare strings for equality class RegularHashCompare { private: std::hash my_hasher; public: std::size_t hash( const std::string& key ) const { return my_hasher(key); } bool equal( const std::string& key1, const std::string& key2 ) const { return key1 == key2; } }; // HashCompare an object that can calculate the hash code for // std::string and const char*, and compare them for equality class TransparentHashCompare { private: std::hash my_hasher; // Calculates a hash for the array of chars std::size_t calculate_hash( const char* ptr ) const { std::size_t h = 0; for (auto c = ptr; *c; ++c) { h = h ^ my_hasher(*c); } return h; } public: using is_transparent = void; std::size_t hash( const char* key ) const { return calculate_hash(key); } std::size_t hash( const std::string& key ) const { return calculate_hash(key.c_str()); } bool equal( const char* key1, const char* key2 ) const { return std::strcmp(key1, key2) == 0; } bool equal( const char* key1, const std::string& key2 ) const { return std::strcmp(key1, key2.c_str()) == 0; } bool equal( const std::string& key1, const char* key2 ) const { return std::strcmp(key1.c_str(), key2) == 0; } bool equal( const std::string& key1, const std::string& key2 ) const { return std::strcmp(key1.c_str(), key2.c_str()) == 0; } }; int main() { using regular_hash_map = oneapi::tbb::concurrent_hash_map; using transparent_hash_map = oneapi::tbb::concurrent_hash_map; using regular_accessor = typename regular_hash_map::accessor; using transparent_accessor = typename transparent_hash_map::accessor; // Accessors regular_accessor reg_accessor; transparent_accessor tran_accessor; // Maps regular_hash_map regular_map; transparent_hash_map tran_map; // Heterogeneous overloads do not participate in overload resolution // Such a call matches on the find overload, which accepts key_type (std::string) // Creates a temporary key_type (std::string) object because of implicit conversion bool result = regular_map.find(reg_accessor, "abc"); // Heterogeneous overloads participate in overload resolution // No implicit conversion from const char* to std::string takes place result = tran_map.find(tran_accessor, "abc"); } ================================================ FILE: third-party/tbb/doc/main/reference/info_namespace.rst ================================================ .. _info_namespace: oneapi::tbb::info namespace =========================== The ``oneapi::tbb::info`` namespace satisfies `the corresponding oneTBB specification section `_. The |full_name| implementation requires `the hwloc library `_ to query NUMA(version >= 1.11) and Hybrid CPUs(version >= 2.4) topology information. See also: * `info namespace specification `_ * `oneapi::tbb::task_arena specification `_ * :doc:`oneapi::tbb::info namespace preview extensions ` * :doc:`task_arena::constraints class preview extensions ` ================================================ FILE: third-party/tbb/doc/main/reference/info_namespace_extensions.rst ================================================ .. _info_namespace_extensions: oneapi::tbb::info namespace extensions ====================================== .. note:: To enable this feature, set the ``TBB_PREVIEW_TASK_ARENA_CONSTRAINTS_EXTENSION`` macro to 1. .. contents:: :local: :depth: 1 Description *********** These extensions allow to query information about execution environment. .. contents:: :local: :depth: 1 API *** Header ------ .. code:: cpp #include Syntax ------ .. code:: cpp namespace oneapi { namespace tbb { using core_type_id = /*implementation-defined*/; namespace info { std::vector core_types(); int default_concurrency(task_arena::constraints c); } } } Types ----- ``core_type_id`` - Represents core type identifier. Functions --------- .. cpp:function:: std::vector core_types() Returns the vector of integral indexes that indicate available core types. The indexes are sorted from the least performant to the most performant core type. .. note:: If error occurs during system topology parsing, returns vector containing single element that equals to ``task_arena::automatic``. .. cpp:function:: int default_concurrency(task_arena::constraints c) Returns concurrency level for the given constraints. See also: * :doc:`task_arena::constraints class preview extensions ` * `info namespace specification `_ ================================================ FILE: third-party/tbb/doc/main/reference/make_edges_function.rst ================================================ .. _make_edges: ``make_edges`` function template ================================ .. note:: To enable this feature, define the ``TBB_PREVIEW_FLOW_GRAPH_FEATURES`` macro to 1. .. contents:: :local: :depth: 1 Description *********** The ``make_edges`` function template creates edges between a single node and each node in a set of nodes. There are two ways to connect nodes in a set and a single node using ``make_edges``: .. figure:: ./Resources/make_edges_usage.png :align: center API *** Header ------ .. code:: cpp #include Syntax ------ .. code:: cpp // node_set is an exposition-only name for the type returned from make_node_set function template void make_edges(node_set& set, NodeType& node); template void make_edges(NodeType& node, node_set& set); Example ------- The example implements the graph structure in the picture below. .. figure:: ./Resources/make_edges_example.png :align: center .. literalinclude:: ./examples/make_edges_function_example.cpp :language: c++ :start-after: /*begin_make_edges_function_example*/ :end-before: /*end_make_edges_function_example*/ ================================================ FILE: third-party/tbb/doc/main/reference/make_node_set_function.rst ================================================ .. _make_node_set: ``make_node_set`` function template =================================== .. note:: To enable this feature, define the ``TBB_PREVIEW_FLOW_GRAPH_FEATURES`` macro to 1. .. contents:: :local: :depth: 1 Description *********** The ``make_node_set`` function template creates a set of nodes that can be passed as arguments to ``make_edges``, ``follows`` and ``precedes`` functions. API *** Header ------ .. code:: cpp #include Syntax ------ .. code:: cpp template /*unspecified*/ make_node_set( Node& node, Nodes&... nodes ); See Also ******** :ref:`make_edges` :ref:`follows_precedes` ================================================ FILE: third-party/tbb/doc/main/reference/mutex_cls.rst ================================================ .. _mutex: mutex ========== .. note:: To enable this feature, define the ``TBB_PREVIEW_MUTEXES`` macro to 1. Description *********** A ``mutex`` is a class that models the `Mutex requirement `_, using adaptive approach: the combination of spinlock and waiting on system primitives. The ``mutex`` class satisfies all of the mutex requirements described in the [thread.mutex.requirements] section of the ISO C++ standard. The ``mutex`` class is not fair or recursive. API *** Header ------ .. code:: cpp #include Synopsis -------- .. code:: cpp namespace oneapi { namespace tbb { class mutex { public: mutex() noexcept; ~mutex(); mutex(const mutex&) = delete; mutex& operator=(const mutex&) = delete; class scoped_lock; void lock(); bool try_lock(); void unlock(); static constexpr bool is_rw_mutex = false; static constexpr bool is_recursive_mutex = false; static constexpr bool is_fair_mutex = false; }; } } Member classes -------------- .. namespace:: tbb::mutex .. cpp:class:: scoped_lock The corresponding ``scoped_lock`` class. See the `Mutex requirement `_. Member functions ---------------- .. cpp:function:: mutex() Constructs ``mutex`` with unlocked state. -------------------------------------------------- .. cpp:function:: ~mutex() Destroys an unlocked ``mutex``. -------------------------------------------------- .. cpp:function:: void lock() Acquires a lock. It uses adaptive logic for waiting: it blocks after particular time period of busy wait. -------------------------------------------------- .. cpp:function:: bool try_lock() Tries to acquire a lock (non-blocking). Returns **true** if succeeded; **false** otherwise. -------------------------------------------------- .. cpp:function:: void unlock() Releases the lock held by a current thread. ================================================ FILE: third-party/tbb/doc/main/reference/parallel_for_each_semantics.rst ================================================ .. _parallel_for_each_semantics: parallel_for_each Body semantics and requirements ================================================= .. contents:: :local: :depth: 1 Description *********** This page clarifies `ParallelForEachBody `_ named requirements for ``tbb::parallel_for_each`` algorithm specification. .. code:: cpp namespace oneapi { namespace tbb { template void parallel_for_each( InputIterator first, InputIterator last, Body body ); // overload (1) template void parallel_for_each( InputIterator first, InputIterator last, Body body, task_group_context& group ); // overload (2) template void parallel_for_each( Container& c, Body body ); // overload (3) template void parallel_for_each( Container& c, Body body, task_group_context& group ); // overload (4) template void parallel_for_each( const Container& c, Body body ); // overload (5) template void parallel_for_each( const Container& c, Body body, task_group_context& group ); // overload (6) } // namespace tbb } // namespace oneapi Terms ----- * ``iterator`` determines the type of the iterator passed into ``parallel_for_each`` algorithm (which is ``InputIterator`` for overloads `(1)` and `(2)` and ``decltype(std::begin(c))`` for overloads `(3) - (6)`) * ``value_type`` - the type ``typename std::iterator_traits::value_type`` * ``reference`` - the type ``typename std::iterator_traits::reference``. Requirements for different iterator types ----------------------------------------- If the ``iterator`` satisfies `Input iterator` named requirements from [input.iterators] ISO C++ Standard section and do not satisfies `Forward iterator` named requirements from [forward.iterators] ISO C++ Standard section, ``tbb::parallel_for_each`` requires the execution of the ``body`` with an object of type ``const value_type&`` or ``value_type&&`` to be well-formed. If both forms are well-formed, an overload with rvalue reference will be preferred. .. caution:: If the ``Body`` only takes non-const lvalue reference to ``value_type``, named requirements above are violated and the program can be ill-formed. If the ``iterator`` satisfies `Forward iterator` named requirements from [forward.iterators] ISO C++ Standard section, ``tbb::parallel_for_each`` requires the execution of the ``body`` with an object of type ``reference`` to be well-formed. Requirements for ``Body`` with ``feeder`` argument -------------------------------------------------- Additional elements submitted into ``tbb::parallel_for_each`` through the ``feeder::add`` passes to the ``Body`` as rvalues and therefore the corresponding execution of the ``Body`` is required to be well-formed. ================================================ FILE: third-party/tbb/doc/main/reference/parallel_phase_for_task_arena.rst ================================================ .. _parallel_phase_for_task_arena: ``parallel_phase`` Interface for Task Arena ==================================================================== .. note:: To enable this feature, set ``TBB_PREVIEW_PARALLEL_PHASE`` macro to 1. .. contents:: :local: :depth: 1 Description *********** This feature extends the `tbb::task_arena specification `_ with the following API: * Adds the ``leave_policy`` enumeration class to ``task_arena``. * Adds ``leave_policy`` as the last parameter in ``task_arena`` constructors and ``task_arena::initialize`` methods. This allows you to inform the scheduler about the preferred policy for worker threads when they are about to leave `task_arena` due to a lack of available work. * Adds new ``start_parallel_phase`` and ``end_parallel_phase`` interfaces to the ``task_arena`` class and the ``this_task_arena`` namespace. These interfaces work as hints to the scheduler to mark the start and end of parallel work submission into the arena, enabling different worker thread retention policies. * Adds the Resource Acquisition is Initialization (RAII) class ``scoped_parallel_phase`` to ``task_arena``. More details about motivation, semantics and conditions for becoming fully supported functionality can be found in the corresponding `Request For Comments document for parallel_phase `_. API *** Header ------ .. code:: cpp #define TBB_PREVIEW_PARALLEL_PHASE 1 #include Synopsis -------- .. code:: cpp namespace oneapi { namespace tbb { class task_arena { public: enum class leave_policy : /* unspecified type */ { automatic = /* unspecifed */, fast = /* unspecifed */, }; task_arena(int max_concurrency = automatic, unsigned reserved_for_masters = 1, priority a_priority = priority::normal, leave_policy a_leave_policy = leave_policy::automatic); task_arena(const constraints& constraints_, unsigned reserved_for_masters = 1, priority a_priority = priority::normal, leave_policy a_leave_policy = leave_policy::automatic); void initialize(int max_concurrency, unsigned reserved_for_masters = 1, priority a_priority = priority::normal, leave_policy a_leave_policy = leave_policy::automatic); void initialize(constraints a_constraints, unsigned reserved_for_masters = 1, priority a_priority = priority::normal, leave_policy a_leave_policy = leave_policy::automatic); void start_parallel_phase(); void end_parallel_phase(bool with_fast_leave = false); class scoped_parallel_phase { public: scoped_parallel_phase(task_arena& ta, bool with_fast_leave = false); }; }; // class task_arena namespace this_task_arena { void start_parallel_phase(); void end_parallel_phase(bool with_fast_leave = false); } // namespace this_task_arena } // namespace tbb } // namespace oneapi Member Types ---------------- .. cpp:enum:: leave_policy::automatic When passed to a constructor or the ``initialize`` method, the initialized ``task_arena`` has the default (possibly system specific) policy for how quickly worker threads leave the arena when there is no more work available in the arena and when the arena is not in a parallel phase. .. note:: Worker threads in ``task_arena`` might be retained based on internal heuristics. .. cpp:enum:: leave_policy::fast When passed to a constructor or the ``initialize`` method, the initialized ``task_arena`` has policy that allows worker threads to more quickly leave the arena when there is no more work available in the arena and when the arena is not in a parallel phase. .. cpp:class:: scoped_parallel_phase The RAII class to map a parallel phase to a code scope. .. cpp:function:: scoped_parallel_phase::scoped_parallel_phase(task_arena& ta, bool with_fast_leave = false) Constructs a ``scoped_parallel_phase`` object that starts a parallel phase in the specified ``task_arena``. If ``with_fast_leave`` is ``true``, the worker threads leave policy is temporarily set to ``fast``. .. note:: For ``task_arena`` initialized with ``leave_policy::fast``, ``with_fast_leave`` setting has no effect. .. note:: When worker threads enter the arena with no active parallel phases, the leave policy is reset to the value set during the initialization of the arena. Member Functions ---------------- .. cpp:function:: task_arena(const task_arena&) Copies settings from ``task_arena`` instance including the ``leave_policy``. .. cpp:function:: void start_parallel_phase() Indicates a point from where the scheduler can use a hint to keep threads in the arena for longer. .. note:: This function can also be a warm-up hint for the scheduler. It allows the scheduler to wake up worker threads in advance. .. cpp:function:: void end_parallel_phase(bool with_fast_leave = false) Indicates the point when the scheduler may drop a hint and no longer retain threads in the arena. If ``with_fast_leave`` is ``true``, worker threads leave policy is temporarily set to ``fast``. .. note:: For ``task_arena`` initialized with ``leave_policy::fast``, ``with_fast_leave`` setting has no effect. .. note:: When worker threads enter the arena with no active parallel phases, the leave policy is reset to the value set during the initialization of the arena. Functions --------- .. cpp:function:: void this_task_arena::start_parallel_phase() Indicates the start of the parallel phase in the current ``task_arena``. .. cpp:function:: void this_task_arena::end_parallel_phase(bool with_fast_leave = false) Indicates the end of the parallel phase in the current ``task_arena``. If ``with_fast_leave`` is ``true``, worker threads leave policy is temporarily set to ``fast``. Example ******* .. literalinclude:: .examples/parallel_phase_example.cpp :language: c++ :start-after: /*begin_parallel_phase_example*/ :end-before: /*end_parallel_phase_example*/ In this example, ``task_arena`` is created with ``leave_policy::fast``. It means that worker threads are not expected to remain in ``task_arena`` once parallel work is completed. However, the workflow includes a sequence of parallel work (initializing and sorting data) interceded by serial work (prefix sum). To hint the start and end of parallel work, ``scoped_parallel_phase`` is used. This provides a hint to the scheduler that worker threads might need to remain in ``task_arena`` since there is more parallel work to come. ================================================ FILE: third-party/tbb/doc/main/reference/parallel_sort_ranges_extension.rst ================================================ .. _parallel_sort_ranges_extension: parallel_sort ranges interface extension ======================================== .. contents:: :local: :depth: 1 Description *********** |full_name| implementation extends the `oneapi::tbb::parallel_sort specification `_ with overloads that takes the container by forwarding reference. API *** Header ------ .. code:: cpp #include Syntax ------ .. code:: cpp namespace oneapi { namespace tbb { template void parallel_sort( Container&& c ); template void parallel_sort( Container&& c, const Compare& comp ); } // namespace tbb } // namespace oneapi Functions --------- .. cpp:function:: template void parallel_sort( Container&& c ); Equivalent to ``parallel_sort( std::begin(c), std::end(c), comp )``, where `comp` uses `operator<` to determine relative orderings. .. cpp:function:: template void parallel_sort( Container&& c, const Compare& comp ); Equivalent to ``parallel_sort( std::begin(c), std::end(c), comp )``. Example ------- This interface may be used for sorting rvalue or constant views: .. literalinclude:: ./examples/parallel_sort_ranges_extension_example.cpp :language: c++ :start-after: /*begin_parallel_sort_ranges_extension_example*/ :end-before: /*end_parallel_sort_ranges_extension_example*/ ================================================ FILE: third-party/tbb/doc/main/reference/reference.rst ================================================ .. _reference: |short_name| API Reference ========================== For oneTBB API Reference, refer to `oneAPI Specification `_. The current supported version of oneAPI Specification is 1.0. Specification extensions ************************ |full_name| implements the `oneTBB specification `_. This document provides additional details or restrictions where necessary. It also describes features that are not included in the oneTBB specification. .. toctree:: :titlesonly: parallel_for_each_semantics parallel_sort_ranges_extension scalable_memory_pools/malloc_replacement_log rvalue_reduce Preview features **************** A preview feature is a component of oneTBB introduced to receive early feedback from users. The key properties of a preview feature are: - It is off by default and must be explicitly enabled. - It is intended to have a high quality implementation. - There is no guarantee of future existence or compatibility. - It may have limited or no support in tools such as correctness analyzers, profilers and debuggers. .. caution:: A preview feature is subject to change in future. It might be removed or significantly altered in future releases. Changes to a preview feature do NOT require usual deprecation and removal process. Therefore, using preview features in production code is strongly discouraged. .. toctree:: :titlesonly: type_specified_message_keys scalable_memory_pools helpers_for_expressing_graphs concurrent_lru_cache_cls task_group_extensions custom_mutex_chmap try_put_and_wait parallel_phase_for_task_arena blocked_nd_range_ctad ================================================ FILE: third-party/tbb/doc/main/reference/rvalue_reduce.rst ================================================ .. _rvalue_reduce: Parallel Reduction for rvalues ============================== .. contents:: :local: :depth: 1 Description *********** |full_name| implementation extends the `ParallelReduceFunc `_ and `ParallelReduceReduction `_ to optimize operating with ``rvalues`` using functional form of ``tbb::parallel_reduce`` and ``tbb::parallel_deterministic_reduce`` algorithms. API *** Header ------ .. code:: cpp #include ParallelReduceFunc Requirements: Pseudo-Signature, Semantics ------------------------------------------------------------ .. cpp:function:: Value Func::operator()(const Range& range, Value&& x) const or .. cpp:function:: Value Func::operator()(const Range& range, const Value& x) const Accumulates the result for a subrange, starting with initial value ``x``. The ``Range`` type must meet the `Range requirements `_. The ``Value`` type must be the same as a corresponding template parameter for the `parallel_reduce algorithm `_. If both ``rvalue`` and ``lvalue`` forms are provided, the ``rvalue`` is preferred. ParallelReduceReduction Requirements: Pseudo-Signature, Semantics ----------------------------------------------------------------- .. cpp:function:: Value Reduction::operator()(Value&& x, Value&& y) const or .. cpp:function:: Value Reduction::operator()(const Value& x, const Value& y) const Combines the ``x`` and ``y`` results. The ``Value`` type must be the same as a corresponding template parameter for the `parallel_reduce algorithm `_. If both ``rvalue`` and ``lvalue`` forms are provided, the ``rvalue`` is preferred. Example ******* .. literalinclude:: ./examples/rvalue_reduce.cpp :language: c++ :start-after: /*begin_rvalue_reduce_example*/ :end-before: /*end_rvalue_reduce_example*/ .. rubric:: See also * `oneapi::tbb::parallel_reduce specification `_ * `oneapi::tbb::parallel_deterministic_reduce specification `_ * `ParallelReduceFunc specification `_ * `ParallelReduceReduction specification `_ ================================================ FILE: third-party/tbb/doc/main/reference/rw_mutex_cls.rst ================================================ .. _rw_mutex: rw_mutex ============= .. note:: To enable this feature, define the ``TBB_PREVIEW_MUTEXES`` macro to 1. Description *********** A ``rw_mutex`` is a class that models the `ReaderWriterMutex requirement `_, using adaptive approach: the combination of spinlock and waiting on system primitives. The ``rw_mutex`` class satisfies all of the shared mutex requirements described in the [thread.sharedmutex.requirements] section of the ISO C++ standard. The ``rw_mutex`` class is unfair reader-writer lock with writer-preference. API *** Header ------ .. code:: cpp #include Synopsis -------- .. code:: cpp namespace oneapi { namespace tbb { class rw_mutex { public: rw_mutex() noexcept; ~rw_mutex(); rw_mutex(const rw_mutex&) = delete; rw_mutex& operator=(const rw_mutex&) = delete; class scoped_lock; // exclusive ownership void lock(); bool try_lock(); void unlock(); // shared ownership void lock_shared(); bool try_lock_shared(); void unlock_shared(); static constexpr bool is_rw_mutex = true; static constexpr bool is_recursive_mutex = false; static constexpr bool is_fair_mutex = false; }; } } Member classes -------------- .. namespace:: tbb::rw_mutex .. cpp:class:: scoped_lock The corresponding scoped-lock class. See the `ReaderWriterMutex requirement `_. Member functions ---------------- .. cpp:function:: rw_mutex() Constructs unlocked ``rw_mutex``. -------------------------------------------------- .. cpp:function:: ~rw_mutex() Destroys unlocked ``rw_mutex``. -------------------------------------------------- .. cpp:function:: void lock() Acquires a lock. It uses adaptive logic for waiting: it blocks after particular time period of busy wait. -------------------------------------------------- .. cpp:function:: bool try_lock() Tries to acquire a lock (non-blocking) on write. Returns **true** if succeeded; **false** otherwise. -------------------------------------------------- .. cpp:function:: void unlock() Releases the write lock held by the current thread. -------------------------------------------------- .. cpp:function:: void lock_shared() Acquires a lock on read. It uses adaptive logic for waiting: it blocks after particular time period of busy wait. -------------------------------------------------- .. cpp:function:: bool try_lock_shared() Tries to acquire the lock (non-blocking) on read. Returns **true** if succeeded; **false** otherwise. -------------------------------------------------- .. cpp:function:: void unlock_shared() Releases the read lock held by the current thread. ================================================ FILE: third-party/tbb/doc/main/reference/scalable_memory_pools/fixed_pool_cls.rst ================================================ .. _fixed_pool_cls: fixed_pool ========== .. note:: To enable this feature, set the ``TBB_PREVIEW_MEMORY_POOL`` macro to 1. A class for scalable memory allocation from a buffer of fixed size. .. contents:: :local: :depth: 1 Description *********** ``fixed_pool`` allocates and frees memory in a way that scales with the number of processors. All the memory available for the allocation is initially passed through arguments of the constructor. ``fixed_pool`` meet the :doc:`Memory Pool named requirement<../scalable_memory_pools>`. API *** Header ------ .. code:: cpp #include "oneapi/tbb/memory_pool.h" Synopsis -------- .. code:: cpp namespace oneapi { namespace tbb { class fixed_pool { public: fixed_pool(void *buffer, size_t size); fixed_pool(const fixed_pool& other) = delete; fixed_pool& operator=(const fixed_pool& other) = delete; ~fixed_pool(); void recycle(); void* malloc(size_t size); void free(void* ptr); void* realloc(void* ptr, size_t size); }; } // namespace tbb } // namespace oneapi Member Functions ---------------- .. cpp:function:: fixed_pool(void *buffer, size_t size) **Effects**: Constructs a memory pool to manage the memory of size ``size`` pointed to by ``buffer``. Throws the ``bad_alloc`` exception if the library fails to construct an instance of the class. Examples ******** The code below provides a simple example of allocation from a fixed pool. .. literalinclude:: ../examples/fixed_pool_example.cpp :language: c++ :start-after: /*begin_fixed_pool_example*/ :end-before: /*end_fixed_pool_example*/ ================================================ FILE: third-party/tbb/doc/main/reference/scalable_memory_pools/malloc_replacement_log.rst ================================================ .. _malloc_replacement_log: TBB_malloc_replacement_log Function =================================== .. note:: This function is for Windows* OS only. Summary ******* Provides information about the status of dynamic memory allocation replacement. Syntax ******* :: extern "C" int TBB_malloc_replacement_log(char *** log_ptr); Header ****** :: #include "oneapi/tbb/tbbmalloc_proxy.h" Description *********** Dynamic replacement of memory allocation functions on Windows* OS uses in-memory binary instrumentation techniques. To make sure that such instrumentation is safe, oneTBB first searches for a subset of replaced functions in the Visual C++* runtime DLLs and checks if each one has a known bytecode pattern. If any required function is not found or its bytecode pattern is unknown, the replacement is skipped, and the program continues to use the standard memory allocation functions. The ``TBB_malloc_replacement_log`` function allows the program to check if the dynamic memory replacement happens and to get a log of the performed checks. **Returns:** * 0, if all necessary functions are successfully found and the replacement takes place. * 1, otherwise. The ``log_ptr`` parameter must be an address of a char** variable or be ``NULL``. If it is not ``NULL``, the function writes there the address of an array of NULL-terminated strings containing detailed information about the searched functions in the following format: :: search_status: function_name (dll_name), byte pattern: For more information about the replacement of dynamic memory allocation functions, see :ref:`Windows_C_Dynamic_Memory_Interface_Replacement`. Example ******* .. literalinclude:: ../examples/malloc_replacement_log_example.cpp :language: c++ :start-after: /*begin_malloc_replacement_log_example*/ :end-before: /*end_malloc_replacement_log_example*/ Example output: :: tbbmalloc_proxy cannot replace memory allocation routines Success: free (ucrtbase.dll), byte pattern: Fail: _msize (ucrtbase.dll), byte pattern: ================================================ FILE: third-party/tbb/doc/main/reference/scalable_memory_pools/memory_pool_allocator_cls.rst ================================================ .. _memory_pool_allocator_cls: memory_pool_allocator ===================== .. note:: To enable this feature, set the ``TBB_PREVIEW_MEMORY_POOL`` macro to 1. A class template that provides a memory pool with a C++ allocator interface. .. contents:: :local: :depth: 1 Description *********** ``memory_pool_allocator`` meets the allocator requirements from the [allocator.requirements] ISO C++ Standard section It also provides a constructor to allocate and deallocate memory. This constructor is linked with an instance of either the ``memory_pool`` or the ``fixed_pool`` class. The class is mainly intended for enabling memory pools within STL containers. API *** Header ------ .. code:: cpp #include "oneapi/tbb/memory_pool.h" Synopsis -------- .. code:: cpp namespace oneapi { namespace tbb { template class memory_pool_allocator { public: using value_type = T; using pointer = value_type*; using const_pointer = const value_type*; using reference = value_type&; using const_reference = const value_type&; using size_type = size_t; using difference_type = ptrdiff_t; template struct rebind { using other = memory_pool_allocator; }; explicit memory_pool_allocator(memory_pool &pool) throw(); explicit memory_pool_allocator(fixed_pool &pool) throw(); memory_pool_allocator(const memory_pool_allocator& src) throw(); template memory_pool_allocator(const memory_pool_allocator& src) throw(); pointer address(reference x) const; const_pointer address(const_reference x) const; pointer allocate(size_type n, const void* hint=0); void deallocate(pointer p, size_type); size_type max_size() const throw(); void construct(pointer p, const T& value); void destroy(pointer p); }; template<> class memory_pool_allocator { public: using pointer = void*; using const_pointer = const void*; using value_type = void; template struct rebind { using other = memory_pool_allocator; }; memory_pool_allocator(memory_pool &pool) throw(); memory_pool_allocator(fixed_pool &pool) throw(); memory_pool_allocator(const memory_pool_allocator& src) throw(); template memory_pool_allocator(const memory_pool_allocator& src) throw(); }; } // namespace tbb } // namespace oneapi template inline bool operator==( const memory_pool_allocator& a, const memory_pool_allocator& b); template inline bool operator!=( const memory_pool_allocator& a, const memory_pool_allocator& b); Member Functions ---------------- .. cpp:function:: explicit memory_pool_allocator(memory_pool &pool) **Effects**: Constructs a memory pool allocator serviced by ``memory_pool`` instance pool. ------------------------------------------------------- .. cpp:function:: explicit memory_pool_allocator(fixed_pool &pool) **Effects**: Constructs a memory pool allocator serviced by ``fixed_pool`` instance pool. Examples ******** The code below provides a simple example of container construction with the use of a memory pool. .. literalinclude:: ../examples/memory_pool_allocator_example.cpp :language: c++ :start-after: /*begin_memory_pool_allocator_example*/ :end-before: /*end_memory_pool_allocator_example*/ ================================================ FILE: third-party/tbb/doc/main/reference/scalable_memory_pools/memory_pool_cls.rst ================================================ .. _memory_pool_cls: memory_pool =========== .. note:: To enable this feature, set the ``TBB_PREVIEW_MEMORY_POOL`` macro to 1. A class template for scalable memory allocation from memory blocks provided by an underlying allocator. .. contents:: :local: :depth: 1 Description *********** A ``memory_pool`` allocates and frees memory in a way that scales with the number of processors. The memory is obtained as big chunks from an underlying allocator specified by the template argument. The latter must satisfy the subset of the allocator requirements from the [allocator.requirements] ISO C++ Standard section. A ``memory_pool`` meet the :doc:`Memory Pool named requirement<../scalable_memory_pools>`. .. caution:: If the underlying allocator refers to another scalable memory pool, the inner pool (or pools) must be destroyed before the outer pool is destroyed or recycled. API *** Header ------ .. code:: cpp #include "oneapi/tbb/memory_pool.h" Synopsis -------- .. code:: cpp namespace oneapi { namespace tbb { template class memory_pool { public: explicit memory_pool(const Alloc &src = Alloc()); memory_pool(const memory_pool& other) = delete; memory_pool& operator=(const memory_pool& other) = delete; ~memory_pool(); void recycle(); void *malloc(size_t size); void free(void* ptr); void *realloc(void* ptr, size_t size); }; } } Member Functions ---------------- .. cpp:function:: explicit memory_pool(const Alloc &src = Alloc()) **Effects**: Constructs a memory pool with an instance of underlying memory allocator of type ``Alloc`` copied from ``src``. Throws the ``bad_alloc`` exception if runtime fails to construct an instance of the class. Examples ******** The code below provides a simple example of allocation from an extensible memory pool. .. literalinclude:: ../examples/memory_pool_example.cpp :language: c++ :start-after: /*begin_memory_pool_example*/ :end-before: /*end_memory_pool_example*/ ================================================ FILE: third-party/tbb/doc/main/reference/scalable_memory_pools.rst ================================================ .. _scalable_memory_pools_reference: Scalable Memory Pools ===================== .. note:: To enable this feature, set the ``TBB_PREVIEW_MEMORY_POOL`` macro to 1. Memory pools allocate and free memory from a specified region or an underlying allocator using thread-safe, scalable operations. The following table summarizes the Memory Pool named requirement. Here, ``P`` represents an instance of the memory pool class. .. container:: tablenoborder .. list-table:: :header-rows: 1 * - Pseudo-Signature - Semantics * - \ ``~P() throw();`` - Destructor. Frees all the allocated memory. * - \ ``void P::recycle();`` - Frees all the allocated memory. * - \ ``void* P::malloc(size_t n);`` - Returns a pointer to ``n`` bytes allocated from the memory pool. * - \ ``void P::free(void* ptr);`` - Frees the memory object specified via ``ptr`` pointer. * - \ ``void* P::realloc(void* ptr, size_t n);`` - Reallocates the memory object pointed by ``ptr`` to ``n`` bytes. .. container:: section .. rubric:: Model Types :class: sectiontitle The ``memory_pool`` template class and the ``fixed_pool`` class meet the Memory Pool named requirement. .. toctree:: :titlesonly: scalable_memory_pools/memory_pool_cls scalable_memory_pools/fixed_pool_cls scalable_memory_pools/memory_pool_allocator_cls ================================================ FILE: third-party/tbb/doc/main/reference/task_group_extensions.rst ================================================ .. _task_group_extensions: task_group extensions ===================== .. note:: To enable these extensions, set the ``TBB_PREVIEW_TASK_GROUP_EXTENSIONS`` macro to 1. .. contents:: :local: :depth: 1 Description *********** |full_name| implementation extends the `tbb::task_group specification `_ with the requirements for a user-provided function object. API *** Header ------ .. code:: cpp #include Synopsis -------- .. code:: cpp namespace oneapi { namespace tbb { class task_group { public: //only the requirements for the return type of function F are changed template task_handle defer(F&& f); //only the requirements for the return type of function F are changed template task_group_status run_and_wait(const F& f); //only the requirements for the return type of function F are changed template void run(F&& f); }; } // namespace tbb } // namespace oneapi Member Functions ---------------- .. cpp:function:: template task_handle defer(F&& f) As an optimization hint, ``F`` might return a ``task_handle``, which task object can be executed next. .. note:: The ``task_handle`` returned by the function must be created using ``*this`` ``task_group``. That is, the one for which the run method is called, otherwise it is undefined behavior. .. cpp:function:: template task_group_status run_and_wait(const F& f) As an optimization hint, ``F`` might return a ``task_handle``, which task object can be executed next. .. note:: The ``task_handle`` returned by the function must be created using ``*this`` ``task_group``. That is, the one for which the run method is called, otherwise it is undefined behavior. .. cpp:function:: template void run(F&& f) As an optimization hint, ``F`` might return a ``task_handle``, which task object can be executed next. .. note:: The ``task_handle`` returned by the function must be created with ``*this`` ``task_group``. It means, with the one for which run method is called, otherwise it is an undefined behavior. .. rubric:: See also * `oneapi::tbb::task_group specification `_ * `oneapi::tbb::task_group_context specification `_ * `oneapi::tbb::task_group_status specification `_ * `oneapi::tbb::task_handle class `_ ================================================ FILE: third-party/tbb/doc/main/reference/try_put_and_wait.rst ================================================ .. _try_put_and_wait: Waiting for Single Messages in Flow Graph ========================================= .. contents:: :local: :depth: 1 Description *********** This feature adds a new ``try_put_and_wait`` interface to the receiving nodes in the Flow Graph. This function puts a message as an input into a Flow Graph and waits until all work related to that message is complete. ``try_put_and_wait`` may reduce latency compared to calling ``graph::wait_for_all`` since ``graph::wait_for_all`` waits for all work, including work that is unrelated to the input message, to complete. ``node.try_put_and_wait(msg)`` performs ``node.try_put(msg)`` on the node and waits until the work on ``msg`` is completed. Therefore, the following conditions are true: * Any task initiated by any node in the Flow Graph that involves working with ``msg`` or any other intermediate result computed from ``msg`` is completed. * No intermediate results computed from ``msg`` remain in any buffers in the graph. .. caution:: To prevent ``try_put_and_wait`` calls from infinite waiting, avoid using buffering nodes at the end of the Flow Graph since the final result will not be automatically consumed by the Flow Graph. .. caution:: The ``multifunction_node`` and ``async_node`` classes are not currently supported by this feature. Including one of these nodes in the Flow Graph may cause ``try_put_and_wait`` to exit early, even if the computations on the initial input message are still in progress. API *** Header ------ .. code:: cpp #define TBB_PREVIEW_FLOW_GRAPH_FEATURES // macro option 1 #define TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT // macro option 2 #include Synopsis -------- .. code:: cpp namespace oneapi { namespace tbb { template class continue_node { public: bool try_put_and_wait(const continue_msg& input); }; // class continue_node template class function_node { public: bool try_put_and_wait(const Input& input); }; // class function_node template class overwrite_node { public: bool try_put_and_wait(const T& input); }; // class overwrite_node template class write_once_node { public: bool try_put_and_wait(const T& input); }; // class write_once_node template class buffer_node { public: bool try_put_and_wait(const T& input); }; // class buffer_node template class queue_node { public: bool try_put_and_wait(const T& input); }; // class queue_node template > class priority_queue_node { public: bool try_put_and_wait(const T& input); }; // class priority_queue_node template class sequencer_node { public: bool try_put_and_wait(const T& input); }; // class sequencer_node template class limiter_node { public: bool try_put_and_wait(const T& input); }; // class limiter_node template class broadcast_node { public: bool try_put_and_wait(const T& input); }; // class broadcast_node template class split_node { public: bool try_put_and_wait(const TupleType& input); }; // class split_node } // namespace tbb } // namespace oneapi Member Functions ---------------- .. code:: cpp template bool continue_node::try_put_and_wait(const continue_msg& input) **Effects**: Increments the count of input signals received. If the incremented count is equal to the number of known predecessors, performs the ``body`` function object execution. Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and related to ``input`` are executed, and no related objects remain in any buffer within the graph. **Returns**: ``true``. .. code:: cpp template bool function_node::try_put_and_wait(const Input& input) **Effects**: If the concurrency limit allows, executes the user-provided body on the incoming message ``input``. Otherwise, depending on the ``Policy`` of the node, either queues the incoming message ``input`` or rejects it. Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and related to ``input`` are executed, and no related objects remain in any buffer within the graph. **Returns**: ``true`` if the input is accepted, ``false`` otherwise. .. code:: cpp template bool overwrite_node::try_put_and_wait(const T& input) **Effects**: Stores ``input`` in the internal single-item buffer and broadcasts it to all successors. Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and related to ``input`` are executed, and no related objects remain in any buffer within the graph. **Returns**: ``true``. .. caution:: Since the input element is not retrieved from ``overwrite_node`` once accepted by the successor, retrieve it by explicitly calling the ``clear()`` method or by overwriting with another element to prevent ``try_put_and_wait`` from indefinite waiting. .. code:: cpp template bool write_once_node::try_put_and_wait(const T& input) **Effects**: Stores ``input`` in the internal single-item buffer if it does not contain a valid value already. If a new value is set, the node broadcasts it to all successors. Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and related to ``input`` are executed, and no related objects remain in any buffer within the graph. **Returns**: ``true`` for the first time after construction or a call to ``clear()``. .. caution:: Since the input element is not retrieved from the ``write_once_node`` once accepted by the successor, retrieve it by explicitly calling the ``clear()`` method to prevent ``try_put_and_wait`` from indefinite waiting. .. code:: cpp template bool buffer_node::try_put_and_wait(const T& input) **Effects**: Adds ``input`` to the set of items managed by the node and tries forwarding it to a successor. Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and related to ``input`` are executed, and no related objects remain in any buffer within the graph. **Returns**: ``true``. .. code:: cpp template bool queue_node::try_put_and_wait(const T& input) **Effects**: Adds ``input`` to the set of items managed by the node and tries forwarding the least recently added item to a successor. Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and related to ``input`` are executed, and no related objects remain in any buffer within the graph. **Returns**: ``true``. .. code:: cpp template bool priority_queue_node::try_put_and_wait(const T& input) **Effects**: Adds ``input`` to the ``priority_queue_node`` and attempts to forward the item with the highest priority among all items added to the node but not yet forwarded to the successors. Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and related to ``input`` are executed, and no related objects remain in any buffer within the graph. **Returns**: ``true``. .. code:: cpp template bool sequencer_node::try_put_and_wait(const T& input) **Effects**: Adds ``input`` to the ``sequencer_node`` and tries forwarding the next item in sequence to a successor. Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and related to ``input`` are executed, and no related objects remain in any buffer within the graph. **Returns**: ``true``. .. code:: cpp template bool limiter_node::try_put_and_wait(const T& input) **Effects**: If the broadcast count is below the threshold, broadcasts ``input`` to all successors. Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and related to ``input`` are executed, and no related objects remain in any buffer within the graph. **Returns**: ``true`` if ``input`` is broadcasted; ``false`` otherwise. .. code:: cpp template bool broadcast_node::try_put_and_wait(const T& input) **Effects**: Broadcasts ``input`` to all successors. Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and related to ``input`` are executed, and no related objects remain in any buffer within the graph. **Returns**: ``true`` even if the node cannot successfully forward the message to any of its successors. .. code:: cpp template bool split_node::try_put_and_wait(const TupleType& input); **Effects**: Broadcasts each element in the incoming tuple to the nodes connected to the ``split_node`` output ports. The element at index ``i`` of ``input`` is broadcasted through the output port number ``i``. Waits for the completion of the ``input`` in the Flow Graph, meaning all tasks created by each node and related to ``input`` are executed, and no related objects remain in any buffer within the graph. **Returns**: ``true``. Example ******* .. literalinclude:: ./examples/try_put_and_wait_example.cpp :language: c++ :start-after: /*begin_try_put_and_wait_example*/ :end-before: /*end_try_put_and_wait_example*/ Each iteration of ``parallel_for`` submits an input into the Flow Graph. After returning from ``try_put_and_wait(input)``, it is guaranteed that all of the work related to the completion of ``input`` is done by all of the nodes in the graph. Tasks related to inputs submitted by other calls are not guaranteed to be completed. ================================================ FILE: third-party/tbb/doc/main/reference/type_specified_message_keys.rst ================================================ .. _class_join_node_extension: Type-specified message keys for join_node ========================================= .. note:: To enable this feature, define the ``TBB_PREVIEW_FLOW_GRAPH_FEATURES`` macro to 1. .. contents:: :local: :depth: 1 Description *********** The extension allows a key matching ``join_node`` to obtain keys via functions associated with its input types. The extension simplifies the existing approach by removing the need to provide a function object for each input port of ``join_node``. API *** Header ------ .. code:: cpp #include "oneapi/tbb/flow_graph.h" Syntax ------ The extension adds a special constructor to the ``join_node`` interface when the ``key_matching`` policy is used. The constructor has the following signature: .. code:: cpp join_node( graph &g ) When constructed this way, a ``join_node`` calls the ``key_from_message`` function for each incoming message to obtain the key associated with it. The default implementation of ``key_from_message`` is the following .. code:: cpp namespace oneapi { namespace tbb { namespace flow { template K key_from_message( const T &t ) { return t.key(); } } } } ``T`` is one of the user-provided types in ``OutputTuple`` and is used to construct the ``join_node``, and ``K`` is the key type of the node. By default, the ``key()`` method defined in the message class will be called. Alternatively, the user can define its own ``key_from_message`` function in the same namespace with the message type. This function will be found via C++ argument-dependent lookup and used in place of the default implementation. See Also ******** `join_node Specification `_ ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Advanced_Example.rst ================================================ .. _Advanced_Example: Advanced Example ================ An example of a more advanced associative operation is to find the index where ``Foo(i)`` is minimized. A serial version might look like this: :: long SerialMinIndexFoo( const float a[], size_t n ) { float value_of_min = FLT_MAX; // FLT_MAX from long index_of_min = -1; for( size_t i=0; i& r ) { const float *a = my_a; for( size_t i=r.begin(); i!=r.end(); ++i ) { float value = Foo(a[i]); if( value index_of_min(-1) {}   void join( const SumFoo& y ) { if( y.value_of_min index_of_min(-1), {} }; Now ``SerialMinIndex`` can be rewritten using ``parallel_reduce`` as shown below: :: long ParallelMinIndexFoo( float a[], size_t n ) { MinIndexFoo mif(a); parallel_reduce(blocked_range(0,n), mif ); return mif.index_of_min; } ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Advanced_Topic_Other_Kinds_of_Iteration_Spaces.rst ================================================ .. _Advanced_Topic_Other_Kinds_of_Iteration_Spaces: Advanced Topic: Other Kinds of Iteration Spaces =============================================== The examples so far have used the class ``blocked_range`` to specify ranges. This class is useful in many situations, but it does not fit every situation. You can use |full_name| to define your own iteration space objects. The object must specify how it can be split into subspaces by providing a basic splitting constructor, an optional proportional splitting constructor, and two predicate methods. If your class is called ``R``, the methods and constructors should be as follows: :: class R { // True if range is empty bool empty() const; // True if range can be split into non-empty subranges bool is_divisible() const; // Splits r into subranges r and *this R( R& r, split ); // (optional) Splits r into subranges r and *this in proportion p R( R& r, proportional_split p ); ... }; The method ``empty`` should return true if the range is empty. The method ``is_divisible`` should return true if the range can be split into two non-empty subspaces, and such a split is worth the overhead. The basic splitting constructor should take two arguments: - The first of type ``R`` - The second of type oneapi::tbb::split The second argument is not used; it serves only to distinguish the constructor from an ordinary copy constructor. The basic splitting constructor should attempt to split ``r`` roughly into two halves, and update ``r`` to be the first half, and set the constructed object as the second half. Unlike the basic splitting constructor, the proportional splitting constructor is optional and takes the second argument of type ``oneapi::tbb::proportional_split``. The type has methods ``left`` and ``right`` that return the values of the proportion. These values should be used to split ``r`` accordingly, so that the updated ``r`` corresponds to the left part of the proportion, and the constructed object corresponds to the right part. Both splitting constructors should guarantee that the updated ``r`` part and the constructed object are not empty. The parallel algorithm templates call the splitting constructors on ``r`` only if ``r.is_divisible`` is true. The iteration space does not have to be linear. Look at ``oneapi/tbb/blocked_range2d.h`` for an example of a range that is two-dimensional. Its splitting constructor attempts to split the range along its longest axis. When used with ``parallel_for``, it causes the loop to be "recursively blocked" in a way that improves cache usage. This nice cache behavior means that using ``parallel_for`` over a ``blocked_range2d`` can make a loop run faster than the sequential equivalent, even on a single processor. The ``blocked_range2d`` allows you to use different value types for its first dimension, *rows*, and the second one, *columns*. That means you can combine indexes, pointers, and iterators into a joint iteration space. Use the methods ``rows()`` and ``cols()`` to obtain ``blocked_range`` objects that represent the respective dimensions. The ``blocked_range3d`` class template extends this approach to 3D by adding ``pages()`` as the first dimension, followed by ``rows()`` and ``cols()``. The ``blocked_nd_range`` class template represents a blocked iteration space of any dimensionality. Unlike the previously described 2D and 3D ranges, ``blocked_nd_range`` uses the same value type for all its axes, and its constructor requires you to pass N instances of ``blocked_range`` instead of individual boundary values. The change in the naming pattern reflects these differences. Example of a Multidimensional Iteration Space ------------------------------------------------ The example demonstrates calculation of a 3-dimensional filter over the pack of feature maps. The ``convolution3d`` function iterates over the output cells, assigning to each cell the result of the ``kernel3d`` function that combines the values from a range in the feature maps. To run the computation in parallel, ``tbb::parallel_for`` is called with ``tbb::blocked_nd_range`` as an argument. The body function processes the received 3D subrange in nested loops, using the method ``dim`` to get the loop boundaries for each dimension. .. literalinclude:: ./examples/blocked_nd_range_example.cpp :language: c++ :start-after: /*begin_blocked_nd_range_example*/ :end-before: /*end_blocked_nd_range_example*/ ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Allocator_Configuration.rst ================================================ .. _Allocator_Configuration: Configuring the Memory Allocator ================================ The oneTBB memory allocator provides the following API functions and environment variables to configure its behavior: - the ``scalable_allocation_command`` function instructs the allocator to perform a certain action, such as cleaning up its internal memory buffers. - the ``scalable_allocation_mode`` function allows an application to set certain parameters for the memory allocator, such as an option to map memory in huge pages or define a recommended heap size. These settings take effect until modified by another call to ``scalable_allocation_mode``. Some of the memory allocator parameters can also be set via system environment variables. It can be useful to adjust the behavior without modifying application source code, to ensure that a setting takes effect as early as possible, or to avoid explicit dependency on the oneTBB allocator binaries. The following environment variables are recognized: - ``TBB_MALLOC_USE_HUGE_PAGES`` controls usage of huge pages for memory mapping. - ``TBB_MALLOC_SET_HUGE_OBJECT_THRESHOLD`` defines the lower bound for the size (bytes), that is interpreted as huge and not released during regular cleanup operations. These variables only take effect at the time the memory manager is initialized; later environment changes are ignored. A call to ``scalable_allocation_mode`` overrides the effect of the corresponding environment variable. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Automatic_Chunking.rst ================================================ .. _Automatic_Chunking: Automatic Chunking ================== A parallel loop construct incurs overhead cost for every chunk of work that it schedules. |full_name| chooses chunk sizes automatically, depending upon load balancing needs. The heuristic attempts to limit overheads while still providing ample opportunities for load balancing. .. CAUTION:: Typically a loop needs to take at least a million clock cycles to make it worth using ``parallel_for``. For example, a loop that takes at least 500 microseconds on a 2 GHz processor might benefit from ``parallel_for``. The default automatic chunking is recommended for most uses. As with most heuristics, however, there are situations where controlling the chunk size more precisely might yield better performance. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Automically_Replacing_malloc.rst ================================================ .. _Automically_Replacing_malloc: Automatically Replacing ``malloc`` and Other C/C++ Functions for Dynamic Memory Allocation ========================================================================================== On Windows*, Linux\* operating systems, it is possible to automatically replace all calls to standard functions for dynamic memory allocation (such as ``malloc``) with the |full_name| scalable equivalents. Doing so can sometimes improve application performance. Replacements are provided by the proxy library (the library names can be found in platform-specific sections below). A proxy library and a scalable memory allocator library should be taken from the same release of oneTBB, otherwise the libraries may be mutually incompatible. .. toctree:: :maxdepth: 4 ../tbb_userguide/Windows_C_Dynamic_Memory_Interface_Replacement ../tbb_userguide/Linux_C_Dynamic_Memory_Interface_Replacement ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Bandwidth_and_Cache_Affinity_os.rst ================================================ .. _Bandwidth_and_Cache_Affinity: Bandwidth and Cache Affinity ============================ For a sufficiently simple function ``Foo``, the examples might not show good speedup when written as parallel loops. The cause could be insufficient system bandwidth between the processors and memory. In that case, you may have to rethink your algorithm to take better advantage of cache. Restructuring to better utilize the cache usually benefits the parallel program as well as the serial program. An alternative to restructuring that works in some cases is ``affinity_partitioner.`` It not only automatically chooses the grainsize, but also optimizes for cache affinity and tries to distribute the data uniformly among threads. Using ``affinity_partitioner`` can significantly improve performance when: - The computation does a few operations per data access. - The data acted upon by the loop fits in cache. - The loop, or a similar loop, is re-executed over the same data. - There are more than two hardware threads available (and especially if the number of threads is not a power of two). If only two threads are available, the default scheduling in |full_name| usually provides sufficient cache affinity. The following code shows how to use ``affinity_partitioner``. :: #include "oneapi/tbb.h"   void ParallelApplyFoo( float a[], size_t n ) { static affinity_partitioner ap; parallel_for(blocked_range(0,n), ApplyFoo(a), ap); }   void TimeStepFoo( float a[], size_t n, int steps ) { for( int t=0; tcancel_group_execution()``. The part ``current_context()`` references the ``task_group_context*`` of the currently executing task if any on the current thread. Calling ``cancel_group_execution()`` cancels all tasks in its ``task_group_context``, which is explained in more detail in :ref:`Cancellation_and_Nested_Parallelism`. The method returns ``true`` if it actually causes cancellation, ``false`` if the ``task_group_context`` was already cancelled. The example below shows how to use ``current_context()->cancel_group_execution()``. :: #include "oneapi/tbb.h" #include #include   using namespace oneapi::tbb; using namespace std;   vector Data;   struct Update { void operator()( const blocked_range& r ) const { for( int i=r.begin(); i!=r.end(); ++i ) if( icancel_group_execution() ) cout << "Index " << i << " caused cancellation\n"; return; } } };   int main() { Data.resize(1000); parallel_for( blocked_range(0, 2000), Update()); return 0; } ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Cancellation_and_Nested_Parallelism.rst ================================================ .. _Cancellation_and_Nested_Parallelism: Cancellation and Nested Parallelism =================================== The discussion so far was simplified by assuming non-nested parallelism and skipping details of ``task_group_context``. This topic explains both. An |full_name| algorithm executes by creating ``task`` objects that execute the snippets of code that you supply to the algorithm template. By default, these ``task`` objects are associated with a ``task_group_context`` created by the algorithm. Nested oneTBB algorithms create a tree of these ``task_group_context`` objects. Cancelling a ``task_group_context`` cancels all of its child ``task_group_context`` objects, and transitively all its descendants. Hence an algorithm and all algorithms it called can be cancelled with a single request. Exceptions propagate upwards. Cancellation propagates downwards. The opposition interplays to cleanly stop a nested computation when an exception occurs. For example, consider the tree in the following figure. Imagine that each node represents an algorithm and its ``task_group_context``. .. container:: fignone :name: fig6 Tree of task_group_context |image0| Suppose that the algorithm in C throws an exception and no node catches the exception. oneTBB propagates the exception upwards, cancelling related subtrees downwards, as follows: #. Handle exception in C: a. Capture exception in C. b. Cancel tasks in C. c. Throw exception from C to B. #. Handle exception in B: a. Capture exception in B. b. Cancel tasks in B and, by downwards propagation, in D. c. Throw an exception out of B to A. #. Handle exception in A: a. Capture exception in A. b. Cancel tasks in A and, by downwards propagation, in E, F, and G. c. Throw an exception upwards out of A. If your code catches the exception at any level, then oneTBB does not propagate it any further. For example, an exception that does not escape outside the body of a ``parallel_for`` does not cause cancellation of other iterations. To prevent downwards propagation of cancellation into an algorithm, construct an 'isolated' ``task_group_context`` on the stack and pass it to the algorithm explicitly. The example uses C++11 lambda expressions for brevity. :: #include "oneapi/tbb.h"   bool Data[1000][1000];   int main() { try { parallel_for( 0, 1000, 1, []( int i ) { task_group_context root(task_group_context::isolated); parallel_for( 0, 1000, 1, []( int ) { Data[i][j] = true; }, root); throw "oops"; }); } catch(...) { } return 0; } The example performs two parallel loops: an outer loop over ``i`` and inner loop over ``j``. The creation of the isolated ``task_group_context`` ``root`` protects the inner loop from downwards propagation of cancellation from the ``i`` loop. When the exception propagates to the outer loop, any pending ``outer`` iterations are cancelled, but not inner iterations for an outer iteration that started. Hence when the program completes, each row of ``Data`` may be different, depending upon whether its iteration ``i`` ran at all, but within a row, the elements will be homogeneously ``false`` or ``true``, not a mixture. Removing the blue text would permit cancellation to propagate down into the inner loop. In that case, a row of ``Data`` might end up with both ``true`` and ``false`` values. .. |image0| image:: Images/image013.jpg :width: 261px :height: 131px ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Concurrent_Queue_Classes.rst ================================================ .. _Concurrent_Queue_Classes: Concurrent Queue Classes ======================== Template class ``concurrent_queue`` implements a concurrent queue with values of type ``T``. Multiple threads may simultaneously push and pop elements from the queue. The queue is unbounded and has no blocking operations. The fundamental operations on it are ``push`` and ``try_pop``. The ``push`` operation works just like ``push`` for a std::queue. The operation ``try_pop`` pops an item if it is available. The check and popping have to be done in a single operation for sake of thread safety. For example, consider the following serial code: :: extern std::queue MySerialQueue; T item; if( !MySerialQueue.empty() ) { item = MySerialQueue.front(); MySerialQueue.pop_front(); ... process item... } Even if each std::queue method were implemented in a thread-safe manner, the composition of those methods as shown in the example would not be thread safe if there were other threads also popping from the same queue. For example, ``MySerialQueue.empty()`` might return true just before another thread snatches the last item from ``MySerialQueue``. The equivalent thread-safe |full_name| code is: :: extern concurrent_queue MyQueue; T item; if( MyQueue.try_pop(item) ) { ...process item... } In a single-threaded program, a queue is a first-in first-out structure. But if multiple threads are pushing and popping concurrently, the definition of "first" is uncertain. Use of ``concurrent_queue`` guarantees that if a thread pushes two values, and another thread pops those two values, they will be popped in the same order that they were pushed. Template class ``concurrent_queue`` is unbounded and has no methods that wait. It is up to the user to provide synchronization to avoid overflow, or to wait for the queue to become non-empty. Typically this is appropriate when the synchronization has to be done at a higher level. Template class ``concurrent_bounded_queue`` is a variant that adds blocking operations and the ability to specify a capacity. The methods of particular interest on it are: - ``pop(item)`` waits until it can succeed. - ``push(item)`` waits until it can succeed without exceeding the queue's capacity. - ``try_push(item)`` pushes ``item`` only if it would not exceed the queue's capacity. - size() returns a *signed* integer. The value of concurrent_queue::size() is defined as the number of push operations started minus the number of pop operations started. If pops outnumber pushes, ``size()`` becomes negative. For example, if a ``concurrent_queue`` is empty, and there are ``n`` pending pop operations, ``size()`` returns -\ ``n``. This provides an easy way for producers to know how many consumers are waiting on the queue. Method ``empty()`` is defined to be true if and only if ``size()`` is not positive. By default, a ``concurrent_bounded_queue`` is unbounded. It may hold any number of values, until memory runs out. It can be bounded by setting the queue capacity with method ``set_capacity``.Setting the capacity causes ``push`` to block until there is room in the queue. Bounded queues are slower than unbounded queues, so if there is a constraint elsewhere in your program that prevents the queue from becoming too large, it is better not to set the capacity. If you do not need the bounds or the blocking pop, consider using ``concurrent_queue`` instead. .. toctree:: :maxdepth: 4 ../tbb_userguide/Iterating_Over_a_Concurrent_Queue_for_Debugging ../tbb_userguide/When_Not_to_Use_Queues ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Constraints.rst ================================================ .. _Constraints: Constrained APIs ================ Starting from C++20, most of |full_name| APIs are constrained to enforce `named requirements `_ on template arguments types. The violations of these requirements are detected at a compile time during the template instantiation. .. rubric:: Example .. code:: cpp // Call for body(oneapi::tbb::blocked_range) is ill-formed // oneapi::tbb::parallel_for call results in constraint failure auto body = [](const int& r) { /*...*/ }; oneapi::tbb::parallel_for(oneapi::tbb::blocked_range{1, 10}, body); // Error example: // error: no matching function to call to oneapi::tbb::parallel_for // note: constraints not satisfied // note: the required expression 'body(range)' is invalid body(range); .. caution:: The code that violates named requirements but compiles successfully until C++20, may not compile in C++20 mode due to early and strict constraints diagnostics. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Containers.rst ================================================ .. _Containers: Containers ========== |full_name| provides highly concurrent container classes. These containers can be used with raw Windows\* OS or Linux\* OS threads, or in conjunction with task-based programming. A concurrent container allows multiple threads to concurrently access and update items in the container. Typical C++ STL containers do not permit concurrent update; attempts to modify them concurrently often result in corrupting the container. STL containers can be wrapped in a mutex to make them safe for concurrent access, by letting only one thread operate on the container at a time, but that approach eliminates concurrency, thus restricting parallel speedup. Containers provided by oneTBB offer a much higher level of concurrency, via one or both of the following methods: - **Fine-grained locking:** Multiple threads operate on the container by locking only those portions they really need to lock. As long as different threads access different portions, they can proceed concurrently. - **Lock-free techniques:** Different threads account and correct for the effects of other interfering threads. Notice that highly-concurrent containers come at a cost. They typically have higher overheads than regular STL containers. Operations on highly-concurrent containers may take longer than for STL containers. Therefore, use highly-concurrent containers when the speedup from the additional concurrency that they enable outweighs their slower sequential performance. .. CAUTION:: As with most objects in C++, the constructor or destructor of a container object must not be invoked concurrently with another operation on the same object. Otherwise the resulting race may cause the operation to be executed on an undefined object. .. toctree:: :maxdepth: 4 ../tbb_userguide/concurrent_hash_map ../tbb_userguide/concurrent_vector_ug ../tbb_userguide/Concurrent_Queue_Classes ../tbb_userguide/Summary_of_Containers ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Controlling_Chunking_os.rst ================================================ .. _Controlling_Chunking: Controlling Chunking ==================== Chunking is controlled by a *partitioner* and a *grainsize.*\ To gain the most control over chunking, you specify both. - Specify ``simple_partitioner()`` as the third argument to ``parallel_for``. Doing so turns off automatic chunking. - Specify the grainsize when constructing the range. The thread argument form of the constructor is ``blocked_range(begin,end,grainsize)``. The default value of ``grainsize`` is 1. It is in units of loop iterations per chunk. If the chunks are too small, the overhead may exceed the performance advantage. The following code is the last example from parallel_for, modified to use an explicit grainsize ``G``. :: #include "oneapi/tbb.h"   void ParallelApplyFoo( float a[], size_t n ) { parallel_for(blocked_range(0,n,G), ApplyFoo(a), simple_partitioner()); } The grainsize sets a minimum threshold for parallelization. The ``parallel_for`` in the example invokes ``ApplyFoo::operator()`` on chunks, possibly of different sizes. Let *chunksize* be the number of iterations in a chunk. Using ``simple_partitioner`` guarantees that [G/2] <= *chunksize* <= G. There is also an intermediate level of control where you specify the grainsize for the range, but use an ``auto_partitioner`` and ``affinity_partitioner``. An ``auto_partitioner`` is the default partitioner. Both partitioners implement the automatic grainsize heuristic described in :ref:`Automatic_Chunking`. An ``affinity_partitioner`` implies an additional hint, as explained later in Section :ref:`Bandwidth_and_Cache_Affinity`. Though these partitioners may cause chunks to have more than G iterations, they never generate chunks with less than [G/2] iterations. Specifying a range with an explicit grainsize may occasionally be useful to prevent these partitioners from generating wastefully small chunks if their heuristics fail. Because of the impact of grainsize on parallel loops, it is worth reading the following material even if you rely on ``auto_partitioner`` and ``affinity_partitioner`` to choose the grainsize automatically. .. container:: tablenoborder .. list-table:: :header-rows: 1 * - |image0| - |image1| * - Case A - Case B The above figure illustrates the impact of grainsize by showing the useful work as the gray area inside a brown border that represents overhead. Both Case A and Case B have the same total gray area. Case A shows how too small a grainsize leads to a relatively high proportion of overhead. Case B shows how a large grainsize reduces this proportion, at the cost of reducing potential parallelism. The overhead as a fraction of useful work depends upon the grainsize, not on the number of grains. Consider this relationship and not the total number of iterations or number of processors when setting a grainsize. A rule of thumb is that ``grainsize`` iterations of ``operator()`` should take at least 100,000 clock cycles to execute. For example, if a single iteration takes 100 clocks, then the ``grainsize`` needs to be at least 1000 iterations. When in doubt, do the following experiment: #. Set the ``grainsize`` parameter higher than necessary. The grainsize is specified in units of loop iterations. If you have no idea of how many clock cycles an iteration might take, start with ``grainsize``\ =100,000. The rationale is that each iteration normally requires at least one clock per iteration. In most cases, step 3 will guide you to a much smaller value. #. Run your algorithm. #. Iteratively halve the ``grainsize`` parameter and see how much the algorithm slows down or speeds up as the value decreases. A drawback of setting a grainsize too high is that it can reduce parallelism. For example, if the grainsize is 1000 and the loop has 2000 iterations, the ``parallel_for`` distributes the loop across only two processors, even if more are available. However, if you are unsure, err on the side of being a little too high instead of a little too low, because too low a value hurts serial performance, which in turns hurts parallel performance if there is other parallelism available higher up in the call tree. .. tip:: You do not have to set the grainsize too precisely. The next figure shows the typical "bathtub curve" for execution time versus grainsize, based on the floating point ``a[i]=b[i]*c`` computation over a million indices. There is little work per iteration. The times were collected on a four-socket machine with eight hardware threads. .. container:: fignone :name: fig2 Wall Clock Time Versus Grainsize |image2| The scale is logarithmic. The downward slope on the left side indicates that with a grainsize of one, most of the overhead is parallel scheduling overhead, not useful work. An increase in grainsize brings a proportional decrease in parallel overhead. Then the curve flattens out because the parallel overhead becomes insignificant for a sufficiently large grainsize. At the end on the right, the curve turns up because the chunks are so large that there are fewer chunks than available hardware threads. Notice that a grainsize over the wide range 100-100,000 works quite well. .. tip:: A general rule of thumb for parallelizing loop nests is to parallelize the outermost one possible. The reason is that each iteration of an outer loop is likely to provide a bigger grain of work than an iteration of an inner loop. .. |image0| image:: Images/image002.jpg :width: 161px :height: 163px .. |image1| image:: Images/image004.jpg :width: 157px :height: 144px .. |image2| image:: Images/image006.jpg :width: 462px :height: 193px ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Cook_Until_Done_parallel_do.rst ================================================ .. _Cook_Until_Done_parallel_do: Cook Until Done: parallel_for_each ================================== For some loops, the end of the iteration space is not known in advance, or the loop body may add more iterations to do before the loop exits. You can deal with both situations using the template class ``oneapi::tbb::parallel_for_each``. A linked list is an example of an iteration space that is not known in advance. In parallel programming, it is usually better to use dynamic arrays instead of linked lists, because accessing items in a linked list is inherently serial. But if you are limited to linked lists, the items can be safely processed in parallel, and processing each item takes at least a few thousand instructions, you can use ``parallel_for_each`` to gain some parallelism. For example, consider the following serial code: :: void SerialApplyFooToList( const std::list& list ) { for( std::list::const_iterator i=list.begin() i!=list.end(); ++i ) Foo(*i); } If ``Foo`` takes at least a few thousand instructions to run, you can get parallel speedup by converting the loop to use ``parallel_for_each``. To do so, define an object with a ``const`` qualified ``operator()``. This is similar to a C++ function object from the C++ standard header ````, except that ``operator()`` must be ``const``. :: class ApplyFoo { public: void operator()( Item& item ) const { Foo(item); } }; The parallel form of ``SerialApplyFooToList`` is as follows: :: void ParallelApplyFooToList( const std::list& list ) { parallel_for_each( list.begin(), list.end(), ApplyFoo() ); } An invocation of ``parallel_for_each`` never causes two threads to act on an input iterator concurrently. Thus typical definitions of input iterators for sequential programs work correctly. This convenience makes ``parallel_for_each`` unscalable, because the fetching of work is serial. But in many situations, you still get useful speedup over doing things sequentially. There are two ways that ``parallel_for_each`` can acquire work scalably. - The iterators can be random-access iterators. - The body argument to ``parallel_for_each``, if it takes a second argument *feeder* of type ``parallel_for_each&``, can add more work by calling ``feeder.add(item)``. For example, suppose processing a node in a tree is a prerequisite to processing its descendants. With ``parallel_for_each``, after processing a node, you could use ``feeder.add`` to add the descendant nodes. The instance of ``parallel_for_each`` does not terminate until all items have been processed. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Data_Flow_Graph.rst ================================================ .. _Data_Flow_Graph: Data Flow Graph =============== In a data flow graph, nodes are computations that send and receive data messages. Some nodes may only send messages, others may only receive messages, and others may send messages in response to messages that they receive. In the following data flow graph, the left-most node generates the integer values from 1 to 10 and passes them to two successor nodes. One of the successors squares each value it receives and passes the result downstream. The second successor cubes each value it receives and passes the result downstream. The right-most node receives values from both of the middle nodes. As it receives each value, it adds it to a running sum of values. When the application is run to completion, the value of sum will be equal to the sum of the sequence of squares and cubes from 1 to 10. .. container:: fignone :name: simple_data_flow_title Simple Data Flow Graph .. container:: imagecenter |image0| The following code snippet shows an implementation of the **Simple Data Flow Graph** shown above: :: int sum = 0; graph g; function_node< int, int > squarer( g, unlimited, [](const int &v) { return v*v; } ); function_node< int, int > cuber( g, unlimited, [](const int &v) { return v*v*v; } ); function_node< int, int > summer( g, 1, [&](const int &v ) -> int { return sum += v; } ); make_edge( squarer, summer ); make_edge( cuber, summer ); for ( int i = 1; i <= 10; ++i ) { squarer.try_put(i); cuber.try_put(i); } g.wait_for_all(); cout << "Sum is " << sum << "\n"; In the implementation above, the following function_nodes are created: - one to square values - one to cube values - one to add values to the global sum Since the squarer and cuber nodes are side-effect free, they are created with an unlimited concurrency. The summer node updates the sum through a reference to a global variable and therefore is not safe to execute in parallel. It is therefore created with a concurrency limit of 1. The node F from **Simple Data Flow Graph** above is implemented as a loop that puts messages to both the squarer and cuber node. A slight improvement over the first implementation is to introduce an additional node type, a ``broadcast_node``. A ``broadcast_node`` broadcasts any message it receives to all of its successors. This enables replacing the two ``try_put``'s in the loop with a single ``try_put``: :: broadcast_node b(g); make_edge( b, squarer ); make_edge( b, cuber ); for ( int i = 1; i <= 10; ++i ) { b.try_put(i); } g.wait_for_all(); An even better option, which will make the implementation even more like the **Simple Data Flow Graph** above, is to introduce an ``input_node``. An ``input_node``, as the name implies only sends messages and does not receive messages. Its constructor takes two arguments: :: template< typename Body > input_node( graph &g, Body body) The body is a function object, or lambda expression, that contains a function operator: :: Output Body::operator()( oneapi::tbb::flow_control &fc ); You can replace the loop in the example with an ``input_node`` :: input_node< int > src( g, src_body(10) ); make_edge( src, squarer ); make_edge( src, cuber ); src.activate(); g.wait_for_all(); The runtime library will repeatedly invoke the function operator in ``src_body`` until ``fc.stop()`` is invoked inside the body. You therefore need to create body that will act like the body of the loop in the **Simple Data Flow Graph** above. The final implementation after all of these changes is shown below: :: class src_body { const int my_limit; int my_next_value; public: src_body(int l) : my_limit(l), my_next_value(1) {} int operator()( oneapi::tbb::flow_control& fc ) { if ( my_next_value <= my_limit ) { return my_next_value++; } else { fc.stop(); return int(); } } }; int main() { int sum = 0; graph g; function_node< int, int > squarer( g, unlimited, [](const int &v) { return v*v; } ); function_node< int, int > cuber( g, unlimited, [](const int &v) { return v*v*v; } ); function_node< int, int > summer( g, 1, [&](const int &v ) -> int { return sum += v; } ); make_edge( squarer, summer ); make_edge( cuber, summer ); input_node< int > src( g, src_body(10) ); make_edge( src, squarer ); make_edge( src, cuber ); src.activate(); g.wait_for_all(); cout << "Sum is " << sum << "\n"; } This final implementation has all of the nodes and edges from the **Simple Data Flow Graph** above. In this simple example, there is not much advantage in using an ``input_node`` over an explicit loop. But, because an ``input_node`` is able to react to the behavior of downstream nodes, it can limit memory use in more complex graphs. For more information, see:ref:`create_token_based_system` . .. |image0| image:: Images/flow_graph.jpg ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Debug_Versus_Release_Libraries.rst ================================================ .. _Debug_Versus_Release_Libraries: Debug Versus Release Libraries ============================== The following table details the |full_name| dynamic shared libraries that come in debug and release versions. .. container:: tablenoborder .. list-table:: :header-rows: 1 * - Library - Description - When to Use * - | ``tbb_debug`` | ``tbbmalloc_debug`` | ``tbbmalloc_proxy_debug`` | ``tbbbind_debug`` - These versions have extensive internal checking for correct use of the library. - Use with code that is compiled with the macro ``TBB_USE_DEBUG`` set to 1. * - | ``tbb`` | ``tbbmalloc`` | ``tbbmalloc_proxy`` | ``tbbbind`` - These versions deliver top performance. They eliminate most checking for correct use of the library. - Use with code compiled with ``TBB_USE_DEBUG`` undefined or set to zero. .. tip:: Test your programs with the debug versions of the libraries first, to assure that you are using the library correctly.  With the release versions, incorrect usage may result in unpredictable program behavior. oneTBB supports Intel® Inspector, Intel® VTune™ Profiler and Intel® Advisor. Full support of these tools requires compiling with macro ``TBB_USE_PROFILING_TOOLS=1``. That symbol defaults to 1 in the following conditions: - When ``TBB_USE_DEBUG=1``. - On the Microsoft Windows\* operating system, when ``_DEBUG=1``. The :ref:`reference` section explains the default values in more detail. .. CAUTION:: The instrumentation support for Intel® Inspector becomes live after the first initialization of the task library. If the library components are used before this initialization occurs, Intel® Inspector may falsely report race conditions that are not really races. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Dependence_Graph.rst ================================================ .. _Dependence_Graph: Dependence Graph ================ In a dependence graph, the nodes invoke body objects to perform computations and the edges create a partial ordering of these computations. At runtime, the library spawns and schedules tasks to execute the body objects when it is legal to do so according to the specified partial ordering. The following figure shows an example of an application that could be expressed using a dependence graph. .. container:: fignone :name: dependence_graph_make_sandwitch Dependence Graph for Making a Sandwich .. container:: imagecenter |image0| Dependence graphs are a special case of data flow graphs, where the data passed between nodes are of type ``oneapi::tbb::flow::continue_msg``. Unlike a general data flow graph, nodes in a dependence graph do not spawn a task for each message they receive. Instead, they are aware of the number of predecessors they have, count the messages they receive and only spawn a task to execute their body when this count is equal to the total number of their predecessors. The following figure shows another example of a dependence graph. It has the same topology as the figure above, but with simple functions replacing the sandwich making steps. In this partial ordering, function A must complete executing before any other computation starts executing. Function B must complete before C and D start executing; and E must complete before D and F start executing. This is a partial ordering because, for example, there is no explicit ordering requirement between B and E or C and F. .. container:: fignone :name: simple_dependence_graph Simple Dependence Graph .. container:: imagecenter |image1| To implement this as a flow graph, continue_node objects are used for the nodes and ``continue_msg`` objects as the messages. A continue_node constructor takes two arguments: :: template< typename Body > continue_node( graph &g, Body body) The first argument is the graph it belongs to and the second is a function object or lambda expression. Unlike a ``function_node``, a ``continue_node`` is always assumed to have unlimited concurrency and will immediately spawn a task whenever its dependencies are met. The following code snippet is an implementation of the example in this figure. :: typedef continue_node< continue_msg > node_t; typedef const continue_msg & msg_t; int main() { oneapi::tbb::flow::graph g; node_t A(g, [](msg_t){ a(); } ); node_t B(g, [](msg_t){ b(); } ); node_t C(g, [](msg_t){ c(); } ); node_t D(g, [](msg_t){ d(); } ); node_t E(g, [](msg_t){ e(); } ); node_t F(g, [](msg_t){ f(); } ); make_edge(A, B); make_edge(B, C); make_edge(B, D); make_edge(A, E); make_edge(E, D); make_edge(E, F); A.try_put( continue_msg() ); g.wait_for_all(); return 0; } One possible execution of this graph is shown below. The execution of D does not start until both B and E are finished. While a task is waiting in the ``wait_for_all``, its thread can participate in executing other tasks from the oneTBB work pool. .. container:: fignone Execution Timeline for a Dependence Graph .. container:: imagecenter |image2| Again, it is important to note that all execution in the flow graph happens asynchronously. The call to ``A.try_put`` returns control to the calling thread quickly, after incrementing the counter and spawning a task to execute the body of A. Likewise, the body tasks execute the lambda expressions and then put a continue_msg to all successor nodes, if any. Only the call to ``wait_for_all`` blocks, as it should, and even in this case the calling thread may be used to execute tasks from the oneTBB work pool while it is waiting. The above timeline shows the sequence when there are enough threads to execute all of the tasks that can be executed concurrently in parallel. If there are fewer threads, then some tasks that are spawned will need to wait until a thread is available to execute them. .. |image0| image:: Images/flow_graph_complex.jpg :width: 440px :height: 337px .. |image1| image:: Images/dependence_graph.jpg .. |image2| image:: Images/execution_timeline_dependence.jpg ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Edges.rst ================================================ .. _Edges: Flow Graph Basics: Edges ======================== Most applications contain multiple nodes with edges connecting them to each other. In the flow graph interface, edges are directed channels over which messages are passed. They are created by calling the function ``make_edge( p, s )`` with two arguments: ``p``, the predecessor, and ``s``, the successor. You can modify the example used in the :ref:`Nodes` topic to include a second node that squares the value it receives before printing it and then connect that to the first node with an edge. :: graph g; function_node< int, int > n( g, unlimited, []( int v ) -> int { cout << v; spin_for( v ); cout << v; return v; } ); function_node< int, int > m( g, 1, []( int v ) -> int { v *= v; cout << v; spin_for( v ); cout << v; return v; } ); make_edge( n, m ); n.try_put( 1 ); n.try_put( 2 ); n.try_put( 3 ); g.wait_for_all(); Now there are two ``function_node`` ``s``, ``n`` and ``m``. The call to ``make_edge`` creates an edge from ``n`` to ``m``. The node ``n`` is created with unlimited concurrency, while ``m`` has a concurrency limit of 1. The invocations of ``n`` can all proceed in parallel, while the invocations of ``m`` will be serialized. Because there is an edge from ``n`` to ``m``, each value ``v``, returned by ``n``, will be automatically passed to node ``m`` by the runtime library. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Exceptions_and_Cancellation.rst ================================================ .. _Exceptions_and_Cancellation: Exceptions and Cancellation =========================== |full_name| supports exceptions and cancellation. When code inside an oneTBB algorithm throws an exception, the following steps generally occur: #. The exception is captured. Any further exceptions inside the algorithm are ignored. #. The algorithm is cancelled. Pending iterations are not executed. If there is oneTBB parallelism nested inside, the nested parallelism may also be cancelled as explained in :ref:`Cancellation_and_Nested_Parallelism`. #. Once all parts of the algorithm stop, an exception is thrown on the thread that invoked the algorithm. As compilers evolve to support this functionality, future versions of oneTBB might throw the original exception. So be sure your code can catch either type of exception. The following example demonstrates exception handling: :: #include "oneapi/tbb.h" #include #include   using namespace oneapi::tbb; using namespace std;   vector Data;   struct Update { void operator()( const blocked_range& r ) const { for( int i=r.begin(); i!=r.end(); ++i ) Data.at(i) += 1; } };   int main() { Data.resize(1000); try { parallel_for( blocked_range(0, 2000), Update()); } catch( out_of_range& ex ) { cout << "out_of_range: " << ex.what() << endl; } return 0; } The ``parallel_for`` attempts to iterate over 2000 elements of a vector with only 1000 elements. Hence the expression ``Data.at(i)`` sometimes throws an exception ``std::out_of_range`` during execution of the algorithm. When the exception happens, the algorithm is cancelled and an exception thrown at the call site to ``parallel_for``. .. toctree:: :maxdepth: 4 ../tbb_userguide/Cancellation_Without_An_Exception ../tbb_userguide/Cancellation_and_Nested_Parallelism ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Floating_Point_Settings.rst ================================================ .. _Floating_Point_Settings: Floating-point Settings ======================= To propagate CPU-specific settings for floating-point computations to tasks executed by the task scheduler, you can use one of the following two methods: * When a ``task_arena`` or a task scheduler for a given application thread is initialized, they capture the current floating-point settings of the thread. * The ``task_group_context`` class has a method to capture the current floating-point settings. By default, worker threads use floating-point settings obtained during the initialization of a ``task_arena`` or the implicit arena of the application thread. The settings are applied to all computations within that ``task_arena`` or started by that application thread. For better control over floating point behavior, a thread may capture the current settings in a task group context. Do it at context creation with a special flag passed to the constructor: :: task_group_context ctx( task_group_context::isolated, task_group_context::default_traits | task_group_context::fp_settings ); Or call the ``capture_fp_settings`` method: :: task_group_context ctx; ctx.capture_fp_settings(); You can then pass the task group context to most parallel algorithms, including ``flow::graph``, to ensure that all tasks related to this algorithm use the specified floating-point settings. It is possible to execute the parallel algorithms with different floating-point settings captured to separate contexts, even at the same time. Floating-point settings captured to a task group context prevail over the settings captured during task scheduler initialization. It means, if a context is passed to a parallel algorithm, the floating-point settings captured to the context are used. Otherwise, if floating-point settings are not captured to the context, or a context is not explicitly specified, the settings captured during the task arena initialization are used. In a nested call to a parallel algorithm that does not use the context of a task group with explicitly captured floating-point settings, the outer-level settings are used. If none of the outer-level contexts capture floating-point settings, the settings captured during task arena initialization are used. It guarantees that: * Floating-point settings are applied to all tasks executed within a task arena, if they are captured: * To a task group context. * During the arena initialization. * A call to a oneTBB parallel algorithm does not change the floating-point settings of the calling thread, even if the algorithm uses different settings. .. note:: The guarantees above apply only to the following conditions: * A user code inside a task should: * Not change the floating-point settings. * Revert any modifications. * Restore previous settings before the end of the task. * oneTBB task scheduler observers are not used to set or modify floating point settings. Otherwise, the stated guarantees are not valid and the behavior related to floating-point settings is undefined. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Flow-Graph-exception-tips.rst ================================================ .. _Flow_Graph_exception_tips: Flow Graph Tips for Exception Handling and Cancellation ======================================================= The execution of a flow graph can be canceled directly or as a result of an exception that propagates beyond a node's body. You can then optionally reset the graph so that it can be re-executed. .. toctree:: :maxdepth: 4 ../tbb_userguide/catching_exceptions ../tbb_userguide/cancel_a_graph ../tbb_userguide/use_graph_reset ../tbb_userguide/cancelling_nested_parallelism ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Flow-Graph-waiting-tips.rst ================================================ .. _Flow_Graph_waiting_tips: Flow Graph Tips for Waiting for and Destroying a Flow Graph =========================================================== .. toctree:: :maxdepth: 4 ../tbb_userguide/always_use_wait_for_all ../tbb_userguide/avoid_dynamic_node_removal ../tbb_userguide/destroy_graphs_outside_main_thread ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Flow_Graph.rst ================================================ .. _Flow_Graph: Parallelizing Data Flow and Dependence Graphs ============================================= .. toctree:: :maxdepth: 4 ../tbb_userguide/Parallelizing_Flow_Graph ../tbb_userguide/Basic_Flow_Graph_concepts ../tbb_userguide/Graph_Main_Categories ../tbb_userguide/Predefined_Node_Types ../tbb_userguide/Flow_Graph_Tips ../tbb_userguide/estimate_flow_graph_performance ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Flow_Graph_Buffering_in_Nodes.rst ================================================ .. _Flow_Graph_Buffering_in_Nodes: Flow Graph Basics: Buffering and Forwarding =========================================== |full_name| flow graph nodes use messages to communicate data and to enforce dependencies. If a node passes a message successfully to any successor, no further action is taken with the message by that node. As noted in the section on Single-push vs. Broadcast-push, a message may be passed to one or to multiple successors, depending on the type of the node, how many successors are connected to the node, and whether the message is pushed or pulled. There are times when a node cannot successfully push a message to any successor. In this case what happens to the message depends on the type of the node. The two possibilities are: - The node stores the message to be forwarded later. - The node discards the message. If a node discards messages that are not forwarded, and this behavior is not desired, the node should be connected to a buffering node that does store messages that cannot be pushed. If a message has been stored by a node, there are two ways it can be passed to another node: - A successor to the node can pull the message using ``try_get()`` or ``try_reserve()``. - A successor can be connected using ``make_edge()``. If a ``try_get()`` successfully forwards a message, it is removed from the node that stored it. If a node is connected using ``make_edge`` the node will attempt to push a stored message to the new successor. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Flow_Graph_Message_Passing_Protocol.rst ================================================ .. _Flow_Graph_Message_Passing_Protocol: Flow Graph Basics: Message Passing Protocol =========================================== |full_name| flow graph operates by passing messages between nodes. A node may not be able to receive and process a message from its predecessor. For a graph to operate most-efficiently, if this occurs the state of the edge between the nodes can change its state to pull so when the successor is able to handle a message it can query its predecessor to see if a message is available. If the edge did not reverse from push to pull, the predecessor node would have to repeatedly attempt to forward its message until the successor accepts it. This would consume resources needlessly. Once the edge is in pull mode, when the successor is not busy, it will try to pull a message from a predecessor. #. If a predecessor has a message, the successor will process it and the edge will remain in pull mode. #. If the predecessor has no message, the edge between the nodes will switch from pull to push mode. The state diagram of this Push-Pull protocol is: .. container:: fignone :name: basic_push_pull **The dynamic push / pull protocol.** .. container:: imagecenter |image0| .. |image0| image:: Images/flow_graph_message_passing_protocol.jpg :width: 442px :height: 196px ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Flow_Graph_Reservation.rst ================================================ .. _Flow_Graph_Reservation: Flow Graph Basics: Reservation ============================== |full_name| flow graph ``join_node`` has four possible policies: ``queueing``, ``reserving``, ``key_matching`` and ``tag_matching``. ``join_nodes`` need messages at every input before they can create an output message. The reserving ``join_node`` does not have internal buffering, and it does not pull messages from its inputs until it has a message at each input. To create an output message it temporarily reserves a message at each input port, and only if all input ports succeed reserving messages will an output message be created. If any input port fails to reserve a message, no message will be pulled by the ``join_node``. To support the reserving ``join_node`` some nodes support **reservation** of their outputs. The way reservation works is: - When a node connected to a reserving ``join_node`` in push state tries to push a message, the ``join_node`` always rejects the push and the edge connecting the nodes is switched to pull mode. - The reserving input port calls ``try_reserve`` on each edge in pull state. This may fail; if so, the reserving input port switches that edge to push state, and tries to reserve the next node connected by an edge in pull state. While the input port's predecessor is in reserved state, no other node can retrieve the reserved value. - If each input port successfully reserves an edge in pull state, the reserving ``join_node`` will create a message using the reserved messages and try to push the resulting message to any nodes connected to it. - If the message is successfully pushed to a successor, the predecessors that were reserved are signaled that the messages were used (by calling ``try_consume()``.) Those messages will be discarded by the predecessor nodes, because they have been successfully pushed. - If the message was not successfully pushed to any successor, the predecessors that were reserved are signaled that the messages were not used (by calling ``try_release()``.) At this point, the messages may be pushed to or pulled by other nodes. Because the reserving ``join_node`` will only attempt to push when each input port has at least one edge in a pull state, and will only attempt to create and push a message if all input ports succeed reserving messages, at least one of the predecessors to each of the reserving ``join_node`` input ports must be reservable. The following example demonstrates a reserving ``join_node``'s behavior. ``buffer_nodes`` buffer their output, so they accept a switch of their output edge from push to pull mode. ``broadcast_nodes`` do not buffer messages and do not support ``try_get()`` or ``try_reserve()``. :: void run_example2() { // example for Flow_Graph_Reservation.xml graph g; broadcast_node bn(g); buffer_node buf1(g); buffer_node buf2(g); typedef join_node, reserving> join_type; join_type jn(g); buffer_node buf_out(g); join_type::output_type tuple_out; int icnt; // join_node predecessors are both reservable buffer_nodes make_edge(buf1,input_port<0>(jn)); make_edge(bn,input_port<0>(jn)); // attach a broadcast_node make_edge(buf2,input_port<1>(jn)); make_edge(jn, buf_out); bn.try_put(2); buf1.try_put(3); buf2.try_put(4); buf2.try_put(7); g.wait_for_all(); while (buf_out.try_get(tuple_out)) { printf("join_node output == (%d,%d)\n",get<0>(tuple_out), get<1>(tuple_out) ); } if(buf1.try_get(icnt)) printf("buf1 had %d\n", icnt); else printf("buf1 was empty\n"); if(buf2.try_get(icnt)) printf("buf2 had %d\n", icnt); else printf("buf2 was empty\n"); } In the example above, port 0 of the reserving ``join_node`` ``jn`` has two predecessors: a ``buffer_node`` ``buf1`` and a ``broadcast_node`` ``bn``. Port 1 of the ``join_node`` has one predecessor, ``buffer_node`` ``buf2``. .. container:: fignone :name: reserve_step1 .. container:: imagecenter |image0| We will discuss one possible execution sequence (the scheduling of tasks may differ slightly, but the end result will be the same.) :: bn.try_put(2); ``bn`` attempts to forward 2 to ``jn``. ``jn`` does not accept the value and the arc from ``bn`` to ``jn`` reverses. Because neither bn nor jn buffer messages, the message is dropped. Because not all the inputs to ``jn`` have available predecessors, ``jn`` does nothing further. .. CAUTION:: Any node which does not support reservation will not work correctly when attached to a reserving ``join_node``. This program demonstrates why this occurs; connecting non-reserving nodes to nodes requiring support for reservation is **not** recommended practice. .. container:: fignone :name: reserve_step2 .. container:: imagecenter |image1| :: buf1.try_put(3); ``buf1`` attempts to forward 3 to ``jn``. ``jn`` does not accept the value and the arc from ``buf1`` to ``jn`` reverses. Because not all the inputs to ``jn`` have available predecessors, ``jn`` does nothing further. .. container:: fignone :name: reserve_step3 .. container:: imagecenter |image2| :: buf2.try_put(4); ``buf2`` attempts to forward 4 to ``jn``. ``jn`` does not accept the value and the arc from ``buf2`` to ``jn`` reverses. Now both inputs of ``jn`` have predecessors, a task to build and forward a message from ``jn`` will be spawned. We assume that task is not yet executing. .. container:: fignone :name: reserve_step4 .. container:: imagecenter |image3| :: buf2.try_put(7); ``buf2`` has no successor (because the arc to ``jn`` is reversed,) so it stores the value 7. .. container:: fignone :name: reserve_step5 .. container:: imagecenter |image4| Now the task spawned to run ``jn`` runs. - ``jn`` tries to reserve ``bn``, which fails. The arc to ``bn`` switches back to the forward direction. - ``jn`` tries to reserve ``buf1``, which succeeds (reserved nodes are colored grey.) ``jn`` receives the value 3 from ``buf1``, but it remains in ``buf1`` (in case the attempt to forward a message from ``jn`` fails.) - ``jn`` tries to reserve ``buf2``, which succeeds. ``jn`` receives the value 4 from ``buf2``, but it remains in ``buf2``. - ``jn`` constructs the output message ``tuple<3,4>``. .. container:: fignone :name: reserve_step6 .. container:: imagecenter |image5| Now ``jn`` pushes its message to ``buf_out``, which accepts it. Because the push succeeded, ``jn`` signals ``buf1`` and ``buf2`` that the reserved values were used, and the buffers discard those values. Now ``jn`` attempts to reserve again. - No attempt to pull from ``bn`` is made, because the edge from ``bn`` to ``jn`` is in push state. - ``jn`` tries to reserve ``buf1``, which fails. The arc to ``buf1`` switches back to the forward direction. - ``jn`` does not try any further actions. .. container:: fignone :name: reserve_step7 .. container:: imagecenter |image6| No further activity occurs in the graph, and the ``wait_for_all()`` will complete. The output of this code is :: join_node output == (3,4) buf1 was empty buf2 had 7 .. |image0| image:: Images/flow_graph_reserve_buffers_1.png :width: 400px :height: 222px .. |image1| image:: Images/flow_graph_reserve_buffers_2.png :width: 400px :height: 222px .. |image2| image:: Images/flow_graph_reserve_buffers_3.png :width: 400px :height: 222px .. |image3| image:: Images/flow_graph_reserve_buffers_4.png :width: 400px :height: 222px .. |image4| image:: Images/flow_graph_reserve_buffers_5.png :width: 400px :height: 222px .. |image5| image:: Images/flow_graph_reserve_buffers_6.png :width: 400px :height: 222px .. |image6| image:: Images/flow_graph_reserve_buffers_7.png :width: 400px :height: 222px ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Flow_Graph_Single_Vs_Broadcast.rst ================================================ .. _Flow_Graph_Single_Vs_Broadcast: Flow Graph Basics: Single-push vs. Broadcast-push ================================================= Nodes in the |full_name| flow graph communicate by pushing and pulling messages. Two policies for pushing messages are used, depending on the type of the node: - **single-push**: No matter how many successors to the node exist and are able to accept a message, each message will be only sent to one successor. - **broadcast-push**: A message will be pushed to every successor which is connected to the node by an edge in push mode, and which accepts the message. The following code demonstrates this difference: :: using namespace oneapi::tbb::flow; std::atomic g_cnt; struct fn_body1 { std::atomic &body_cnt; fn_body1(std::atomic &b_cnt) : body_cnt(b_cnt) {} continue_msg operator()( continue_msg /*dont_care*/) { ++g_cnt; ++body_cnt; return continue_msg(); } }; void run_example1() { // example for Flow_Graph_Single_Vs_Broadcast.xml graph g; std::atomic b1; // local counts std::atomic b2; // for each function _node body std::atomic b3; // function_node f1(g,serial,fn_body1(b1)); function_node f2(g,serial,fn_body1(b2)); function_node f3(g,serial,fn_body1(b3)); buffer_node buf1(g); // // single-push policy // g_cnt = b1 = b2 = b3 = 0; make_edge(buf1,f1); make_edge(buf1,f2); make_edge(buf1,f3); buf1.try_put(continue_msg()); buf1.try_put(continue_msg()); buf1.try_put(continue_msg()); g.wait_for_all(); printf( "after single-push test, g_cnt == %d, b1==%d, b2==%d, b3==%d\n", (int)g_cnt, (int)b1, (int)b2, (int)b3); remove_edge(buf1,f1); remove_edge(buf1,f2); remove_edge(buf1,f3); // // broadcast-push policy // broadcast_node bn(g); g_cnt = b1 = b2 = b3 = 0; make_edge(bn,f1); make_edge(bn,f2); make_edge(bn,f3); bn.try_put(continue_msg()); bn.try_put(continue_msg()); bn.try_put(continue_msg()); g.wait_for_all(); printf( "after broadcast-push test, g_cnt == %d, b1==%d, b2==%d, b3==%d\n", (int)g_cnt, (int)b1, (int)b2, (int)b3); } The output of this code is :: after single-push test, g_cnt == 3, b1==3, b2==0, b3==0 after broadcast-push test, g_cnt == 9, b1==3, b2==3, b3==3 The single-push test uses a ``buffer_node``, which has a "single-push" policy for forwarding messages. Putting three messages to the ``buffer_node`` results in three messages being pushed. Notice also only the first ``function_node`` is sent to; in general there is no policy for which node is pushed to if more than one successor can accept. The broadcast-push test uses a ``broadcast_node``, which will push any message it receives to all accepting successors. Putting three messages to the ``broadcast_node`` results in a total of nine messages pushed to the ``function_nodes``. Only nodes designed to buffer (hold and forward received messages) have a "single-push" policy; all other nodes have a "broadcast-push" policy. Please see the :ref:`broadcast_or_send` section of :ref:`Flow_Graph_Tips`, and :ref:`Flow_Graph_Buffering_in_Nodes` for more information. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Flow_Graph_Tips.rst ================================================ .. _Flow_Graph_Tips: Flow Graph Tips and Tricks ========================== .. toctree:: :maxdepth: 4 ../tbb_userguide/Flow-Graph-waiting-tips ../tbb_userguide/Flow_Graph_making_edges_tips ../tbb_userguide/Flow_Graph_nested_parallelism_tips ../tbb_userguide/Flow_Graph_resource_tips ../tbb_userguide/Flow-Graph-exception-tips ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Flow_Graph_exception_tips.rst ================================================ .. _Flow_Graph_exception_tips: Flow Graph Tips for Exception Handling and Cancellation ======================================================= The execution of a flow graph can be canceled directly or as a result of an exception that propagates beyond a node's body. You can then optionally reset the graph so that it can be re-executed. .. toctree:: :maxdepth: 4 ../tbb_userguide/catching_exceptions ../tbb_userguide/cancel_a_graph ../tbb_userguide/use_graph_reset ../tbb_userguide/cancelling_nested_parallelism ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Flow_Graph_making_edges_tips.rst ================================================ .. _Flow_Graph_making_edges_tips: Flow Graph Tips on Making Edges =============================== .. toctree:: :maxdepth: 4 ../tbb_userguide/use_make_edge ../tbb_userguide/broadcast_or_send ../tbb_userguide/communicate_with_nodes ../tbb_userguide/use_input_node ../tbb_userguide/avoiding_data_races ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Flow_Graph_nested_parallelism_tips.rst ================================================ .. _Flow_Graph_nested_parallelism_tips: Flow Graph Tips on Nested Parallelism ===================================== .. toctree:: :maxdepth: 4 ../tbb_userguide/use_nested_algorithms ../tbb_userguide/use_nested_flow_graphs ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Flow_Graph_resource_tips.rst ================================================ .. _Flow_Graph_resource_tips: Flow Graph Tips for Limiting Resource Consumption ================================================= You may want to control the number of messages allowed to enter parts of your graph, or control the maximum number of tasks in the work pool. There are several mechanisms available for limiting resource consumption in a flow graph. .. toctree:: :maxdepth: 4 ../tbb_userguide/use_limiter_node ../tbb_userguide/use_concurrency_limits ../tbb_userguide/create_token_based_system ../tbb_userguide/attach_flow_graph_to_arena ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Flow_Graph_waiting_tips.rst ================================================ .. _Flow_Graph_waiting_tips: Flow Graph Tips for Waiting for and Destroying a Flow Graph =========================================================== .. toctree:: :maxdepth: 4 ../tbb_userguide/always_use_wait_for_all ../tbb_userguide/avoid_dynamic_node_removal ../tbb_userguide/destroy_graphs_outside_main_thread ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Graph_Main_Categories.rst ================================================ .. _Graph_Main_Categories: Graph Application Categories ============================ Most flow graphs fall into one of two categories: - **Data flow graphs.** In this type of graph, data is passed along the graph's edges. The nodes receive, transform and then pass along the data messages. - **Dependence graphs.** In this type of graph, the data operated on by the nodes is obtained through shared memory directly and is not passed along the edges. .. toctree:: :maxdepth: 4 ../tbb_userguide/Data_Flow_Graph ../tbb_userguide/Dependence_Graph ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Graph_Object.rst ================================================ .. _Graph_Object: Flow Graph Basics: Graph Object =============================== Conceptually a flow graph is a collection of nodes and edges. Each node belongs to exactly one graph and edges are made only between nodes in the same graph. In the flow graph interface, a graph object represents this collection of nodes and edges, and is used for invoking whole graph operations such as waiting for all tasks related to the graph to complete, resetting the state of all nodes in the graph, and canceling the execution of all nodes in the graph. The code below creates a graph object and then waits for all tasks spawned by the graph to complete. The call to ``wait_for_all`` in this example returns immediately since this is a trivial graph with no nodes or edges, and therefore no tasks are spawned. :: graph g; g.wait_for_all(); The graph object does not own the nodes associated with it. You need to make sure that the graph object's lifetime is longer than the lifetimes of all nodes added to the graph and any activity associated with the graph. .. tip:: Call ``wait_for_all`` on a graph object before destroying it to make sure all activities are complete. Even when using smart pointers, be aware of the order of destruction for nodes and the graph to make sure that nodes are not deleted before the graph. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Guiding_Task_Scheduler_Execution.rst ================================================ .. _guiding_task_scheduler_execution: Guiding Task Scheduler Execution ================================ By default, the task scheduler tries to use all available computing resources. In some cases, you may want to configure the task scheduler to use only some of them. .. caution:: Guiding the execution of the task scheduler may cause composability issues. |full_name| provides the ``task_arena`` interface to guide tasks execution within the arena by: - setting the preferred computation units; - restricting part of computation units. Such customizations are encapsulated within the ``task_arena::constraints`` structure. To set the limitation, you have to customize the ``task_arena::constraints`` and then pass it to the ``task_arena`` instance during the construction or initialization. The structure ``task_arena::constraints`` allows to specify the following restrictions: - Preferred NUMA node - Preferred core type - The maximum number of logical threads scheduled per single core simultaneously - The level of ``task_arena`` concurrency You may use the interfaces from ``tbb::info`` namespace to construct the ``tbb::task_arena::constraints`` instance. Interfaces from ``tbb::info`` namespace respect the process affinity mask. For instance, if the process affinity mask excludes execution on some of the NUMA nodes, then these NUMA nodes are not returned by ``tbb::info::numa_nodes()`` interface. The following examples show how to use these interfaces: Set NUMA Node ************* The execution on systems with non-uniform memory access (NUMA https://en.wikipedia.org/wiki/Non-uniform_memory_access systems) may cause a performance penalty if threads from one NUMA node access the memory allocated on a different NUMA node. To reduce this overhead, the work may be divided among several ``task_arena`` instances, whose execution preference is set to different NUMA nodes. To set execution preference, assign a NUMA node identifier to the ``task_arena::constraints::numa_id`` field. :: std::vector numa_indexes = tbb::info::numa_nodes(); std::vector arenas(numa_indexes.size()); std::vector task_groups(numa_indexes.size()); for(unsigned j = 0; j < numa_indexes.size(); j++) { arenas[j].initialize(tbb::task_arena::constraints(numa_indexes[j])); arenas[j].execute([&task_groups, &j](){  task_groups[j].run([](){/*some parallel stuff*/}); }); } for(unsigned j = 0; j < numa_indexes.size(); j++) { arenas[j].execute([&task_groups, &j](){ task_groups[j].wait(); }); } Set Core Type ************* The processors with `Intel® Hybrid Technology `_ contain several core types, each is suited for different purposes. In most cases, systems with hybrid CPU architecture show reasonable performance without involving additional API calls. However, in some exceptional scenarios, performance may be tuned by setting the preferred core type. To set the preferred core type for the execution, assign a specific core type identifier to the ``task_arena::constraints::core_type`` field. The example shows how to set the most performant core type as preferable for work execution: :: std::vector core_types = tbb::info::core_types(); tbb::task_arena arena( tbb::task_arena::constraints{}.set_core_type(core_types.back()) ); arena.execute( [] { /*the most performant core type is defined as preferred.*/ }); Limit The Maximum Number of Threads Simultaneously Scheduled To One Core **************************************************************************** The processors with `Intel® Hyper-Threading Technology `_ allow more than one thread to run on each core simultaneously. However, there might be situations when there is need to lower the number of simultaneously running threads per core. In such cases, assign the desired value to the ``task_arena::constraints::max_threads_per_core`` field. The example shows how to allow only one thread to run on each core at a time: :: tbb::task_arena no_ht_arena( tbb::task_arena::constraints{}.set_max_threads_per_core(1) ); no_ht_arena.execute( [] { /*parallel work*/ }); A more composable way to limit the number of threads executing on cores is by setting the maximal concurrency of the ``tbb::task_arena``: :: int no_ht_concurrency = tbb::info::default_concurrency( tbb::task_arena::constraints{}.set_max_threads_per_core(1) ); tbb::task_arena arena( no_ht_concurrency ); arena.execute( [] { /*parallel work*/ }); Similarly to the previous example, the number of threads inside the arena is equal to the number of available cores. However, this one results in fewer overheads and better composability by imposing a less constrained execution. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/How_Task_Scheduler_Works.rst ================================================ .. _How_Task_Scheduler_Works.rst: How Task Scheduler Works ======================== While the task scheduler is not bound to any particular type of parallelism, it was designed to work efficiently for fork-join parallelism with lots of forks. This type of parallelism is typical for parallel algorithms such as `oneapi::tbb::parallel_for `_. Let's consider the mapping of fork-join parallelism on the task scheduler in more detail. The scheduler runs tasks in a way that tries to achieve several targets simultaneously: - Enable as many threads as possible, by creating enough job, to achieve actual parallelism - Preserve data locality to make a single thread execution more efficient - Minimize both memory demands and cross-thread communication to reduce an overhead To achieve this, a balance between depth-first and breadth-first execution strategies must be reached. Assuming that the task graph is finite, depth-first is better for a sequential execution because: - **Strike when the cache is hot**. The deepest tasks are the most recently created tasks and therefore are the hottest in the cache. Also, if they can be completed, tasks that depend on it can continue executing, and though not the hottest in a cache, they are still warmer than the older tasks deeper in the dequeue. - **Minimize space**. Execution of the shallowest task leads to the breadth-first unfolding of a graph. It creates an exponential number of nodes that co-exist simultaneously. In contrast, depth-first execution creates the same number of nodes, but only a linear number can exists at the same time, since it creates a stack of other ready tasks. Each thread has its deque of tasks that are ready to run. When a thread spawns a task, it pushes it onto the bottom of its deque. When a thread participates in the evaluation of tasks, it constantly executes a task obtained by the first rule that applies from the roughly equivalent ruleset: - Get the task returned by the previous one, if any. - Take a task from the bottom of its deque, if any. - Steal a task from the top of another randomly chosen deque. If the selected deque is empty, the thread tries again to execute this rule until it succeeds. Rule 1 is described in :doc:`Task Scheduler Bypass `. The overall effect of rule 2 is to execute the *youngest* task spawned by the thread, which causes the depth-first execution until the thread runs out of work. Then rule 3 applies. It steals the *oldest* task spawned by another thread, which causes temporary breadth-first execution that converts potential parallelism into actual parallelism. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Initializing_and_Terminating_the_Library.rst ================================================ .. _Initializing_and_Terminating_the_Library: Initializing and Terminating the Library ======================================== |full_name| automatically initializes the task scheduler. The initialization process is involved when a thread uses task scheduling services the first time, for example any parallel algorithm, flow graph or task group. The termination happens when the last such thread exits. Explicit Library Finalization ***************************** oneTBB supports an explicit library termination as a preview feature. The ``oneapi::tbb::finalize`` function called with an instance of class ``oneapi::tbb::task_scheduler_handle`` blocks the calling thread until all worker threads implicitly created by the library have completed. If waiting for thread completion is not safe, e.g. may result in a deadlock or called inside a task, a parallel algorithm, or a flow graph node, the method fails. If you know how many active ``oneapi::tbb::task_scheduler_handle`` instances exist in the program, it is recommended to call ``oneapi::tbb::release`` function on all but the last one, then call ``oneapi::tbb::finalize`` for the last instance. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Iterating_Over_a_Concurrent_Queue_for_Debugging.rst ================================================ .. _Iterating_Over_a_Concurrent_Queue_for_Debugging: Iterating Over a Concurrent Queue for Debugging =============================================== The template classes ``concurrent_queue`` and ``concurrent_bounded_queue`` support STL-style iteration. This support is intended only for debugging, when you need to dump a queue. The iterators go forwards only, and are too slow to be very useful in production code. If a queue is modified, all iterators pointing to it become invalid and unsafe to use. The following snippet dumps a queue. The ``operator<<`` is defined for a ``Foo``. :: concurrent_queue q; ... typedef concurrent_queue::const_iterator iter; for(iter i(q.unsafe_begin()); i!=q.unsafe_end(); ++i ) { cout << *i; } The prefix ``unsafe_`` on the methods is a reminder that they are not concurrency safe. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Lambda_Expressions.rst ================================================ .. _Lambda_Expressions: Lambda Expressions ================== C++11 lambda expressions make the |full_name| ``parallel_for`` much easier to use. A lambda expression lets the compiler do the tedious work of creating a function object. Below is the example from the previous section, rewritten with a lambda expression. The lambda expression, replaces both the declaration and construction of function object ``ApplyFoo`` in the example of the previous section. .. literalinclude:: ./examples/parallel_for_lambda_example_1.cpp :language: c++ :start-after: /*begin_parallel_for_lambda_1*/ :end-before: /*end_parallel_for_lambda_1*/ The [=] introduces the lambda expression. The expression creates a function object very similar to ``ApplyFoo``. When local variables like ``a`` and ``n`` are declared outside the lambda expression, but used inside it, they are "captured" as fields inside the function object. The [=] specifies that capture is by value. Writing [&] instead would capture the values by reference. After the [=] is the parameter list and definition for the ``operator()`` of the generated function object. The compiler documentation says more about lambda expressions and other implemented C++11 features. It is worth reading more complete descriptions of lambda expressions than can fit here, because lambda expressions are a powerful feature for using template libraries in general. C++11 support is off by default in the compiler. The following table shows the option for turning it on. .. container:: tablenoborder .. list-table:: :header-rows: 1 * - Environment - Intel® C++ Compiler Classic - Intel® oneAPI DPC++/C++ Compiler * - Windows\* OS systems - \ ``icl /Qstd=c++11 foo.cpp`` - \ ``icx /Qstd=c++11 foo.cpp`` * - Linux\* OS systems - \ ``icc -std=c++11 foo.cpp`` - \ ``icx -std=c++11 foo.cpp`` For further compactness, oneTBB has a form of ``parallel_for`` expressly for parallel looping over a consecutive range of integers. The expression ``parallel_for(first,last,step,f)`` is like writing ``for(auto i=first; i* to indicate the top-level installation directory. The following table describes the subdirectory structure for Linux\*, relative to ** .. container:: tablenoborder .. list-table:: :header-rows: 1 * - Item - Location - Environment Variable * - Header files - | ``include/oneapi/tbb.h`` | ``include/oneapi/tbb/*.h`` - ``CPATH`` * - Shared libraries - ``lib//.so.`` - | ``LIBRARY_PATH`` | ``LD_LIBRARY_PATH`` Where: * ```` - ``ia32`` or ``intel64`` .. note:: Starting with oneTBB 2022.0, 32-bit binaries are supported only by the open-source version of the library. * ```` - ``libtbb``, ``libtbbmalloc``, ``libtbbmalloc_proxy`` or ``libtbbbind`` * ```` - ``_debug`` or empty * ```` - binary version in a form of ``.`` ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Lock_Pathologies.rst ================================================ .. _Lock_Pathologies: Lock Pathologies ================ Locks can introduce performance and correctness problems. If you are new to locking, here are some of the problems to avoid: .. container:: section .. rubric:: Deadlock :class: sectiontitle Deadlock happens when threads are trying to acquire more than one lock, and each holds some of the locks the other threads need to proceed. More precisely, deadlock happens when: - There is a cycle of threads - Each thread holds at least one lock on a mutex, and is waiting on a mutex for which the *next* thread in the cycle already has a lock. - No thread is willing to give up its lock. Think of classic gridlock at an intersection – each car has "acquired" part of the road, but needs to "acquire" the road under another car to get through. Two common ways to avoid deadlock are: - Avoid needing to hold two locks at the same time. Break your program into small actions in which each can be accomplished while holding a single lock. - Always acquire locks in the same order. For example, if you have "outer container" and "inner container" mutexes, and need to acquire a lock on one of each, you could always acquire the "outer sanctum" one first. Another example is "acquire locks in alphabetical order" in a situation where the locks have names. Or if the locks are unnamed, acquire locks in order of the mutex’s numerical addresses. - Use atomic operations instead of locks. .. container:: section .. rubric:: Convoying :class: sectiontitle Another common problem with locks is *convoying*. Convoying occurs when the operating system interrupts a thread that is holding a lock. All other threads must wait until the interrupted thread resumes and releases the lock. Fair mutexes can make the situation even worse, because if a waiting thread is interrupted, all the threads behind it must wait for it to resume. To minimize convoying, try to hold the lock as briefly as possible. Precompute whatever you can before acquiring the lock. To avoid convoying, use atomic operations instead of locks where possible. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Mac_OS.rst ================================================ .. _Mac_OS: macOS\* ======= This section uses ** to indicate the top-level installation directory. The following table describes the subdirectory structure for macOS\*, relative to **. .. container:: tablenoborder .. list-table:: :header-rows: 1 * - Item - Location - Environment Variable * - Header files - | ``include/oneapi/tbb.h`` | ``include/oneapi/tbb/*.h`` - ``CPATH`` * - Shared libraries - ``lib/..dylib`` - | ``LIBRARY_PATH`` | ``DYLD_LIBRARY_PATH`` where * ```` - ``libtbb``, ``libtbbmalloc`` or ``libtbbmalloc_proxy`` * ```` - ``_debug`` or empty * ```` - binary version in a form of ``.`` ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Mapping_Nodes2Tasks.rst ================================================ .. _Mapping_Nodes2Tasks: Flow Graph Basics: Mapping Nodes to Tasks ========================================= The following figure shows the timeline for one possible execution of the two node graph example in the previous section. The bodies of n and m will be referred to as λ\ :sub:`n` and λ\ :sub:`m`, respectively. The three calls to try_put spawn three tasks; each one applies the lambda expression, λ\ :sub:`n`, on one of the three input messages. Because n has unlimited concurrency, these tasks can execute concurrently if there are enough threads available. The call to ``g.wait_for_all()`` blocks until there are no tasks executing in the graph. As with other ``wait_for_all`` functions in oneTBB, the thread that calls ``wait_for_all`` is not spinning idly during this time, but instead can join in executing other tasks from the work pool. .. container:: fignone **Execution Timeline of a Two Node Graph** .. container:: imagecenter |image0| As each task from n finishes, it puts its output to m, since m is a successor of n. Unlike node n, m has been constructed with a concurrency limit of 1 and therefore does not spawn all tasks immediately. Instead, it sequentially spawns tasks to execute its body, λ\ :sub:`m`, on the messages in the order that they arrive. When all tasks are complete, the call to ``wait_for_all`` returns. .. note:: All execution in the flow graph happens asynchronously. The calls to try_put return control to the calling thread quickly, after either immediately spawning a task or buffering the message being passed. Likewise, the body tasks execute the lambda expressions and then put the result to any successor nodes. Only the call to ``wait_for_all`` blocks, as it should, and even in this case the calling thread may be used to execute tasks from the oneTBB work pool while it is waiting. The above timeline shows the sequence when there are enough threads to execute all of the tasks that can be executed in parallel. If there are fewer threads, some spawned tasks will need to wait until a thread is available to execute them. .. |image0| image:: Images/execution_timeline2node.jpg ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Memory_Allocation.rst ================================================ .. _Memory_Allocation: Memory Allocation ================= |full_name| provides several memory allocator templates that are similar to the STL template class std::allocator. Two templates, ``scalable_allocator`` and ``cache_aligned_allocator``, address critical issues in parallel programming as follows: - **Scalability**. Problems of scalability arise when using memory allocators originally designed for serial programs, on threads that might have to compete for a single shared pool in a way that allows only one thread to allocate at a time. Use the ``scalable_allocator`` template to avoid scalability bottlenecks. This template can improve the performance of programs that rapidly allocate and free memory. - **False sharing**. Problems of sharing arise when two threads access different words that share the same cache line. The problem is that a cache line is the unit of information interchange between processor caches. If one processor modifies a cache line and another processor reads the same cache line, the line must be moved from one processor to the other, even if the two processors are dealing with different words within the line. False sharing can hurt performance because cache lines can take hundreds of clocks to move. Use the ``cache_aligned_allocator`` template to always allocate on a separate cache line. Two objects allocated by ``cache_aligned_allocator`` are guaranteed to not have false sharing. However, if an object is allocated by ``cache_aligned_allocator`` and another object is allocated some other way, there is no guarantee. You can use these allocator templates as the *allocator* argument to STL template classes.The following code shows how to declare an STL vector that uses ``cache_aligned_allocator``\ for allocation: :: std::vector >; .. tip:: The functionality of ``cache_aligned_allocator`` comes at some cost in space, because it must allocate at least one cache line’s worth of memory, even for a small object. So use ``cache_aligned_allocator`` only if false sharing is likely to be a real problem. The scalable memory allocator also provides a set of functions equivalent to the C standard library memory management routines but has the ``scalable_`` prefix in their names, as well as the way to easily redirect the standard routines to these functions. .. toctree:: :maxdepth: 4 ../tbb_userguide/Which_Dynamic_Libraries_to_Use ../tbb_userguide/Allocator_Configuration ../tbb_userguide/automatically-replacing-malloc ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Migration_Guide/Mixing_Two_Runtimes.rst ================================================ .. _Mixing_Two_Runtimes: Mixing two runtimes ======================================= Threading Building Blocks (TBB) and oneAPI Threading Building Blocks (oneTBB) can be safely used in the same application. TBB and oneTBB runtimes are named differently and can be loaded safely within the same process. In addition, the ABI versioning is completely different that prevents symbols conflicts. However, if both runtimes are loaded into the same process it can lead to oversubscription because each runtime will use its own pool of threads. It might lead to a performance penalty due to increased number of context switches. To check if both TBB and oneTBB are loaded to the application, export ``TBB_VERSION=1`` before the application run. If both runtimes are loaded there will be two blocks of output, for example: oneTBB possible output: .. code:: text oneTBB: SPECIFICATION VERSION 1.0 oneTBB: VERSION 2021.2 oneTBB: INTERFACE VERSION 12020 oneTBB: TBB_USE_DEBUG 1 oneTBB: TBB_USE_ASSERT 1 oneTBB: TOOLS SUPPORT disabled TBB possible output: .. code:: text TBB: VERSION 2018.0 TBB: INTERFACE VERSION 10006 TBB: BUILD_DATE Mon 01 Mar 2021 01:28:40 PM UTC TBB: BUILD_HOST localhost (x86_64) TBB: BUILD_OS Fedora release 32 (Thirty Two) TBB: BUILD_KERNEL Linux 5.8.9-200.fc32.x86_64 #1 SMP Mon Sep 14 18:28:45 UTC 2020 TBB: BUILD_GCC g++ (GCC) 10.2.1 20201125 (Red Hat 10.2.1-9) TBB: BUILD_LIBC 2.31 TBB: BUILD_LD GNU ld version 2.34-6.fc32 TBB: BUILD_TARGET intel64 on cc10_libc2.31_kernel5.8.9 TBB: BUILD_COMMAND g++ -DDO_ITT_NOTIFY -g -O2 -DUSE_PTHREAD -m64 -fPIC -D__TBB_BUILD=1 -Wall -Wno-parentheses -Wno-non-virtual-dtor -I../../src -I../../src/rml/include -I../../include -I. TBB: TBB_USE_DEBUG 0 TBB: TBB_USE_ASSERT 0 TBB: DO_ITT_NOTIFY 1 TBB: RML private TBB: Tools support disabled .. note:: The ``tbbmalloc`` library in oneTBB is fully binary compatible with TBB. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Migration_Guide/Task_API.rst ================================================ .. _Task_API: Migrating from low-level task API ================================= The low-level task API of Intel(R) Threading Building Blocks (TBB) was considered complex and hence error-prone, which was the primary reason it had been removed from oneAPI Threading Building Blocks (oneTBB). This guide helps with the migration from TBB to oneTBB for the use cases where low-level task API is used. Spawning of individual tasks ---------------------------- For most use cases, the spawning of individual tasks can be replaced with the use of either ``oneapi::tbb::task_group`` or ``oneapi::tbb::parallel_invoke``. For example, ``RootTask``, ``ChildTask1``, and ``ChildTask2`` are the user-side functors that inherit ``tbb::task`` and implement its interface. Then spawning of ``ChildTask1`` and ``ChildTask2`` tasks that can execute in parallel with each other and waiting on the ``RootTask`` is implemented as: .. code:: cpp #include int main() { // Assuming RootTask, ChildTask1, and ChildTask2 are defined. RootTask& root = *new(tbb::task::allocate_root()) RootTask{}; ChildTask1& child1 = *new(root.allocate_child()) ChildTask1{/*params*/}; ChildTask2& child2 = *new(root.allocate_child()) ChildTask2{/*params*/}; root.set_ref_count(3); tbb::task::spawn(child1); tbb::task::spawn(child2); root.wait_for_all(); } Using ``oneapi::tbb::task_group`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The code above can be rewritten using ``oneapi::tbb::task_group``: .. code:: cpp #include int main() { // Assuming ChildTask1, and ChildTask2 are defined. oneapi::tbb::task_group tg; tg.run(ChildTask1{/*params*/}); tg.run(ChildTask2{/*params*/}); tg.wait(); } The code looks more concise now. It also enables lambda functions and does not require you to implement ``tbb::task`` interface that overrides the ``tbb::task* tbb::task::execute()`` virtual method. With this new approach, you work with functors in a C++-standard way by implementing ``void operator() const``: .. code:: cpp struct Functor { // Member to be called when object of this type are passed into // oneapi::tbb::task_group::run() method void operator()() const {} }; Using ``oneapi::tbb::parallel_invoke`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ It is also possible to use ``oneapi::tbb::parallel_invoke`` to rewrite the original code and make it even more concise: .. code:: cpp #include int main() { // Assuming ChildTask1, and ChildTask2 are defined. oneapi::tbb::parallel_invoke( ChildTask1{/*params*/}, ChildTask2{/*params*/} ); } Adding more work during task execution -------------------------------------- ``oneapi::tbb::parallel_invoke`` follows a blocking style of programming, which means that it completes only when all functors passed to the parallel pattern complete their execution. In TBB, cases when the amount of work is not known in advance and the work needs to be added during the execution of a parallel algorithm were mostly covered by ``tbb::parallel_do`` high-level parallel pattern. The ``tbb::parallel_do`` algorithm logic may be implemented using the task API as: .. code:: cpp #include #include #include // Assuming RootTask and OtherWork are defined and implement tbb::task interface. struct Task : public tbb::task { Task(tbb::task& root, int i) : m_root(root), m_i(i) {} tbb::task* execute() override { // ... do some work for item m_i ... if (add_more_parallel_work) { tbb::task& child = *new(m_root.allocate_child()) OtherWork; tbb::task::spawn(child); } return nullptr; } tbb::task& m_root; int m_i; }; int main() { std::vector items = { 0, 1, 2, 3, 4, 5, 6, 7 }; RootTask& root = *new(tbb::task::allocate_root()) RootTask{/*params*/}; root.set_ref_count(items.size() + 1); for (std::size_t i = 0; i < items.size(); ++i) { Task& task = *new(root.allocate_child()) Task(root, items[i]); tbb::task::spawn(task); } root.wait_for_all(); return 0; } In oneTBB ``tbb::parallel_do`` interface was removed. Instead, the functionality of adding new work was included into the ``oneapi::tbb::parallel_for_each`` interface. The previous use case can be rewritten in oneTBB as follows: .. code:: cpp #include #include int main() { std::vector items = { 0, 1, 2, 3, 4, 5, 6, 7 }; oneapi::tbb::parallel_for_each( items.begin(), items.end(), [](int& i, tbb::feeder& feeder) { // ... do some work for item i ... if (add_more_parallel_work) feeder.add(i); } ); } Since both TBB and oneTBB support nested expressions, you can run additional functors from within an already running functor. The previous use case can be rewritten using ``oneapi::tbb::task_group`` as: .. code:: cpp #include #include #include int main() { std::vector items = { 0, 1, 2, 3, 4, 5, 6, 7 }; oneapi::tbb::task_group tg; for (std::size_t i = 0; i < items.size(); ++i) { tg.run([&i = items[i], &tg] { // ... do some work for item i ... if (add_more_parallel_work) // Assuming OtherWork is defined. tg.run(OtherWork{}); }); } tg.wait(); } Task recycling -------------- You can re-run the functor by passing ``*this`` to the ``oneapi::tbb::task_group::run()`` method. The functor will be copied in this case. However, its state can be shared among instances: .. code:: cpp #include #include struct SharedStateFunctor { std::shared_ptr m_shared_data; oneapi::tbb::task_group& m_task_group; void operator()() const { // do some work processing m_shared_data if (has_more_work) m_task_group.run(*this); // Note that this might be concurrently accessing m_shared_data already } }; int main() { // Assuming Data is defined. std::shared_ptr data = std::make_shared(/*params*/); oneapi::tbb::task_group tg; tg.run(SharedStateFunctor{data, tg}); tg.wait(); } Such patterns are particularly useful when the work within a functor is not completed but there is a need for the task scheduler to react to outer circumstances, such as cancellation of group execution. To avoid issues with concurrent access, it is recommended to submit it for re-execution as the last step: .. code:: cpp #include #include struct SharedStateFunctor { std::shared_ptr m_shared_data; oneapi::tbb::task_group& m_task_group; void operator()() const { // do some work processing m_shared_data if (need_to_yield) { m_task_group.run(*this); return; } } }; int main() { // Assuming Data is defined. std::shared_ptr data = std::make_shared(/*params*/); oneapi::tbb::task_group tg; tg.run(SharedStateFunctor{data, tg}); tg.wait(); } Recycling as child or continuation ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ In oneTBB this kind of recycling is done manually. You have to track when it is time to run the task: .. code:: cpp #include #include #include #include #include struct ContinuationTask { ContinuationTask(std::vector& data, int& result) : m_data(data), m_result(result) {} void operator()() const { for (const auto& item : m_data) m_result += item; } std::vector& m_data; int& m_result; }; struct ChildTask { ChildTask(std::vector& data, int& result, std::atomic& tasks_left, std::atomic& tasks_done, oneapi::tbb::task_group& tg) : m_data(data), m_result(result), m_tasks_left(tasks_left), m_tasks_done(tasks_done), m_tg(tg) {} void operator()() const { std::size_t index = --m_tasks_left; m_data[index] = produce_item_for(index); std::size_t done_num = ++m_tasks_done; if (index % 2 != 0) { // Recycling as child m_tg.run(*this); return; } else if (done_num == m_data.size()) { assert(m_tasks_left == 0); // Spawning a continuation that does reduction m_tg.run(ContinuationTask(m_data, m_result)); } } std::vector& m_data; int& m_result; std::atomic& m_tasks_left; std::atomic& m_tasks_done; oneapi::tbb::task_group& m_tg; }; int main() { int result = 0; std::vector items(10, 0); std::atomic tasks_left{items.size()}; std::atomic tasks_done{0}; oneapi::tbb::task_group tg; for (std::size_t i = 0; i < items.size(); i+=2) { tg.run(ChildTask(items, result, tasks_left, tasks_done, tg)); } tg.wait(); } Scheduler Bypass ---------------- TBB ``task::execute()`` method can return a pointer to a task that can be executed next by the current thread. This might reduce scheduling overheads compared to direct ``spawn``. Similar to ``spawn``, the returned task is not guaranteed to be executed next by the current thread. .. code:: cpp #include // Assuming OtherTask is defined. struct Task : tbb::task { task* execute(){ // some work to do ... auto* other_p = new(this->parent().allocate_child()) OtherTask{}; this->parent().add_ref_count(); return other_p; } }; int main(){ // Assuming RootTask is defined. RootTask& root = *new(tbb::task::allocate_root()) RootTask{}; Task& child = *new(root.allocate_child()) Task{/*params*/}; root.add_ref_count(); tbb::task_spawn(child); root.wait_for_all(); } In oneTBB, this can be done using ``oneapi::tbb::task_group``. .. code:: cpp #include // Assuming OtherTask is defined. int main(){ oneapi::tbb::task_group tg; tg.run([&tg](){ //some work to do ... return tg.defer(OtherTask{}); }); tg.wait(); } Here ``oneapi::tbb::task_group::defer`` adds a new task into the ``tg``. However, the task is not put into a queue of tasks ready for execution via ``oneapi::tbb::task_group::run``, but bypassed to the executing thread directly via function return value. Deferred task creation ---------------------- The TBB low-level task API separates the task creation from the actual spawning. This separation allows to postpone the task spawning, while the parent task and final result production are blocked from premature leave. For example, ``RootTask``, ``ChildTask``, and ``CallBackTask`` are the user-side functors that inherit ``tbb::task`` and implement its interface. Then, blocking the ``RootTask`` from leaving prematurely and waiting on it is implemented as follows: .. code:: cpp #include int main() { // Assuming RootTask, ChildTask, and CallBackTask are defined. RootTask& root = *new(tbb::task::allocate_root()) RootTask{}; ChildTask& child = *new(root.allocate_child()) ChildTask{/*params*/}; CallBackTask& cb_task = *new(root.allocate_child()) CallBackTask{/*params*/}; root.set_ref_count(3); tbb::task::spawn(child); register_callback([cb_task&](){ tbb::task::enqueue(cb_task); }); root.wait_for_all(); // Control flow will reach here only after both ChildTask and CallBackTask are executed, // i.e. after the callback is called } In oneTBB, this can be done using ``oneapi::tbb::task_group``. .. code:: cpp #include int main(){ oneapi::tbb::task_group tg; oneapi::tbb::task_arena arena; // Assuming ChildTask and CallBackTask are defined. auto cb = tg.defer(CallBackTask{/*params*/}); register_callback([&tg, c = std::move(cb), &arena]{ arena.enqueue(c); }); tg.run(ChildTask{/*params*/}); tg.wait(); // Control flow gets here once both ChildTask and CallBackTask are executed // i.e. after the callback is called } Here ``oneapi::tbb::task_group::defer`` adds a new task into the ``tg``. However, the task is not spawned until ``oneapi::tbb::task_arena::enqueue`` is called. .. note:: The call to ``oneapi::tbb::task_group::wait`` will not return control until both ``ChildTask`` and ``CallBackTask`` are executed. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Migration_Guide/Task_Scheduler_Init.rst ================================================ .. _Task_Scheduler_Init: Migrating from tbb::task_scheduler_init ======================================= ``tbb::task_scheduler_init`` was a multipurpose functionality in the previous versions of Threading Building Blocks (TBB). This section considers different use cases and how they can be covered with oneTBB. Managing the number of threads --------------------------------------- Querying the default number of threads ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ * `oneapi::tbb::info::default_concurrency() `_ returns the maximum concurrency that will be created by *default* in implicit or explicit ``task_arena``. * `oneapi::tbb::this_task_arena::max_concurrency() `_ returns the maximum number of threads available for the parallel algorithms within the current context (or *default* if an implicit arena is not initialized) * `oneapi::tbb::global_control::active_value(tbb::global_control::max_allowed_parallelism) `_ returns the current limit of the thread pool (or *default* if oneTBB scheduler is not initialized) Setting the maximum concurrency ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ * `task_arena(/* max_concurrency */) `_ limits the maximum concurrency of the parallel algorithm running inside ``task_arena`` * `tbb::global_control(tbb::global_control::max_allowed_parallelism, /* max_concurrency */) `_ limits the total number of oneTBB worker threads Examples ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The default parallelism: .. code:: cpp #include #include #include #include int main() { // Get the default number of threads int num_threads = oneapi::tbb::info::default_concurrency(); // Run the default parallelism oneapi::tbb::parallel_for( /* ... */ [] { // Assert the maximum number of threads assert(num_threads == oneapi::tbb::this_task_arena::max_concurrency()); }); // Create the default task_arena oneapi::tbb::task_arena arena; arena.execute([]{ oneapi::tbb::parallel_for( /* ... */ [] { // Assert the maximum number of threads assert(num_threads == oneapi::tbb::this_task_arena::max_concurrency()); }); }); return 0; } The limited parallelism: .. code:: cpp #include #include #include #include #include int main() { // Create the custom task_arena with four threads oneapi::tbb::task_arena arena(4); arena.execute([]{ oneapi::tbb::parallel_for( /* ... */ [] { // This arena is limited with for threads assert(oneapi::tbb::this_task_arena::max_concurrency() == 4); }); }); // Limit the number of threads to two for all oneTBB parallel interfaces oneapi::tbb::global_control global_limit(oneapi::tbb::global_control::max_allowed_parallelism, 2); // the default parallelism oneapi::tbb::parallel_for( /* ... */ [] { // No more than two threads is expected; however, tbb::this_task_arena::max_concurrency() can return a bigger value int thread_limit = oneapi::tbb::global_control::active_value(oneapi::tbb::global_control::max_allowed_parallelism); assert(thread_limit == 2); }); arena.execute([]{ oneapi::tbb::parallel_for( /* ... */ [] { // No more than two threads is expected; however, tbb::this_task_arena::max_concurrency() is four int thread_limit = oneapi::tbb::global_control::active_value(oneapi::tbb::global_control::max_allowed_parallelism); assert(thread_limit == 2); assert(tbb::this_task_arena::max_concurrency() == 4); }); }); return 0; } Setting thread stack size --------------------------------------- Use `oneapi::tbb::global_control(oneapi::tbb::global_control::thread_stack_size, /* stack_size */) `_ to set the stack size for oneTBB worker threads: .. code:: cpp #include #include int main() { // Set 16 MB of the stack size for oneTBB worker threads. // Note that the stack size of the main thread should be configured in accordace with the // system documentation, e.g. at application startup moment oneapi::tbb::global_control global_limit(tbb::global_control::thread_stack_size, 16 * 1024 * 1024); oneapi::tbb::parallel_for( /* ... */ [] { // Create a big array in the stack char big_array[10*1024*1024]; }); return 0; } Terminating oneTBB scheduler --------------------------------------- `task_scheduler_handle `_ allows waiting for oneTBB worker threads completion: .. code:: cpp #include #include int main() { oneapi::tbb::task_scheduler_handle handle{tbb::attach{}}; // Do some parallel work here oneapi::tbb::parallel_for(/* ... */); oneapi::tbb::finalize(handle); return 0; } ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Migration_Guide.rst ================================================ .. _Migration_Guide: Migrating from Threading Building Blocks (TBB) ============================================== While oneTBB is mostly source compatible with TBB, some interfaces were deprecated in TBB and redesigned or removed in oneTBB. This section considers the most difficult use cases for migrating TBB to oneTBB. .. toctree:: :maxdepth: 4 ../tbb_userguide/Migration_Guide/Task_Scheduler_Init ../tbb_userguide/Migration_Guide/Task_API ../tbb_userguide/Migration_Guide/Mixing_Two_Runtimes ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/More_on_HashCompare.rst ================================================ .. _More_on_HashCompare: More on HashCompare =================== There are several ways to make the ``HashCompare`` argument for ``concurrent_hash_map`` work for your own types. - Specify the ``HashCompare`` argument explicitly - Let the ``HashCompare`` default to ``tbb_hash_compare`` and do one of the following: - Define a specialization of template ``tbb_hash_compare``. For example, if you have keys of type ``Foo``, and ``operator==`` is defined for ``Foo``, you just have to provide a definition of ``tbb_hasher`` as shown below: :: size_t tbb_hasher(const Foo& f) { size_t h = ...compute hash code for f... return h; }; In general, the definition of ``tbb_hash_compare`` or ``HashCompare`` must provide two signatures: - A method ``hash`` that maps a ``Key`` to a ``size_t`` - A method ``equal`` that determines if two keys are equal The signatures go together in a single class because *if two keys are equal, then they must hash to the same value*, otherwise the hash table might not work. You could trivially meet this requirement by always hashing to ``0``, but that would cause tremendous inefficiency. Ideally, each key should hash to a different value, or at least the probability of two distinct keys hashing to the same value should be kept low. The methods of ``HashCompare`` should be ``static`` unless you need to have them behave differently for different instances. If so, then you should construct the ``concurrent_hash_map`` using the constructor that takes a ``HashCompare`` as a parameter. The following example is a variation on an earlier example with instance-dependent methods. The instance performs both case-sensitive or case-insensitive hashing, and comparison, depending upon an internal flag ``ignore_case``. :: // Structure that defines hashing and comparison operations class VariantHashCompare { // If true, then case of letters is ignored. bool ignore_case; public: size_t hash(const string& x) const { size_t h = 0; for(const char* s = x.c_str(); *s; s++) h = (h*16777179)^*(ignore_case?tolower(*s):*s); return h; } // True if strings are equal bool equal(const string& x, const string& y) const { if( ignore_case ) strcasecmp(x.c_str(), y.c_str())==0; else return x==y; } VariantHashCompare(bool ignore_case_) : ignore_case(ignore_case_) {} };   typedef concurrent_hash_map VariantStringTable;   VariantStringTable CaseSensitiveTable(VariantHashCompare(false)); VariantStringTable CaseInsensitiveTable(VariantHashCompare(true)); ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Mutex_Flavors.rst ================================================ .. _Mutex_Flavors: Mutex Flavors ============= Connoisseurs of mutexes distinguish various attributes of mutexes. It helps to know some of these, because they involve tradeoffs of generality and efficiency. Picking the right one often helps performance. Mutexes can be described by the following qualities, also summarized in the table below. - **Scalable**. Some mutexes are called *scalable*. In a strict sense, this is not an accurate name, because a mutex limits execution to one thread at a time. A *scalable mutex* is one that does not do *worse* than this. A mutex can do worse than serialize execution if the waiting threads consume excessive processor cycles and memory bandwidth, reducing the speed of threads trying to do real work. Scalable mutexes are often slower than non-scalable mutexes under light contention, so a non-scalable mutex may be better. When in doubt, use a scalable mutex. - **Fair**. Mutexes can be *fair* or *unfair*. A fair mutex lets threads through in the order they arrived. Fair mutexes avoid starving threads. Each thread gets its turn. However, unfair mutexes can be faster, because they let threads that are running go through first, instead of the thread that is next in line which may be sleeping on account of an interrupt. - **Yield or Block**. This is an implementation detail that impacts performance. On long waits, an |full_name| mutex either *yields* or *blocks*. Here *yields* means to repeatedly poll whether progress can be made, and if not, temporarily yield [#]_ the processor. To *block* means to yield the processor until the mutex permits progress. Use the yielding mutexes if waits are typically short and blocking mutexes if waits are typically long. The following is a summary of mutex behaviors: - ``spin_mutex`` is non-scalable, unfair, non-recursive, and spins in user space. It would seem to be the worst of all possible worlds, except that it is *very fast* in *lightly contended* situations. If you can design your program so that contention is somehow spread out among many ``spin_mutex`` objects, you can improve performance over using other kinds of mutexes. If a mutex is heavily contended, your algorithm will not scale anyway. Consider redesigning the algorithm instead of looking for a more efficient lock. - ``mutex`` has behavior similar to the ``spin_mutex``. However, the ``mutex`` *blocks* on long waits that makes it resistant to high contention. - ``queuing_mutex`` is scalable, fair, non-recursive, and spins in user space. Use it when scalability and fairness are important. - ``spin_rw_mutex`` and ``queuing_rw_mutex`` are similar to ``spin_mutex`` and ``queuing_mutex``, but additionally support *reader* locks. - ``rw_mutex`` is similar to ``mutex``, but additionally support *reader* locks. - ``speculative_spin_mutex`` and ``speculative_spin_rw_mutex`` are similar to ``spin_mutex`` and ``spin_rw_mutex``, but additionally provide *speculative locking* on processors that support hardware transaction memory. Speculative locking allows multiple threads acquire the same lock, as long as there are no "conflicts" that may generate different results than non-speculative locking. These mutexes are *scalable* when work with low conflict rate, i.e. mostly in speculative locking mode. - ``null_mutex`` and ``null_rw_mutex`` do nothing. They can be useful as template arguments. For example, suppose you are defining a container template and know that some instantiations will be shared by multiple threads and need internal locking, but others will be private to a thread and not need locking. You can define the template to take a Mutex type parameter. The parameter can be one of the real mutex types when locking is necessary, and ``null_mutex`` when locking is unnecessary. .. container:: tablenoborder .. list-table:: :header-rows: 1 * - Mutex - Scalable - Fair - Recursive - Long Wait - Size * - \ ``spin_mutex`` - no - no - no - yields - 1 byte * - \ ``mutex`` - ✓ - no - no - blocks - 1 byte * - \ ``speculative_spin_mutex`` - HW dependent - no - no - yields - 2 cache lines * - \ ``queuing_mutex`` - ✓ - ✓ - no - yields - 1 word * - \ ``spin_rw_mutex`` - no - no - no - yields - 1 word * - \ ``spin_rw_mutex`` - ✓ - no - no - blocks - 1 word * - \ ``speculative_spin_rw_mutex`` - HW dependent - no - no - yields - 3 cache lines * - \ ``queuing_rw_mutex`` - ✓ - ✓ - no - yields - 1 word * - \ ``null_mutex`` [#]_ - moot - ✓ - ✓ - never - empty * - \ ``null_rw_mutex`` - moot - ✓ - ✓ - never - empty .. [#] The yielding is implemented via ``SwitchToThread()`` on Microsoft Windows\* operating systems and by ``sched_yield()`` on other systems. .. [#] Null mutexes are considered fair by oneTBB because they cannot cause starvation. They lack any non-static data members. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Mutual_Exclusion.rst ================================================ .. _Mutual_Exclusion: Mutual Exclusion ================ Mutual exclusion controls how many threads can simultaneously run a region of code. In |full_name|, mutual exclusion is implemented by *mutexes* and *locks.* A mutex is an object on which a thread can acquire a lock. Only one thread at a time can have a lock on a mutex; other threads have to wait their turn. The simplest mutex is ``spin_mutex``. A thread trying to acquire a lock on a ``spin_mutex`` busy waits until it can acquire the lock. A ``spin_mutex`` is appropriate when the lock is held for only a few instructions. For example, the following code uses a mutex ``FreeListMutex`` to protect a shared variable ``FreeList``. It checks that only a single thread has access to ``FreeList`` at a time. :: Node* FreeList; typedef spin_mutex FreeListMutexType; FreeListMutexType FreeListMutex;   Node* AllocateNode() { Node* n; { FreeListMutexType::scoped_lock lock(FreeListMutex); n = FreeList; if( n ) FreeList = n->next; } if( !n ) n = new Node(); return n; }   void FreeNode( Node* n ) { FreeListMutexType::scoped_lock lock(FreeListMutex); n->next = FreeList; FreeList = n; } The constructor for ``scoped_lock`` waits until there are no other locks on ``FreeListMutex``. The destructor releases the lock. The braces inside routine ``AllocateNode`` may look unusual. Their role is to keep the lifetime of the lock as short as possible, so that other waiting threads can get their chance as soon as possible. .. CAUTION:: Be sure to name the lock object, otherwise it will be destroyed too soon. For example, if the creation of the ``scoped_lock`` object in the example is changed to :: FreeListMutexType::scoped_lock (FreeListMutex); then the ``scoped_lock`` is destroyed when execution reaches the semicolon, which releases the lock *before* ``FreeList`` is accessed. The following shows an alternative way to write ``AllocateNode``: :: Node* AllocateNode() { Node* n; FreeListMutexType::scoped_lock lock; lock.acquire(FreeListMutex); n = FreeList; if( n ) FreeList = n->next; lock.release(); if( !n ) n = new Node(); return n; } Method ``acquire`` waits until it can acquire a lock on the mutex; method ``release`` releases the lock. It is recommended that you add extra braces where possible, to clarify to maintainers which code is protected by the lock. If you are familiar with C interfaces for locks, you may be wondering why there are not simply acquire and release methods on the mutex object itself. The reason is that the C interface would not be exception safe, because if the protected region threw an exception, control would skip over the release. With the object-oriented interface, destruction of the ``scoped_lock`` object causes the lock to be released, no matter whether the protected region was exited by normal control flow or an exception. This is true even for our version of ``AllocateNode`` that used methods ``acquire`` and ``release –`` the explicit release causes the lock to be released earlier, and the destructor then sees that the lock was released and does nothing. All mutexes in oneTBB have a similar interface, which not only makes them easier to learn, but enables generic programming. For example, all of the mutexes have a nested ``scoped_lock`` type, so given a mutex of type ``M``, the corresponding lock type is ``M::scoped_lock``. .. tip:: It is recommended that you always use a ``typedef`` for the mutex type, as shown in the previous examples. That way, you can change the type of the lock later without having to edit the rest of the code. In the examples, you could replace the ``typedef`` with ``typedef queuing_mutex FreeListMutexType``, and the code would still be correct. .. toctree:: :maxdepth: 4 ../tbb_userguide/Mutex_Flavors ../tbb_userguide/Reader_Writer_Mutexes ../tbb_userguide/UpgradeDowngrade ../tbb_userguide/Lock_Pathologies ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Nodes.rst ================================================ .. _Nodes: Flow Graph Basics: Nodes ======================== A node is a class that inherits from ``oneapi::tbb::flow::graph_node`` and also typically inherits from ``oneapi::tbb::flow::sender``, ``oneapi::tbb::flow::receiver``, or both. A node performs some operation, usually on an incoming message and may generate zero or more output messages. Some nodes require more than one input message or generate more than one output message. While it is possible to define your own node types by inheriting from graph_node, sender and receiver, it is more typical that predefined node types are used to construct a graph. A ``function_node`` is a predefined type available in ``flow_graph.h`` and represents a simple function with one input and one output. The constructor for a ``function_node`` takes three arguments: :: template< typename Body> function_node(graph &g, size_t concurrency, Body body) .. container:: tablenoborder .. list-table:: :header-rows: 1 * - Parameter - Description * - Body - Type of the body object. * - g - The graph the node belongs to. * - concurrency - The concurrency limit for the node. You can use the concurrency limit to control how many invocations of the node are allowed to proceed concurrently, from 1 (serial) to an unlimited number. * - body - User defined function object, or lambda expression, that is applied to the incoming message to generate the outgoing message. Below is code for creating a simple graph that contains a single function_node. In this example, a node n is constructed that belongs to graph g, and has a second argument of 1, which allows at most 1 invocation of the node to occur concurrently. The body is a lambda expression that prints each value v that it receives, spins for v seconds, prints the value again, and then returns v unmodified. The code for the function spin_for is not provided. :: graph g; function_node< int, int > n( g, 1, []( int v ) -> int { cout << v; spin_for( v ); cout << v; return v; } ); After the node is constructed in the example above, you can pass messages to it, either by connecting it to other nodes using edges or by invoking its function try_put. Using edges is described in the next section. :: n.try_put( 1 ); n.try_put( 2 ); n.try_put( 3 ); You can then wait for the messages to be processed by calling ``wait_for_all`` on the graph object: :: g.wait_for_all(); In the above example code, the function_node n was created with a concurrency limit of 1. When it receives the message sequence 1, 2 and 3, the node n will spawn a task to apply the body to the first input, 1. When that task is complete, it will then spawn another task to apply the body to 2. And likewise, the node will wait for that task to complete before spawning a third task to apply the body to 3. The calls to try_put do not block until a task is spawned; if a node cannot immediately spawn a task to process the message, the message will be buffered in the node. When it is legal, based on concurrency limits, a task will be spawned to process the next buffered message. In the above graph, each message is processed sequentially. If however, you construct the node with a different concurrency limit, parallelism can be achieved: :: function_node< int, int > n( g, oneapi::tbb::flow::unlimited, []( int v ) -> int { cout << v; spin_for( v ); cout << v; return v; } ); You can use unlimited as the concurrency limit to instruct the library to spawn a task as soon as a message arrives, regardless of how many other tasks have been spawned. You can also use any specific value, such as 4 or 8, to limit concurrency to at most 4 or 8, respectively. It is important to remember that spawning a task does not mean creating a thread. So while a graph may spawn many tasks, only the number of threads available in the library's thread pool will be used to execute these tasks. Suppose you use unlimited in the function_node constructor instead and call try_put on the node: :: n.try_put( 1 ); n.try_put( 2 ); n.try_put( 3 ); g.wait_for_all(); The library spawns three tasks, each one applying n's lambda expression to one of the messages. If you have a sufficient number of threads available on your system, then all three invocations of the body will occur in parallel. If however, you have only one thread in the system, they execute sequentially. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Non-Linear_Pipelines.rst ================================================ .. _Non-Linear_Pipelines: Non-Linear Pipelines ==================== Template function ``parallel_pipeline`` supports only linear pipelines. It does not directly handle more baroque plumbing, such as in the diagram below. .. container:: fignone :name: image011 |image0| However, you can still use a pipeline for this. Just topologically sort the filters into a linear order, like this: The light gray arrows are the original arrows that are now implied by transitive closure of the other arrows. It might seem that lot of parallelism is lost by forcing a linear order on the filters, but in fact the only loss is in the *latency* of the pipeline, not the throughput. The latency is the time it takes a token to flow from the beginning to the end of the pipeline. Given a sufficient number of processors, the latency of the original non-linear pipeline is three filters. This is because filters A and B could process the token concurrently, and likewise filters D and E could process the token concurrently. .. container:: fignone :name: image012 |image1| In the linear pipeline, the latency is five filters. The behavior of filters A, B, D and E above may need to be modified in order to properly handle objects that don’t need to be acted upon by the filter other than to be passed along to the next filter in the pipeline. The throughput remains the same, because regardless of the topology, the throughput is still limited by the throughput of the slowest serial filter. If ``parallel_pipeline`` supported non-linear pipelines, it would add a lot of programming complexity, and not improve throughput. The linear limitation of ``parallel_pipeline`` is a good tradeoff of gain versus pain. .. |image0| image:: Images/image011.jpg :width: 281px :height: 107px .. |image1| image:: Images/image012.jpg :width: 281px :height: 107px ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Package_Contents_os.rst ================================================ .. _Package_Contents: Package Contents ================ |full_name| includes dynamic library files and header files for Windows\*, Linux\* and macOS\* operating systems as described in this section. .. toctree:: :maxdepth: 4 ../tbb_userguide/Debug_Versus_Release_Libraries ../tbb_userguide/Scalable_Memory_Allocator ../tbb_userguide/Windows_OS_ug ../tbb_userguide/Linux_OS ../tbb_userguide/Mac_OS ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Parallelizing_Complex_Loops.rst ================================================ .. _Parallelizing_Complex_Loops: Parallelizing Complex Loops =========================== You can successfully parallelize many applications using only the constructs in the :ref:`Parallelizing_Simple_Loops` section. However, some situations call for other parallel patterns. This section describes the support for some of these alternate patterns. .. toctree:: :maxdepth: 4 ../tbb_userguide/Cook_Until_Done_parallel_do ../tbb_userguide/Working_on_the_Assembly_Line_pipeline ../tbb_userguide/Summary_of_Loops_and_Pipelines ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Parallelizing_Flow_Graph.rst ================================================ .. _Parallelizing_Flow_Graph: Parallelizing Data Flow and Dependency Graphs ============================================= In addition to loop parallelism, the |full_name| library also supports graph parallelism. It's possible to create graphs that are highly scalable, but it is also possible to create graphs that are completely sequential. Using graph parallelism, computations are represented by nodes and the communication channels between these computations are represented by edges. When a node in the graph receives a message, a task is spawned to execute its body object on the incoming message. Messages flow through the graph across the edges that connect the nodes. The following sections present two examples of applications that can be expressed as graphs. The following figure shows a *streaming* or *data flow* application where a sequence of values is processed as each value passes through the nodes in the graph. In this example, the sequence is created by a function F. For each value in the sequence, G squares the value and H cubes the value. J then takes each of the squared and cubed values and adds them to a global sum. After all values in the sequence are completely processed, sum is equal to the sum of the sequence of squares and cubes from 1 to 10. In a streaming or data flow graph, the values actually flow across the edges; the output of one node becomes the input of its successor(s). .. container:: fignone :name: simple_data_flow_title **Simple Data Flow Graph** .. container:: imagecenter |image0| The following graphic shows a different form of graph application. In this example, a dependence graph is used to establish a partial ordering among the steps for making a peanut butter and jelly sandwich. In this partial ordering, you must first get the bread before spreading the peanut butter or jelly on the bread. You must spread on the peanut butter before you put away the peanut butter jar, and likewise spread on the jelly before you put away the jelly jar. And, you need to spread on both the peanut butter and jelly before putting the two slices of bread together. This is a partial ordering because, for example, it doesn't matter if you spread on the peanut butter first or the jelly first. It also doesn't matter if you finish making the sandwich before putting away the jars. .. container:: fignone :name: dependence_graph_make_sandwitch **Dependence Graph for Making a Sandwich** .. container:: imagecenter |image1| While it can be inferred that resources, such as the bread, or the jelly jar, are shared between ordered steps, it is not explicit in the graph. Instead, only the required ordering of steps is explicit in a dependence graph. For example, you must "Put jelly on 1 slice" **before** you "Put away jelly jar". The flow graph interface in the oneTBB library allows you to express data flow and dependence graphs such as these, as well as more complicated graphs that include cycles, conditionals, buffering and more. If you express your application using the flow graph interface, the runtime library spawns tasks to exploit the parallelism that is present in the graph. For example, in the first example above, perhaps two different values might be squared in parallel, or the same value might be squared and cubed in parallel. Likewise in the second example, the peanut butter might be spread on one slice of bread in parallel with the jelly being spread on the other slice. The interface expresses what is legal to execute in parallel, but allows the runtime library to choose at runtime what will be executed in parallel. The support for graph parallelism is contained within the namespace ``oneapi::tbb::flow`` and is defined in the ``flow_graph.h`` header file. .. |image0| image:: Images/flow_graph.jpg .. |image1| image:: Images/flow_graph_complex.jpg :width: 440px :height: 337px ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Parallelizing_Simple_Loops_os.rst ================================================ .. _Parallelizing_Simple_Loops: Parallelizing Simple Loops ========================== The simplest form of scalable parallelism is a loop of iterations that can each run simultaneously without interfering with each other. The following sections demonstrate how to parallelize simple loops. .. note:: |full_name| components are defined in namespace ``tbb``. For brevity’s sake, the namespace is explicit in the first mention of a component, but implicit afterwards. When compiling oneTBB programs, be sure to link in the oneTBB shared library, otherwise undefined references will occur. The following table shows compilation commands that use the debug version of the library. Remove the ``_debug`` portion to link against the production version of the library. .. container:: tablenoborder .. list-table:: :header-rows: 1 * - Operating System - Command line * - Windows\* OS - ``icl /MD example.cpp tbb_debug.lib`` * - Linux\* OS - ``icc example.cpp -ltbb_debug`` .. include:: Parallelizing_Simple_Loops_toctree.rst ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Parallelizing_Simple_Loops_toctree.rst ================================================ .. _Parallelizing_Simple_Loops_toctree: .. toctree:: :maxdepth: 4 ../tbb_userguide/Initializing_and_Terminating_the_Library ../tbb_userguide/parallel_for_os ../tbb_userguide/parallel_reduce ../tbb_userguide/Advanced_Example ../tbb_userguide/Advanced_Topic_Other_Kinds_of_Iteration_Spaces ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Partitioner_Summary.rst ================================================ .. _Partitioner_Summary: Partitioner Summary =================== The parallel loop templates ``parallel_for`` and ``parallel_reduce`` take an optional *partitioner* argument, which specifies a strategy for executing the loop. The following table summarizes partitioners and their effect when used in conjunction with ``blocked_range``. .. container:: tablenoborder .. list-table:: :header-rows: 1 * - Partitioner - Description - When Used with ``blocked_range(i,j,g)`` * - ``simple_partitioner`` - Chunksize bounded by grain size. - ``g/2 ≤ chunksize ≤ g`` * - ``auto_partitioner`` (default) - Automatic chunk size. - ``g/2 ≤ chunksize`` * - ``affinity_partitioner`` - Automatic chunk size, cache affinity and uniform distribution of iterations. - ``g/2 ≤ chunksize`` * - ``static_partitioner`` - Deterministic chunk size, cache affinity and uniform distribution of iterations without load balancing. - ``max(g/3, problem_size/num_of_resources) ≤ chunksize`` An ``auto_partitioner`` is used when no partitioner is specified. In general, the ``auto_partitioner`` or ``affinity_partitioner`` should be used, because these tailor the number of chunks based on available execution resources. ``affinity_partitioner`` and ``static_partitioner`` may take advantage of ``Range`` ability to split in a given ratio (see "Advanced Topic: Other Kinds of Iteration Spaces") for distributing iterations in nearly equal chunks between computing resources. ``simple_partitioner`` can be useful in the following situations: - The subrange size for ``operator()`` must not exceed a limit. That might be advantageous, for example, if your ``operator()`` needs a temporary array proportional to the size of the range. With a limited subrange size, you can use an automatic variable for the array instead of having to use dynamic memory allocation. - A large subrange might use cache inefficiently. For example, suppose the processing of a subrange involves repeated sweeps over the same memory locations. Keeping the subrange below a limit might enable the repeatedly referenced memory locations to fit in cache. - You want to tune to a specific machine. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Predefined_Node_Types.rst ================================================ .. _Predefined_Node_Types: Predefined Node Types ===================== You can define your own node types by inheriting from class graph_node, class sender and class receiver but it is likely that you can create your graph with the predefined node types already available in flow_graph.h. Below is a table that lists all of the predefined types with a basic description. See the Developer Reference for a more detailed description of each node. .. container:: tablenoborder .. list-table:: :header-rows: 1 :widths: 25 25 * - Predefined Node Type - Description * - ``input_node`` - A single-output node, with a generic output type. When activated, it executes a user body to generate its output. Its body is invoked if downstream nodes have accepted the previous generated output. Otherwise, the previous output is temporarily buffered until it is accepted downstream and then the body is again invoked. * - ``function_node`` - A single-input single-output node that broadcasts its output to all successors. Has generic input and output types. Executes a user body and has controllable concurrency level and buffering policy. For each input exactly one output is returned. * - ``continue_node`` - A single-input, single-output node that broadcasts its output to all successors. It has a single input that requires 1 or more inputs of type ``continue_msg`` and has a generic output type. It executes a user body when it receives N ``continue_msg objects`` at its input. N is equal to the number of predecessors plus any additional offset assigned at construction time. * - ``multifunction_node`` - A single-input multi-output node. It has a generic input type and several generic output types. It executes a user body, and has controllable concurrency level and buffering policy. The body can output zero or more messages on each output port. * - ``broadcast_node`` - A single-input, single-output node that broadcasts each message received to all successors. Its input and output are of the same generic type. It does not buffer messages. * - ``buffer_node``, ``queue_node``, ``priority_queue_node``, and ``sequencer_node``. - Single-input, single-output nodes that buffer messages and send their output to one successor. The order in which the messages are sent are node specific (see the Developer Reference). These nodes are unique in that they send to only a single successor and not all successors. * - ``join_node`` - A multi-input, single-output node. There are several generic input types and the output type is a tuple of these generic types. The node combines one message from each input port to create a tuple that is broadcast to all successors. The policy used to combine messages is selectable as queueing, reserving or tag-matching. * - ``split_node`` - A single-input, multi-output node. The input type is a tuple of generic types and there is one output port for each of the types in the tuple. The node receives a tuple of values and outputs each element of the tuple on a corresponding output port. * - ``write_once_node``, ``overwrite_node`` - Single-input, single-output nodes that buffer a single message and broadcast their outputs to all successors. After broadcast, the nodes retain the last message received, so it is available to any future successor. A ``write_once_node`` will only accept the first message it receives, while the ``overwrite_node`` will accept all messages, broadcasting them to all successors, and replacing the old value with the new. * - ``limiter_node`` - A multi-input, single output node that broadcasts its output to all successors. The main input type and output type are of the same generic type. The node increments an internal counter when it broadcasts a message. If the increment causes it to reach its user-assigned threshold, it will broadcast no more messages. A special input port can be used to adjust the internal count, allowing further messages to be broadcast. The node does not buffer messages. * - ``indexer_node`` - A multi-input, single-output node that broadcasts its output message to all of its successors. The input type is a list of generic types and the output type is a ``tagged_msg``. The message is of one of the types listed in the input and the tag identifies the port on which the message was received. Messages are broadcast individually as they arrive at the input ports. * - ``composite_node`` - A node that might have 0, 1 or multiple ports for both input and output. The ``composite_node`` packages a group of other nodes together and maintains a tuple of references to ports that border it. This allows for the corresponding ports of the ``composite_node`` to be used to make edges which hitherto would have been made from the actual nodes in the ``composite_node``. * - async_node (preview feature) - A node that allows a flow graph to communicate with an external activity managed by the user or another runtime. This node receives messages of generic type, invokes the user-provided body to submit a message to an external activity. The external activity can use a special interface to return a generic type and put it to all successors of ``async_node``. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Reader_Writer_Mutexes.rst ================================================ .. _Reader_Writer_Mutexes: Reader Writer Mutexes ===================== Mutual exclusion is necessary when at least one thread *writes* to a shared variable. But it does no harm to permit multiple readers into a protected region. The reader-writer variants of the mutexes, denoted by ``_rw_`` in the class names, enable multiple readers by distinguishing *reader locks* from *writer locks.* There can be more than one reader lock on a given mutex. Requests for a reader lock are distinguished from requests for a writer lock via an extra boolean parameter in the constructor for ``scoped_lock``. The parameter is false to request a reader lock and true to request a writer lock. It defaults to ``true`` so that when omitted, a ``spin_rw_mutex`` or ``queuing_rw_mutex`` behaves like its non-``_rw_`` counterpart. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/References.rst ================================================ .. _References: References ========== **[1]**   "Memory Consistency & .NET", Arch D. Robison, Dr. Dobb’s Journal, April 2003. **[2]**   A Formal Specification of Intel® Itanium® Processor Family Memory Ordering, Intel Corporation, October 2002. **[3]**   "Cilk: An Efficient Multithreaded Runtime System", Robert Blumofe, Christopher Joerg, Bradley Kuszmaul, C. Leiserson, and Keith Randall, Proceedings of the fifth ACM SIGPLAN symposium on Principles and practice of parallel programming, 1995. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Scalable_Memory_Allocator.rst ================================================ .. _Scalable_Memory_Allocator: Scalable Memory Allocator ========================= Both the debug and release versions of |full_name| consists of two dynamic shared libraries, one with general support and the other with a scalable memory allocator. The latter is distinguished by ``malloc`` in its name. For example, the release versions for Windows\* OS are ``tbb.dll`` and ``tbbmalloc.dll`` respectively. Applications may choose to use only the general library, or only the scalable memory allocator, or both. See the links below for more information on memory allocation. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Summary_of_Containers.rst ================================================ .. _Summary_of_Containers: Summary of Containers ===================== The high-level containers in |full_name| enable common idioms for concurrent access. They are suitable for scenarios where the alternative would be a serial container with a lock around it. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Summary_of_Loops_and_Pipelines.rst ================================================ .. _Summary_of_Loops_and_Pipelines: Summary of Loops and Pipelines ============================== The high-level loop and pipeline templates in |full_name| give you efficient scalable ways to exploit the power of multi-core chips without having to start from scratch. They let you design your software at a high task-pattern level and not worry about low-level manipulation of threads. Because they are generic, you can customize them to your specific needs. Have fun using these templates to unlock the power of multi-core. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Task-Based_Programming.rst ================================================ .. _Task-Based_Programming: Task-Based Programming ====================== When striving for performance, programming in terms of threads can be a poor way to do multithreaded programming. It is much better to formulate your program in terms of *logical tasks*, not threads, for several reasons. - Matching parallelism to available resources - Faster task startup and shutdown - More efficient evaluation order - Improved load balancing - Higher–level thinking The following paragraphs explain these points in detail. The threads you create with a threading package are *logical* threads, which map onto the *physical threads* of the hardware. For computations that do not wait on external devices, highest efficiency usually occurs when there is exactly one running logical thread per physical thread. Otherwise, there can be inefficiencies from the mismatch\ *. Undersubscription* occurs when there are not enough running logical threads to keep the physical threads working. *Oversubscription* occurs when there are more running logical threads than physical threads. Oversubscription usually leads to *time sliced* execution of logical threads, which incurs overheads as discussed in Appendix A, *Costs of Time Slicing*. The scheduler tries to avoid oversubscription, by having one logical thread per physical thread, and mapping tasks to logical threads, in a way that tolerates interference by other threads from the same or other processes. The key advantage of tasks versus logical threads is that tasks are much *lighter weight* than logical threads. On Linux systems, starting and terminating a task is about 18 times faster than starting and terminating a thread. On Windows systems, the ratio is more than 100. This is because a thread has its own copy of a lot of resources, such as register state and a stack. On Linux, a thread even has its own process id. A task in |full_name|, in contrast, is typically a small routine, and also, cannot be preempted at the task level (though its logical thread can be preempted). Tasks in oneTBB are efficient too because *the scheduler is unfair*. Thread schedulers typically distribute time slices in a round-robin fashion. This distribution is called "fair", because each logical thread gets its fair share of time. Thread schedulers are typically fair because it is the safest strategy to undertake without understanding the higher-level organization of a program. In task-based programming, the task scheduler does have some higher-level information, and so can sacrifice fairness for efficiency. Indeed, it often delays starting a task until it can make useful progress. The scheduler does *load balancing*. In addition to using the right number of threads, it is important to distribute work evenly across those threads. As long as you break your program into enough small tasks, the scheduler usually does a good job of assigning tasks to threads to balance load. With thread-based programming, you are often stuck dealing with load-balancing yourself, which can be tricky to get right. .. tip:: Design your programs to try to create many more tasks than there are threads, and let the task scheduler choose the mapping from tasks to threads. Finally, the main advantage of using tasks instead of threads is that they let you think at a higher, task-based, level. With thread-based programming, you are forced to think at the low level of physical threads to get good efficiency, because you have one logical thread per physical thread to avoid undersubscription or oversubscription. You also have to deal with the relatively coarse grain of threads. With tasks, you can concentrate on the logical dependences between tasks, and leave the efficient scheduling to the scheduler. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Task_Scheduler_Bypass.rst ================================================ .. _Task_Scheduler_Bypass: Task Scheduler Bypass ===================== Scheduler bypass is an optimization where you directly specify the next task to run. According to the rules of execution described in :doc:`How Task Scheduler Works `, the spawning of the new task to be executed by the current thread involves the next steps: - Push a new task onto the thread's deque. - Continue to execute the current task until it is completed. - Take a task from the thread's deque, unless it is stolen by another thread. Steps 1 and 3 introduce unnecessary deque operations or, even worse, allow stealing that can hurt locality without adding significant parallelism. These problems can be avoided by using "Task Scheduler Bypass" technique to directly point the preferable task to be executed next instead of spawning it. When, as described in :doc:`How Task Scheduler Works `, the returned task becomes the first candidate for the next task to be executed by the thread. Furthermore, this approach almost guarantees that the task is executed by the current thread and not by any other thread. Please note that at the moment the only way to use this optimization is to use `preview feature of ``onepai::tbb::task_group`` ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/The_Task_Scheduler.rst ================================================ .. _The_Task_Scheduler: The Task Scheduler ================== This section introduces the |full_name| *task scheduler*. The task scheduler is the engine that powers the loop templates. When practical, use the loop templates instead of the task scheduler, because the templates hide the complexity of the scheduler. However, if you have an algorithm that does not naturally map onto one of the high-level templates, use the task scheduler. .. toctree:: :maxdepth: 4 ../tbb_userguide/Task-Based_Programming ../tbb_userguide/When_Task-Based_Programming_Is_Inappropriate ../tbb_userguide/How_Task_Scheduler_Works ../tbb_userguide/Task_Scheduler_Bypass ../tbb_userguide/Guiding_Task_Scheduler_Execution ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Throughput_of_pipeline.rst ================================================ .. _Throughput_of_pipeline: Throughput of pipeline ====================== The throughput of a pipeline is the rate at which tokens flow through it, and is limited by two constraints. First, if a pipeline is run with ``N`` tokens, then obviously there cannot be more than ``N`` operations running in parallel. Selecting the right value of ``N`` may involve some experimentation. Too low a value limits parallelism; too high a value may demand too many resources (for example, more buffers). Second, the throughput of a pipeline is limited by the throughput of the slowest sequential filter. This is true even for a pipeline with no parallel filters. No matter how fast the other filters are, the slowest sequential filter is the bottleneck. So in general you should try to keep the sequential filters fast, and when possible, shift work to the parallel filters. The text processing example has relatively poor speedup, because the serial filters are limited by the I/O speed of the system. Indeed, even with files that are on a local disk, you are unlikely to see a speedup much more than 2. To really benefit from a pipeline, the parallel filters need to be doing some heavy lifting compared to the serial filters. The window size, or sub-problem size for each token, can also limit throughput. Making windows too small may cause overheads to dominate the useful work. Making windows too large may cause them to spill out of cache. A good guideline is to try for a large window size that still fits in cache. You may have to experiment a bit to find a good window size. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Timing.rst ================================================ .. _Timing: Timing ====== When measuring the performance of parallel programs, it is usually *wall clock* time, not CPU time, that matters. The reason is that better parallelization typically increases aggregate CPU time by employing more CPUs. The goal of parallelizing a program is usually to make it run *faster* in real time. The class ``tick_count`` in |full_name| provides a simple interface for measuring wall clock time. A ``tick_count`` value obtained from the static method tick_count::now() represents the current absolute time. Subtracting two ``tick_count`` values yields a relative time in ``tick_count::interval_t``, which you can convert to seconds, as in the following example: :: tick_count t0 = tick_count::now(); ... do some work ... tick_count t1 = tick_count::now(); printf("work took %g seconds\n",(t1-t0).seconds()); Unlike some timing interfaces, ``tick_count`` is guaranteed to be safe to use across threads. It is valid to subtract ``tick_count`` values that were created by different threads. A ``tick_count`` difference can be converted to seconds. The resolution of ``tick_count`` corresponds to the highest resolution timing service on the platform that is valid across threads in the same process. Since the CPU timer registers are *not* valid across threads on some platforms, this means that the resolution of tick_count can not be guaranteed to be consistent across platforms. .. note:: On Linux\* OS, you may need to add -lrt to the linker command when you use oneapi::tbb::tick_count class. For more information, see `http://fedoraproject.org/wiki/Features/ChangeInImplicitDSOLinking `_. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/UpgradeDowngrade.rst ================================================ .. _UpgradeDowngrade: Upgrade/Downgrade ================= It is possible to upgrade a reader lock to a writer lock, by using the method ``upgrade_to_writer``. Here is an example. :: std::vector MyVector; typedef spin_rw_mutex MyVectorMutexType; MyVectorMutexType MyVectorMutex;   void AddKeyIfMissing( const string& key ) { // Obtain a reader lock on MyVectorMutex MyVectorMutexType::scoped_lock lock(MyVectorMutex,/*is_writer=*/false); size_t n = MyVector.size(); for( size_t i=0; i`` requires the |full_name| scalable memory allocator library as described in **Scalable Memory Allocator**. It does not require the oneTBB general library, and can be used independently of the rest of oneTBB. The templates ``tbb_allocator`` and ``cache_aligned_allocator`` use the scalable allocator library if it is present otherwise it reverts to using ``malloc`` and ``free``. Thus, you can use these templates even in applications that choose to omit the scalable memory allocator library. The rest of |full_name| can be used with or without the oneTBB scalable memory allocator library. .. container:: tablenoborder .. list-table:: :header-rows: 1 * - Template - Requirements - Notes * - \ ``scalable_allocator`` - |full_name| scalable memory allocator library. See **Scalable Memory Allocator**. -   * - \ ``tbb_allocator`` \ ``cache_aligned_allocator`` -   - Uses the scalable allocator library if it is present, otherwise it reverts to using ``malloc`` and ``free``. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Windows_C_Dynamic_Memory_Interface_Replacement.rst ================================================ .. _Windows_C_Dynamic_Memory_Interface_Replacement: Windows\* OS C/C++ Dynamic Memory Interface Replacement ======================================================= Release version of the proxy library is ``tbbmalloc_proxy.dll``, debug version is ``tbbmalloc_proxy_debug.dll``. The following dynamic memory functions are replaced: - Standard C library functions: ``malloc``, ``calloc``, ``realloc``, ``free`` - Replaceable global C++ operators ``new`` and ``delete`` - Microsoft\* C run-time library functions: ``_msize``, ``_aligned_malloc``, ``_aligned_realloc``, ``_aligned_free``, ``_aligned_msize`` .. note:: Replacement of memory allocation functions is not supported for Universal Windows Platform applications. To do the replacement use one of the following methods: - Add the following header to a source code of any binary which is loaded during application startup. :: #include "oneapi/tbb/tbbmalloc_proxy.h" - Alternatively, add the following parameters to the linker options for the .exe or .dll file that is loaded during application startup. For 32-bit code (note the triple underscore): :: tbbmalloc_proxy.lib /INCLUDE:"___TBB_malloc_proxy" For 64-bit code (note the double underscore): :: tbbmalloc_proxy.lib /INCLUDE:"__TBB_malloc_proxy" The OS program loader must be able to find the proxy library and the scalable memory allocator library at program load time. For that you may include the directory containing the libraries in the ``PATH`` environment variable. The replacement uses in-memory binary instrumentation of Visual C++\* runtime libraries. To ensure correctness, it must first recognize a subset of dynamic memory functions in these libraries. If a problem occurs, the replacement is skipped, and the program continues to use the standard memory allocation functions. You can use the ``TBB_malloc_replacement_log`` function to check if the replacement has succeeded and to get additional information. Set the ``TBB_MALLOC_DISABLE_REPLACEMENT`` environment variable to 1 to disable replacement for a specific program invocation. In this case, the program will use standard dynamic memory allocation functions. Note that the oneTBB memory allocation libraries are still required for the program to start even if their usage is disabled. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Windows_OS_ug.rst ================================================ .. _Windows_OS_ug: Windows\* ========= This section uses <*tbb_install_dir*> to indicate the top-level installation directory. The following table describes the subdirectory structure for Windows\*, relative to <*tbb_install_dir*>. .. container:: tablenoborder .. list-table:: :header-rows: 1 * - Item - Location - Environment Variable * - Header files - | ``include\oneapi\tbb.h`` | ``include\oneapi\tbb\*.h`` - ``INCLUDE`` * - .lib files - ``lib\\vc\.lib``\ - ``LIB`` * - .dll files - ``redist\\vc\.dll`` - ``PATH`` * - .pdb files - Same as corresponding ``.dll`` file. - \ Where * ```` - ``ia32`` or ``intel64`` .. note:: Starting with oneTBB 2022.0, 32-bit binaries are supported only by the open-source version of the library. * ```` - ``tbb``, ``tbbmalloc``, ``tbbmalloc_proxy`` or ``tbbbind`` * ```` - ``14`` - use for dynamic linkage with the CRT - ``14_uwp`` - use for Windows 10 Universal Windows applications - ``14_uwd`` - use for Universal Windows Drivers - ``_mt`` - use for static linkage with the CRT * ```` - ``_debug`` or empty * ```` - binary version The last column shows, which environment variables are used by the Microsoft\* Visual C++\* or Intel® C++ Compiler Classic or Intel® oneAPI DPC++/C++ Compiler, to find these subdirectories. .. CAUTION:: Ensure that the relevant product directories are mentioned by the environment variables; otherwise the compiler might not find the required files. .. note:: Microsoft\* C/C++ run-time libraries come in static and dynamic forms. Either can be used with oneTBB. Linking to the oneTBB library is always dynamic. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/Working_on_the_Assembly_Line_pipeline.rst ================================================ .. _Working_on_the_Assembly_Line_pipeline: Working on the Assembly Line: parallel_pipeline =============================================== *Pipelining* is a common parallel pattern that mimics a traditional manufacturing assembly line. Data flows through a series of pipeline filters and each filter processes the data in some way. Given an incoming stream of data, some of these filters can operate in parallel, and others cannot. For example, in video processing, some operations on frames do not depend on other frames, and so can be done on multiple frames at the same time. On the other hand, some operations on frames require processing prior frames first. The |full_name| classes ``parallel_pipeline`` and filter implement the pipeline pattern. A simple text processing example will be used to demonstrate the usage of ``parallel_pipeline`` and filter to perform parallel formatting. The example reads a text file, squares each decimal numeral in the text, and writes the modified text to a new file. Below is a picture of the pipeline. .. CAUTION:: Since the body object provided to the filters of the ``parallel_pipeline`` might be copied, its ``operator()`` should not modify the body. Otherwise the modification might or might not become visible to the thread that invoked ``parallel_pipeline``, depending upon whether ``operator()`` is acting on the original or a copy. As a reminder of this nuance, ``parallel_pipeline`` requires that the body object's ``operator()`` be declared ``const``. .. container:: tablenoborder .. list-table:: :header-rows: 0 * - Read chunk from input file - |image0| - Square numerals in chunk - |image1| - Write chunk to output file Assume that the raw file I/O is sequential. The squaring filter can be done in parallel. That is, if you can serially read ``n`` chunks very quickly, you can transform each of the ``n`` chunks in parallel, as long as they are written in the proper order to the output file. Though the raw I/O is sequential, the formatting of input and output can be moved to the middle filter, and thus be parallel. To amortize parallel scheduling overheads, the filters operate on chunks of text. Each input chunk is approximately 4000 characters. Each chunk is represented by an instance of class ``TextSlice``: :: // Holds a slice of text. /** Instances *must* be allocated/freed using methods herein, because the C++ declaration represents only the header of a much larger object in memory. */ class TextSlice { // Pointer to one past last character in sequence char* logical_end; // Pointer to one past last available byte in sequence. char* physical_end; public: // Allocate a TextSlice object that can hold up to max_size characters. static TextSlice* allocate( size_t max_size ) { // +1 leaves room for a terminating null character. TextSlice* t = (TextSlice*)oneapi::tbb::tbb_allocator().allocate( sizeof(TextSlice)+max_size+1 ); t->logical_end = t->begin(); t->physical_end = t->begin()+max_size; return t; } // Free this TextSlice object void free() { oneapi::tbb::tbb_allocator().deallocate((char*)this, sizeof(TextSlice)+(physical_end-begin())+1); } // Pointer to beginning of sequence char* begin() {return (char*)(this+1);} // Pointer to one past last character in sequence char* end() {return logical_end;} // Length of sequence size_t size() const {return logical_end-(char*)(this+1);} // Maximum number of characters that can be appended to sequence size_t avail() const {return physical_end-logical_end;} // Append sequence [first,last) to this sequence. void append( char* first, char* last ) { memcpy( logical_end, first, last-first ); logical_end += last-first; } // Set end() to given value. void set_end( char* p ) {logical_end=p;} }; Below is the top-level code for building and running the pipeline. ``TextSlice`` objects are passed between filters using pointers to avoid the overhead of copying a ``TextSlice``. :: void RunPipeline( int ntoken, FILE* input_file, FILE* output_file ) { oneapi::tbb::parallel_pipeline( ntoken, oneapi::tbb::make_filter( oneapi::tbb::filter_mode::serial_in_order, MyInputFunc(input_file) ) & oneapi::tbb::make_filter( oneapi::tbb::filter_mode::parallel, MyTransformFunc() ) & oneapi::tbb::make_filter( oneapi::tbb::filter_mode::serial_in_order, MyOutputFunc(output_file) ) ); } The parameter ``ntoken`` to method ``parallel_pipeline`` controls the level of parallelism. Conceptually, tokens flow through the pipeline. In a serial in-order filter, each token must be processed serially in order. In a parallel filter, multiple tokens can by processed in parallel by the filter. If the number of tokens were unlimited, there might be a problem where the unordered filter in the middle keeps gaining tokens because the output filter cannot keep up. This situation typically leads to undesirable resource consumption by the middle filter. The parameter to method ``parallel_pipeline`` specifies the maximum number of tokens that can be in flight. Once this limit is reached, the pipeline never creates a new token at the input filter until another token is destroyed at the output filter. The second parameter specifies the sequence of filters. Each filter is constructed by function ``make_filter(mode,functor)``. - The *inputType* specifies the type of values input by a filter. For the input filter, the type is ``void``. - The *outputType* specifies the type of values output by a filter. For the output filter, the type is ``void``. - The *mode* specifies whether the filter processes items in parallel, serial in-order, or serial out-of-order. - The *functor* specifies how to produce an output value from an input value. The filters are concatenated with ``operator&``. When concatenating two filters, the *outputType* of the first filter must match the *inputType* of the second filter. The filters can be constructed and concatenated ahead of time. An equivalent version of the previous example that does this follows: :: void RunPipeline( int ntoken, FILE* input_file, FILE* output_file ) { oneapi::tbb::filter f1( oneapi::tbb::filter_mode::serial_in_order, MyInputFunc(input_file) ); oneapi::tbb::filter f2(oneapi::tbb::filter_mode::parallel, MyTransformFunc() ); oneapi::tbb::filter f3(oneapi::tbb::filter_mode::serial_in_order, MyOutputFunc(output_file) ); oneapi::tbb::filter f = f1 & f2 & f3; oneapi::tbb::parallel_pipeline(ntoken,f); } The input filter must be ``serial_in_order`` in this example because the filter reads chunks from a sequential file and the output filter must write the chunks in the same order. All ``serial_in_order`` filters process items in the same order. Thus if an item arrives at ``MyOutputFunc`` out of the order established by ``MyInputFunc``, the pipeline automatically delays invoking ``MyOutputFunc::operator()`` on the item until its predecessors are processed. There is another kind of serial filter, ``serial_out_of_order``, that does not preserve order. The middle filter operates on purely local data. Thus any number of invocations of its functor can run concurrently. Hence it is specified as a parallel filter. The functors for each filter are explained in detail now. The output functor is the simplest. All it has to do is write a ``TextSlice`` to a file and free the ``TextSlice``. :: // Functor that writes a TextSlice to a file. class MyOutputFunc { FILE* my_output_file; public: MyOutputFunc( FILE* output_file ); void operator()( TextSlice* item ) const; };   MyOutputFunc::MyOutputFunc( FILE* output_file ) : my_output_file(output_file) { }   void MyOutputFunc::operator()( TextSlice* out ) const { size_t n = fwrite( out->begin(), 1, out->size(), my_output_file ); if( n!=out->size() ) { fprintf(stderr,"Can't write into file '%s'\n", OutputFileName); exit(1); } out->free(); } Method ``operator()`` processes a ``TextSlice``. The parameter ``out`` points to the ``TextSlice`` to be processed. Since it is used for the last filter of the pipeline, it returns ``void``. The functor for the middle filter is similar, but a bit more complex. It returns a pointer to the ``TextSlice`` that it produces. :: // Functor that changes each decimal number to its square. class MyTransformFunc { public: TextSlice* operator()( TextSlice* input ) const; }; TextSlice* MyTransformFunc::operator()( TextSlice* input ) const { // Add terminating null so that strtol works right even if number is at end of the input. *input->end() = '\0'; char* p = input->begin(); TextSlice* out = TextSlice::allocate( 2*MAX_CHAR_PER_INPUT_SLICE ); char* q = out->begin(); for(;;) { while( pend() && !isdigit(*p) ) *q++ = *p++; if( p==input->end() ) break; long x = strtol( p, &p, 10 ); // Note: no overflow checking is needed here, as we have twice the // input string length, but the square of a non-negative integer n // cannot have more than twice as many digits as n. long y = x*x; sprintf(q,"%ld",y); q = strchr(q,0); } out->set_end(q); input->free(); return out; } The input functor is the most complicated, because it has to ensure that no numeral crosses a boundary. When it finds what could be a numeral crossing into the next slice, it copies the partial numeral to the next slice. Furthermore, it has to indicate when the end of input is reached. It does this by invoking method ``stop()`` on a special argument of type ``flow_control``. This idiom is required for any functor used for the first filter of a pipeline. :: TextSlice* next_slice = NULL; class MyInputFunc { public: MyInputFunc( FILE* input_file_ ); MyInputFunc( const MyInputFunc& f ) : input_file(f.input_file) { } ~MyInputFunc(); TextSlice* operator()( oneapi::tbb::flow_control& fc ) const; private: FILE* input_file; }; MyInputFunc::MyInputFunc( FILE* input_file_ ) : input_file(input_file_) { } MyInputFunc::~MyInputFunc() { } TextSlice* MyInputFunc::operator()( oneapi::tbb::flow_control& fc ) const { // Read characters into space that is available in the next slice. if( !next_slice ) next_slice = TextSlice::allocate( MAX_CHAR_PER_INPUT_SLICE ); size_t m = next_slice->avail(); size_t n = fread( next_slice->end(), 1, m, input_file ); if( !n && next_slice->size()==0 ) { // No more characters to process fc.stop(); return NULL; } else { // Have more characters to process. TextSlice* t = next_slice; next_slice = TextSlice::allocate( MAX_CHAR_PER_INPUT_SLICE ); char* p = t->end()+n; if( n==m ) { // Might have read partial number. // If so, transfer characters of partial number to next slice. while( p>t->begin() && isdigit(p[-1]) ) --p; assert(p>t->begin(),"Number too large to fit in buffer.\n"); next_slice->append( p, t->end()+n ); } t->set_end(p); return t; } } The copy constructor must be defined because the functor is copied when the underlying ``oneapi::tbb::filter_t`` is built from the functor, and again when the pipeline runs. .. |image0| image:: Images/image010.jpg :width: 31px :height: 26px .. |image1| image:: Images/image010.jpg :width: 31px :height: 26px .. toctree:: :maxdepth: 4 ../tbb_userguide/Using_Circular_Buffers ../tbb_userguide/Throughput_of_pipeline ../tbb_userguide/Non-Linear_Pipelines ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/always_use_wait_for_all.rst ================================================ .. _always_use_wait_for_all: Always Use wait_for_all() ========================= One of the most common mistakes made in flow graph programming is to forget to call ``wait_for_all``. The function ``graph::wait_for_all`` blocks until all tasks spawned by the graph are complete. This is not only useful when you want to wait until the computation is done, but it is necessary to call ``wait_for_all`` before destroying the graph, or any of its nodes. For example, the following function will lead to a program failure: :: void no_wait_for_all() { graph g; function_node< int, int > f( g, 1, []( int i ) -> int { return spin_for(i); } ); f.try_put(1); // program will fail when f and g are destroyed at the // end of the scope, since the body of f is not complete } In the function above, the graph g and its node f are destroyed at the end of the function's scope. However, the task spawned to execute f's body is still in flight. When the task completes, it will look for any successors connected to its node, but by then both the graph and the node have been deleted out from underneath it. Placing a ``g.wait_for_all()`` at the end of the function prevents the premature destruction of the graph and node. If you use a flow graph and see mysterious behavior, check first to see that you have called ``wait_for_all``. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/appendix_A.rst ================================================ .. _appendix_A: Appendix A Costs of Time Slicing ================================ Time slicing enables there to be more logical threads than physical threads. Each logical thread is serviced for a *time slice* by a physical thread. If a thread runs longer than a time slice, as most do, it relinquishes the physical thread until it gets another turn. This appendix details the costs incurred by time slicing. The most obvious is the time for *context switching* between logical threads. Each context switch requires that the processor save all its registers for the previous logical thread that it was executing, and load its registers for the next logical thread that it runs. A more subtle cost is *cache cooling*. Processors keep recently accessed data in cache memory, which is very fast, but also relatively small compared to main memory. When the processor runs out of cache memory, it has to evict items from cache and put them back into main memory. Typically, it chooses the least recently used items in the cache. (The reality of set-associative caches is a bit more complicated, but this is not a cache primer.) When a logical thread gets its time slice, as it references a piece of data for the first time, this data will be pulled into cache, taking hundreds of cycles. If it is referenced frequently enough to not be evicted, each subsequent reference will find it in cache, and only take a few cycles. Such data is called "hot in cache". Time slicing undoes this, because if a thread A finishes its time slice, and subsequently thread B runs on the same physical thread, B will tend to evict data that was hot in cache for A, unless both threads need the data. When thread A gets its next time slice, it will need to reload evicted data, at the cost of hundreds of cycles for each cache miss. Or worse yet, the next time slice for thread A may be on a different physical thread that has a different cache altogether. Another cost is *lock preemption.* This happens if a thread acquires a lock on a resource, and its time slice runs out before it releases the lock. No matter how short a time the thread intended to hold the lock, it is now going to hold it for at least as long as it takes for its next turn at a time slice to come up. Any other threads waiting on the lock either pointlessly busy-wait, or lose the rest of their time slice. The effect is called *convoying*, because the threads end up "bumper to bumper" waiting for the preempted thread in front to resume driving. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/appendix_B.rst ================================================ .. _appendix_B: Appendix B Mixing With Other Threading Packages =============================================== |full_name| can be mixed with other threading packages. No special effort is required to use any part of oneTBB with other threading packages. Here is an example that parallelizes an outer loop with OpenMP and an inner loop with oneTBB. :: int M, N;   struct InnerBody { ... };   void TBB_NestedInOpenMP() { #pragma omp parallel { #pragma omp for for( int i=0; i(0,N,10), InnerBody(i) ); } } } The details of ``InnerBody`` are omitted for brevity. The ``#pragma omp parallel`` causes the OpenMP to create a team of threads, and each thread executes the block statement associated with the pragma. The ``#pragma omp for`` indicates that the compiler should use the previously created thread team to execute the loop in parallel. Here is the same example written using POSIX\* Threads. :: int M, N;   struct InnerBody { ... };   void* OuterLoopIteration( void* args ) { int i = (int)args; parallel_for( blocked_range(0,N,10), InnerBody(i) ); }   void TBB_NestedInPThreads() { std::vector id( M ); // Create thread for each outer loop iteration for( int i=0; i src( g, [&]( oneapi::tbb::flow_control& fc ) -> int { if ( src_count <= limit ) { return src_count++; } else { fc.stop(); return int(); } } ); src.activate(); function_node< int, int > f( g, unlimited, [&]( int i ) -> int { global_sum += i; // data race on global_sum return i; } ); make_edge( src, f ); g.wait_for_all(); cout << "global sum = " << global_sum << " and closed form = " << limit*(limit+1)/2 << "\n"; If you run the above example, it will likely calculate a global sum that is a bit smaller than the expected solution due to the data race. The data race could be avoided in this simple example by changing the allowed concurrency in ``f`` from unlimited to 1, forcing each value to be processed sequentially by ``f``. You may also note that the ``input_node`` also updates a global value, ``src_count``. However, since an ``input_node`` always executes serially, there is no race possible. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/broadcast_or_send.rst ================================================ .. _broadcast_or_send: Sending to One or Multiple Successors ===================================== An important characteristic of the predefined nodes is whether they push their output to a single successor or broadcast to all successors. The following predefined nodes push messages to a single successor: - ``buffer_node`` - ``queue_node`` - ``priority_queue_node`` - ``sequencer_node`` Other nodes push messages to all successors that will accept them. The nodes that push to only a single successor are all buffer nodes. Their purpose is to hold messages temporarily, until they are consumed downstream. Consider the example below: :: void use_buffer_and_two_nodes() { graph g; function_node< int, int, rejecting > f1( g, 1, []( int i ) -> int { spin_for(0.1); cout << "f1 consuming " << i << "\n"; return i; } ); function_node< int, int, rejecting > f2( g, 1, []( int i ) -> int { spin_for(0.2); cout << "f2 consuming " << i << "\n"; return i; } ); priority_queue_node< int > q(g); make_edge( q, f1 ); make_edge( q, f2 ); for ( int i = 10; i > 0; --i ) { q.try_put( i ); } g.wait_for_all(); } First, function_nodes by default queue up the messages they receive at their input. To make a ``priority_queue_node`` work properly with a ``function_node``, the example above constructs its ``function_nodes`` with its buffer policy set to rejecting. So, ``f1`` and ``f2`` do not internally buffer incoming messages, but instead rely on upstream buffering in the ``priority_queue_node``. In the above example, each message buffered by the ``priority_queue_node`` is sent to either ``f1`` or ``f2``, but not both. Let's consider the alternative behavior; that is; what if the ``priority_queue_node`` broadcasts to all successors. What if some, but not all, nodes accept a message? Should the message be buffered until all nodes accept it, or be only delivered to the accepting subset? If the node continues to buffer the message, should it eventually deliver the messages in the same order to all nodes or in the current priority order at the time the node accepts the next message? For example, assume a ``priority_queue_node`` only contains "9" when a successor node, ``f1``, accepts "9" but another successor node, ``f2``, rejects it. Later a value "100" arrives and ``f2`` is available to accept messages. Should ``f2`` receive "9" next or "100", which has a higher priority? In any case, trying to ensure that all successors receive each message creates a garbage collection problem and complicates reasoning. Therefore, these buffering nodes push each message to only one successor. And, you can use this characteristic to create useful graph structures such as the one shown in the graph above, where each message will be processed in priority order, by either ``f1`` or ``f2``. But what if you really do want both ``f1`` and ``f2`` to receive all of the values, and in priority order? You can easily create this behavior by creating one ``priority_queue_node`` for each ``function_node``, and pushing each value to both queues through a broadcast_node, as shown below: :: graph g; function_node< int, int, rejecting > f1( g, 1, []( int i ) -> int { spin_for(0.1); cout << "f1 consuming " << i << "\n"; return i; } ); function_node< int, int, rejecting > f2( g, 1, []( int i ) -> int { spin_for(0.2); cout << "f2 consuming " << i << "\n"; return i; } ); priority_queue_node< int > q1(g); priority_queue_node< int > q2(g); broadcast_node< int > b(g); make_edge( b, q1 ); make_edge( b, q2 ); make_edge( q1, f1 ); make_edge( q2, f2 ); for ( int i = 10; i > 0; --i ) { b.try_put( i ); } g.wait_for_all(); So, when connecting a node in your graph to multiple successors, be sure to understand whether the output will broadcast to all of the successors, or just a single successor. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/cancel_a_graph.rst ================================================ .. _cancel_a_graph: Cancel a Graph Explicitly ========================= To cancel a graph execution without an exception, you can create the graph using an explicit task_group_context, and then call ``cancel_group_execution()`` on that object. This is done in the example below: :: task_group_context t; graph g(t); function_node< int, int > f1( g, 1, []( int i ) { return i; } ); function_node< int, int > f2( g, 1, []( const int i ) -> int { cout << "Begin " << i << "\n"; spin_for(0.2); cout << "End " << i << "\n"; return i; } ); function_node< int, int > f3( g, 1, []( int i ) { return i; } ); make_edge( f1, f2 ); make_edge( f2, f3 ); f1.try_put(1); f1.try_put(2); spin_for(0.1); t.cancel_group_execution(); g.wait_for_all(); When a graph execution is canceled, any node that has already started to execute will execute to completion, but any node that has not started to execute will not start. So in the example above, f2 will print both the Begin and End message for input 1, but will not receive the input 2. You can also get the task_group_context that a node belongs to from within the node body and use it to cancel the execution of the graph it belongs to: :: graph g; function_node< int, int > f1( g, 1, []( int i ) { return i; } ); function_node< int, int > f2( g, 1, []( const int i ) -> int { cout << "Begin " << i << "\n"; spin_for(0.2); cout << "End " << i << "\n"; task::self().group()->cancel_group_execution(); return i; } ); function_node< int, int > f3( g, 1, []( int i ) { return i; } ); make_edge( f1, f2 ); make_edge( f2, f3 ); f1.try_put(1); f1.try_put(2); g.wait_for_all(); You can get the ``task_group_context`` from a node's body even if the graph was not explicitly passed one at construction time. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/cancelling_nested_parallelism.rst ================================================ .. _cancelling_nested_parallelism: Canceling Nested Parallelism ============================ Nested parallelism is canceled if the inner context is bound to the outer context; otherwise it is not. If the execution of a flow graph is canceled, either explicitly or due to an exception, any tasks started by parallel algorithms or flow graphs nested within the nodes of the canceled flow graph may or may not be canceled. As with all of the library's nested parallelism, you can control cancellation relationships by use of explicit ``task_group_context`` objects. If you do not provide an explicit ``task_group_context`` to a flow graph, it is created with an isolated context by default. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/catching_exceptions.rst ================================================ .. _catching_exceptions: Catching Exceptions Inside the Node that Throws the Exception ============================================================= If you catch an exception within the node's body, execution continues normally, as you might expect. If an exception is thrown but is not caught before it propagates beyond the node's body, the execution of all of the graph's nodes are canceled and the exception is rethrown at the call site of graph::wait_for_all(). Take the graph below as an example: :: graph g; function_node< int, int > f1( g, 1, []( int i ) { return i; } ); function_node< int, int > f2( g, 1, []( const int i ) -> int { throw i; return i; } ); function_node< int, int > f3( g, 1, []( int i ) { return i; } ); make_edge( f1, f2 ); make_edge( f2, f3 ); f1.try_put(1); f1.try_put(2); g.wait_for_all(); In the code above, the second function_node, f2, throws an exception that is not caught within the body. This will cause the execution of the graph to be canceled and the exception to be rethrown at the call to g.wait_for_all(). Since it is not handled there either, the program will terminate. If desirable, the exception could be caught and handled within the body: :: function_node< int, int > f2( g, 1, []( const int i ) -> int { try { throw i; } catch (int j) { cout << "Caught " << j << "\n"; } return i; } ); If the exception is caught and handled in the body, then there is no effect on the overall execution of the graph. However, you could choose instead to catch the exception at the call to wait_for_all: :: try { g.wait_for_all(); } catch ( int j ) { cout << "Caught " << j << "\n"; } In this case, the execution of the graph is canceled. For our example, this means that the input 1 never reaches f3 and that input 2 never reaches either f2 or f3. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/communicate_with_nodes.rst ================================================ .. _communicate_with_nodes: Communication Between Graphs ============================ All graph nodes require a reference to a graph object as one of the arguments to their constructor. It is only safe to construct edges between nodes that are part of the same graph. An edge expresses the topology of your graph to the runtime library. Connecting two nodes in different graphs can make it difficult to reason about whole graph operations, such as calls to ``graph::wait_for_all`` and exception handling. To optimize performance, the library may make calls to a node's predecessor or successor at times that are unexpected by the user. If two graphs must communicate, do NOT create an edge between them, but instead use explicit calls to try_put. This will prevent the runtime library from making any assumptions about the relationship of the two nodes, and therefore make it easier to reason about events that cross the graph boundaries. However, it may still be difficult to reason about whole graph operations. For example, consider the graphs below: :: graph g; function_node< int, int > n1( g, 1, [](int i) -> int { cout << "n1\n"; spin_for(i); return i; } ); function_node< int, int > n2( g, 1, [](int i) -> int { cout << "n2\n"; spin_for(i); return i; } ); make_edge( n1, n2 ); graph g2; function_node< int, int > m1( g2, 1, [](int i) -> int { cout << "m1\n"; spin_for(i); return i; } ); function_node< int, int > m2( g2, 1, [&](int i) -> int { cout << "m2\n"; spin_for(i); n1.try_put(i); return i; } ); make_edge( m1, m2 ); m1.try_put( 1 ); // The following call returns immediately: g.wait_for_all(); // The following call returns after m1 & m2 g2.wait_for_all(); // we reach here before n1 & n2 are finished // even though wait_for_all was called on both graphs In the example above, ``m1.try_put(1)`` sends a message to node m1, which runs its body and then sends a message to node ``m2``. Next, node ``m2`` runs its body and sends a message to ``n1`` using an explicit ``try_put``. In turn, ``n1`` runs its body and sends a message to n2. The runtime library does not consider ``m2`` to be a predecessor of ``n1`` since no edge exists. If you want to wait until all of the tasks spawned by these graphs are done, you need to call the function ``wait_for_all`` on both graphs. However, because there is cross-graph communication, the order of the calls is important. In the (incorrect) code segment above, the first call to ``g.wait_for_all()`` returns immediately because there are no tasks yet active in ``g``; the only tasks that have been spawned by then belong to ``g2``. The call to ``g2.wait_for_all`` returns after both ``m1`` and ``m2`` are done, since they belong to ``g2``; the call does not however wait for ``n1`` and ``n2``, since they belong to ``g``. The end of this code segment is therefore reached before ``n1`` and ``n2`` are done. If the calls to ``wait_for_all`` are swapped, the code works as expected: :: g2.wait_for_all(); g.wait_for_all(); // all tasks are done While it is not too difficult to reason about how these two very small graphs interact, the interaction of two larger graphs, perhaps with cycles, will be more difficult to understand. Therefore, communication between nodes in different graphs should be done with caution. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/concurrent_hash_map.rst ================================================ .. _concurrent_hash_map: concurrent_hash_map =================== A ``concurrent_hash_map`` is a hash table that permits concurrent accesses. The table is a map from a key to a type ``T``. The traits type HashCompare defines how to hash a key and how to compare two keys. The following example builds a ``concurrent_hash_map`` where the keys are strings and the corresponding data is the number of times each string occurs in the array ``Data``. :: #include "oneapi/tbb/concurrent_hash_map.h" #include "oneapi/tbb/blocked_range.h" #include "oneapi/tbb/parallel_for.h" #include   using namespace oneapi::tbb; using namespace std;   // Structure that defines hashing and comparison operations for user's type. struct MyHashCompare { size_t hash( const string& x ) const { size_t h = 0; for( const char* s = x.c_str(); *s; ++s ) h = (h*17)^*s; return h; } //! True if strings are equal bool equal( const string& x, const string& y ) const { return x==y; } };   // A concurrent hash table that maps strings to ints. typedef concurrent_hash_map StringTable;   // Function object for counting occurrences of strings. struct Tally { StringTable& table; Tally( StringTable& table_ ) : table(table_) {} void operator()( const blocked_range range ) const { for( string* p=range.begin(); p!=range.end(); ++p ) { StringTable::accessor a; table.insert( a, *p ); a->second += 1; } } };   const size_t N = 1000000;   string Data[N];   void CountOccurrences() { // Construct empty table. StringTable table;   // Put occurrences into the table parallel_for( blocked_range( Data, Data+N, 1000 ), Tally(table) );   // Display the occurrences for( StringTable::iterator i=table.begin(); i!=table.end(); ++i ) printf("%s %d\n",i->first.c_str(),i->second); } A ``concurrent_hash_map`` acts as a container of elements of type ``std::pair``. Typically, when accessing a container element, you are interested in either updating it or reading it. The template class ``concurrent_hash_map`` supports these two purposes respectively with the classes ``accessor`` and ``const_accessor`` that act as smart pointers. An *accessor* represents *update* (*write*) access. As long as it points to an element, all other attempts to look up that key in the table block until the ``accessor`` is done. A ``const_accessor`` is similar, except that is represents *read-only* access. Multiple ``const_accessors`` can point to the same element at the same time. This feature can greatly improve concurrency in situations where elements are frequently read and infrequently updated. The methods ``find`` and ``insert`` take an ``accessor`` or ``const_accessor`` as an argument. The choice tells ``concurrent_hash_map`` whether you are asking for *update* or *read-only* access. Once the method returns, the access lasts until the ``accessor`` or ``const_accessor`` is destroyed. Because having access to an element can block other threads, try to shorten the lifetime of the ``accessor`` or ``const_accessor``. To do so, declare it in the innermost block possible. To release access even sooner than the end of the block, use method ``release``. The following example is a rework of the loop body that uses ``release`` instead of depending upon destruction to end thread lifetime: :: StringTable accessor a; for( string* p=range.begin(); p!=range.end(); ++p ) { table.insert( a, *p ); a->second += 1; a.release(); } The method ``remove(key)`` can also operate concurrently. It implicitly requests write access. Therefore before removing the key, it waits on any other extant accesses on ``key``. .. toctree:: :maxdepth: 4 ../tbb_userguide/More_on_HashCompare ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/concurrent_vector_ug.rst ================================================ .. _concurrent_vector_ug: concurrent_vector ================= ``A concurrent_vector`` is a dynamically growable array of ``T``. It is safe to grow a ``concurrent_vector`` while other threads are also operating on elements of it, or even growing it themselves. For safe concurrent growing, ``concurrent_vector`` has three methods that support common uses of dynamic arrays: ``push_back``, ``grow_by``, and ``grow_to_at_least``. Method ``push_back(x)`` safely appends x to the array. Method ``grow_by(n)`` safely appends ``n`` consecutive elements initialized with ``T()``. Both methods return an iterator pointing to the first appended element. Each element is initialized with ``T()``. So for example, the following routine safely appends a C string to a shared vector: :: void Append( concurrent_vector& vector, const char* string ) { size_t n = strlen(string)+1; std::copy( string, string+n, vector.grow_by(n) ); } The related method ``grow_to_at_least(n)``\ grows a vector to size ``n`` if it is shorter. Concurrent calls to the growth methods do not necessarily return in the order that elements are appended to the vector. Method ``size()`` returns the number of elements in the vector, which may include elements that are still undergoing concurrent construction by methods ``push_back``, ``grow_by,`` or ``grow_to_at_least``. The example uses std::copy and iterators, not ``strcpy and pointers``, because elements in a ``concurrent_vector`` might not be at consecutive addresses. It is safe to use the iterators while the ``concurrent_vector`` is being grown, as long as the iterators never go past the current value of ``end()``. However, the iterator may reference an element undergoing concurrent construction. You must synchronize construction and access. A ``concurrent_vector`` never moves an element until the array is cleared, which can be an advantage over the STL std::vector even for single-threaded code. However, ``concurrent_vector`` does have more overhead than std::vector. Use ``concurrent_vector`` only if you really need the ability to dynamically resize it while other accesses are (or might be) in flight, or require that an element never move. .. CAUTION:: Operations on ``concurrent_vector`` are concurrency safe with respect to *growing*, not for clearing or destroying a vector. Never invoke method ``clear()`` if there are other operations in flight on the ``concurrent_vector``. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/create_token_based_system.rst ================================================ .. _create_token_based_system: Create a Token-Based System =========================== A more flexible solution to limit the number of messages in a flow graph is to use tokens. In a token-based system, a limited number of tokens are available in the graph and a message will not be allowed to enter the graph until it can be paired with an available token. When a message is retired from the graph, its token is released, and can be paired with a new message that will then be allowed to enter. The ``oneapi::tbb::parallel_pipeline`` algorithm relies on a token-based system. In the flow graph interface, there is no explicit support for tokens, but ``join_node`` can be used to create an analogous system. A ``join_node`` has two template arguments, the tuple that describes the types of its inputs and a buffer policy: :: template class join_node; The buffer policy can be one of the following: - ``queueing``. This type of policy causes inputs to be matched first-in-first-out; that is, the inputs are joined together to form a tuple in the order they are received. - ``tag_matching``. This type of policy joins inputs together that have matching tags. - ``reserving``. This type of policy causes the ``join_node`` to do no internally buffering, but instead to consume inputs only when it can first reserve an input on each port from an upstream source. If it can reserve an input at each port, it gets those inputs and joins those together to form an output tuple. A token-based system can be created by using reserving join_nodes. In the example below, there is an ``input_node`` that generates ``M`` big objects and a ``buffer_node`` that is pre-filled with three tokens. The ``token_t`` can be anything, for example it could be ``typedef int token_t;``. The ``input_node`` and ``buffer_node`` are connected to a reserving ``join_node``. The ``input_node`` will only generate an input when one is pulled from it by the reserving ``join_node``, and the reserving ``join_node`` will only pull the input from the ``input_node`` when it knows there is also an item to pull from the ``buffer_node``. :: graph g; int src_count = 0; int number_of_objects = 0; int max_objects = 3; input_node< big_object * > s( g, [&]( oneapi::tbb::flow_control& fc ) -> big_object* { if ( src_count < M ) { big_object* v = new big_object(); ++src_count; return v; } else { fc.stop(); return nullptr; } } ); s.activate(); join_node< tuple_t, reserving > j(g); buffer_node< token_t > b(g); function_node< tuple_t, token_t > f( g, unlimited, []( const tuple_t &t ) -> token_t { spin_for(1); cout << get<1>(t) << "\n"; delete get<0>(t); return get<1>(t); } ); make_edge( s, input_port<0>(j) ); make_edge( b, input_port<1>(j) ); make_edge( j, f ); make_edge( f, b ); b.try_put( 1 ); b.try_put( 2 ); b.try_put( 3 ); g.wait_for_all(); In the above code, you can see that the ``function_node`` returns the token back to the ``buffer_node``. This cycle in the flow graph allows the token to be recycled and paired with another input from the ``input_node``. So like in the previous sections, there will be at most four big objects in the graph. There could be three big objects in the ``function_node`` and one buffered in the ``input_node``, awaiting a token to be paired with. Since there is no specific ``token_t`` defined for the flow graph, you can use any type for a token, including objects or pointers to arrays. Therefore, unlike in the example above, the ``token_t`` doesn't need to be a dummy type; it could for example be a buffer or other object that is essential to the computation. We could, for example, modify the example above to use the big objects themselves as the tokens, removing the need to repeatedly allocate and deallocate them, and essentially create a free list of big objects using a cycle back to the ``buffer_node``. Also, in our example above, the ``buffer_node`` was prefilled by a fixed number of explicit calls to ``try_put``, but there are other options. For example, an ``input_node`` could be attached to the input of the ``buffer_node``, and it could generate the tokens. In addition, our ``function_node`` could be replaced by a ``multifunction_node`` that can optionally put 0 or more outputs to each of its output ports. Using a ``multifunction_node``, you can choose to recycle or not recycle a token, or even generate more tokens, thereby increasing or decreasing the allowed concurrency in the graph. A token based system is therefore very flexible. You are free to declare the token to be of any type and to inject or remove tokens from the system as it is executing, thereby having dynamic control of the allowed concurrency in the system. Since you can pair the token with an input at the source, this approach enables you to limit resource consumption across the entire graph. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/design_patterns/Agglomeration.rst ================================================ .. _Agglomeration: Agglomeration ============= .. container:: section .. rubric:: Problem :class: sectiontitle Parallelism is so fine grained that overhead of parallel scheduling or communication swamps the useful work. .. container:: section .. rubric:: Context :class: sectiontitle Many algorithms permit parallelism at a very fine grain, on the order of a few instructions per task. But synchronization between threads usually requires orders of magnitude more cycles. For example, elementwise addition of two arrays can be done fully in parallel, but if each scalar addition is scheduled as a separate task, most of the time will be spent doing synchronization instead of useful addition. .. container:: section .. rubric:: Forces :class: sectiontitle - Individual computations can be done in parallel, but are small. For practical use of |full_name|, "small" here means less than 10,000 clock cycles. - The parallelism is for sake of performance and not required for semantic reasons. .. container:: section .. rubric:: Solution :class: sectiontitle Group the computations into blocks. Evaluate computations within a block serially. The block size should be chosen to be large enough to amortize parallel overhead. Too large a block size may limit parallelism or load balancing because the number of blocks becomes too small to distribute work evenly across processors. The choice of block topology is typically driven by two concerns: - Minimizing synchronization between blocks. - Minimizing cache traffic between blocks. If the computations are completely independent, then the blocks will be independent too, and then only cache traffic issues must be considered. If the loop is "small", on the order of less than 10,000 clock cycles, then it may be impractical to parallelize at all, because the optimal agglomeration might be a single block, .. container:: section .. rubric:: Examples :class: sectiontitle TBB loop templates such as ``oneapi::tbb::parallel_for`` that take a *range* argument support automatic agglomeration. When agglomerating, think about cache effects. Avoid having cache lines cross between groups if possible. There may be boundary to interior ratio effects. For example, if the computations form a 2D grid, and communicate only with nearest neighbors, then the computation per block grows quadratically (with the block's area), but the cross-block communication grows with linearly (with the block's perimeter). The following figure shows four different ways to agglomerate an 8×8 grid. If doing such analysis, be careful to consider that information is transferred in cache line units. For a given area, the perimeter may be minimized when the block is square with respect to the underlying grid of cache lines, not square with respect to the logical grid. .. container:: fignone :name: fig1 Four different agglomerations of an 8×8 grid. |image0| Also consider vectorization. Blocks that contain long contiguous subsets of data may better enable vectorization. For recursive computations, most of the work is towards the leaves, so the solution is to treat subtrees as a groups as shown in the following figure. .. container:: fignone :name: fig2 Agglomeration of a recursive computation |image1| Often such an agglomeration is achieved by recursing serially once some threshold is reached. For example, a recursive sort might solve sub-problems in parallel only if they are above a certain threshold size. .. container:: section .. rubric:: Reference :class: sectiontitle Ian Foster introduced the term "agglomeration" in his book Designing and Building Parallel Programs http://www.mcs.anl.gov/~itf/dbpp. There agglomeration is part of a four step **PCAM** design method: #. **P**\ artitioning - break the program into the smallest tasks possible. #. **C**\ ommunication – figure out what communication is required between tasks. When using oneTBB, communication is usually cache line transfers. Though they are automatic, understanding which ones happen between tasks helps guide the agglomeration step. #. **A**\ gglomeration – combine tasks into larger tasks. His book has an extensive list of considerations that is worth reading. #. **M**\ apping – map tasks onto processors. The oneTBB task scheduler does this step for you. .. |image0| image:: Images/image002a.jpg :width: 301px :height: 293px .. |image1| image:: Images/image003a.jpg :width: 291px :height: 150px ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/design_patterns/Design_Patterns.rst ================================================ .. _design_patterns: Design Patterns =============== This section provides some common parallel programming patterns and how to implement them in |full_name|. The description of each pattern has the following format: - **Problem** – describes the problem to be solved. - **Context** – describes contexts in which the problem arises. - **Forces** - considerations that drive use of the pattern. - **Solution** - describes how to implement the pattern. - **Example** – presents an example implementation. Variations and examples are sometimes discussed. The code examples are intended to emphasize key points and are not full-fledged code. Examples may omit obvious const overloads of non-const methods. Much of the nomenclature and examples are adapted from Web pages created by Eun-Gyu and Marc Snir, and the Berkeley parallel patterns wiki. See links in the **General References** section. For brevity, some of the code examples use C++11 lambda expressions. It is straightforward, albeit sometimes tedious, to translate such lambda expressions into equivalent C++03 code. .. toctree:: :maxdepth: 4 ../../tbb_userguide/design_patterns/Agglomeration ../../tbb_userguide/design_patterns/Elementwise ../../tbb_userguide/design_patterns/Odd-Even_Communication ../../tbb_userguide/design_patterns/Wavefront ../../tbb_userguide/design_patterns/Reduction ../../tbb_userguide/design_patterns/Divide_and_Conquer ../../tbb_userguide/design_patterns/GUI_Thread ../../tbb_userguide/design_patterns/Non-Preemptive_Priorities ../../tbb_userguide/design_patterns/Lazy_Initialization ../../tbb_userguide/design_patterns/Local_Serializer ../../tbb_userguide/design_patterns/Fenced_Data_Transfer ../../tbb_userguide/design_patterns/Reference_Counting ../../tbb_userguide/design_patterns/General_References ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/design_patterns/Divide_and_Conquer.rst ================================================ .. _Divide_and_Conquer: Divide and Conquer ================== .. container:: section .. rubric:: Problem :class: sectiontitle Parallelize a divide and conquer algorithm. .. container:: section .. rubric:: Context :class: sectiontitle Divide and conquer is widely used in serial algorithms. Common examples are quicksort and mergesort. .. container:: section .. rubric:: Forces :class: sectiontitle - Problem can be transformed into subproblems that can be solved independently. - Splitting problem or merging solutions is relatively cheap compared to cost of solving the subproblems. .. container:: section .. rubric:: Solution :class: sectiontitle There are several ways to implement divide and conquer in |full_name|. The best choice depends upon circumstances. - If division always yields the same number of subproblems, use recursion and ``oneapi::tbb::parallel_invoke``. - If the number of subproblems varies, use recursion and ``oneapi::tbb::task_group``. .. container:: section .. rubric:: Example :class: sectiontitle Quicksort is a classic divide-and-conquer algorithm. It divides a sorting problem into two subsorts. A simple serial version looks like [1]_. :: void SerialQuicksort( T* begin, T* end ) { if( end-begin>1 ) { using namespace std; T* mid = partition( begin+1, end, bind2nd(less(),*begin) ); swap( *begin, mid[-1] ); SerialQuicksort( begin, mid-1 ); SerialQuicksort( mid, end ); } } The number of subsorts is fixed at two, so ``oneapi::tbb::parallel_invoke`` provides a simple way to parallelize it. The parallel code is shown below: :: void ParallelQuicksort( T* begin, T* end ) { if( end-begin>1 ) { using namespace std; T* mid = partition( begin+1, end, bind2nd(less(),*begin) ); swap( *begin, mid[-1] ); oneapi::tbb::parallel_invoke( [=]{ParallelQuicksort( begin, mid-1 );}, [=]{ParallelQuicksort( mid, end );} ); } } Eventually the subsorts become small enough that serial execution is more efficient. The following variation, does sorts of less than 500 elements using the earlier serial code. :: void ParallelQuicksort( T* begin, T* end ) { if( end-begin>=500 ) { using namespace std; T* mid = partition( begin+1, end, bind2nd(less(),*begin) ); swap( *begin, mid[-1] ); oneapi::tbb::parallel_invoke( [=]{ParallelQuicksort( begin, mid-1 );}, [=]{ParallelQuicksort( mid, end );} ); } else { SerialQuicksort( begin, end ); } } The change is an instance of the Agglomeration pattern. The next example considers a problem where there are a variable number of subproblems. The problem involves a tree-like description of a mechanical assembly. There are two kinds of nodes: - Leaf nodes represent individual parts. - Internal nodes represent groups of parts. The problem is to find all nodes that collide with a target node. The following code shows a serial solution that walks the tree. It records in ``Hits`` any nodes that collide with ``Target``. :: std::list Hits; Node* Target;   void SerialFindCollisions( Node& x ) { if( x.is_leaf() ) { if( x.collides_with( *Target ) ) Hits.push_back(&x); } else { for( Node::const_iterator y=x.begin();y!=x.end(); ++y ) SerialFindCollisions(*y); } } A parallel version is shown below. :: typedef oneapi::tbb::enumerable_thread_specific > LocalList; LocalList LocalHits; Node* Target; // Target node   void ParallelWalk( Node& x ) { if( x.is_leaf() ) { if( x.collides_with( *Target ) ) LocalHits.local().push_back(&x); } else { // Recurse on each child y of x in parallel oneapi::tbb::task_group g; for( Node::const_iterator y=x.begin(); y!=x.end(); ++y ) g.run( [=]{ParallelWalk(*y);} ); // Wait for recursive calls to complete g.wait(); } }   void ParallelFindCollisions( Node& x ) { ParallelWalk(x); for(LocalList::iterator i=LocalHits.begin();i!=LocalHits.end(); ++i) Hits.splice( Hits.end(), *i ); } The recursive walk is parallelized using class ``task_group`` to do recursive calls in parallel. There is another significant change because of the parallelism that is introduced. Because it would be unsafe to update ``Hits`` concurrently, the parallel walk uses variable ``LocalHits`` to accumulate results. Because it is of type ``enumerable_thread_specific``, each thread accumulates its own private result. The results are spliced together into Hits after the walk completes. The results will *not* be in the same order as the original serial code. If parallel overhead is high, use the agglomeration pattern. For example, use the serial walk for subtrees under a certain threshold. .. [1] Production quality quicksort implementations typically use more sophisticated pivot selection, explicit stacks instead of recursion, and some other sorting algorithm for small subsorts. The simple algorithm is used here to focus on exposition of the parallel pattern. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/design_patterns/Elementwise.rst ================================================ .. _Elementwise: Elementwise =========== .. container:: section .. rubric:: Problem :class: sectiontitle Initiate similar independent computations across items in a data set, and wait until all complete. .. container:: section .. rubric:: Context :class: sectiontitle Many serial algorithms sweep over a set of items and do an independent computation on each item. However, if some kind of summary information is collected, use the Reduction pattern instead. .. container:: section .. rubric:: Forces :class: sectiontitle No information is carried or merged between the computations. .. container:: section .. rubric:: Solution :class: sectiontitle If the number of items is known in advance, use ``oneapi::tbb::parallel_for``. If not, consider using ``oneapi::tbb::parallel_for_each``. Use agglomeration if the individual computations are small relative to scheduler overheads. If the pattern is followed by a reduction on the same data, consider doing the element-wise operation as part of the reduction, so that the combination of the two patterns is accomplished in a single sweep instead of two sweeps. Doing so may improve performance by reducing traffic through the memory hierarchy. .. container:: section .. rubric:: Example :class: sectiontitle Convolution is often used in signal processing. The convolution of a filter ``c`` and signal ``x`` is computed as: |image0| Serial code for this computation might look like: :: // Assumes c[0..clen-1] and x[1-clen..xlen-1] are defined for( int i=0; i(0,xlen+clen-1,1000), [=]( oneapi::tbb::blocked_range r ) { int end = r.end(); for( int i=r.begin(); i!=end; ++i ) { float tmp = 0; for( int j=0; j`` for the flag that indicates when the message is ready. Here is the previous example with modifications. :: std::atomic Ready; std::string Message;   void Send( const std::string& src ) {. // Executed by thread 1 Message=src; Ready.store(true, std::memory_order_release); }   bool Receive( std::string& dst ) { // Executed by thread 2 bool result = Ready.load(std::memory_order_acquire); if( result ) dst=Message; return result; // Return true if message was received. } A write to a ``std::atomic`` value has *release* semantics, which means that all of its prior writes will be seen before the releasing write. A read from ``std::atomic`` value has *acquire* semantics, which means that all of its subsequent reads will happen after the acquiring read. The implementation of ``std::atomic`` ensures that both the compiler and the hardware observe these ordering constraints. .. container:: section .. rubric:: Variations :class: sectiontitle Higher level synchronization constructs normally include the necessary *acquire* and *release* fences. For example, mutexes are normally implemented such that acquisition of a lock has *acquire* semantics and release of a lock has *release* semantics. Thus a thread that acquires a lock on a mutex always sees any memory writes done by another thread before it released a lock on that mutex. .. container:: section .. rubric:: Non Solutions :class: sectiontitle Mistaken solutions are so often proposed that it is worth understanding why they are wrong. One common mistake is to assume that declaring the flag with the ``volatile`` keyword solves the problem. Though the ``volatile`` keyword forces a write to happen immediately, it generally has no effect on the visible ordering of that write with respect to other memory operations. Another mistake is to assume that conditionally executed code cannot happen before the condition is tested. However, the compiler or hardware may speculatively hoist the conditional code above the condition. Similarly, it is a mistake to assume that a processor cannot read the target of a pointer before reading the pointer. A modern processor does not read individual values from main memory. It reads cache lines. The target of a pointer may be in a cache line that has already been read before the pointer was read, thus giving the appearance that the processor presciently read the pointer target. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/design_patterns/GUI_Thread.rst ================================================ .. _GUI_Thread: GUI Thread ========== .. container:: section .. rubric:: Problem :class: sectiontitle A user interface thread must remain responsive to user requests, and must not get bogged down in long computations. .. container:: section .. rubric:: Context :class: sectiontitle Graphical user interfaces often have a dedicated thread ("GUI thread") for servicing user interactions. The thread must remain responsive to user requests even while the application has long computations running. For example, the user might want to press a "cancel" button to stop the long running computation. If the GUI thread takes part in the long running computation, it will not be able to respond to user requests. .. container:: section .. rubric:: Forces :class: sectiontitle - The GUI thread services an event loop. - The GUI thread needs to offload work onto other threads without waiting for the work to complete. - The GUI thread must be responsive to the event loop and not become dedicated to doing the offloaded work. .. container:: section .. rubric:: Related :class: sectiontitle - Non-Preemptive Priorities - Local Serializer .. container:: section .. rubric:: Solution :class: sectiontitle The GUI thread offloads the work by firing off a task to do it using method ``task_arena::enqueue`` of a ``task_arena`` instance. When finished, the task posts an event to the GUI thread to indicate that the work is done. The semantics of ``enqueue`` cause the task to eventually run on a worker thread distinct from the calling thread. The following figure sketches the communication paths. Items in black are executed by the GUI thread; items in blue are executed by another thread. |image0| .. container:: section .. rubric:: Example :class: sectiontitle The example is for the Microsoft Windows\* operating systems, though similar principles apply to any GUI using an event loop idiom. For each event, the GUI thread calls a user-defined function ``WndProc`` to process an event. :: // Event posted from enqueued task when it finishes its work. const UINT WM_POP_FOO = WM_USER+0; // Queue for transmitting results from enqueued task to GUI thread. oneapi::tbb::concurrent_queueResultQueue; // GUI thread's private copy of most recently computed result. Foo CurrentResult;   LRESULT CALLBACK WndProc(HWND hWnd, UINT msg, WPARAM wParam, LPARAM lParam) { switch(msg) { case WM_COMMAND: switch (LOWORD(wParam)) { case IDM_LONGRUNNINGWORK: // User requested a long computation. Delegate it to another thread. LaunchLongRunningWork(hWnd); break; case IDM_EXIT: DestroyWindow(hWnd); break; default: return DefWindowProc(hWnd, msg, wParam, lParam); } break; case WM_POP_FOO: // There is another result in ResultQueue for me to grab. ResultQueue.try_pop(CurrentResult); // Update the window with the latest result. RedrawWindow( hWnd, NULL, NULL, RDW_ERASE|RDW_INVALIDATE ); break; case WM_PAINT: Repaint the window using CurrentResult break; case WM_DESTROY: PostQuitMessage(0); break; default: return DefWindowProc( hWnd, msg, wParam, lParam ); } return 0; } The GUI thread processes long computations as follows: #. The GUI thread calls ``LongRunningWork``, which hands off the work to a worker thread and immediately returns. #. The GUI thread continues servicing the event loop. If it has to repaint the window, it uses the value of\ ``CurrentResult``, which is the most recent ``Foo`` that it has seen. When a worker finishes the long computation, it pushes the result into ResultQueue, and sends a message WM_POP_FOO to the GUI thread. #. The GUI thread services a ``WM_POP_FOO`` message by popping an item from ResultQueue into CurrentResult. The ``try_pop`` always succeeds because there is exactly one ``WM_POP_FOO`` message for each item in ``ResultQueue``. Routine ``LaunchLongRunningWork`` creates a function task and launches it using method ``task_arena::enqueue``. :: class LongTask { HWND hWnd; void operator()() { Do long computation Foo x = result of long computation ResultQueue.push( x ); // Notify GUI thread that result is available. PostMessage(hWnd,WM_POP_FOO,0,0); } public: LongTask( HWND hWnd_ ) : hWnd(hWnd_) {} }; void LaunchLongRunningWork( HWND hWnd ) { oneapi::tbb::task_arena a; a.enqueue(LongTask(hWnd)); } It is essential to use method ``task_arena::enqueue`` here. Even though, an explicit ``task_arena`` instance is created, the method ``enqueue`` ensures that the function task eventually executes when resources permit, even if no thread explicitly waits on the task. In contrast, ``oneapi::tbb::task_group::run`` may postpone execution of the function task until it is explicitly waited upon with the ``oneapi::tbb::task_group::wait``. The example uses a ``concurrent_queue`` for workers to communicate results back to the GUI thread. Since only the most recent result matters in the example, and alternative would be to use a shared variable protected by a mutex. However, doing so would block the worker while the GUI thread was holding a lock on the mutex, and vice versa. Using ``concurrent_queue`` provides a simple robust solution. If two long computations are in flight, there is a chance that the first computation completes after the second one. If displaying the result of the most recently requested computation is important, then associate a request serial number with the computation. The GUI thread can pop from ``ResultQueue`` into a temporary variable, check the serial number, and update ``CurrentResult`` only if doing so advances the serial number. .. |image0| image:: Images/image007a.jpg :width: 400px :height: 150px ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/design_patterns/General_References.rst ================================================ .. _General_References: General References ================== This section lists general references. References specific to a pattern are listed at the end of the topic for the pattern. - E. Gamma, R. Helm, R. Johnson, J. Vlissides. Design Patterns (1995) - `Berkeley Pattern Language for Parallel Programming `_ - T. Mattson, B. Sanders, B. Massingill. Patterns for Parallel Programming (2005) - `ParaPLoP 2009 `_ - `ParaPLoP 2010 `_ - Eun-Gyu Kim and Marc Snir, `Parallel Programming Patterns `_ ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/design_patterns/Lazy_Initialization.rst ================================================ .. _Lazy_Initialization: Lazy Initialization ==================== .. container:: section .. rubric:: Problem :class: sectiontitle Delay the creation of an object, potentially expensive, until it is accessed. In parallel programming, initialization must also be guarded against race conditions. .. container:: section .. rubric:: Context :class: sectiontitle The cost of operations that take place during the initialization of the object may be considerably high. In that case, the object should be initialized only when needed. Lazy initialization is the common tactic that allows implementing such an approach. .. container:: section .. rubric:: Solution :class: sectiontitle Using ``oneapi::tbb::collaborative_call_once`` with ``oneapi::tbb::collaborative_once_flag`` helps to implement thread-safe lazy initialization for a user object. In addition, ``collaborative_call_once`` allows other thread blocked on the same ``collaborative_once_flag`` to join other |short_name| parallel constructions called within the initializing function. .. container:: section .. rubric:: Example :class: sectiontitle This example illustrates the implementation of lazy initialization for the calculation of the Fibonacci numbers. Here is a graphical representation of the Fibonacci recursion tree for N=4. |image0| As seen in the diagram, some elements are recalculated more than once. These operations are redundant, so the "lazy initialized" Fibonacci numbers are relevant here. An implementation without the use of lazy initialization would have *O(2^N)* time complexity due to the full recursion tree traversal and recalculation of values. Since all the nodes are traversed once, the tree becomes a list, making the time complexity *O(N)*. |image1| Here you can see the code for the implementation. Already calculated values are stored in a buffer paired with ``collaborative_once_flag`` and will not be recalculated when ``collaborative_call_once`` is invoked when initialization has already been done. :: using FibBuffer = std::vector>; std::uint64_t LazyFibHelper(int n, FibBuffer& buffer) { // Base case if (n <= 1) { return n; } // Calculate nth value only once and store it in the buffer. // Other threads won't be blocked on already taken collaborative_once_flag // but join parallelism inside functor oneapi::tbb::collaborative_call_once(buffer[n].first, [&]() { std::uint64_t a, b; oneapi::tbb::parallel_invoke([&] { a = LazyFibHelper(n - 2, buffer); }, [&] { b = LazyFibHelper(n - 1, buffer); }); buffer[n].second = a + b; }); return buffer[n].second; } std::uint64_t Fib(int n) { FibBuffer buffer(n+1); return LazyFibHelper(n, buffer); } .. |image0| image:: Images/image008a.jpg :width: 744px :height: 367px .. |image1| image:: Images/image009a.jpg :width: 744px :height: 367px ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/design_patterns/Local_Serializer.rst ================================================ .. _Local_Serializer: Local Serializer ================ .. container:: section .. rubric:: Context :class: sectiontitle Consider an interactive program. To maximize concurrency and responsiveness, operations requested by the user can be implemented as tasks. The order of operations can be important. For example, suppose the program presents editable text to the user. There might be operations to select text and delete selected text. Reversing the order of "select" and "delete" operations on the same buffer would be bad. However, commuting operations on different buffers might be okay. Hence the goal is to establish serial ordering of tasks associated with a given object, but not constrain ordering of tasks between different objects. .. container:: section .. rubric:: Forces :class: sectiontitle - Operations associated with a certain object must be performed in serial order. - Serializing with a lock would be wasteful because threads would be waiting at the lock when they could be doing useful work elsewhere. .. container:: section .. rubric:: Solution :class: sectiontitle Sequence the work items using a FIFO (first-in first-out structure). Always keep an item in flight if possible. If no item is in flight when a work item appears, put the item in flight. Otherwise, push the item onto the FIFO. When the current item in flight completes, pop another item from the FIFO and put it in flight. The logic can be implemented without mutexes, by using ``concurrent_queue`` for the FIFO and ``atomic`` to count the number of items waiting and in flight. The example explains the accounting in detail. .. container:: section .. rubric:: Example :class: sectiontitle The following example builds on the Non-Preemptive Priorities example to implement local serialization in addition to priorities. It implements three priority levels and local serializers. The user interface for it follows: :: enum Priority { P_High, P_Medium, P_Low };   template void EnqueueWork( Priority p, Func f, Serializer* s=NULL ); Template function ``EnqueueWork`` causes functor ``f`` to run when the three constraints in the following table are met. .. container:: tablenoborder .. list-table:: :header-rows: 1 * - Constraint - Resolved by class... * - Any prior work for the ``Serializer`` has completed. - \ ``Serializer`` * - A thread is available. - \ ``RunWorkItem`` * - No higher priority work is ready to run. - \ ``ReadyPileType`` Constraints on a given functor are resolved from top to bottom in the table. The first constraint does not exist when s is NULL. The implementation of ``EnqueueWork`` packages the functor in a ``SerializedWorkItem`` and routes it to the class that enforces the first relevant constraint between pieces of work. :: template void EnqueueWork( Priority p, Func f, Serializer* s=NULL ) { WorkItem* item = new SerializedWorkItem( p, f, s ); if( s ) s->add(item); else ReadyPile.add(item); } A ``SerializedWorkItem`` is derived from a ``WorkItem``, which serves as a way to pass around a prioritized piece of work without knowing further details of the work. :: // Abstract base class for a prioritized piece of work. class WorkItem { public: WorkItem( Priority p ) : priority(p) {} // Derived class defines the actual work. virtual void run() = 0; const Priority priority; };   template class SerializedWorkItem: public WorkItem { Serializer* serializer; Func f; /*override*/ void run() { f(); Serializer* s = serializer; // Destroy f before running Serializer’s next functor. delete this; if( s ) s->noteCompletion(); } public: SerializedWorkItem( Priority p, const Func& f_, Serializer* s ) : WorkItem(p), serializer(s), f(f_) {} }; Base class ``WorkItem`` is the same as class WorkItem in the example for Non-Preemptive Priorities. The notion of serial constraints is completely hidden from the base class, thus permitting the framework to extend other kinds of constraints or lack of constraints. Class ``SerializedWorkItem`` is essentially ``ConcreteWorkItem`` from the example for Non-Preemptive Priorities, extended with a ``Serializer`` aspect. Virtual method ``run()`` is invoked when it becomes time to run the functor. It performs three steps: #. Run the functor. #. Destroy the functor. #. Notify the ``Serializer`` that the functor completed, and thus unconstraining the next waiting functor. Step 3 is the difference from the operation of ConcreteWorkItem::run. Step 2 could be done after step 3 in some contexts to increase concurrency slightly. However, the presented order is recommended because if step 2 takes non-trivial time, it likely has side effects that should complete before the next functor runs. Class ``Serializer`` implements the core of the Local Serializer pattern: :: class Serializer { oneapi::tbb::concurrent_queue queue; std::atomic count; // Count of queued items and in-flight item void moveOneItemToReadyPile() { // Transfer item from queue to ReadyPile WorkItem* item; queue.try_pop(item); ReadyPile.add(item); } public: void add( WorkItem* item ) { queue.push(item); if( ++count==1 ) moveOneItemToReadyPile(); } void noteCompletion() { // Called when WorkItem completes. if( --count!=0 ) moveOneItemToReadyPile(); } }; The class maintains two members: - A queue of WorkItem waiting for prior work to complete. - A count of queued or in-flight work. Mutexes are avoided by using ``concurrent_queue`` and ``atomic`` along with careful ordering of operations. The transitions of count are the key understanding how class ``Serializer`` works. - If method ``add`` increments ``count`` from 0 to 1, this indicates that no other work is in flight and thus the work should be moved to the ``ReadyPile``. - If method ``noteCompletion`` decrements count and it is *not* from 1 to 0, then the queue is non-empty and another item in the queue should be moved to ``ReadyPile``. Class ``ReadyPile`` is explained in the example for Non-Preemptive Priorities. If priorities are not necessary, there are two variations on method ``moveOneItemToReadyPile``, with different implications. - Method ``moveOneItemToReadyPile`` could directly invoke\ ``item->run()``. This approach has relatively low overhead and high thread locality for a given ``Serializer``. But it is unfair. If the ``Serializer`` has a continual stream of tasks, the thread operating on it will keep servicing those tasks to the exclusion of others. - Method ``moveOneItemToReadyPile`` could invoke ``task::enqueue`` to enqueue a task that invokes ``item->run()``. Doing so introduces higher overhead and less locality than the first approach, but avoids starvation. The conflict between fairness and maximum locality is fundamental. The best resolution depends upon circumstance. The pattern generalizes to constraints on work items more general than those maintained by class Serializer. A generalized ``Serializer::add`` determines if a work item is unconstrained, and if so, runs it immediately. A generalized ``Serializer::noteCompletion`` runs all previously constrained items that have become unconstrained by the completion of the current work item. The term "run" means to run work immediately, or if there are more constraints, forwarding the work to the next constraint resolver. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/design_patterns/Non-Preemptive_Priorities.rst ================================================ .. _Non-Preemptive_Priorities: Non-Preemptive Priorities ========================= .. container:: section .. rubric:: Problem :class: sectiontitle Choose the next work item to do, based on priorities. .. container:: section .. rubric:: Context :class: sectiontitle The scheduler in |full_name| chooses tasks using rules based on scalability concerns. The rules are based on the order in which tasks were spawned or enqueued, and are oblivious to the contents of tasks. However, sometimes it is best to choose work based on some kind of priority relationship. .. container:: section .. rubric:: Forces :class: sectiontitle - Given multiple work items, there is a rule for which item should be done next that is *not* the default oneTBB rule. - Preemptive priorities are not necessary. If a higher priority item appears, it is not necessary to immediately stop lower priority items in flight. If preemptive priorities are necessary, then non-preemptive tasking is inappropriate. Use threads instead. .. container:: section .. rubric:: Solution :class: sectiontitle Put the work in a shared work pile. Decouple tasks from specific work, so that task execution chooses the actual piece of work to be selected from the pile. .. container:: section .. rubric:: Example :class: sectiontitle The following example implements three priority levels. The user interface for it and top-level implementation follow: :: enum Priority { P_High, P_Medium, P_Low };   template void EnqueueWork( Priority p, Func f ) { WorkItem* item = new ConcreteWorkItem( p, f ); ReadyPile.add(item); } The caller provides a priority ``p`` and a functor ``f`` to routine ``EnqueueWork``. The functor may be the result of a lambda expression. ``EnqueueWork`` packages ``f`` as a ``WorkItem`` and adds it to global object ``ReadyPile``. Class ``WorkItem`` provides a uniform interface for running functors of unknown type: :: // Abstract base class for a prioritized piece of work. class WorkItem { public: WorkItem( Priority p ) : priority(p) {} // Derived class defines the actual work. virtual void run() = 0; const Priority priority; };   template class ConcreteWorkItem: public WorkItem { Func f; /*override*/ void run() { f(); delete this; } public: ConcreteWorkItem( Priority p, const Func& f_ ) : WorkItem(p), f(f_) {} }; Class ``ReadyPile`` contains the core pattern. It maintains a collection of work and fires off tasks through the ``oneapi::tbb::task_group::run`` interface and then choose a work from the collection: :: class ReadyPileType { // One queue for each priority level oneapi::tbb::concurrent_queue level[P_Low+1]; oneapi::tbb::task_group tg; public: void add( WorkItem* item ) { level[item->priority].push(item); tg.run(RunWorkItem()); } void runNextWorkItem() { // Scan queues in priority order for an item. WorkItem* item=NULL; for( int i=P_High; i<=P_Low; ++i ) if( level[i].try_pop(item) ) break; assert(item); item->run(); } };   ReadyPileType ReadyPile; The task added by ``add(item)`` does *not* necessarily execute that item. The task itself executes ``runNextWorkItem()``, which may find a higher priority item. There is one task for each item, but the mapping resolves when the task actually executes, not when it is created. Here are the details of class ``RunWorkItem``: :: class RunWorkItem { void operator()() { ReadyPile.runNextWorkItem(); }; }; ``RunWorkItem`` objects are fungible. They enable the oneTBB scheduler to choose when to do a work item, not which work item to do. Other priority schemes can be implemented by changing the internals for ``ReadyPileType``. A priority queue could be used to implement very fine grained priorities. The scalability of the pattern is limited by the scalability of ``ReadyPileType``. Ideally scalable concurrent containers should be used for it. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/design_patterns/Odd-Even_Communication.rst ================================================ .. _Odd-Even_Communication: Odd-Even Communication ====================== .. container:: section .. rubric:: Problem :class: sectiontitle Operations on data cannot be done entirely independently, but data can be partitioned into two subsets such that all operations on a subset can run in parallel. .. container:: section .. rubric:: Context :class: sectiontitle Solvers for partial differential equations can often be modified to follow this pattern. For example, for a 2D grid with only nearest-neighbor communication, it may be possible to treat the grid as a checkerboard, and alternate between updating red squares and black squares. Another context is staggered grid ("leap frog") Finite Difference Time Domain (FDTD solvers, which naturally fit the pattern. .. container:: section .. rubric:: Forces :class: sectiontitle - Dependencies between items form a bipartite graph. .. container:: section .. rubric:: Solution :class: sectiontitle Alternate between updating one subset and then the other subset. Apply the elementwise pattern to each subset. .. container:: section .. rubric:: References :class: sectiontitle Eun-Gyu Kim and Mark Snir, "Odd-Even Communication Group", http://snir.cs.illinois.edu/patterns/oddeven.pdf ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/design_patterns/Reduction.rst ================================================ .. _Reduction: Reduction ========= .. container:: section .. rubric:: Problem :class: sectiontitle Perform an associative reduction operation across a data set. .. container:: section .. rubric:: Context :class: sectiontitle Many serial algorithms sweep over a set of items to collect summary information. .. container:: section .. rubric:: Forces :class: sectiontitle The summary can be expressed as an associative operation over the data set, or at least is close enough to associative that reassociation does not matter. .. container:: section .. rubric:: Solution :class: sectiontitle Two solutions exist in |full_name|. The choice on which to use depends upon several considerations: - Is the operation commutative as well as associative? - Are instances of the reduction type expensive to construct and destroy. For example, a floating point number is inexpensive to construct. A sparse floating-point matrix might be very expensive to construct. Use ``oneapi::tbb::parallel_reduce`` when the objects are inexpensive to construct. It works even if the reduction operation is not commutative. Use ``oneapi::tbb::parallel_for`` and ``oneapi::tbb::combinable`` if the reduction operation is commutative and instances of the type are expensive. If the operation is not precisely associative but a precisely deterministic result is required, use recursive reduction and parallelize it using ``oneapi::tbb::parallel_invoke``. .. container:: section .. rubric:: Examples :class: sectiontitle The examples presented here illustrate the various solutions and some tradeoffs. The first example uses ``oneapi::tbb::parallel_reduce`` to do a + reduction over sequence of type ``T``. The sequence is defined by a half-open interval [first,last). :: T AssociativeReduce( const T* first, const T* last, T identity ) { return oneapi::tbb::parallel_reduce( // Index range for reduction oneapi::tbb::blocked_range(first,last), // Identity element identity, // Reduce a subrange and partial sum [&]( oneapi::tbb::blocked_range r, T partial_sum )->float { return std::accumulate( r.begin(), r.end(), partial_sum ); }, // Reduce two partial sums std::plus() ); } The third and fourth arguments to this form of ``parallel_reduce`` are a built in form of the agglomeration pattern. If there is an elementwise action to be performed before the reduction, incorporating it into the third argument (reduction of a subrange) may improve performance because of better locality of reference. Note that the block size for agglomeration is not explicitly specified; ``parallel_reduce`` defines blocks automatically with the help of implicitly used ``oneapi::tbb::auto_partitioner``. The second example assumes the + is commutative on ``T``. It is a good solution when ``T`` objects are expensive to construct. :: T CombineReduce( const T* first, const T* last, T identity ) { oneapi::tbb::combinable sum(identity); oneapi::tbb::parallel_for( oneapi::tbb::blocked_range(first,last), [&]( oneapi::tbb::blocked_range r ) { sum.local() += std::accumulate(r.begin(), r.end(), identity); } ); return sum.combine( []( const T& x, const T& y ) {return x+y;} ); } Sometimes it is desirable to destructively use the partial results to generate the final result. For example, if the partial results are lists, they can be spliced together to form the final result. In that case use class ``oneapi::tbb::enumerable_thread_specific`` instead of ``combinable``. The ``ParallelFindCollisions`` example in :ref:`Divide_and_Conquer` demonstrates the technique. Floating-point addition and multiplication are almost associative. Reassociation can cause changes because of rounding effects. The techniques shown so far reassociate terms non-deterministically. Fully deterministic parallel reduction for a not quite associative operation requires using deterministic reassociation. The code below demonstrates this in the form of a template that does a + reduction over a sequence of values of type ``T``. :: template T RepeatableReduce( const T* first, const T* last, T identity ) { if( last-first<=1000 ) { // Use serial reduction return std::accumulate( first, last, identity ); } else { // Do parallel divide-and-conquer reduction const T* mid = first+(last-first)/2; T left, right; oneapi::tbb::parallel_invoke( [&]{left=RepeatableReduce(first,mid,identity);}, [&]{right=RepeatableReduce(mid,last,identity);} ); return left+right; } } The outer if-else is an instance of the agglomeration pattern for recursive computations. The reduction graph, though not a strict binary tree, is fully deterministic. Thus the result will always be the same for a given input sequence, assuming all threads do identical floating-point rounding. ``oneapi::tbb::parallel_deterministic_reduce`` is a simpler and more efficient way to get reproducible non-associative reduction. It is very similar to ``oneapi::tbb::parallel_reduce`` but, unlike the latter, builds a deterministic reduction graph. With it, the ``RepeatableReduce`` sample can be almost identical to ``AssociativeReduce``: :: template T RepeatableReduce( const T* first, const T* last, T identity ) { return oneapi::tbb::parallel_deterministic_reduce( // Index range for reduction oneapi::tbb::blocked_range(first,last,1000), // Identity element identity, // Reduce a subrange and partial sum [&]( oneapi::tbb::blocked_range r, T partial_sum )->float { return std::accumulate( r.begin(), r.end(), partial_sum ); }, // Reduce two partial sums std::plus() ); } Besides the function name change, note the grain size of 1000 specified for ``oneapi::tbb::blocked_range``. It defines the desired block size for agglomeration; automatic block size selection is not used due to non-determinism. The final example shows how a problem that typically is not viewed as a reduction can be parallelized by viewing it as a reduction. The problem is retrieving floating-point exception flags for a computation across a data set. The serial code might look something like: :: feclearexcept(FE_ALL_EXCEPT); for( int i=0; i r ) { int end=r.end(); for( int i=r.begin(); i!=end; ++i ) C[i] = A[i]/B[i]; // It is critical to do |= here, not =, because otherwise we // might lose earlier exceptions from the same thread. flags |= fetestexcept(FE_ALL_EXCEPT); } // Called by parallel_reduce when joining results from two subranges. void join( Body& other ) { flags |= other.flags; } }; Then invoke it as follows: :: // Construction of cc implicitly resets FP exception state. ComputeChunk cc; oneapi::tbb::parallel_reduce( oneapi::tbb::blocked_range(0,N), cc ); if (cc.flags & FE_DIVBYZERO) ...; if (cc.flags & FE_OVERFLOW) ...; ... ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/design_patterns/Reference_Counting.rst ================================================ .. _Reference_Counting: Reference Counting ================== .. container:: section .. rubric:: Problem :class: sectiontitle Destroy an object when it will no longer be used. .. container:: section .. rubric:: Context :class: sectiontitle Often it is desirable to destroy an object when it is known that it will not be used in the future. Reference counting is a common serial solution that extends to parallel programming if done carefully. .. container:: section .. rubric:: Forces :class: sectiontitle - If there are cycles of references, basic reference counting is insufficient unless the cycle is explicitly broken. - Atomic counting is relatively expensive in hardware. .. container:: section .. rubric:: Solution :class: sectiontitle Thread-safe reference counting is like serial reference counting, except that the increment/decrement is done atomically, and the decrement and test "count is zero?" must act as a single atomic operation. The following example uses ``std::atomic`` to achieve this. :: template class counted { std::atomic my_count; T value; public: // Construct object with a single reference to it. counted() {my_count=1;} // Add reference void add_ref() {++my_count;} // Remove reference. Return true if it was the last reference. bool remove_ref() {return --my_count==0;} // Get reference to underlying object T& get() { assert(my_count>0); return my_value; } }; It is incorrect to use a separate read for testing if the count is zero. The following code would be an incorrect implementation of method ``remove_ref``\ () because two threads might both execute the decrement, and then both read ``my_count`` as zero. Hence two callers would both be told incorrectly that they had removed the last reference. :: --my_count; return my_count==0. // WRONG! The decrement may need to have a *release* fence so that any pending writes complete before the object is deleted. There is no simple way to atomically copy a pointer and increment its reference count, because there will be a timing hole between the copying and the increment where the reference count is too low, and thus another thread might decrement the count to zero and delete the object. Two ways to address the problem are "hazard pointers" and "pass the buck". See the references below for details. .. container:: section .. rubric:: Variations :class: sectiontitle Atomic increment/decrement can be more than an order of magnitude more expensive than ordinary increment/decrement. The serial optimization of eliminating redundant increment/decrement operations becomes more important with atomic reference counts. Weighted reference counting can be used to reduce costs if the pointers are unshared but the referent is shared. Associate a *weight* with each pointer. The reference count is the sum of the weights. A pointer ``x`` can be copied as a pointer ``x'`` without updating the reference count by splitting the original weight between ``x`` and ``x'``. If the weight of ``x`` is too low to split, then first add a constant W to the reference count and weight of ``x``. .. container:: section .. rubric:: References :class: sectiontitle D. Bacon and V.T. Rajan, "Concurrent Cycle Collection in Reference Counted Systems" in Proc. European Conf. on Object-Oriented Programming (June 2001). Describes a garbage collector based on reference counting that does collect cycles. M. Michael, "Hazard Pointers: Safe Memory Reclamation for Lock-Free Objects" in IEEE Transactions on Parallel and Distributed Systems (June 2004). Describes the "hazard pointer" technique. M. Herlihy, V. Luchangco, and M. Moir, "The Repeat Offender Problem: A Mechanism for Supporting Dynamic-Sized, Lock-Free Data Structures" in Proceedings of the 16th International Symposium on Distributed Computing (Oct. 2002). Describes the "pass the buck" technique. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/design_patterns/Wavefront.rst ================================================ .. _Wavefront: Wavefront ========= .. container:: section .. rubric:: Problem :class: sectiontitle Perform computations on items in a data set, where the computation on an item uses results from computations on predecessor items. .. container:: section .. rubric:: Context :class: sectiontitle The dependences between computations form an acyclic graph. .. container:: section .. rubric:: Forces :class: sectiontitle - Dependence constraints between items form an acyclic graph. - The number of immediate predecessors in the graph is known in advance, or can be determined some time before the last predecessor completes. .. container:: section .. rubric:: Solution :class: sectiontitle The solution is a parallel variant of topological sorting, using ``oneapi::tbb::parallel_for_each`` to process items. Associate an atomic counter with each item. Initialize each counter to the number of predecessors. Invoke ``oneapi::tbb::parallel_for_each`` to process the items that have no predessors (have counts of zero). After an item is processed, decrement the counters of its successors. If a successor's counter reaches zero, add that successor to the ``oneapi::tbb::parallel_for_each`` via a "feeder". If the number of predecessors for an item cannot be determined in advance, treat the information "know number of predecessors" as an additional predecessor. When the number of predecessors becomes known, treat this conceptual predecessor as completed. If the overhead of counting individual items is excessive, aggregate items into blocks, and do the wavefront over the blocks. .. container:: section .. rubric:: Example :class: sectiontitle Below is a serial kernel for the longest common subsequence algorithm. The parameters are strings ``x`` and ``y`` with respective lengths ``xlen`` and ``ylen``. :: int F[MAX_LEN+1][MAX_LEN+1]; void SerialLCS( const char* x, size_t xlen, const char* y, size_t ylen ) { for( size_t i=1; i<=xlen; ++i ) for( size_t j=1; j<=ylen; ++j ) F[i][j] = x[i-1]==y[j-1] ? F[i-1][j-1]+1: max(F[i][j-1],F[i-1][j]); } The kernel sets ``F[i][j]`` to the length of the longest common subsequence shared by ``x[0..i-1]`` and ``y[0..j-1]``. It assumes that F[0][0..ylen] and ``F[0..xlen][0]`` have already been initialized to zero. The following figure shows the data dependences for calculating ``F[i][j]``. .. container:: fignone :name: fig3 Data dependences for longest common substring calculation. |image0| The following figure shows the gray diagonal dependence is the transitive closure of other dependencies. Thus for parallelization purposes it is a redundant dependence that can be ignored. .. container:: fignone :name: fig4 Diagonal dependence is redundant. |image1| It is generally good to remove redundant dependences from consideration, because the atomic counting incurs a cost for each dependence considered. Another consideration is grain size. Scheduling each ``F[i][j]`` element calculation separately is prohibitively expensive. A good solution is to aggregate the elements into contiguous blocks, and process the contents of a block serially. The blocks have the same dependence pattern, but at a block scale. Hence scheduling overheads can be amortized over blocks. The parallel code follows. Each block consists of ``N×N`` elements. Each block has an associated atomic counter. Array ``Count`` organizes these counters for easy lookup. The code initializes the counters and then rolls a wavefront using ``parallel_for_each``, starting with the block at the origin since it has no predecessors. :: const int N = 64; std::atomic Count[MAX_LEN/N+1][MAX_LEN/N+1];   void ParallelLCS( const char* x, size_t xlen, const char* y, size_t ylen ) { // Initialize predecessor counts for blocks. size_t m = (xlen+N-1)/N; size_t n = (ylen+N-1)/N; for( int i=0; i0)+(j>0); // Roll the wavefront from the origin. typedef pair block; block origin(0,0); oneapi::tbb::parallel_for_each( &origin, &origin+1, [=]( const block& b, oneapi::tbb::feeder&feeder ) { // Extract bounds on block size_t bi = b.first; size_t bj = b.second; size_t xl = N*bi+1; size_t xu = min(xl+N,xlen+1); size_t yl = N*bj+1; size_t yu = min(yl+N,ylen+1); // Process the block for( size_t i=xl; i f( g, 1, []( int i ) -> int { return spin_for(i); } ); f.try_put(1); g.wait_for_all(); } }; void no_wait_for_all_enqueue() { task_arena a; a.enqueue(background_task()); // do other things without waiting… } In the code snippet above, the enqueued task executes at some point, but it's not clear when. If you need to use the results of the enqueued task, or even ensure that it completes before the program ends, you will need to use some mechanism to signal from the enqueued task that the graph is complete. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/estimate_flow_graph_performance.rst ================================================ .. _estimate_flow_graph_performance: Estimating Flow Graph Performance ================================= The performance or scalability of a flow graph is not easy to predict. However there are a few key points that can guide you in estimating the limits on performance and speedup of some graphs. .. container:: section .. rubric:: The Critical Path Limits the Scalability in a Dependence Graph :class: sectiontitle A critical path is the most time consuming path from a node with no predecessors to a node with no successors. In a dependence graph, the execution of the nodes along a path cannot be overlapped since they have a strict ordering. Therefore, for a dependence graph, the critical path limits scalability. More formally, let T be the total time consumed by all of the nodes in your graph if executed sequentially. Then let C be the time consumed along the path that takes the most time. The nodes along this path cannot be overlapped even in a parallel execution. Therefore, even if all other paths are executed in parallel with C, the wall clock time for the parallel execution is at least C, and the maximum possible speedup (ignoring microarchitectural and memory effects) is T/C. .. container:: section .. rubric:: There is Overhead in Spawning a Node's Body as a Task :class: sectiontitle The bodies of ``input_nodes``, ``function_nodes``, ``continue_nodes`` and ``multifunction_nodes`` execute within spawned tasks by default. This means that you need to take into account the overhead of task scheduling when estimating the time it takes for a node to execute its body. All of the rules of thumb for determining the appropriate granularity of tasks therefore also apply to node bodies as well. If you have many fine-grained nodes in your flow graph, the impact of these overheads can noticeably impact your performance. However, depending on the graph structure, you can reduce such overheads by using lightweight policy with these nodes. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/examples/blocked_nd_range_example.cpp ================================================ /* Copyright (c) 2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /*begin_blocked_nd_range_example*/ #include "oneapi/tbb/blocked_nd_range.h" #include "oneapi/tbb/parallel_for.h" template float kernel3d(const Features& feature_maps, int i, int j, int k, int kernel_length, int kernel_width, int kernel_height) { float result = 0.f; for (int feature_i = i; feature_i < i + kernel_length; ++feature_i) for (int feature_j = j; feature_j < j + kernel_width; ++feature_j) for (int feature_k = k; feature_k < k + kernel_width; ++feature_k) result += feature_maps[feature_i][feature_j][feature_k]; return result; } template void convolution3d(const Features& feature_maps, Output& out, int out_length, int out_width, int out_heigth, int kernel_length, int kernel_width, int kernel_height) { using range_t = oneapi::tbb::blocked_nd_range; oneapi::tbb::parallel_for( range_t({0, out_length}, {0, out_width}, {0, out_heigth}), [&](const range_t& out_range) { auto out_x = out_range.dim(0); auto out_y = out_range.dim(1); auto out_z = out_range.dim(2); for (int i = out_x.begin(); i < out_x.end(); ++i) for (int j = out_y.begin(); j < out_y.end(); ++j) for (int k = out_z.begin(); k < out_z.end(); ++k) out[i][j][k] = kernel3d(feature_maps, i, j, k, kernel_length, kernel_width, kernel_height); } ); } /*end_blocked_nd_range_example*/ #include #include int main() { const int kernel_length = 9; const int kernel_width = 5; const int kernel_height = 5; const int feature_maps_length = 128; const int feature_maps_width = 16; const int feature_maps_heigth = 16; const int out_length = feature_maps_length - kernel_length + 1; const int out_width = feature_maps_width - kernel_width + 1; const int out_heigth = feature_maps_heigth - kernel_height + 1; // Initializes feature maps with 1 in each cell and out with zeros. std::vector>> feature_maps(feature_maps_length, std::vector>(feature_maps_width, std::vector(feature_maps_heigth, 1.0f))); std::vector>> out(out_length, std::vector>(out_width, std::vector(out_heigth, 0.f))); // 3D convolution calculates the sum of all elements in the kernel convolution3d(feature_maps, out, out_length, out_width, out_heigth, kernel_length, kernel_width, kernel_height); // Checks correctness of convolution by equality to the expected sum of elements float expected = float(kernel_length * kernel_height * kernel_width); for (auto i : out) { for (auto j : i) { for (auto k : j) { assert(k == expected && "convolution failed to calculate correctly"); } } } return 0; } ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/examples/flow_graph_examples.cpp ================================================ /* Copyright (c) 2022-2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* Flow Graph Code Example for the Userguide. */ #include #include using namespace tbb::flow; //! Example shows how to set the most performant core type as the preferred one //! for a graph execution. static void flow_graph_attach_to_arena_1() { /*begin_attach_to_arena_1*/ std::vector core_types = tbb::info::core_types(); tbb::task_arena arena( tbb::task_arena::constraints{}.set_core_type(core_types.back()) ); arena.execute( [&]() { graph g; function_node< int > f( g, unlimited, []( int ) { /*the most performant core type is defined as preferred.*/ } ); f.try_put(1); g.wait_for_all(); } ); /*end_attach_to_arena_1*/ } //! Reattach existing graph to an arena with the most performant core type as //! the preferred one for a work execution. static void flow_graph_attach_to_arena_2() { /*begin_attach_to_arena_2*/ graph g; function_node< int > f( g, unlimited, []( int ) { /*the most performant core type is defined as preferred.*/ } ); std::vector core_types = tbb::info::core_types(); tbb::task_arena arena( tbb::task_arena::constraints{}.set_core_type(core_types.back()) ); arena.execute( [&]() { g.reset(); } ); f.try_put(1); g.wait_for_all(); /*end_attach_to_arena_2*/ } int main() { flow_graph_attach_to_arena_1(); flow_graph_attach_to_arena_2(); return 0; } ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/examples/parallel_for_lambda_example_1.cpp ================================================ /* Copyright (c) 2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ void Foo(float) {} /*begin_parallel_for_lambda_1*/ #include "oneapi/tbb.h" using namespace oneapi::tbb; void ParallelApplyFoo( float* a, size_t n ) { parallel_for( blocked_range(0,n), [=](const blocked_range& r) { for(size_t i=r.begin(); i!=r.end(); ++i) Foo(a[i]); } ); } /*end_parallel_for_lambda_1*/ int main() { constexpr std::size_t size = 10; float array[size] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; ParallelApplyFoo(array, size); } ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/examples/parallel_for_lambda_example_2.cpp ================================================ /* Copyright (c) 2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ void Foo(float) {} /*begin_parallel_for_lambda_2*/ #include "oneapi/tbb.h" using namespace oneapi::tbb; #pragma warning(disable: 588) void ParallelApplyFoo(float a[], size_t n) { parallel_for(size_t(0), n, [=](size_t i) {Foo(a[i]);}); } /*end_parallel_for_lambda_2*/ int main() { constexpr std::size_t size = 10; float array[size] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; ParallelApplyFoo(array, size); } ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/examples/parallel_for_os_example.cpp ================================================ /* Copyright (c) 2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ void Foo(float) {} /*begin_parallel_for_os_1*/ #include "oneapi/tbb.h" using namespace oneapi::tbb; class ApplyFoo { float *const my_a; public: void operator()( const blocked_range& r ) const { float *a = my_a; for( size_t i=r.begin(); i!=r.end(); ++i ) Foo(a[i]); } ApplyFoo( float a[] ) : my_a(a) {} }; /*end_parallel_for_os_1*/ /*begin_parallel_for_os_2*/ #include "oneapi/tbb.h" void ParallelApplyFoo( float a[], size_t n ) { parallel_for(blocked_range(0,n), ApplyFoo(a)); } /*end_parallel_for_os_2*/ int main() { constexpr std::size_t size = 10; float array[size] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; ParallelApplyFoo(array, size); } ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/parallel_for_os.rst ================================================ .. _parallel_for: parallel_for ============ Suppose you want to apply a function ``Foo`` to each element of an array, and it is safe to process each element concurrently. Here is the sequential code to do this: :: void SerialApplyFoo( float a[], size_t n ) { for( size_t i=0; i!=n; ++i ) Foo(a[i]); } The iteration space here is of type ``size_t``, and goes from ``0`` to ``n-1``. The template function ``oneapi::tbb::parallel_for`` breaks this iteration space into chunks, and runs each chunk on a separate thread. The first step in parallelizing this loop is to convert the loop body into a form that operates on a chunk. The form is an STL-style function object, called the *body* object, in which ``operator()`` processes a chunk. The following code declares the body object. .. literalinclude:: ./examples/parallel_for_os_example.cpp :language: c++ :start-after: /*begin_parallel_for_os_1*/ :end-before: /*end_parallel_for_os_1*/ The ``using`` directive in the example enables you to use the library identifiers without having to write out the namespace prefix ``oneapi::tbb`` before each identifier. The rest of the examples assume that such a ``using`` directive is present. Note the argument to ``operator()``. A ``blocked_range`` is a template class provided by the library. It describes a one-dimensional iteration space over type ``T``. Class ``parallel_for`` works with other kinds of iteration spaces too. The library provides ``blocked_range2d``, ``blocked_range3d``, and ``blocked_nd_range`` for multidimensional spaces. You can define your own spaces as explained in :ref:`Advanced_Topic_Other_Kinds_of_Iteration_Spaces`. An instance of ``ApplyFoo`` needs member fields that remember all the local variables that were defined outside the original loop but used inside it. Usually, the constructor for the body object will initialize these fields, though ``parallel_for`` does not care how the body object is created. Template function ``parallel_for`` requires that the body object have a copy constructor, which is invoked to create a separate copy (or copies) for each worker thread. It also invokes the destructor to destroy these copies. In most cases, the implicitly generated copy constructor and destructor work correctly. If they do not, it is almost always the case (as usual in C++) that you must define *both* to be consistent. Because the body object might be copied, its ``operator()`` should not modify the body. Otherwise the modification might or might not become visible to the thread that invoked ``parallel_for``, depending upon whether ``operator()`` is acting on the original or a copy. As a reminder of this nuance, ``parallel_for`` requires that the body object's ``operator()`` be declared ``const``. The example ``operator()`` loads ``my_a`` into a local variable ``a``. Though not necessary, there are two reasons for doing this in the example: - **Style**. It makes the loop body look more like the original. - **Performance**. Sometimes putting frequently accessed values into local variables helps the compiler optimize the loop better, because local variables are often easier for the compiler to track. Once you have the loop body written as a body object, invoke the template function ``parallel_for``, as follows: .. literalinclude:: ./examples/parallel_for_os_example.cpp :language: c++ :start-after: /*begin_parallel_for_os_2*/ :end-before: /*end_parallel_for_os_2*/ The ``blocked_range`` constructed here represents the entire iteration space from 0 to n-1, which ``parallel_for`` divides into subspaces for each processor. The general form of the constructor is ``blocked_range(begin,end,grainsize)``. The ``T`` specifies the value type. The arguments ``begin`` and ``end`` specify the iteration space STL-style as a half-open interval [``begin``,\ ``end``). The argument *grainsize* is explained in the :ref:`Controlling_Chunking` section. The example uses the default grainsize of 1 because by default ``parallel_for`` applies a heuristic that works well with the default grainsize. .. include:: parallel_for_toctree.rst ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/parallel_for_toctree.rst ================================================ .. _parallel_for_toctree: .. toctree:: :maxdepth: 4 ../tbb_userguide/Lambda_Expressions ../tbb_userguide/Automatic_Chunking ../tbb_userguide/Controlling_Chunking_os ../tbb_userguide/Bandwidth_and_Cache_Affinity_os ../tbb_userguide/Partitioner_Summary ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/parallel_reduce.rst ================================================ .. _parallel_reduce: parallel_reduce =============== A loop can do a reduction, as in this summation: :: float SerialSumFoo( float a[], size_t n ) { float sum = 0; for( size_t i=0; i!=n; ++i ) sum += Foo(a[i]); return sum; } If the iterations are independent, you can parallelize this loop using the template class ``parallel_reduce`` as follows: :: float ParallelSumFoo( const float a[], size_t n ) { SumFoo sf(a); parallel_reduce( blocked_range(0,n), sf ); return sf.my_sum; } The class ``SumFoo`` specifies details of the reduction, such as how to accumulate subsums and combine them. Here is the definition of class ``SumFoo``: :: class SumFoo { float* my_a; public: float my_sum; void operator()( const blocked_range& r ) { float *a = my_a; float sum = my_sum; size_t end = r.end(); for( size_t i=r.begin(); i!=end; ++i ) sum += Foo(a[i]); my_sum = sum; }   SumFoo( SumFoo& x, split ) : my_a(x.my_a), my_sum(0) {}   void join( const SumFoo& y ) {my_sum+=y.my_sum;} SumFoo(float a[] ) : my_a(a), my_sum(0) {} }; Note the differences with class ``ApplyFoo`` from parallel_for. First, ``operator()`` is *not* ``const``. This is because it must update SumFoo::my_sum. Second, ``SumFoo`` has a *splitting constructor* and a method ``join`` that must be present for ``parallel_reduce`` to work. The splitting constructor takes as arguments a reference to the original object, and a dummy argument of type ``split``, which is defined by the library. The dummy argument distinguishes the splitting constructor from a copy constructor. .. tip:: In the example, the definition of ``operator()`` uses local temporary variables (``a``, ``sum``, ``end``) for scalar values accessed inside the loop. This technique can improve performance by making it obvious to the compiler that the values can be held in registers instead of memory. If the values are too large to fit in registers, or have their address taken in a way the compiler cannot track, the technique might not help. With a typical optimizing compiler, using local temporaries for only written variables (such as ``sum`` in the example) can suffice, because then the compiler can deduce that the loop does not write to any of the other locations, and hoist the other reads to outside the loop. When a worker thread is available, as decided by the task scheduler, ``parallel_reduce`` invokes the splitting constructor to create a subtask for the worker. When the subtask completes, ``parallel_reduce`` uses method ``join`` to accumulate the result of the subtask. The graph at the top of the following figure shows the split-join sequence that happens when a worker is available: .. container:: fignone :name: fig5 Graph of the Split-join Sequence |image0| An arrows in the above figure indicate order in time. The splitting constructor might run concurrently while object ``x`` is being used for the first half of the reduction. Therefore, all actions of the splitting constructor that creates y must be made thread safe with respect to ``x``. So if the splitting constructor needs to increment a reference count shared with other objects, it should use an atomic increment. If a worker is not available, the second half of the iteration is reduced using the same body object that reduced the first half. That is the reduction of the second half starts where reduction of the first half finished. .. CAUTION:: Since split/join are not used if workers are unavailable, ``parallel_reduce`` does not necessarily do recursive splitting. .. CAUTION:: Since the same body might be used to accumulate multiple subranges, it is critical that ``operator()`` not discard earlier accumulations. The code below shows an incorrect definition of ``SumFoo::operator()``. :: class SumFoo { ... public: float my_sum; void operator()( const blocked_range& r ) { ... float sum = 0; // WRONG – should be 'sum = my_sum". ... for( ... ) sum += Foo(a[i]); my_sum = sum; } ... }; With the mistake, the body returns a partial sum for the last subrange instead of all subranges to which ``parallel_reduce`` applies it. The rules for partitioners and grain sizes for ``parallel_reduce`` are the same as for ``parallel_for``. ``parallel_reduce`` generalizes to any associative operation. In general, the splitting constructor does two things: - Copy read-only information necessary to run the loop body. - Initialize the reduction variable(s) to the identity element of the operation(s). The join method should do the corresponding merge(s). You can do more than one reduction at the same time: you can gather the min and max with a single ``parallel_reduce``. .. note:: The reduction operation can be non-commutative. The example still works if floating-point addition is replaced by string concatenation. .. |image0| image:: Images/image009.jpg :width: 512px :height: 438px ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/snippets/blocked_nd_range_example.cpp ================================================ #include "blocked_nd_range_example.h" #include #include int main() { const int kernel_length = 9; const int kernel_width = 5; const int kernel_height = 5; const int feature_maps_length = 128; const int feature_maps_width = 16; const int feature_maps_heigth = 16; const int out_length = feature_maps_length - kernel_length + 1; const int out_width = feature_maps_width - kernel_width + 1; const int out_heigth = feature_maps_heigth - kernel_height + 1; // Initializes feature maps with 1 in each cell and out with zeros. std::vector>> feature_maps(feature_maps_length, std::vector>(feature_maps_width, std::vector(feature_maps_heigth, 1.0f))); std::vector>> out(out_length, std::vector>(out_width, std::vector(out_heigth, 0.f))); // 3D convolution calculates the sum of all elements in the kernel convolution3d(feature_maps, out, out_length, out_width, out_heigth, kernel_length, kernel_width, kernel_height); // Checks correctness of convolution by equality to the expected sum of elements float expected = float(kernel_length * kernel_height * kernel_width); for (auto i : out) { for (auto j : i) { for (auto k : j) { assert(k == expected && "convolution failed to calculate correctly"); } } } return 0; } ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/snippets/blocked_nd_range_example.h ================================================ #include "oneapi/tbb/blocked_nd_range.h" #include "oneapi/tbb/parallel_for.h" template float kernel3d(const Features& feature_maps, int i, int j, int k, int kernel_length, int kernel_width, int kernel_height) { float result = 0.f; for (int feature_i = i; feature_i < i + kernel_length; ++feature_i) for (int feature_j = j; feature_j < j + kernel_width; ++feature_j) for (int feature_k = k; feature_k < k + kernel_width; ++feature_k) result += feature_maps[feature_i][feature_j][feature_k]; return result; } template void convolution3d(const Features& feature_maps, Output& out, int out_length, int out_width, int out_heigth, int kernel_length, int kernel_width, int kernel_height) { using range_t = oneapi::tbb::blocked_nd_range; oneapi::tbb::parallel_for( range_t({0, out_length}, {0, out_width}, {0, out_heigth}), [&](const range_t& out_range) { auto out_x = out_range.dim(0); auto out_y = out_range.dim(1); auto out_z = out_range.dim(2); for (int i = out_x.begin(); i < out_x.end(); ++i) for (int j = out_y.begin(); j < out_y.end(); ++j) for (int k = out_z.begin(); k < out_z.end(); ++k) out[i][j][k] = kernel3d(feature_maps, i, j, k, kernel_length, kernel_width, kernel_height); } ); } ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/snippets/flow_graph_examples.cpp ================================================ /* Copyright (c) 2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* Flow Graph Code Example for the Userguide. */ #include #include using namespace tbb::flow; //! Example shows how to set the most performant core type as the preferred one //! for a graph execution. static void flow_graph_attach_to_arena_1() { /*begin_attach_to_arena_1*/ std::vector core_types = tbb::info::core_types(); tbb::task_arena arena( tbb::task_arena::constraints{}.set_core_type(core_types.back()) ); arena.execute( [&]() { graph g; function_node< int > f( g, unlimited, []( int ) { /*the most performant core type is defined as preferred.*/ } ); f.try_put(1); g.wait_for_all(); } ); /*end_attach_to_arena_1*/ } //! Reattach existing graph to an arena with the most performant core type as //! the preferred one for a work execution. static void flow_graph_attach_to_arena_2() { /*begin_attach_to_arena_2*/ graph g; function_node< int > f( g, unlimited, []( int ) { /*the most performant core type is defined as preferred.*/ } ); std::vector core_types = tbb::info::core_types(); tbb::task_arena arena( tbb::task_arena::constraints{}.set_core_type(core_types.back()) ); arena.execute( [&]() { g.reset(); } ); f.try_put(1); g.wait_for_all(); /*end_attach_to_arena_2*/ } int main() { flow_graph_attach_to_arena_1(); flow_graph_attach_to_arena_2(); return 0; } ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/std_invoke.rst ================================================ .. _std_invoke: Invoke a Callable Object ========================== Starting from C++17, the requirements for callable objects passed to algorithms or Flow Graph nodes are relaxed. It allows using additional types of bodies. Previously, the body of the algorithm or Flow Graph node needed to be a Function Object (see `C++ Standard Function Object `_) and provide an ``operator()`` that accepts input parameters. Now the body needs to meet the more relaxed requirements of being Callable (see `C++ Standard Callable `_) that covers three types of objects: * **Function Objects that provide operator(arg1, arg2, ...)**, which accepts the input parameters * **Pointers to member functions** that you can use as the body of the algorithm or the Flow Graph node * **Pointers to member objects** work as the body of the algorithm or parallel construct You can use it not only for a Flow Graph but also for algorithms. See the example below: .. code:: // The class models oneTBB Range class StrideRange { public: StrideRange(int* s, std::size_t sz, std::size_t str) : start(s), size(sz), stride(str) {} // A copy constructor StrideRange(const StrideRange&) = default; // A splitting constructor StrideRange(StrideRange& other, oneapi::tbb::split) : start(other.start), size(other.size / 2) { other.size -= size; other.start += size; } ~StrideRange() = default; // Indicate if the range is empty bool empty() const { return size == 0; } // Indicate if the range can be divided bool is_divisible() const { return size >= stride; } void iterate() const { for (std::size_t i = 0; i < size; i += stride) { // Performed an action for each element of the range, // implement the code based on your requirements } } private: int* start; std::size_t size; std::size_t stride; }; Where: * The ``StrideRange`` class models oneTBB range that should be iterated with a specified stride during its initial construction. * The ``stride`` value is stored in a private field within the range. Therefore, the class provides the member function ``iterate() const`` that implements a loop with the specified stride. ``range.iterate()`` ******************* Before C++17, to utilize a range in a parallel algorithm, such as ``parallel_for``, it was required to provide a ``Function Object`` as the algorithm's body. This Function Object defined the operations to be executed on each iteration of the range: .. code:: int main() { std::size_t array_size = 1000; int* array_to_iterate = new int[array_size]; StrideRange range(array_to_iterate, array_size, /* stride = */ 2); // Define a lambda function as the body of the parallel_for loop auto pfor_body = [] (const StrideRange& range) { range.iterate(); }; // Perform parallel iteration oneapi::tbb::parallel_for(range, pfor_body); delete[] array_to_iterate; } An additional lambda function ``pfor_body`` was also required. This lambda function invoked the ``rage.iterate()`` function. Now with C++17, you can directly utilize a pointer to ``range.iterate()`` as the body of the algorithm: .. code:: int main() { std::size_t array_size = 1000; int* array_to_iterate = new int[array_size]; // Performs the iteration over the array elements with the specified stride StrideRange range(array_to_iterate, array_size, /* stride = */ 2); // Parallelize the iteration over the range object oneapi::tbb::parallel_for(range, &StrideRange::iterate); delete[] array_to_iterate; } ``std::invoke`` **************** ``std::invoke`` is a function template that provides a syntax for invoking different types of callable objects with a set of arguments. oneTBB implementation uses the C++ standard function ``std::invoke(&StrideRange::iterate, range)`` to execute the body. It is the equivalent of ``range.iterate()``. Therefore, it allows you to invoke a callable object, such as a function object, with the provided arguments. .. tip:: Refer to `C++ Standard `_ to learn more about ``std::invoke``. Example ^^^^^^^^ Consider a specific scenario with ``function_node`` within a Flow Graph. In the example below, a ``function_node`` takes an object as an input to read a member object of that input and proceed it to the next node in the graph: .. code:: struct Object { int number; }; int main() { using namespace oneapi::tbb::flow; // Lambda function to read the member object of the input Object auto number_reader = [] (const Object& obj) { return obj.number; }; // Lambda function to process the received integer auto number_processor = [] (int i) { /* processing integer */ }; graph g; // Function node that takes an Object as input and produces an integer function_node func1(g, unlimited, number_reader); // Function node that takes an integer as input and processes it function_node func2(g, unlimited, number_processor); // Connect the function nodes make_edge(func1, func2); // Provide produced input to the graph func1.try_put(Object{1}); // Wait for the graph to complete g.wait_for_all(); } Before C++17, the ``function_node`` in the Flow Graph required the body to be a Function Object. A lambda function was required to extract the number from the Object. With C++17, you can use ``std::invoke`` with a pointer to the member number directly as the body. You can update the previous example as follows: .. code:: struct Object { int number; }; int main() { using namespace oneapi::tbb::flow; // The processing logic for the received integer auto number_processor = [] (int i) { /* processing integer */ }; // Create a graph object g to hold the flow graph graph g; // Use a member function pointer to the number member of the Object struct as the body function_node func1(g, unlimited, &Object::number); // Use the number_processor lambda function as the body function_node func2(g, unlimited, number_processor); // Connect the function nodes make_edge(func1, func2); // Connect the function nodes func1.try_put(Object{1}); // Wait for the graph to complete g.wait_for_all(); } Find More ********* The following APIs supports Callable object as Bodies: * `parallel_for `_ * `parallel_reduce `_ * `parallel_deterministic_reduce `_ * `parallel_for_each `_ * `parallel_scan `_ * `parallel_pipeline `_ * `function_node `_ * `multifunction_node `_ * `async_node `_ * `sequencer_node `_ * `join_node with key_matching policy `_ ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/title.rst ================================================ .. _title: |short_name| Developer Guide ============================ |full_name| .. toctree:: :maxdepth: 4 ../tbb_userguide/Package_Contents_os ../tbb_userguide/Parallelizing_Simple_Loops_os ../tbb_userguide/Parallelizing_Complex_Loops ../tbb_userguide/Flow_Graph ../tbb_userguide/work_isolation ../tbb_userguide/Exceptions_and_Cancellation ../tbb_userguide/Floating_Point_Settings ../tbb_userguide/Containers ../tbb_userguide/Mutual_Exclusion ../tbb_userguide/Timing ../tbb_userguide/Memory_Allocation ../tbb_userguide/The_Task_Scheduler ../tbb_userguide/design_patterns/Design_Patterns ../tbb_userguide/Migration_Guide ../tbb_userguide/Constraints ../tbb_userguide/std_invoke ../tbb_userguide/appendix_A ../tbb_userguide/appendix_B ../tbb_userguide/References ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/use_concurrency_limits.rst ================================================ .. _use_concurrency_limits: Use Concurrency Limits ====================== To control the number of instances of a single node, you can use the concurrency limit on the node. To cause it to reject messages after it reaches its concurrency limit, you construct it as a "rejecting" node. A function node is constructed with one or more template arguments. The third argument controls the buffer policy used by the node, and is by default queueing. With a queueing policy, a ``function_node`` that has reached its concurrency limit still accepts incoming messages, but buffers them internally. If the policy is set to rejecting the node will instead reject the incoming messages. :: template < typename Input, typename Output = continue_msg, graph_buffer_policy = queueing > class function_node; For example, you can control the number of big objects in flight in a graph by placing a rejecting function_node downstream of an ``input_node``, as is done below: :: graph g; int src_count = 0; int number_of_objects = 0; int max_objects = 3; input_node< big_object * > s( g, [&]( oneapi::tbb::flow_control& fc ) -> big_object* { if ( src_count < M ) { big_object* v = new big_object(); ++src_count; return v; } else { fc.stop(); return nullptr; } } ); s.activate(); function_node< big_object *, continue_msg, rejecting > f( g, 3, []( big_object *v ) -> continue_msg { spin_for(1); delete v; return continue_msg(); } ); make_edge( s, f ); g.wait_for_all(); The ``function_node`` will operate on at most three big objects concurrently. The node's concurrency threshold that limits the node to three concurrent invocations. When the ``function_node`` is running three instances concurrently, it will start rejecting incoming messages from the ``input_node``, causing the ``input_node`` to buffer its last created object and temporarily stop invoking its body object. Whenever the ``function_node`` drops below its concurrency limit, it will pull new messages from the ``input_node``. At most four big objects will exist simultaneously, three in the ``function_node`` and one buffered in the ``input_node``. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/use_graph_reset.rst ================================================ .. _use_graph_reset: Use ``graph::reset()`` to Reset a Canceled Graph ================================================ When a graph execution is canceled either because of an unhandled exception or because its ``task_group_context`` is canceled explicitly, the graph and its nodes may be left in an indeterminate state. For example, in the code samples shown in :ref:`cancel_a_graph` the input 2 may be left in a buffer. But even beyond remnants in the buffers, there are other optimizations performed during the execution of a flow graph that can leave its nodes and edges in an indeterminate state. If you want to re-execute or restart a graph, you first need to reset the graph: :: try { g.wait_for_all(); } catch ( int j ) { cout << "Caught " << j << "\n"; // do something to fix the problem g.reset(); f1.try_put(1); f1.try_put(2); g.wait_for_all(); } ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/use_input_node.rst ================================================ .. _use_input_node: Using input_node ================= By default, an ``input_node`` is constructed in the inactive state: :: template< typename Body > input_node( graph &g, Body body, bool is_active=true ) To activate an inactive ``input_node``, you call the node's function activate: :: input_node< int > src( g, src_body(10), false ); // use it in calls to make_edge… src.activate(); All ``input_node`` objects are constructed in the inactive state and usually activated after the entire flow graph is constructed. For example, you can use the code in :ref:`Data_Flow_Graph`. In that implementation, the ``input_node`` is constructed in the inactive state and activated after all other edges are made: :: make_edge( squarer, summer ); make_edge( cuber, summer ); input_node< int > src( g, src_body(10), false ); make_edge( src, squarer ); make_edge( src, cuber ); src.activate(); g.wait_for_all(); In this example, if the ``input_node`` was toggled to the active state at the beginning, it might send a message to squarer immediately after the edge to squarer is connected. Later, when the edge to cuber is connected, cuber will receive all future messages, but may have already missed some. In general it is safest to create your ``input_node`` objects in the inactive state and then activate them after the whole graph is constructed. However, this approach serializes graph construction and graph execution. Some graphs can be constructed safely with ``input_node`` active, allowing the overlap of construction and execution. If your graph is a directed acyclic graph (DAG), and each ``input_node`` has only one successor, you can activate your ``input_node`` just after their construction if you construct the edges in reverse topological order; that is, make the edges at the largest depth in the tree first, and work back to the shallowest edges. For example, if src is an ``input_node`` and ``func1`` and ``func2`` are both function nodes, the following graph would not drop messages, even though src is activated just after its construction: :: const int limit = 10; int count = 0; graph g; oneapi::tbb::flow::graph g; oneapi::tbb::flow::input_node src( g, [&]( oneapi::tbb::flow_control &fc ) -> int { if ( count < limit ) { return ++count; } fc.stop(); return {}; }); src.activate(); oneapi::tbb::flow::function_node func1( g, 1, []( int i ) -> int { std::cout << i << "\n"; return i; } ); oneapi::tbb::flow::function_node func2( g, 1, []( int i ) -> int { std::cout << i << "\n"; return i; } ); make_edge( func1, func2 ); make_edge( src, func1 ); g.wait_for_all(); The above code is safe because the edge from ``func1`` to ``func2`` is made before the edge from src to ``func1``. If the edge from src to func1 were made first, ``func1`` might generate a message before ``func2`` is attached to it; that message would be dropped. Also, src has only a single successor. If src had more than one successor, the successor that is attached first might receive messages that do not reach the successors that are attached after it. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/use_limiter_node.rst ================================================ .. _use_limiter_node: Using limiter_node ================== One way to limit resource consumption is to use a limiter_node to set a limit on the number of messages that can flow through a given point in your graph. The constructor for a limiter node takes two arguments: :: limiter_node( graph &g, size_t threshold ) The first argument is a reference to the graph it belongs to. The second argument sets the maximum number of items that should be allowed to pass through before the node starts rejecting incoming messages. A limiter_node maintains an internal count of the messages that it has allowed to pass. When a message leaves the controlled part of the graph, a message can be sent to the decrement port on the ``limiter_node`` to decrement the count, allowing additional messages to pass through. In the example below, an ``input_node`` will generate ``M`` big objects. But the user wants to allow at most three big objects to reach the ``function_node`` at a time, and to prevent the ``input_node`` from generating all ``M`` big objects at once. :: graph g; int src_count = 0; int number_of_objects = 0; int max_objects = 3; input_node< big_object * > s( g, [&]( oneapi::tbb::flow_control& fc ) -> big_object* { if ( src_count < M ) { big_object* v = new big_object(); ++src_count; return v; } else { fc.stop(); return nullptr; } } ); s.activate(); limiter_node< big_object * > l( g, max_objects ); function_node< big_object *, continue_msg > f( g, unlimited, []( big_object *v ) -> continue_msg { spin_for(1); delete v; return continue_msg(); } ); make_edge( l, f ); make_edge( f, l.decrement ); make_edge( s, l ); g.wait_for_all(); The example above prevents the ``input_node`` from generating all ``M`` big objects at once. The ``limiter_node`` has a threshold of 3, and will therefore start rejecting incoming messages after its internal count reaches 3. When the ``input_node`` sees its message rejected, it stops calling its body object and temporarily buffers the last generated value. The ``function_node`` has its output, a ``continue_msg``, sent to the decrement port of the ``limiter_node``. So, after it completes executing, the ``limiter_node`` internal count is decremented. When the internal count drops below the threshold, messages begin flowing from the ``input_node`` again. So in this example, at most four big objects exist at a time, the three that have passed through the ``limiter_node`` and the one that is buffered in the ``input_node``. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/use_make_edge.rst ================================================ .. _use_make_edge: Use make_edge and remove_edge ============================= These are the basic guidelines for creating and removing edges: - Use ``make_edge`` and ``remove_edge`` - Avoid using ``register_successor`` and ``register_predecessor`` - Avoid using ``remove_successor`` and ``remove_predecessor`` As a convention, to communicate the topology, use only functions ``flow::make_edge`` and ``flow::remove_edge``. The runtime library uses node functions, such as ``sender::register_successor``, to create these edges, but those functions should not be called directly. The runtime library calls these node functions directly to implement optimizations on the topology at runtime. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/use_nested_algorithms.rst ================================================ .. _use_nested_algorithms: Use Nested Algorithms to Increase Scalability ============================================= One powerful way to increase the scalability of a flow graph is to nest other parallel algorithms inside of node bodies. Doing so, you can use a flow graph as a coordination language, expressing the most coarse-grained parallelism at the level of the graph, with finer grained parallelism nested within. In the example below, five nodes are created: an ``input_node``, ``matrix_source``, that reads a sequence of matrices from a file, two ``function_nodes``, ``n1`` and ``n2``, that receive these matrices and generate two new matrices by applying a function to each element, and two final ``function_nodes``, ``n1_sink`` and ``n2_sink``, that process these resulting matrices. The ``matrix_source`` is connected to both ``n1`` and ``n2``. The node ``n1`` is connected to ``n1_sink``, and ``n2`` is connected to ``n2_sink``. In the lambda expressions for ``n1`` and ``n2``, a ``parallel_for`` is used to apply the functions to the elements of the matrix in parallel. The functions ``read_next_matrix``, ``f1``, ``f2``, ``consume_f1`` and ``consume_f2`` are not provided below. :: graph g; input_node< double * > matrix_source( g, [&]( oneapi::tbb::flow_control &fc ) -> double* { double *a = read_next_matrix(); if ( a ) { return a; } else { fc.stop(); return nullptr; } } ); function_node< double *, double * > n1( g, unlimited, [&]( double *a ) -> double * { double *b = new double[N]; parallel_for( 0, N, [&](int i) { b[i] = f1(a[i]); } ); return b; } ); function_node< double *, double * > n2( g, unlimited, [&]( double *a ) -> double * { double *b = new double[N]; parallel_for( 0, N, [&](int i) { b[i] = f2(a[i]); } ); return b; } ); function_node< double *, double * > n1_sink( g, unlimited, []( double *b ) -> double * { return consume_f1(b); } ); function_node< double *, double * > n2_sink( g, unlimited, []( double *b ) -> double * { return consume_f2(b); } ); make_edge( matrix_source, n1 ); make_edge( matrix_source, n2 ); make_edge( n1, n1_sink ); make_edge( n2, n2_sink ); matrix_source.activate(); g.wait_for_all(); ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/use_nested_flow_graphs.rst ================================================ .. _use_nested_flow_graphs: Use Nested Flow Graphs ====================== In addition to nesting algorithms within a flow graph node, it is also possible to nest flow graphs. For example, below there is a graph ``g`` with two nodes, ``a`` and ``b``. When node ``a`` receives a message, it constructs and executes an inner dependence graph. When node ``b`` receives a message, it constructs and executes an inner data flow graph: :: graph g; function_node< int, int > a( g, unlimited, []( int i ) -> int { graph h; node_t n1( h, [=]( msg_t ) { cout << "n1: " << i << "\n"; } ); node_t n2( h, [=]( msg_t ) { cout << "n2: " << i << "\n"; } ); node_t n3( h, [=]( msg_t ) { cout << "n3: " << i << "\n"; } ); node_t n4( h, [=]( msg_t ) { cout << "n4: " << i << "\n"; } ); make_edge( n1, n2 ); make_edge( n1, n3 ); make_edge( n2, n4 ); make_edge( n3, n4 ); n1.try_put(continue_msg()); h.wait_for_all(); return i; } ); function_node< int, int > b( g, unlimited, []( int i ) -> int { graph h; function_node< int, int > m1( h, unlimited, []( int j ) -> int { cout << "m1: " << j << "\n"; return j; } ); function_node< int, int > m2( h, unlimited, []( int j ) -> int { cout << "m2: " << j << "\n"; return j; } ); function_node< int, int > m3( h, unlimited, []( int j ) -> int { cout << "m3: " << j << "\n"; return j; } ); function_node< int, int > m4( h, unlimited, []( int j ) -> int { cout << "m4: " << j << "\n"; return j; } ); make_edge( m1, m2 ); make_edge( m1, m3 ); make_edge( m2, m4 ); make_edge( m3, m4 ); m1.try_put(i); h.wait_for_all(); return i; } ); make_edge( a, b ); for ( int i = 0; i < 3; ++i ) { a.try_put(i); } g.wait_for_all(); If the nested graph remains unchanged in structure between invocations of the node, it is redundant to construct it each time. Reconstructing the graph only adds overhead to the execution. You can modify the example above, for example, to have node ``b`` reuse a graph that is persistent across its invocations: :: graph h; function_node< int, int > m1( h, unlimited, []( int j ) -> int { cout << "m1: " << j << "\n"; return j; } ); function_node< int, int > m2( h, unlimited, []( int j ) -> int { cout << "m2: " << j << "\n"; return j; } ); function_node< int, int > m3( h, unlimited, []( int j ) -> int { cout << "m3: " << j << "\n"; return j; } ); function_node< int, int > m4( h, unlimited, []( int j ) -> int { cout << "m4: " << j << "\n"; return j; } ); make_edge( m1, m2 ); make_edge( m1, m3 ); make_edge( m2, m4 ); make_edge( m3, m4 ); graph g; function_node< int, int > a( g, unlimited, []( int i ) -> int { graph h; node_t n1( h, [=]( msg_t ) { cout << "n1: " << i << "\n"; } ); node_t n2( h, [=]( msg_t ) { cout << "n2: " << i << "\n"; } ); node_t n3( h, [=]( msg_t ) { cout << "n3: " << i << "\n"; } ); node_t n4( h, [=]( msg_t ) { cout << "n4: " << i << "\n"; } ); make_edge( n1, n2 ); make_edge( n1, n3 ); make_edge( n2, n4 ); make_edge( n3, n4 ); n1.try_put(continue_msg()); h.wait_for_all(); return i; } ); function_node< int, int > b( g, unlimited, [&]( int i ) -> int { m1.try_put(i); h.wait_for_all(); // optional since h is not destroyed return i; } ); make_edge( a, b ); for ( int i = 0; i < 3; ++i ) { a.try_put(i); } g.wait_for_all(); It is only necessary to call ``h.wait_for_all()`` at the end of each invocation of ``b``'s body in our modified code, if you wish for this ``b``'s body to block until the inner graph is done. In the first implementation of ``b``, it was necessary to call ``h.wait_for_all`` at the end of each invocation since the graph was destroyed at the end of the scope. So it would be valid in the body of ``b`` above to call ``m1.try_put(i)`` and then return without waiting for ``h`` to become idle. ================================================ FILE: third-party/tbb/doc/main/tbb_userguide/work_isolation.rst ================================================ .. _work_isolation: Work Isolation ============== .. container:: section In |full_name|, a thread waiting for a group of tasks to complete might execute other available tasks. In particular, when a parallel construct calls another parallel construct, a thread can obtain a task from the outer-level construct while waiting for completion of the inner-level one. In the following example with two ``parallel_for`` calls, the call to the second (nested) parallel loop blocks execution of the first (outer) loop iteration: :: // The first parallel loop. oneapi::tbb::parallel_for( 0, N1, []( int i ) { // The second parallel loop. oneapi::tbb::parallel_for( 0, N2, []( int j ) { /* Some work */ } ); } ); The blocked thread is allowed to take tasks belonging to the first parallel loop. As a result, two or more iterations of the outer loop might be simultaneously assigned to the same thread. In other words, in oneTBB execution of functions constituting a parallel construct is *unsequenced* even within a single thread. In most cases, this behavior is harmless or even beneficial because it does not restrict parallelism available for the thread. However, in some cases such unsequenced execution may result in errors. For example, a thread-local variable might unexpectedly change its value after a nested parallel construct: :: oneapi::tbb::enumerable_thread_specific ets; oneapi::tbb::parallel_for( 0, N1, [&ets]( int i ) { // Set a thread specific value ets.local() = i; oneapi::tbb::parallel_for( 0, N2, []( int j ) { /* Some work */ } ); // While executing the above parallel_for, the thread might have run iterations // of the outer parallel_for, and so might have changed the thread specific value. assert( ets.local()==i ); // The assertion may fail! } ); In other scenarios, the described behavior might lead to deadlocks and other issues. In these cases, a stronger guarantee of execution being sequenced within a thread is desired. For that, oneTBB provides ways to *isolate* execution of a parallel construct, for its tasks to not interfere with other simultaneously running tasks. One of these ways is to execute the inner level loop in a separate ``task_arena``: :: oneapi::tbb::enumerable_thread_specific ets; oneapi::tbb::task_arena nested; oneapi::tbb::parallel_for( 0, N1, [&]( int i ) { // Set a thread specific value ets.local() = i; nested.execute( []{ // Run the inner parallel_for in a separate arena to prevent the thread // from taking tasks of the outer parallel_for. oneapi::tbb::parallel_for( 0, N2, []( int j ) { /* Some work */ } ); } ); assert( ets.local()==i ); // Valid assertion } ); However, using a separate arena for work isolation is not always convenient, and might have noticeable overheads. To address these shortcomings, oneTBB provides ``this_task_arena::isolate`` function which runs a user-provided functor in isolation by restricting the calling thread to process only tasks scheduled in the scope of the functor (also called the isolation region). When entered a task waiting call or a blocking parallel construct inside an isolated region, a thread can only execute tasks spawned within the region and their child tasks spawned by other threads. The thread is prohibited from executing any outer level tasks or tasks belonging to other isolated regions. The isolation region imposes restrictions only upon the thread that called it. Other threads running in the same task arena have no restrictions on task selection unless isolated by a distinct call to ``this_task_arena::isolate``. The following example demonstrates the use of ``this_task_arena::isolate`` to ensure that a thread-local variable is not changed unexpectedly during the call to a nested parallel construct. :: #include "oneapi/tbb/task_arena.h" #include "oneapi/tbb/parallel_for.h" #include "oneapi/tbb/enumerable_thread_specific.h" #include int main() { const int N1 = 1000, N2 = 1000; oneapi::tbb::enumerable_thread_specific ets; oneapi::tbb::parallel_for( 0, N1, [&ets]( int i ) { // Set a thread specific value ets.local() = i; // Run the second parallel loop in an isolated region to prevent the current thread // from taking tasks related to the outer parallel loop. oneapi::tbb::this_task_arena::isolate( []{ oneapi::tbb::parallel_for( 0, N2, []( int j ) { /* Some work */ } ); } ); assert( ets.local()==i ); // Valid assertion } ); return 0; } ================================================ FILE: third-party/tbb/doc/make.bat ================================================ @ECHO OFF rem ============================================================================ rem Copyright (C) 2022 Intel Corporation rem rem Licensed under the Apache License, Version 2.0 (the "License"); rem you may not use this file except in compliance with the License. rem You may obtain a copy of the License at rem rem http://www.apache.org/licenses/LICENSE-2.0 rem rem Unless required by applicable law or agreed to in writing, software rem distributed under the License is distributed on an "AS IS" BASIS, rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. rem See the License for the specific language governing permissions and rem limitations under the License. rem ============================================================================ pushd %~dp0 REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) set SOURCEDIR=. set BUILDDIR=build if "%1" == "" goto help %SPHINXBUILD% >NUL 2>NUL if errorlevel 9009 ( echo. echo.The 'sphinx-build' command was not found. Make sure you have Sphinx echo.installed, then set the SPHINXBUILD environment variable to point echo.to the full path of the 'sphinx-build' executable. Alternatively you echo.may add the Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from echo.http://sphinx-doc.org/ exit /b 1 ) %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% goto end :help %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% :end popd ================================================ FILE: third-party/tbb/doc/test_classification.dox ================================================ /// \page test_classification Test classification /// The list of test classes attributed to test cases. /// /// \section interface \\\#interface /// This class of tests covers interface availability and checks basic /// interface behavior. /// /// \section requirement \\\#requirement /// This class of tests covers one or more statements in the /// specification document. /// /// \section negative \\\#negative /// This class of tests checks that input specified as invalid is /// processed correctly. Additionally, this class of tests might /// check imposibility to call particular interfaces that are not /// described by the specification. /// /// \section resource_usage \\\#resource usage /// This class of tests checks correct resource usage and absence of /// resource leaks. /// /// \section boundary \\\#boundary /// This class of tests checks boundary values, possible overflows, etc. /// /// \section stress \\\#stress /// This class of tests tries to detect synchronization and other issues /// under heavy load and extensive usage of the functionality. /// /// \section error_guessing \\\#error guessing /// This class of tests tries to detect issues that implementation might /// have but these issues cannot be deduced from the specification. /// /// \section regression \\\#regression /// This class of tests covers issues that were detected and fixed after /// functionality release. ================================================ FILE: third-party/tbb/examples/.clang-format ================================================ # Copyright (c) 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. --- Language: Cpp AccessModifierOffset: -4 AlignAfterOpenBracket: Align AlignConsecutiveAssignments: false AlignConsecutiveDeclarations: false AlignConsecutiveMacros: true AlignEscapedNewlines: Left AlignOperands: true AlignTrailingComments: false AllowAllArgumentsOnNextLine: true AllowAllConstructorInitializersOnNextLine: false AllowAllParametersOfDeclarationOnNextLine: false AllowShortBlocksOnASingleLine: false AllowShortCaseLabelsOnASingleLine: true AllowShortFunctionsOnASingleLine: Empty AllowShortIfStatementsOnASingleLine: Never AllowShortLambdasOnASingleLine: Empty AllowShortLoopsOnASingleLine: false AlwaysBreakAfterDefinitionReturnType: None AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: false AlwaysBreakTemplateDeclarations: Yes BinPackArguments: false BinPackParameters: false BraceWrapping: AfterCaseLabel: false AfterClass: false AfterControlStatement: false AfterEnum: false AfterFunction: false AfterNamespace: false AfterObjCDeclaration: false AfterStruct: false AfterUnion: false AfterExternBlock: false BeforeCatch: true BeforeElse: true IndentBraces: false SplitEmptyFunction: false SplitEmptyRecord: false SplitEmptyNamespace: false BreakBeforeBinaryOperators: None BreakBeforeBraces: Custom BreakBeforeTernaryOperators: true BreakConstructorInitializers: BeforeColon BreakInheritanceList: BeforeColon BreakStringLiterals: false ColumnLimit: 100 CommentPragmas: '^ IWYU pragma:' CompactNamespaces: false ConstructorInitializerAllOnOneLineOrOnePerLine: true ConstructorInitializerIndentWidth: 8 ContinuationIndentWidth: 4 Cpp11BracedListStyle: false DerivePointerAlignment: true DisableFormat: false FixNamespaceComments: true ForEachMacros: - foreach - Q_FOREACH - BOOST_FOREACH IncludeBlocks: Preserve IncludeCategories: - Regex: '^' Priority: 2 - Regex: '^<.*\.h>' Priority: 1 - Regex: '^<.*' Priority: 2 - Regex: '.*' Priority: 3 IncludeIsMainRegex: '([-_](test|unittest))?$' IndentCaseLabels: true IndentPPDirectives: None IndentWidth: 4 IndentWrappedFunctionNames: false KeepEmptyLinesAtTheStartOfBlocks: false MacroBlockBegin: '' MacroBlockEnd: '' MaxEmptyLinesToKeep: 1 NamespaceIndentation: None PenaltyBreakAssignment: 2 PenaltyBreakBeforeFirstCallParameter: 1 PenaltyBreakComment: 300 PenaltyBreakFirstLessLess: 120 PenaltyBreakString: 1000 PenaltyBreakTemplateDeclaration: 10 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 200 PointerAlignment: Left RawStringFormats: - Language: Cpp Delimiters: - cc - CC - cpp - Cpp - CPP - 'c++' - 'C++' CanonicalDelimiter: '' BasedOnStyle: google - Language: TextProto Delimiters: - pb - PB - proto - PROTO EnclosingFunctions: - EqualsProto - EquivToProto - PARSE_PARTIAL_TEXT_PROTO - PARSE_TEST_PROTO - PARSE_TEXT_PROTO - ParseTextOrDie - ParseTextProtoOrDie CanonicalDelimiter: '' BasedOnStyle: google ReflowComments: false SortIncludes: false SortUsingDeclarations: false SpaceAfterCStyleCast: false SpaceAfterLogicalNot: false SpaceAfterTemplateKeyword: true SpaceBeforeAssignmentOperators: true SpaceBeforeCpp11BracedList: false SpaceBeforeCtorInitializerColon: true SpaceBeforeInheritanceColon: true SpaceBeforeParens: ControlStatements SpaceBeforeRangeBasedForLoopColon: true SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 1 SpacesInAngles: false SpacesInContainerLiterals: false SpacesInCStyleCastParentheses: false SpacesInParentheses: false SpacesInSquareBrackets: false Standard: Cpp11 StatementMacros: - Q_UNUSED - QT_REQUIRE_VERSION TabWidth: 1 UseTab: Never ... ================================================ FILE: third-party/tbb/examples/CMakeLists.txt ================================================ # Copyright (c) 2020-2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. cmake_minimum_required(VERSION 3.5.0...3.31.3) project(tbb_examples CXX) add_custom_target(build_examples) add_custom_target(run_examples) add_dependencies(run_examples build_examples) add_custom_target(light_test_examples) add_dependencies(light_test_examples build_examples) macro(tbb_add_example subdir name) add_subdirectory(${subdir}/${name}) add_dependencies(build_examples ${name}) add_dependencies(run_examples run_${name}) if (TARGET light_test_${name}) add_dependencies(light_test_examples light_test_${name}) else() add_dependencies(light_test_examples run_${name}) endif() endmacro() tbb_add_example(concurrent_hash_map count_strings) tbb_add_example(concurrent_priority_queue shortpath) tbb_add_example(getting_started sub_string_finder) tbb_add_example(graph binpack) # TODO: Consider using FindMKL module find_library(MKL_INTEL_LP64_LIB mkl_intel_lp64 PATHS ENV LIBRARY_PATH) find_library(MKL_SEQUENTIAL_LIB mkl_sequential PATHS ENV LIBRARY_PATH) find_library(MKL_CORE_LIB mkl_core PATHS ENV LIBRARY_PATH) if(MKL_INTEL_LP64_LIB AND MKL_SEQUENTIAL_LIB AND MKL_CORE_LIB) tbb_add_example(graph cholesky) else() message(WARNING "Intel(R) Math Kernel Library (Intel(R) MKL) libraries were not found, graph/cholesky example is excluded from the build.") endif() tbb_add_example(graph dining_philosophers) tbb_add_example(graph fgbzip2) tbb_add_example(graph logic_sim) tbb_add_example(graph som) tbb_add_example(parallel_for game_of_life) tbb_add_example(parallel_for polygon_overlay) tbb_add_example(parallel_for seismic) tbb_add_example(parallel_for tachyon) tbb_add_example(parallel_for_each parallel_preorder) tbb_add_example(parallel_pipeline square) tbb_add_example(parallel_reduce convex_hull) tbb_add_example(parallel_reduce pi) tbb_add_example(parallel_reduce primes) tbb_add_example(task_arena fractal) tbb_add_example(task_group sudoku) tbb_add_example(test_all fibonacci) tbb_add_example(migration recursive_fibonacci) ================================================ FILE: third-party/tbb/examples/README.md ================================================ # Code Samples of oneAPI Threading Building Blocks (oneTBB) This directory contains example usages of oneAPI Threading Building Blocks. | Code sample name | Description |:--- |:--- | getting_started/sub_string_finder | Example referenced by the [oneAPI Threading Building Blocks Get Started Guide](https://uxlfoundation.github.io/oneTBB/GSG/get_started.html#get-started-guide). Finds the largest matching substrings. | concurrent_hash_map/count_strings | Concurrently inserts strings into a `concurrent_hash_map` container. | concurrent_priority_queue/shortpath | Solves the single source shortest path problem using a `concurrent_priority_queue` container. | graph/binpack | A solution to the binpacking problem using a `queue_node`, a `buffer_node`, and `function_node`s. | graph/cholesky | Several versions of Cholesky Factorization algorithm implementation. | graph/dining_philosophers | An implementation of dining philosophers in a graph using the reserving `join_node`. | graph/fgbzip2 | A parallel implementation of bzip2 block-sorting file compressor. | graph/logic_sim | An example of a collection of digital logic gates that can be easily composed into larger circuits. | graph/som | An example of a Kohonen Self-Organizing Map using cancellation. | parallel_for/game_of_life | Game of life overlay. | parallel_for/polygon_overlay | Polygon overlay. | parallel_for/seismic | Parallel seismic wave simulation. | parallel_for/tachyon | Parallel 2-D raytracer/renderer. | parallel_for_each/parallel_preorder | Parallel preorder traversal of a graph. | parallel_pipeline/square | Another string transformation example that squares numbers read from a file. | parallel_reduce/convex_hull | Parallel version of convex hull algorithm (quick hull). | parallel_reduce/pi | Parallel version of calculating π by numerical integration. | parallel_reduce/primes | Parallel version of the Sieve of Eratosthenes. | task_arena/fractal |The example calculates two classical Mandelbrot fractals with different concurrency limits. | task_group/sudoku | Compute all solutions for a Sudoku board. | test_all/fibonacci | Compute Fibonacci numbers in different ways. ## System Requirements Refer to the [System Requirements](https://github.com/uxlfoundation/oneTBB/blob/master/SYSTEM_REQUIREMENTS.md) for the list of supported hardware and software. ### Graphical User Interface (GUI) Some examples (e.g., fractal, seismic, tachyon, polygon_overlay) support different GUI modes, which may be defined via the `EXAMPLES_UI_MODE` CMake variable. Supported values are: - Cross-platform: - `con` - Console mode (Default). - Windows* OS: - `gdi` - `GDI+` based implementation. - `d2d` - `Direct 2D` based implementation. May offer superior performance but can only be used if the Microsoft* DirectX* SDK is installed on your system(`DXSDK_DIR` should be defined). - Linux* OS: - `x` - `X11` based implementation. Also `libXext` may be required to display the output correctly. - macOS*: - `mac` - `OpenGL` based implementation. Also requires the `Foundation` and `Cocoa` libraries availability. ================================================ FILE: third-party/tbb/examples/common/cmake/common.cmake ================================================ # Copyright (c) 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. macro(set_common_project_settings required_components) # Path to common headers include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../) if (NOT TARGET TBB::tbb) list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../../common/cmake/modules) find_package(TBB REQUIRED COMPONENTS ${required_components}) endif() find_package(Threads REQUIRED) # --------------------------------------------------------------------------------------------------------- # Handle C++ standard version. if (NOT MSVC) # no need to cover MSVC as it uses C++14 by default. if (NOT CMAKE_CXX_STANDARD) set(CMAKE_CXX_STANDARD 11) endif() if (CMAKE_CXX${CMAKE_CXX_STANDARD}_STANDARD_COMPILE_OPTION) # if standard option was detected by CMake set(CMAKE_CXX_STANDARD_REQUIRED ON) endif() endif() set(CMAKE_CXX_EXTENSIONS OFF) # use -std=c++... instead of -std=gnu++... # --------------------------------------------------------------------------------------------------------- endmacro() macro(add_execution_target TARGET_NAME TARGET_DEPENDENCIES EXECUTABLE ARGS) if (WIN32) add_custom_target(${TARGET_NAME} set "PATH=$\\;$ENV{PATH}" & ${EXECUTABLE} ${ARGS}) else() add_custom_target(${TARGET_NAME} ${EXECUTABLE} ${ARGS}) endif() add_dependencies(${TARGET_NAME} ${TARGET_DEPENDENCIES}) endmacro() ================================================ FILE: third-party/tbb/examples/common/cmake/modules/FindTBB.cmake ================================================ # Copyright (c) 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. include(FindPackageHandleStandardArgs) # Firstly search for TBB in config mode (i.e. search for TBBConfig.cmake). find_package(TBB QUIET CONFIG) if (TBB_FOUND) find_package_handle_standard_args(TBB CONFIG_MODE) return() endif() if (NOT TBB_FIND_COMPONENTS) set(TBB_FIND_COMPONENTS tbb tbbmalloc) foreach (_tbb_component ${TBB_FIND_COMPONENTS}) set(TBB_FIND_REQUIRED_${_tbb_component} 1) endforeach() endif() if (WIN32) list(APPEND ADDITIONAL_LIB_DIRS ENV PATH ENV LIB) list(APPEND ADDITIONAL_INCLUDE_DIRS ENV INCLUDE ENV CPATH) else() list(APPEND ADDITIONAL_LIB_DIRS ENV LIBRARY_PATH ENV LD_LIBRARY_PATH ENV DYLD_LIBRARY_PATH) list(APPEND ADDITIONAL_INCLUDE_DIRS ENV CPATH ENV C_INCLUDE_PATH ENV CPLUS_INCLUDE_PATH ENV INCLUDE_PATH) endif() find_path(_tbb_include_dir NAMES tbb/tbb.h PATHS ${ADDITIONAL_INCLUDE_DIRS}) if (_tbb_include_dir) # TODO: consider TBB_VERSION handling set(_TBB_BUILD_MODES RELEASE DEBUG) set(_TBB_DEBUG_SUFFIX _debug) foreach (_tbb_component ${TBB_FIND_COMPONENTS}) if (NOT TARGET TBB::${_tbb_component}) add_library(TBB::${_tbb_component} SHARED IMPORTED) set_property(TARGET TBB::${_tbb_component} APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${_tbb_include_dir}) foreach(_TBB_BUILD_MODE ${_TBB_BUILD_MODES}) set(_tbb_component_lib_name ${_tbb_component}${_TBB_${_TBB_BUILD_MODE}_SUFFIX}) if (WIN32) find_library(${_tbb_component_lib_name}_lib ${_tbb_component_lib_name} PATHS ${ADDITIONAL_LIB_DIRS}) find_file(${_tbb_component_lib_name}_dll ${_tbb_component_lib_name}.dll PATHS ${ADDITIONAL_LIB_DIRS}) set_target_properties(TBB::${_tbb_component} PROPERTIES IMPORTED_LOCATION_${_TBB_BUILD_MODE} "${${_tbb_component_lib_name}_dll}" IMPORTED_IMPLIB_${_TBB_BUILD_MODE} "${${_tbb_component_lib_name}_lib}" ) else() find_library(${_tbb_component_lib_name}_so ${_tbb_component_lib_name} PATHS ${ADDITIONAL_LIB_DIRS}) set_target_properties(TBB::${_tbb_component} PROPERTIES IMPORTED_LOCATION_${_TBB_BUILD_MODE} "${${_tbb_component_lib_name}_so}" ) endif() if (${_tbb_component_lib_name}_lib AND ${_tbb_component_lib_name}_dll OR ${_tbb_component_lib_name}_so) set_property(TARGET TBB::${_tbb_component} APPEND PROPERTY IMPORTED_CONFIGURATIONS ${_TBB_BUILD_MODE}) list(APPEND TBB_IMPORTED_TARGETS TBB::${_tbb_component}) set(TBB_${_tbb_component}_FOUND 1) endif() unset(${_tbb_component_lib_name}_lib CACHE) unset(${_tbb_component_lib_name}_dll CACHE) unset(${_tbb_component_lib_name}_so CACHE) unset(_tbb_component_lib_name) endforeach() endif() endforeach() unset(_TBB_BUILD_MODESS) unset(_TBB_DEBUG_SUFFIX) endif() unset(_tbb_include_dir CACHE) list(REMOVE_DUPLICATES TBB_IMPORTED_TARGETS) find_package_handle_standard_args(TBB REQUIRED_VARS TBB_IMPORTED_TARGETS HANDLE_COMPONENTS) ================================================ FILE: third-party/tbb/examples/common/gui/CMakeLists.txt ================================================ # Copyright (c) 2020-2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. cmake_minimum_required(VERSION 3.5.0...3.31.3) set(EXAMPLES_UI_MODE "con" CACHE STRING "EXAMPLES_UI_MODE") if (WIN32) set_property(CACHE EXAMPLES_UI_MODE PROPERTY STRINGS "con" "gdi" "d2d") elseif (APPLE) set_property(CACHE EXAMPLES_UI_MODE PROPERTY STRINGS "con" "mac") else() set_property(CACHE EXAMPLES_UI_MODE PROPERTY STRINGS "con" "x") endif() get_property(_available_ui_modes CACHE EXAMPLES_UI_MODE PROPERTY STRINGS) list(FIND _available_ui_modes ${EXAMPLES_UI_MODE} _find_index) if (NOT _find_index EQUAL -1) add_library(UI_LIB_${PROJECT_NAME} STATIC ${CMAKE_CURRENT_LIST_DIR}/${EXAMPLES_UI_MODE}video.cpp) if (EXAMPLES_UI_MODE STREQUAL "mac") enable_language(C) add_library(UI_LIB_OBJECT_${PROJECT_NAME} OBJECT ${CMAKE_CURRENT_LIST_DIR}/xcode/tbbExample/main.m ${CMAKE_CURRENT_LIST_DIR}/xcode/tbbExample/OpenGLView.m ${CMAKE_CURRENT_LIST_DIR}/xcode/tbbExample/tbbAppDelegate.m ) target_sources(UI_LIB_${PROJECT_NAME} PUBLIC $) endif() else() string(REPLACE ";" ", " _available_ui_modes "${_available_ui_modes}") message(FATAL_ERROR "Selected UI mode (${EXAMPLES_UI_MODE}) is not supported on ${CMAKE_SYSTEM_NAME}. Supported UI modes: ${_available_ui_modes}" ) endif() macro(set_gdi_ui_project_settings) find_program(RC rc REQUIRED) add_custom_command(TARGET UI_LIB_${PROJECT_NAME} PRE_BUILD COMMAND ${RC} /r -fo ${PROJECT_BINARY_DIR}/${PROJECT_NAME}.res ${PROJECT_SOURCE_DIR}/gui/${PROJECT_NAME}.rc ) set_target_properties(${PROJECT_NAME} PROPERTIES WIN32_EXECUTABLE TRUE) if (COMMAND target_link_options) target_link_options(UI_LIB_${PROJECT_NAME} PUBLIC ${PROJECT_BINARY_DIR}/${PROJECT_NAME}.res) else() set_target_properties(${PROJECT_NAME} PROPERTIES LINK_FLAGS ${PROJECT_BINARY_DIR}/${PROJECT_NAME}.res) endif() endmacro() macro(set_d2d_ui_project_settings) set_gdi_ui_project_settings() if (IS_DIRECTORY $ENV{DXSDK_DIR}) target_include_directories(UI_LIB_${PROJECT_NAME} PUBLIC $ENV{DXSDK_DIR}\\include) target_link_directories(UI_LIB_${PROJECT_NAME} PUBLIC $ENV{DXSDK_DIR}\\lib\\x64) else() message(FATAL_ERROR "Cannot find the DirectX library (required by the 'd2d' UI mode)") endif() endmacro() macro(set_x_ui_project_settings) find_package(X11 REQUIRED) target_link_libraries(UI_LIB_${PROJECT_NAME} PUBLIC ${X11_LIBRARIES}) target_include_directories(UI_LIB_${PROJECT_NAME} PUBLIC ${X11_INCLUDE_DIR}) find_library(LIB_Xext Xext PATHS ENV LIBRARY_PATH ENV LD_LIBRARY_PATH ENV DYLD_LIBRARY_PATH) if (LIB_Xext) target_link_libraries(UI_LIB_${PROJECT_NAME} PUBLIC ${LIB_Xext}) else() target_compile_definitions(UI_LIB_${PROJECT_NAME} PUBLIC -DX_NOSHMEM) endif() endmacro() macro(set_mac_ui_project_settings) find_package(OpenGL REQUIRED) find_library(FOUNDATION_LIBRARY Foundation PATHS ENV LIBRARY_PATH ENV LD_LIBRARY_PATH ENV DYLD_LIBRARY_PATH) if (NOT FOUNDATION_LIBRARY) message(FATAL_ERROR "Cannot find the Foundation library (required by the 'mac' UI mode)") endif() find_library(COCOA_LIBRARY Cocoa PATHS ENV LIBRARY_PATH ENV LD_LIBRARY_PATH ENV DYLD_LIBRARY_PATH) if (NOT COCOA_LIBRARY) message(FATAL_ERROR "Cannot find the Cocoa library (required by the 'mac' UI mode)") endif() target_link_libraries(UI_LIB_${PROJECT_NAME} PUBLIC ${OPENGL_LIBRARIES} ${FOUNDATION_LIBRARY} ${COCOA_LIBRARY}) target_include_directories(UI_LIB_${PROJECT_NAME} PUBLIC ${OPENGL_INCLUDE_DIR}) file(MAKE_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/MacUI/${PROJECT_NAME}.app/Contents/Resources) file(MAKE_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/MacUI/${PROJECT_NAME}.app/Contents/MacOS) file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/../../common/gui/xcode/tbbExample/PkgInfo DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/MacUI/${PROJECT_NAME}.app/Contents) file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/../../common/gui/xcode/tbbExample/en.lproj DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/MacUI/${PROJECT_NAME}.app/Contents/Resources) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../../common/gui/xcode/tbbExample/tbbExample-Info.plist ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/MacUI/${PROJECT_NAME}.app/Contents/Info.plist COPYONLY) set_target_properties(${PROJECT_NAME} PROPERTIES PREFIX "MacUI/${PROJECT_NAME}.app/Contents/MacOS/") endmacro() if (NOT _find_index EQUAL -1) if ("${EXAMPLES_UI_MODE}" STREQUAL "gdi") set_gdi_ui_project_settings() elseif ("${EXAMPLES_UI_MODE}" STREQUAL "d2d") set_d2d_ui_project_settings() elseif ("${EXAMPLES_UI_MODE}" STREQUAL "mac") set_mac_ui_project_settings() elseif ("${EXAMPLES_UI_MODE}" STREQUAL "x") set_x_ui_project_settings() endif() target_compile_options(UI_LIB_${PROJECT_NAME} PRIVATE ${TBB_CXX_STD_FLAG}) endif() unset(_available_ui_modes) unset(_find_index) ================================================ FILE: third-party/tbb/examples/common/gui/convideo.cpp ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include "video.hpp" unsigned int *g_pImg = nullptr; int g_sizex, g_sizey; static video *g_video = nullptr; static int g_fps = 0; #if _WIN32 || _WIN64 static DWORD g_msec = 0; #ifdef _WINDOWS HINSTANCE video::win_hInstance = 0; int video::win_iCmdShow = 0; void video::win_set_class(WNDCLASSEX &wcex) {} void video::win_load_accelerators(int idc) {} #endif //_WINDOWS #else #include #include struct timeval g_time; #endif //_WIN32||_WIN64 #define CALC_FPS_ENABLED ((WINAPI_FAMILY != WINAPI_FAMILY_APP) && (!__ANDROID__)) video::video() // OpenGL* RGBA byte order for little-endian CPU : depth(24), red_shift(0), green_shift(8), blue_shift(16), red_mask(0xff), green_mask(0xff00), blue_mask(0xff0000) { assert(g_video == nullptr); g_video = this; title = "Video"; updating = calc_fps = false; } bool video::init_window(int x, int y) { g_sizex = x; g_sizey = y; g_pImg = new unsigned int[x * y]; running = true; return false; } bool video::init_console() { running = true; return true; } void video::terminate() { #if CALC_FPS_ENABLED if (calc_fps) { double fps = g_fps; #if _WIN32 || _WIN64 fps /= (GetTickCount() - g_msec) / 1000.0; #else struct timezone tz; struct timeval end_time; gettimeofday(&end_time, &tz); fps /= (end_time.tv_sec + 1.0 * end_time.tv_usec / 1000000.0) - (g_time.tv_sec + 1.0 * g_time.tv_usec / 1000000.0); #endif printf("%s: %.1f fps\n", title, fps); } #endif g_video = nullptr; running = false; delete[] g_pImg; g_pImg = nullptr; } video::~video() { if (g_video) terminate(); } //! Count and display FPS count in titlebar bool video::next_frame() { #if CALC_FPS_ENABLED if (calc_fps) { if (!g_fps) { #if _WIN32 || _WIN64 g_msec = GetTickCount(); #else struct timezone tz; gettimeofday(&g_time, &tz); #endif } g_fps++; } #endif return running; } //! Do standard loop void video::main_loop() { on_process(); } //! Change window title void video::show_title() {} ///////////////////////////////////////////// public methods of video class /////////////////////// drawing_area::drawing_area(int x, int y, int sizex, int sizey) : base_index(y * g_sizex + x), max_index(g_sizex * g_sizey), index_stride(g_sizex), pixel_depth(24), ptr32(g_pImg), start_x(x), start_y(y), size_x(sizex), size_y(sizey) { assert(x < g_sizex); assert(y < g_sizey); assert(x + sizex <= g_sizex); assert(y + sizey <= g_sizey); index = base_index; // current index } void drawing_area::update() {} ================================================ FILE: third-party/tbb/examples/common/gui/d2dvideo.cpp ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ // common Windows parts #include "winvideo.hpp" // and another headers #include #include #include #if _DXSDK_PRODUCT_MAJOR < 9 #error DXSDK Version 9 and above required. #endif #include #include #pragma comment(lib, "d2d1.lib") ID2D1Factory *m_pD2DFactory; ID2D1HwndRenderTarget *m_pRenderTarget; ID2D1Bitmap *m_pBitmap; D2D1_SIZE_U bitmapSize; HANDLE g_hVSync; #include #pragma comment(lib, "DxErr.lib") //! Create a dialog box and tell the user what went wrong bool DisplayError(LPSTR lpstrErr, HRESULT hres) { if (hres != S_OK) { static bool InError = false; int retval = 0; if (!InError) { InError = true; const char *message = hres ? DXGetErrorString(hres) : 0; retval = MessageBoxA(g_hAppWnd, lpstrErr, hres ? message : "Error!", MB_OK | MB_ICONERROR); InError = false; } } return false; } void DrawBitmap() { HRESULT hr = S_OK; if (m_pRenderTarget) { m_pRenderTarget->BeginDraw(); if (m_pBitmap) hr = m_pBitmap->CopyFromMemory(nullptr, (BYTE *)g_pImg, 4 * g_sizex); DisplayError("DrawBitmap error", hr); m_pRenderTarget->DrawBitmap(m_pBitmap); m_pRenderTarget->EndDraw(); } return; } inline void mouse(int k, LPARAM lParam) { int x = (int)LOWORD(lParam); int y = (int)HIWORD(lParam); RECT rc; GetClientRect(g_hAppWnd, &rc); g_video->on_mouse(x * g_sizex / (rc.right - rc.left), y * g_sizey / (rc.bottom - rc.top), k); } //! Win event processing function LRESULT CALLBACK InternalWndProc(HWND hwnd, UINT iMsg, WPARAM wParam, LPARAM lParam) { switch (iMsg) { case WM_MOVE: // Check to make sure our window exists before we tell it to repaint. // This will fail the first time (while the window is being created). if (hwnd) { InvalidateRect(hwnd, nullptr, FALSE); UpdateWindow(hwnd); } return 0L; case WM_SIZE: case WM_PAINT: if (g_video->running && g_video->updating) { DrawBitmap(); Sleep(0); } break; // Process all mouse and keyboard events case WM_LBUTTONDOWN: mouse(1, lParam); break; case WM_LBUTTONUP: mouse(-1, lParam); break; case WM_RBUTTONDOWN: mouse(2, lParam); break; case WM_RBUTTONUP: mouse(-2, lParam); break; case WM_MBUTTONDOWN: mouse(3, lParam); break; case WM_MBUTTONUP: mouse(-3, lParam); break; case WM_CHAR: g_video->on_key((int)wParam); break; // some useless stuff case WM_ERASEBKGND: return 1; // keeps erase-background events from happening, reduces chop case WM_DISPLAYCHANGE: return 0; // Now, shut down the window... case WM_DESTROY: PostQuitMessage(0); return 0; } // call user defined proc, if exists return g_pUserProc ? g_pUserProc(hwnd, iMsg, wParam, lParam) : DefWindowProc(hwnd, iMsg, wParam, lParam); } bool video::init_window(int sizex, int sizey) { assert(win_hInstance != 0); g_sizex = sizex; g_sizey = sizey; if (!WinInit(win_hInstance, win_iCmdShow, gWndClass, title, false)) { DisplayError("Unable to initialize the program's window."); return false; } ShowWindow(g_hAppWnd, SW_SHOW); g_pImg = new unsigned int[sizex * sizey]; HRESULT hr = S_OK; hr = D2D1CreateFactory(D2D1_FACTORY_TYPE_SINGLE_THREADED, &m_pD2DFactory); // Create a Direct2D render target. if (SUCCEEDED(hr) && !m_pRenderTarget) { RECT rc; GetClientRect(g_hAppWnd, &rc); bitmapSize = D2D1::SizeU(rc.right - rc.left, rc.bottom - rc.top); hr = m_pD2DFactory->CreateHwndRenderTarget( D2D1::RenderTargetProperties(), D2D1::HwndRenderTargetProperties(g_hAppWnd, bitmapSize), &m_pRenderTarget); if (SUCCEEDED(hr) && !m_pBitmap) { D2D1_PIXEL_FORMAT pixelFormat = D2D1::PixelFormat(DXGI_FORMAT_B8G8R8A8_UNORM, D2D1_ALPHA_MODE_IGNORE); D2D1_BITMAP_PROPERTIES bitmapProperties; bitmapProperties.pixelFormat = pixelFormat; m_pRenderTarget->GetDpi(&bitmapProperties.dpiX, &bitmapProperties.dpiY); m_pRenderTarget->CreateBitmap(bitmapSize, bitmapProperties, &m_pBitmap); m_pRenderTarget->DrawBitmap(m_pBitmap); } } running = true; return true; } void video::terminate() { if (m_pBitmap) m_pBitmap->Release(); if (m_pRenderTarget) m_pRenderTarget->Release(); if (m_pD2DFactory) m_pD2DFactory->Release(); g_video = nullptr; running = false; delete[] g_pImg; g_pImg = nullptr; } //////////// drawing area constructor & destructor ///////////// drawing_area::drawing_area(int x, int y, int sizex, int sizey) : base_index(y * g_sizex + x), max_index(g_sizex * g_sizey), index_stride(g_sizex), pixel_depth(24), ptr32(g_pImg), start_x(x), start_y(y), size_x(sizex), size_y(sizey) { assert(x < g_sizex); assert(y < g_sizey); assert(x + sizex <= g_sizex); assert(y + sizey <= g_sizey); index = base_index; // current index } void drawing_area::update() { if (g_video->updating) { RECT r; r.left = start_x; r.right = start_x + size_x; r.top = start_y; r.bottom = start_y + size_y; InvalidateRect(g_hAppWnd, &r, false); } } ================================================ FILE: third-party/tbb/examples/common/gui/gdivideo.cpp ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ // common Windows parts #include "winvideo.hpp" // include GDI+ headers #include // and another headers #include // tag linking library #pragma comment(lib, "gdiplus.lib") // global specific variables Gdiplus::Bitmap* g_pBitmap; // main drawing bitmap ULONG_PTR gdiplusToken; Gdiplus::GdiplusStartupInput gdiplusStartupInput; // GDI+ //! display system error bool DisplayError(LPSTR lpstrErr, HRESULT hres) { static bool InError = false; int retval = 0; if (!InError) { InError = true; LPCSTR lpMsgBuf; if (!hres) hres = GetLastError(); FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, nullptr, hres, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPTSTR)&lpMsgBuf, 0, nullptr); retval = MessageBox(g_hAppWnd, lpstrErr, lpMsgBuf, MB_OK | MB_ICONERROR); LocalFree((HLOCAL)lpMsgBuf); InError = false; } return false; } //! Win event processing function LRESULT CALLBACK InternalWndProc(HWND hwnd, UINT iMsg, WPARAM wParam, LPARAM lParam) { switch (iMsg) { case WM_MOVE: // Check to make sure our window exists before we tell it to repaint. // This will fail the first time (while the window is being created). if (hwnd) { InvalidateRect(hwnd, nullptr, FALSE); UpdateWindow(hwnd); } return 0L; case WM_PAINT: { PAINTSTRUCT ps; Gdiplus::Graphics graphics(BeginPaint(hwnd, &ps)); // redraw just requested area. This call is as fast as simple DrawImage() call. if (g_video->updating) graphics.DrawImage(g_pBitmap, ps.rcPaint.left, ps.rcPaint.top, ps.rcPaint.left, ps.rcPaint.top, ps.rcPaint.right, ps.rcPaint.bottom, Gdiplus::UnitPixel); EndPaint(hwnd, &ps); } return 0L; // Process all mouse and keyboard events case WM_LBUTTONDOWN: g_video->on_mouse((int)LOWORD(lParam), (int)HIWORD(lParam), 1); break; case WM_LBUTTONUP: g_video->on_mouse((int)LOWORD(lParam), (int)HIWORD(lParam), -1); break; case WM_RBUTTONDOWN: g_video->on_mouse((int)LOWORD(lParam), (int)HIWORD(lParam), 2); break; case WM_RBUTTONUP: g_video->on_mouse((int)LOWORD(lParam), (int)HIWORD(lParam), -2); break; case WM_MBUTTONDOWN: g_video->on_mouse((int)LOWORD(lParam), (int)HIWORD(lParam), 3); break; case WM_MBUTTONUP: g_video->on_mouse((int)LOWORD(lParam), (int)HIWORD(lParam), -3); break; case WM_CHAR: g_video->on_key((int)wParam); break; // some useless stuff case WM_ERASEBKGND: return 1; // keeps erase-background events from happening, reduces chop case WM_DISPLAYCHANGE: return 0; // Now, shut down the window... case WM_DESTROY: PostQuitMessage(0); return 0; } // call user defined proc, if exists return g_pUserProc ? g_pUserProc(hwnd, iMsg, wParam, lParam) : DefWindowProc(hwnd, iMsg, wParam, lParam); } ///////////// video functions //////////////// bool video::init_window(int sizex, int sizey) { assert(win_hInstance != 0); g_sizex = sizex; g_sizey = sizey; if (!WinInit(win_hInstance, win_iCmdShow, gWndClass, title, true)) { DisplayError("Unable to initialize the program's window."); return false; } ShowWindow(g_hAppWnd, SW_SHOW); Gdiplus::GdiplusStartup(&gdiplusToken, &gdiplusStartupInput, nullptr); g_pImg = new unsigned int[sizex * sizey]; g_pBitmap = new Gdiplus::Bitmap(g_sizex, g_sizey, 4 * g_sizex, PixelFormat32bppRGB, (BYTE*)g_pImg); running = true; return true; } void video::terminate() { delete g_pBitmap; g_pBitmap = nullptr; Gdiplus::GdiplusShutdown(gdiplusToken); g_video = nullptr; running = false; delete[] g_pImg; g_pImg = nullptr; } //////////// drawing area constructor & destructor ///////////// drawing_area::drawing_area(int x, int y, int sizex, int sizey) : base_index(y * g_sizex + x), max_index(g_sizex * g_sizey), index_stride(g_sizex), pixel_depth(24), ptr32(g_pImg), start_x(x), start_y(y), size_x(sizex), size_y(sizey) { assert(x < g_sizex); assert(y < g_sizey); assert(x + sizex <= g_sizex); assert(y + sizey <= g_sizey); index = base_index; // current index } void drawing_area::update() { if (g_video->updating) { RECT r; r.left = start_x; r.right = start_x + size_x; r.top = start_y; r.bottom = start_y + size_y; InvalidateRect(g_hAppWnd, &r, false); } } ================================================ FILE: third-party/tbb/examples/common/gui/macvideo.cpp ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "video.hpp" #include #include #include #include unsigned int *g_pImg = nullptr; int g_sizex = 0, g_sizey = 0; static video *g_video = nullptr; static int g_fps = 0; char *window_title = nullptr; #define WINDOW_TITLE_SIZE 256 int cocoa_update = 0; #include #include struct timeval g_time; video::video() #if __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ : depth(24), red_shift(0), green_shift(8), blue_shift(16), red_mask(0xff), green_mask(0xff00), blue_mask(0xff0000) #else : depth(24), red_shift(16), green_shift(8), blue_shift(0), red_mask(0xff0000), green_mask(0xff00), blue_mask(0xff) #endif { assert(g_video == nullptr); g_video = this; title = "Video"; cocoa_update = 1; updating = true; calc_fps = false; } bool video::init_window(int x, int y) { g_sizex = x; g_sizey = y; g_pImg = new unsigned int[x * y]; if (window_title == nullptr) window_title = (char *)malloc(WINDOW_TITLE_SIZE); strncpy(window_title, title, WINDOW_TITLE_SIZE - 1); running = true; return true; } bool video::init_console() { running = true; return true; } void video::terminate() { if (calc_fps) { double fps = g_fps; struct timezone tz; struct timeval end_time; gettimeofday(&end_time, &tz); fps /= (end_time.tv_sec + 1.0 * end_time.tv_usec / 1000000.0) - (g_time.tv_sec + 1.0 * g_time.tv_usec / 1000000.0); printf("%s: %.1f fps\n", title, fps); } g_video = nullptr; running = false; delete[] g_pImg; g_pImg = nullptr; } video::~video() { if (g_video) terminate(); } //! Count and display FPS count in titlebar bool video::next_frame() { if (calc_fps) { if (!g_fps) { struct timezone tz; gettimeofday(&g_time, &tz); } g_fps++; } struct timezone tz; struct timeval now_time; gettimeofday(&now_time, &tz); double sec = ((now_time.tv_sec + 1.0 * now_time.tv_usec / 1000000.0) - (g_time.tv_sec + 1.0 * g_time.tv_usec / 1000000.0)); if (sec > 1) { if (calc_fps) { memcpy(&g_time, &now_time, sizeof(g_time)); int fps; fps = g_fps / sec; cocoa_update = (int)updating; snprintf(window_title, WINDOW_TITLE_SIZE, "%s%s: %d fps", title, updating ? "" : " (no updating)", int(fps)); g_fps = 0; } } return running; } void *thread_func(void *) { g_video->on_process(); exit(EXIT_SUCCESS); } extern "C" void on_mouse_func(int x, int y, int k) { g_video->on_mouse(x, y, k); return; } extern "C" void on_key_func(int x) { g_video->on_key(x); return; } extern "C" int cocoa_main(int argc, char *argv[]); //! Do standard loop void video::main_loop() { pthread_t handle; pthread_attr_t attr; pthread_attr_init(&attr); pthread_create(&handle, &attr, &thread_func, (void *)nullptr); pthread_detach(handle); cocoa_main(0, nullptr); } //! Change window title void video::show_title() { if (title) strncpy(window_title, title, WINDOW_TITLE_SIZE); return; } ///////////////////////////////////////////// public methods of video class /////////////////////// drawing_area::drawing_area(int x, int y, int sizex, int sizey) : base_index(y * g_sizex + x), max_index(g_sizex * g_sizey), index_stride(g_sizex), pixel_depth(24), ptr32(g_pImg), start_x(x), start_y(y), size_x(sizex), size_y(sizey) { assert(x < g_sizex); assert(y < g_sizey); assert(x + sizex <= g_sizex); assert(y + sizey <= g_sizey); index = base_index; // current index } void drawing_area::update() { //nothing to do, updating via timer in cocoa part. } ================================================ FILE: third-party/tbb/examples/common/gui/video.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef TBB_examples_video_H #define TBB_examples_video_H #include #include #if _MSC_VER #include // for uintptr_t #else #include // for uintptr_t #endif #if _WIN32 || _WIN64 #include #else #include #endif typedef unsigned int color_t; typedef unsigned char colorcomp_t; typedef signed char depth_t; //! Class for getting access to drawing memory class drawing_memory { uintptr_t my_address; public: depth_t pixel_depth; int sizex, sizey; //! Get drawing memory inline char *get_address() const { return reinterpret_cast(my_address); } //! Get drawing memory size inline int get_size() const { return ((pixel_depth > 16) ? 4 : 2) * sizex * sizey; } //! Set drawing memory inline void set_address(char *mem) { my_address = reinterpret_cast(mem); } friend class drawing_area; friend class video; }; //! Simple proxy class for managing of different video systems class video { //! colorspace information depth_t depth, red_shift, green_shift, blue_shift; color_t red_mask, green_mask, blue_mask; friend class drawing_area; public: //! Constructor video(); //! Destructor ~video(); //! member to set window name const char *title; //! true is enable to show fps bool calc_fps; //! if true: on windows fork processing thread for on_process(), on non-windows note that next_frame() is called concurrently. bool threaded; //! true while running within main_loop() bool running; //! if true, do gui updating bool updating; //! initialize graphical video system bool init_window(int sizex, int sizey); //! initialize console. returns true if console is available bool init_console(); //! terminate video system void terminate(); //! Do standard event & processing loop. Use threaded = true to separate event/updating loop from frame processing void main_loop(); //! Process next frame bool next_frame(); //! Change window title void show_title(); //! translate RGB components into packed type inline color_t get_color(colorcomp_t red, colorcomp_t green, colorcomp_t blue) const; //! Get drawing memory descriptor inline drawing_memory get_drawing_memory() const; //! code of the ESCape key static const int esc_key = 27; //! Mouse events handler. virtual void on_mouse(int x, int y, int key) {} //! Mouse events handler. virtual void on_key(int key) {} //! Main processing loop. Redefine with your own virtual void on_process() { while (next_frame()) ; } #ifdef _WINDOWS //! Windows specific members //! if VIDEO_WINMAIN isn't defined then set this just before init() by arguments of WinMain static HINSTANCE win_hInstance; static int win_iCmdShow; //! optionally call it just before init() to set own. Use ascii strings convention void win_set_class(WNDCLASSEX &); //! load and set accelerator table from resources void win_load_accelerators(int idc); #endif }; //! Drawing class class drawing_area { const std::size_t base_index, max_index, index_stride; const depth_t pixel_depth; unsigned int *const ptr32; std::size_t index; public: const int start_x, start_y, size_x, size_y; //! constructors drawing_area(int x, int y, int sizex, int sizey); inline drawing_area(int x, int y, int sizex, int sizey, const drawing_memory &dmem); //! destructor inline ~drawing_area(); //! update the image void update(); //! set current position. local_x could be bigger then size_x inline void set_pos(int local_x, int local_y); //! put pixel in current position with incremental address calculating to next right pixel inline void put_pixel(color_t color); //! draw pixel at position by packed color void set_pixel(int localx, int localy, color_t color) { set_pos(localx, localy); put_pixel(color); } }; extern int g_sizex; extern int g_sizey; extern unsigned int *g_pImg; inline drawing_memory video::get_drawing_memory() const { drawing_memory dmem; dmem.pixel_depth = depth; dmem.my_address = reinterpret_cast(g_pImg); dmem.sizex = g_sizex; dmem.sizey = g_sizey; return dmem; } inline color_t video::get_color(colorcomp_t red, colorcomp_t green, colorcomp_t blue) const { if (red_shift == 16) // only for depth == 24 && red_shift > blue_shift return (red << 16) | (green << 8) | blue; else if (depth >= 24) return #if __ANDROID__ // Setting Alpha to 0xFF 0xFF000000 | #endif (red << red_shift) | (green << green_shift) | (blue << blue_shift); else if (depth > 0) { depth_t bs = blue_shift, rs = red_shift; if (blue_shift < 0) blue >>= -bs, bs = 0; else /*red_shift < 0*/ red >>= -rs, rs = 0; return ((red << rs) & red_mask) | ((green << green_shift) & green_mask) | ((blue << bs) & blue_mask); } else { // UYVY colorspace unsigned y, u, v; y = red * 77 + green * 150 + blue * 29; // sum(77+150+29=256) * max(=255): limit->2^16 u = (2048 + (blue << 3) - (y >> 5)) >> 4; // (limit->2^12)>>4 v = (2048 + (red << 3) - (y >> 5)) >> 4; y = y >> 8; return u | (y << 8) | (v << 16) | (y << 24); } } inline drawing_area::drawing_area(int x, int y, int sizex, int sizey, const drawing_memory &dmem) : base_index(y * dmem.sizex + x), max_index(dmem.sizex * dmem.sizey), index_stride(dmem.sizex), pixel_depth(dmem.pixel_depth), ptr32(reinterpret_cast(dmem.my_address)), start_x(x), start_y(y), size_x(sizex), size_y(sizey) { assert(x < dmem.sizex); assert(y < dmem.sizey); assert(x + sizex <= dmem.sizex); assert(y + sizey <= dmem.sizey); index = base_index; // current index } inline void drawing_area::set_pos(int local_x, int local_y) { index = base_index + local_x + local_y * index_stride; } inline void drawing_area::put_pixel(color_t color) { assert(index < max_index); if (pixel_depth > 16) ptr32[index++] = color; else if (pixel_depth > 0) ((unsigned short *)ptr32)[index++] = (unsigned short)color; else { // UYVY colorspace if (index & 1) color >>= 16; ((unsigned short *)ptr32)[index++] = (unsigned short)color; } } inline drawing_area::~drawing_area() {} #if defined(_WINDOWS) && (defined(VIDEO_WINMAIN) || defined(VIDEO_WINMAIN_ARGS)) #include //! define WinMain for subsystem:windows. #ifdef VIDEO_WINMAIN_ARGS int main(int, char *[]); #else int main(); #endif int WINAPI WinMain(HINSTANCE hInstance, HINSTANCE, PSTR szCmdLine, int iCmdShow) { video::win_hInstance = hInstance; video::win_iCmdShow = iCmdShow; #ifdef VIDEO_WINMAIN_ARGS return main(__argc, __argv); #else return main(); #endif } #endif #endif /* TBB_examples_video_H */ ================================================ FILE: third-party/tbb/examples/common/gui/winvideo.hpp ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /////// Common internal implementation of Windows-specific stuff ////////////// /////// Must be the first included header ////////////// #ifndef __WINVIDEO_H__ #define __WINVIDEO_H__ #ifndef _CRT_SECURE_NO_DEPRECATE #define _CRT_SECURE_NO_DEPRECATE #endif // Check that the target Windows version has all API calls required. #ifndef _WIN32_WINNT #define _WIN32_WINNT 0x0400 #endif #if _WIN32_WINNT < 0x0400 #define YIELD_TO_THREAD() Sleep(0) #else #define YIELD_TO_THREAD() SwitchToThread() #endif #include "video.hpp" #include #include #include #include #pragma comment(lib, "gdi32.lib") #pragma comment(lib, "user32.lib") // maximum number of lines the output console should have static const WORD MAX_CONSOLE_LINES = 500; const COLORREF RGBKEY = RGB(8, 8, 16); // at least 8 for 16-bit palette HWND g_hAppWnd; // The program's window handle HANDLE g_handles[2] = { 0, 0 }; // thread and wake up event unsigned int *g_pImg = 0; // drawing memory int g_sizex, g_sizey; static video *g_video = 0; WNDPROC g_pUserProc = 0; HINSTANCE video::win_hInstance = 0; int video::win_iCmdShow = 0; static WNDCLASSEX *gWndClass = 0; static HACCEL hAccelTable = 0; static DWORD g_msec = 0; static int g_fps = 0, g_updates = 0, g_skips = 0; bool DisplayError(LPSTR lpstrErr, HRESULT hres = 0); // always returns false LRESULT CALLBACK InternalWndProc(HWND hwnd, UINT iMsg, WPARAM wParam, LPARAM lParam); //! Create window bool WinInit(HINSTANCE hInstance, int nCmdShow, WNDCLASSEX *uwc, const char *title, bool fixedsize) { WNDCLASSEX wndclass; // Our app's windows class if (uwc) { memcpy(&wndclass, uwc, sizeof(wndclass)); g_pUserProc = uwc->lpfnWndProc; } else { memset(&wndclass, 0, sizeof(wndclass)); wndclass.hCursor = LoadCursor(nullptr, IDC_ARROW); wndclass.lpszClassName = title; } wndclass.cbSize = sizeof(wndclass); wndclass.hInstance = hInstance; wndclass.lpfnWndProc = InternalWndProc; wndclass.style |= CS_HREDRAW | CS_VREDRAW; wndclass.hbrBackground = CreateSolidBrush(RGBKEY); if (!RegisterClassExA(&wndclass)) return false; int xaddend = GetSystemMetrics(fixedsize ? SM_CXFIXEDFRAME : SM_CXFRAME) * 2; int yaddend = GetSystemMetrics(fixedsize ? SM_CYFIXEDFRAME : SM_CYFRAME) * 2 + GetSystemMetrics(SM_CYCAPTION); if (wndclass.lpszMenuName) yaddend += GetSystemMetrics(SM_CYMENU); // Setup the new window's physical parameters - and tell Windows to create it g_hAppWnd = CreateWindowA(wndclass.lpszClassName, // Window class name title, // Window caption !fixedsize ? WS_OVERLAPPEDWINDOW : // Window style WS_OVERLAPPED | WS_CAPTION | WS_SYSMENU | WS_MINIMIZEBOX, CW_USEDEFAULT, // Initial x pos: use default placement 0, // Initial y pos: not used here g_sizex + xaddend, // Initial x size g_sizey + yaddend, // Initial y size nullptr, // parent window handle nullptr, // window menu handle hInstance, // program instance handle nullptr); // Creation parameters return g_hAppWnd != nullptr; } //! create console window with redirection static bool RedirectIOToConsole(void) { int hConHandle; size_t lStdHandle; CONSOLE_SCREEN_BUFFER_INFO coninfo; FILE *fp; // allocate a console for this app AllocConsole(); // set the screen buffer to be big enough to let us scroll text GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &coninfo); coninfo.dwSize.Y = MAX_CONSOLE_LINES; SetConsoleScreenBufferSize(GetStdHandle(STD_OUTPUT_HANDLE), coninfo.dwSize); // redirect unbuffered STDOUT to the console lStdHandle = (size_t)GetStdHandle(STD_OUTPUT_HANDLE); hConHandle = _open_osfhandle(lStdHandle, _O_TEXT); if (hConHandle <= 0) return false; fp = _fdopen(hConHandle, "w"); *stdout = *fp; setvbuf(stdout, nullptr, _IONBF, 0); // redirect unbuffered STDERR to the console lStdHandle = (size_t)GetStdHandle(STD_ERROR_HANDLE); hConHandle = _open_osfhandle(lStdHandle, _O_TEXT); if (hConHandle > 0) { fp = _fdopen(hConHandle, "w"); *stderr = *fp; setvbuf(stderr, nullptr, _IONBF, 0); } // redirect unbuffered STDIN to the console lStdHandle = (size_t)GetStdHandle(STD_INPUT_HANDLE); hConHandle = _open_osfhandle(lStdHandle, _O_TEXT); if (hConHandle > 0) { fp = _fdopen(hConHandle, "r"); *stdin = *fp; setvbuf(stdin, nullptr, _IONBF, 0); } // make cout, wcout, cin, wcin, wcerr, cerr, wclog and clog // point to console as well std::ios::sync_with_stdio(); return true; } video::video() : depth(24), red_shift(16), green_shift(8), blue_shift(0), red_mask(0xff0000), green_mask(0xff00), blue_mask(0xff) { assert(g_video == 0); g_video = this; title = "Video"; running = threaded = calc_fps = false; updating = true; } //! optionally call it just before init() to set own void video::win_set_class(WNDCLASSEX &wcex) { gWndClass = &wcex; } void video::win_load_accelerators(int idc) { hAccelTable = LoadAccelerators(win_hInstance, MAKEINTRESOURCE(idc)); } bool video::init_console() { if (RedirectIOToConsole()) { if (!g_pImg && g_sizex && g_sizey) g_pImg = new unsigned int[g_sizex * g_sizey]; if (g_pImg) running = true; return true; } return false; } video::~video() { if (g_video) terminate(); } DWORD WINAPI thread_video(LPVOID lpParameter) { video *v = (video *)lpParameter; v->on_process(); return 0; } static bool loop_once(video *v) { // screen update notify if (int updates = g_updates) { g_updates = 0; if (g_video->updating) { g_skips += updates - 1; g_fps++; } else g_skips += updates; UpdateWindow(g_hAppWnd); } // update fps DWORD msec = GetTickCount(); if (v->calc_fps && msec >= g_msec + 1000) { double sec = (msec - g_msec) / 1000.0; char buffer[256], n = _snprintf(buffer, 128, "%s: %d fps", v->title, int(double(g_fps + g_skips) / sec)); if (g_skips) _snprintf(buffer + n, 128, " - %d skipped = %d updates", int(g_skips / sec), int(g_fps / sec)); SetWindowTextA(g_hAppWnd, buffer); g_msec = msec; g_skips = g_fps = 0; } // event processing, including painting MSG msg; if (PeekMessage(&msg, nullptr, 0, 0, PM_REMOVE)) { if (msg.message == WM_QUIT) { v->running = false; return false; } if (!hAccelTable || !TranslateAccelerator(msg.hwnd, hAccelTable, &msg)) { TranslateMessage(&msg); DispatchMessage(&msg); } return true; // try again } return false; } //! Do standard event loop void video::main_loop() { // let Windows draw and unroll the window InvalidateRect(g_hAppWnd, 0, false); g_msec = GetTickCount(); // let's stay for 0,5 sec while (g_msec + 500 > GetTickCount()) { loop_once(this); Sleep(1); } g_msec = GetTickCount(); // now, start main process if (threaded) { g_handles[0] = CreateThread(nullptr, // LPSECURITY_ATTRIBUTES security_attrs 0, // SIZE_T stacksize (LPTHREAD_START_ROUTINE)thread_video, this, // argument 0, 0); if (!g_handles[0]) { DisplayError("Can't create thread"); return; } else // harmless race is possible here g_handles[1] = CreateEvent(nullptr, false, false, nullptr); while (running) { while (loop_once(this)) ; YIELD_TO_THREAD(); // give time for processing when running on single CPU DWORD r = MsgWaitForMultipleObjects( 2, g_handles, false, INFINITE, QS_ALLINPUT ^ QS_MOUSEMOVE); if (r == WAIT_OBJECT_0) break; // thread terminated } running = false; if (WaitForSingleObject(g_handles[0], 3000) == WAIT_TIMEOUT) { // there was not enough time for graceful shutdown, killing the example with code 1. exit(1); } if (g_handles[0]) CloseHandle(g_handles[0]); if (g_handles[1]) CloseHandle(g_handles[1]); g_handles[0] = g_handles[1] = 0; } else on_process(); } //! Refresh screen picture bool video::next_frame() { if (!running) return false; g_updates++; // Fast but inaccurate counter. The data race here is benign. if (!threaded) while (loop_once(this)) ; else if (g_handles[1]) { SetEvent(g_handles[1]); YIELD_TO_THREAD(); } return true; } //! Change window title void video::show_title() { if (g_hAppWnd) SetWindowTextA(g_hAppWnd, title); } #endif //__WINVIDEO_H__ ================================================ FILE: third-party/tbb/examples/common/gui/xcode/tbbExample/OpenGLView.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #import #if TARGET_OS_IPHONE #import #import "OpenGLES/ES2/gl.h" @interface OpenGLView : UIView { NSTimer *timer; CGRect imageRect; } @property (nonatomic, retain) NSTimer *timer; @property (nonatomic) CGRect imageRect; - (void) drawRect:(CGRect)rect; - (void) touchesBegan:(NSSet *)touches withEvent:(UIEvent *)event; @end #elif TARGET_OS_MAC #import #import @interface OpenGLView : NSOpenGLView{ NSTimer *timer; } @property (nonatomic,retain) NSTimer *timer; - (void) drawRect:(NSRect)start; - (void) mouseDown:(NSEvent *)theEvent; - (void) keyDown:(NSEvent *)theEvent; - (BOOL) acceptsFirstResponder; - (void) viewDidEndLiveResize; @end #endif ================================================ FILE: third-party/tbb/examples/common/gui/xcode/tbbExample/OpenGLView.m ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #import #import "OpenGLView.h" // defined in macvideo.cpp extern char* window_title; extern int cocoa_update; extern int g_sizex, g_sizey; extern unsigned int *g_pImg; void on_mouse_func(int x, int y, int k); void on_key_func(int x); bool initialized = false; #if TARGET_OS_IPHONE #import "OpenGLES/ES2/gl.h" @implementation OpenGLView @synthesize timer; @synthesize imageRect; - (void)drawRect:(CGRect)start { if (initialized == false) { NSLog(@"INITIALIZE"); timer = [NSTimer scheduledTimerWithTimeInterval:0.03 target:self selector:@selector(update_window) userInfo:nil repeats:YES]; imageRect = [[UIScreen mainScreen] bounds]; CGFloat full_height = imageRect.size.height; const float ratio=(float)g_sizex/g_sizey; imageRect.size.height=imageRect.size.width/ratio; imageRect.origin.y=(full_height-imageRect.size.height)/2; initialized = true; } CGColorSpaceRef colourSpace = CGColorSpaceCreateDeviceRGB(); CGDataProviderRef dataProvider = CGDataProviderCreateWithData(NULL, g_pImg, 4*g_sizex*g_sizey, NULL); CGImageRef inputImage = CGImageCreate(g_sizex, g_sizey, 8, 32, g_sizex * 4, colourSpace,(CGBitmapInfo)kCGImageAlphaNoneSkipLast, dataProvider, NULL, NO, kCGRenderingIntentDefault); UIImage *image = [UIImage imageWithCGImage:inputImage]; CGDataProviderRelease(dataProvider); CGColorSpaceRelease(colourSpace); CGImageRelease(inputImage); [image drawInRect:imageRect]; } - (void) touchesBegan:(NSSet *)touches withEvent:(UIEvent *)event { CGPoint point = [[touches anyObject] locationInView:self]; const int x = point.x; const int y = point.y; if ( (y-imageRect.origin.y) > 0 && y < (imageRect.origin.y + imageRect.size.height )) on_mouse_func( x*g_sizex/(imageRect.size.width), (y-imageRect.origin.y)*g_sizey/imageRect.size.height,1); [self setNeedsDisplay]; } -(void) update_window{ if( cocoa_update ) [self setNeedsDisplay]; } @end #elif TARGET_OS_MAC #import @implementation OpenGLView @synthesize timer; - (void) drawRect:(NSRect)start { if (initialized == false) { NSLog(@"INITIALIZE"); timer = [NSTimer scheduledTimerWithTimeInterval:0.03 target:self selector:@selector(update_window) userInfo:nil repeats:YES]; initialized = true; } glWindowPos2i(0, (int)self.visibleRect.size.height); glPixelZoom( (float)self.visibleRect.size.width /(float)g_sizex, -(float)self.visibleRect.size.height/(float)g_sizey); glDrawPixels(g_sizex, g_sizey, GL_BGRA_EXT, GL_UNSIGNED_INT_8_8_8_8_REV, g_pImg); glFlush(); } -(void) update_window{ if( cocoa_update ) [self setNeedsDisplay:YES]; if( window_title ) [self.window setTitle:[NSString stringWithFormat:@"%s", window_title]]; } -(void) keyDown:(NSEvent *)theEvent{ on_key_func([theEvent.characters characterAtIndex:0]); } -(void) mouseDown:(NSEvent *)theEvent{ // mouse event for seismic and fractal NSPoint point= theEvent.locationInWindow; const int x = (int)point.x; const int y = (int)point.y; NSRect rect = self.visibleRect; on_mouse_func(x*g_sizex/(int)rect.size.width,((int)rect.size.height-y)*g_sizey/(int)rect.size.height,1); [self setNeedsDisplay:YES]; } - (BOOL) acceptsFirstResponder { return YES; } - (void) rightMouseDown:(NSEvent *)theEvent { return; } -(void) viewDidEndLiveResize { NSRect rect = self.visibleRect; const int x=(int)rect.size.width; const int y=(int)rect.size.height; [self.window setTitle:[NSString stringWithFormat:@"X=%d Y=%d", x,y]]; } @end #endif ================================================ FILE: third-party/tbb/examples/common/gui/xcode/tbbExample/PkgInfo ================================================ APPL???? ================================================ FILE: third-party/tbb/examples/common/gui/xcode/tbbExample/en.lproj/InfoPlist.strings ================================================ /* Localized versions of Info.plist keys */ ================================================ FILE: third-party/tbb/examples/common/gui/xcode/tbbExample/en.lproj/MainMenu.xib ================================================ ================================================ FILE: third-party/tbb/examples/common/gui/xcode/tbbExample/iOS.storyboard ================================================ ================================================ FILE: third-party/tbb/examples/common/gui/xcode/tbbExample/main.m ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #import #import #if TARGET_OS_IPHONE #import #import "tbbAppDelegate.h" void get_screen_resolution(int *x, int *y) { // Getting landscape screen resolution in any case CGRect imageRect = [[UIScreen mainScreen] bounds]; *x=imageRect.size.width>imageRect.size.height?imageRect.size.width:imageRect.size.height; *y=imageRect.size.width int cocoa_main(int argc, char *argv[]) { return NSApplicationMain(argc, (const char **)argv); } #endif ================================================ FILE: third-party/tbb/examples/common/gui/xcode/tbbExample/tbbAppDelegate.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ // // Created by Xcode* 4.3.2 // #import #if TARGET_OS_IPHONE #import @interface tbbAppDelegate : UIResponder @property (strong, nonatomic) UIWindow *window; @end #elif TARGET_OS_MAC #import @interface tbbAppDelegate : NSObject { __unsafe_unretained NSWindow *_window; } @property (assign) IBOutlet NSWindow *window; - (BOOL) applicationShouldTerminateAfterLastWindowClosed:(NSApplication *) sender; @end #endif ================================================ FILE: third-party/tbb/examples/common/gui/xcode/tbbExample/tbbAppDelegate.m ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #import "tbbAppDelegate.h" #if TARGET_OS_IPHONE @implementation tbbAppDelegate - (BOOL)application:(UIApplication *)application didFinishLaunchingWithOptions:(NSDictionary *)launchOptions { return YES; } - (void)applicationDidEnterBackground:(UIApplication *)application { exit(EXIT_SUCCESS); } @end #elif TARGET_OS_MAC @implementation tbbAppDelegate @synthesize window = _window; //declared in macvideo.cpp file extern int g_sizex, g_sizey; - (void)applicationDidFinishLaunching:(NSNotification *)aNotification { // Insert code here to initialize your application NSRect windowSize; windowSize.size.height = g_sizey; windowSize.size.width = g_sizex; windowSize.origin=_window.frame.origin; [_window setFrame:windowSize display:YES]; } - (BOOL) applicationShouldTerminateAfterLastWindowClosed:(NSApplication *) sender { return YES; } @end #endif ================================================ FILE: third-party/tbb/examples/common/gui/xcode/tbbExample/tbbExample-Info.ios.plist ================================================ CFBundleDevelopmentRegion en CFBundleExecutable $(EXECUTABLE_NAME) CFBundleIdentifier $(PRODUCT_BUNDLE_IDENTIFIER) CFBundleInfoDictionaryVersion 6.0 CFBundleName $(PRODUCT_NAME) CFBundlePackageType APPL CFBundleShortVersionString 1.0 CFBundleSignature ???? CFBundleVersion 1 LSRequiresIPhoneOS UILaunchStoryboardName iOS UIMainStoryboardFile iOS UIRequiredDeviceCapabilities armv7 UISupportedInterfaceOrientations UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight UISupportedInterfaceOrientations~ipad UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight ================================================ FILE: third-party/tbb/examples/common/gui/xcode/tbbExample/tbbExample-Info.plist ================================================ CFBundleDevelopmentRegion en CFBundleDisplayName CFBundleExecutable ${EXECUTABLE_NAME} CFBundleIconFile CFBundleIdentifier $(PRODUCT_BUNDLE_IDENTIFIER) CFBundleInfoDictionaryVersion 6.0 CFBundleName ${PRODUCT_NAME} CFBundlePackageType APPL CFBundleShortVersionString 1.0 CFBundleSignature ???? CFBundleVersion 1 LSApplicationCategoryType public.app-category.business LSEnvironment DYLD_LIBRARY_PATH Resources:.:../Resources:/tmp:$DYLD_LIBRARY_PATH LIBRARY_PATH Resources:.:../:/tmp:$DYLD_LIBRARY_PATH LSMinimumSystemVersion ${MACOSX_DEPLOYMENT_TARGET} NSHumanReadableCopyright Copyright 2005-2021 Intel Corporation. All Rights Reserved. NSMainNibFile MainMenu NSPrincipalClass NSApplication ================================================ FILE: third-party/tbb/examples/common/gui/xvideo.cpp ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ // Uncomment next line to disable shared memory features if you do not have libXext // (http://www.xfree86.org/current/mit-shm.html) //#define X_NOSHMEM // Note that it may happen that the build environment supports the shared-memory extension // (so there's no build-time reason to disable the relevant code by defining X_NOSHMEM), // but that using shared memory still fails at run time. // This situation will (ultimately) cause the error handler set by XSetErrorHandler() // to be invoked with XErrorEvent::minor_code==X_ShmAttach. The code below tries to make // such a determination at XShmAttach() time, which seems plausible, but unfortunately // it has also been observed in a specific environment that the error may be reported // at a later time instead, even after video::init_window() has returned. // It is not clear whether this may happen in that way in any environment where it might // depend on the kind of display, e.g., local vs. over "ssh -X", so #define'ing X_NOSHMEM // may not always be the appropriate solution, therefore an environment variable // has been introduced to disable shared memory at run time. // A diagnostic has been added to advise the user about possible workarounds. // X_ShmAttach macro was changed to 1 due to recent changes to X11/extensions/XShm.h header. #include "video.hpp" #include #include #include #include #include #include #include #include #include #include #ifndef X_NOSHMEM #include #include #include #include static XShmSegmentInfo shmseginfo; static Pixmap pixmap = 0; static bool already_called_X_ShmAttach = false; static bool already_advised_about_NOSHMEM_workarounds = false; #endif static char *display_name = nullptr; static Display *dpy = nullptr; static Screen *scrn; static Visual *vis; static Colormap cmap; static GC gc; static Window win, rootW; static int dispdepth = 0; static XGCValues xgcv; static XImage *ximage; static int x_error = 0; static int vidtype = 3; int g_sizex, g_sizey; static video *g_video = 0; unsigned int *g_pImg = 0; static int g_fps = 0; struct timeval g_time; static pthread_mutex_t g_mutex = PTHREAD_MUTEX_INITIALIZER; Atom _XA_WM_DELETE_WINDOW = 0; // like in Xatom.h ///////////////////////////////////////////// public methods of video class /////////////////////// video::video() { assert(g_video == 0); g_video = this; title = "Video"; calc_fps = running = false; updating = true; } inline void mask2bits(unsigned int mask, unsigned int &save, depth_t &shift) { save = mask; if (!mask) { shift = dispdepth / 3; return; } shift = 0; while (!(mask & 1)) ++shift, mask >>= 1; int bits = 0; while (mask & 1) ++bits, mask >>= 1; shift += bits - 8; } int xerr_handler(Display *dpy_, XErrorEvent *error) { x_error = error->error_code; if (g_video) g_video->running = false; #ifndef X_NOSHMEM if (error->minor_code == 1 /*X_ShmAttach*/ && already_called_X_ShmAttach && !already_advised_about_NOSHMEM_workarounds) { char err[256]; XGetErrorText(dpy_, x_error, err, 255); fprintf(stderr, "Warning: Can't attach shared memory to display: %s (%d)\n", err, x_error); fprintf( stderr, "If you are seeing a black output window, сheck if you have installed Xext library and rebuild project"); already_advised_about_NOSHMEM_workarounds = true; } #else (void)dpy_; // warning prevention #endif return 0; } bool video::init_window(int xsize, int ysize) { { //enclose local variables before fail label g_sizex = xsize; g_sizey = ysize; // Open the display if (!dpy) { dpy = XOpenDisplay(display_name); if (!dpy) { fprintf(stderr, "Can't open X11 display %s\n", XDisplayName(display_name)); goto fail; } } int theScreen = DefaultScreen(dpy); scrn = ScreenOfDisplay(dpy, theScreen); dispdepth = DefaultDepth(dpy, theScreen); XVisualInfo vinfo; if (!((dispdepth >= 15 && dispdepth <= 32 && XMatchVisualInfo(dpy, theScreen, dispdepth, TrueColor, &vinfo)) || XMatchVisualInfo(dpy, theScreen, 24, TrueColor, &vinfo) || XMatchVisualInfo(dpy, theScreen, 32, TrueColor, &vinfo) || XMatchVisualInfo(dpy, theScreen, 16, TrueColor, &vinfo) || XMatchVisualInfo(dpy, theScreen, 15, TrueColor, &vinfo))) { fprintf(stderr, "Display has no appropriate True Color visual\n"); goto fail; } vis = vinfo.visual; depth = dispdepth = vinfo.depth; mask2bits(vinfo.red_mask, red_mask, red_shift); mask2bits(vinfo.green_mask, green_mask, green_shift); mask2bits(vinfo.blue_mask, blue_mask, blue_shift); rootW = RootWindow(dpy, theScreen); cmap = XCreateColormap(dpy, rootW, vis, AllocNone); XSetWindowAttributes attrs; attrs.backing_store = Always; attrs.colormap = cmap; attrs.event_mask = StructureNotifyMask | KeyPressMask | ButtonPressMask | ButtonReleaseMask; attrs.background_pixel = BlackPixelOfScreen(scrn); attrs.border_pixel = WhitePixelOfScreen(scrn); win = XCreateWindow(dpy, rootW, 0, 0, xsize, ysize, 2, dispdepth, InputOutput, vis, CWBackingStore | CWColormap | CWEventMask | CWBackPixel | CWBorderPixel, &attrs); if (!win) { fprintf(stderr, "Can't create the window\n"); goto fail; } XSizeHints sh; sh.flags = PSize | PMinSize | PMaxSize; sh.width = sh.min_width = sh.max_width = xsize; sh.height = sh.min_height = sh.max_height = ysize; XSetStandardProperties(dpy, win, g_video->title, g_video->title, None, nullptr, 0, &sh); _XA_WM_DELETE_WINDOW = XInternAtom(dpy, "WM_DELETE_WINDOW", false); XSetWMProtocols(dpy, win, &_XA_WM_DELETE_WINDOW, 1); gc = XCreateGC(dpy, win, 0L, &xgcv); XMapRaised(dpy, win); XFlush(dpy); #ifdef X_FULLSYNC XSynchronize(dpy, true); #endif XSetErrorHandler(xerr_handler); int imgbytes = xsize * ysize * (dispdepth <= 16 ? 2 : 4); const char *vidstr; #ifndef X_NOSHMEM int major, minor, pixmaps; if (XShmQueryExtension(dpy) && XShmQueryVersion(dpy, &major, &minor, &pixmaps)) { // Shared memory shmseginfo.shmid = shmget(IPC_PRIVATE, imgbytes, IPC_CREAT | 0777); if (shmseginfo.shmid < 0) { fprintf(stderr, "Warning: Can't get shared memory: %s\n", strerror(errno)); goto generic; } g_pImg = (unsigned int *)(shmseginfo.shmaddr = (char *)shmat(shmseginfo.shmid, 0, 0)); if (g_pImg == (unsigned int *)-1) { fprintf(stderr, "Warning: Can't attach to shared memory: %s\n", strerror(errno)); shmctl(shmseginfo.shmid, IPC_RMID, nullptr); goto generic; } shmseginfo.readOnly = false; if (!XShmAttach(dpy, &shmseginfo) || x_error) { char err[256]; XGetErrorText(dpy, x_error, err, 255); fprintf(stderr, "Warning: Can't attach shared memory to display: %s (%d)\n", err, x_error); shmdt(shmseginfo.shmaddr); shmctl(shmseginfo.shmid, IPC_RMID, nullptr); goto generic; } already_called_X_ShmAttach = true; #ifndef X_NOSHMPIX if (pixmaps && XShmPixmapFormat(dpy) == ZPixmap) { // Pixmaps vidtype = 2; vidstr = "X11 shared memory pixmap"; pixmap = XShmCreatePixmap( dpy, win, (char *)g_pImg, &shmseginfo, xsize, ysize, dispdepth); XSetWindowBackgroundPixmap(dpy, win, pixmap); } else #endif //!X_NOSHMPIX { // Standard vidtype = 1; vidstr = "X11 shared memory"; ximage = XShmCreateImage(dpy, vis, dispdepth, ZPixmap, 0, &shmseginfo, xsize, ysize); if (!ximage) { fprintf(stderr, "Can't create the shared image\n"); goto fail; } assert(ximage->bytes_per_line == xsize * (dispdepth <= 16 ? 2 : 4)); ximage->data = shmseginfo.shmaddr; } } else #endif { #ifndef X_NOSHMEM generic: #endif vidtype = 0; vidstr = "generic X11"; g_pImg = new unsigned int[imgbytes / sizeof(int)]; ximage = XCreateImage(dpy, vis, dispdepth, ZPixmap, 0, (char *)g_pImg, xsize, ysize, 32, imgbytes / ysize); if (!ximage) { fprintf(stderr, "Can't create the image\n"); goto fail; } } if (ximage) { // Note: It may be more efficient to adopt the server's byte order // and swap once per get_color() call instead of once per pixel. const uint32_t probe = 0x03020100; const bool big_endian = (((const char *)(&probe))[0] == 0x03); ximage->byte_order = big_endian ? MSBFirst : LSBFirst; } printf("Note: using %s with %s visual for %d-bit color depth\n", vidstr, vis == DefaultVisual(dpy, theScreen) ? "default" : "non-default", dispdepth); running = true; return true; } // end of enclosing local variables fail: terminate(); init_console(); return false; } bool video::init_console() { if (!g_pImg && g_sizex && g_sizey) { dispdepth = 24; red_shift = 16; vidtype = 3; // fake video g_pImg = new unsigned int[g_sizex * g_sizey]; running = true; } return true; } void video::terminate() { running = false; if (dpy) { vidtype = 3; // stop video if (threaded) { pthread_mutex_lock(&g_mutex); pthread_mutex_unlock(&g_mutex); } if (ximage) { XDestroyImage(ximage); ximage = 0; g_pImg = 0; } // it frees g_pImg for vidtype == 0 #ifndef X_NOSHMEM if (pixmap) XFreePixmap(dpy, pixmap); if (shmseginfo.shmaddr) { XShmDetach(dpy, &shmseginfo); shmdt(shmseginfo.shmaddr); g_pImg = 0; } if (shmseginfo.shmid >= 0) shmctl(shmseginfo.shmid, IPC_RMID, nullptr); #endif if (gc) XFreeGC(dpy, gc); if (win) XDestroyWindow(dpy, win); XCloseDisplay(dpy); dpy = 0; } if (g_pImg) { delete[] g_pImg; g_pImg = 0; } // if was allocated for console mode } video::~video() { if (g_video) terminate(); g_video = 0; } //! Do standard event loop void video::main_loop() { struct timezone tz; gettimeofday(&g_time, &tz); on_process(); } //! Check for pending events once bool video::next_frame() { if (!running) return false; //! try acquire mutex if threaded code, returns on failure if (vidtype == 3 || threaded && pthread_mutex_trylock(&g_mutex)) return running; //! Refresh screen picture g_fps++; #ifndef X_NOSHMPIX if (vidtype == 2 && updating) XClearWindow(dpy, win); #endif while (XPending(dpy)) { XEvent report; XNextEvent(dpy, &report); switch (report.type) { case ClientMessage: if (report.xclient.format != 32 || report.xclient.data.l[0] != _XA_WM_DELETE_WINDOW) break; case DestroyNotify: running = false; case KeyPress: on_key(XLookupKeysym(&report.xkey, 0)); break; case ButtonPress: on_mouse(report.xbutton.x, report.xbutton.y, report.xbutton.button); break; case ButtonRelease: on_mouse(report.xbutton.x, report.xbutton.y, -report.xbutton.button); break; } } struct timezone tz; struct timeval now_time; gettimeofday(&now_time, &tz); double sec = (now_time.tv_sec + 1.0 * now_time.tv_usec / 1000000.0) - (g_time.tv_sec + 1.0 * g_time.tv_usec / 1000000.0); if (sec > 1) { memcpy(&g_time, &now_time, sizeof(g_time)); if (calc_fps) { double fps = g_fps; g_fps = 0; char buffer[256]; snprintf(buffer, 256, "%s%s: %d fps", title, updating ? "" : " (no updating)", int(fps / sec)); XStoreName(dpy, win, buffer); } #ifndef X_FULLSYNC XSync(dpy, false); // It is often better then using XSynchronize(dpy, true) #endif //X_FULLSYNC } if (threaded) pthread_mutex_unlock(&g_mutex); return true; } //! Change window title void video::show_title() { if (vidtype < 3) XStoreName(dpy, win, title); } drawing_area::drawing_area(int x, int y, int sizex, int sizey) : base_index(y * g_sizex + x), max_index(g_sizex * g_sizey), index_stride(g_sizex), pixel_depth(dispdepth), ptr32(g_pImg), start_x(x), start_y(y), size_x(sizex), size_y(sizey) { assert(x < g_sizex); assert(y < g_sizey); assert(x + sizex <= g_sizex); assert(y + sizey <= g_sizey); index = base_index; // current index } void drawing_area::update() { if (!g_video->updating) return; #ifndef X_NOSHMEM switch (vidtype) { case 0: #endif pthread_mutex_lock(&g_mutex); if (vidtype == 0) XPutImage(dpy, win, gc, ximage, start_x, start_y, start_x, start_y, size_x, size_y); pthread_mutex_unlock(&g_mutex); #ifndef X_NOSHMEM break; case 1: pthread_mutex_lock(&g_mutex); if (vidtype == 1) XShmPutImage(dpy, win, gc, ximage, start_x, start_y, start_x, start_y, size_x, size_y, false); pthread_mutex_unlock(&g_mutex); break; /*case 2: make it in next_frame(); break;*/ } #endif } ================================================ FILE: third-party/tbb/examples/common/utility/fast_random.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef TBB_examples_fast_random_H #define TBB_examples_fast_random_H #include namespace utility { //------------------------------------------------------------------------ // FastRandom //------------------------------------------------------------------------ namespace internal { std::size_t GetPrime(std::size_t seed); } //! A fast random number generator. /** Uses linear congruential method. */ class FastRandom { std::size_t x, a; public: //! Get a random number. unsigned short get() { return get(x); } //! Get a random number for the given seed; update the seed for next use. unsigned short get(std::size_t& seed) { unsigned short r = (unsigned short)(seed >> 16); seed = seed * a + 1; return r; } //! Construct a random number generator. FastRandom(std::size_t seed) { x = seed * internal::GetPrime(seed); a = internal::GetPrime(x); } }; } // namespace utility namespace utility { namespace internal { //! Table of primes used by fast random-number generator (FastRandom). static const unsigned Primes[] = { 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877, 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231, 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201, 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3, 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7, 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9, 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45, 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7, 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363, 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3, 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f }; std::size_t GetPrime(std::size_t seed) { return Primes[seed % (sizeof(Primes) / sizeof(Primes[0]))]; } } // namespace internal } // namespace utility #endif /* TBB_examples_fast_random_H */ ================================================ FILE: third-party/tbb/examples/common/utility/get_default_num_threads.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef TBB_examples_num_threads_H #define TBB_examples_num_threads_H #include "oneapi/tbb/task_arena.h" namespace utility { inline int get_default_num_threads() { return oneapi::tbb::this_task_arena::max_concurrency(); } } // namespace utility #endif /* TBB_examples_num_threads_H */ ================================================ FILE: third-party/tbb/examples/common/utility/utility.hpp ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef TBB_examples_utility_H #define TBB_examples_utility_H #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include // TBB headers should not be used, as some examples may need to be built without TBB. namespace utility { namespace internal { //TODO: add tcs template dest_type& string_to(std::string const& s, dest_type& result) { std::stringstream stream(s); stream >> result; if ((!stream) || (stream.fail())) { throw std::invalid_argument("error converting string '" + std::string(s) + "'"); } return result; } template dest_type string_to(std::string const& s) { dest_type result; return string_to(s, result); } template struct is_bool { static bool value() { return false; } }; template <> struct is_bool { static bool value() { return true; } }; class type_base { type_base& operator=(const type_base&); public: const std::string name; const std::string description; type_base(std::string a_name, std::string a_description) : name(a_name), description(a_description) {} virtual void parse_and_store(const std::string& s) = 0; virtual std::string value() const = 0; virtual std::unique_ptr clone() const = 0; virtual ~type_base() {} }; template class type_impl : public type_base { private: type_impl(const type_impl& src) : type_base(src.name, src.description), target(src.target), validating_function(src.validating_function) {} type_impl& operator=(const type_impl&); typedef bool (*validating_function_type)(const type&); type& target; validating_function_type validating_function; public: type_impl(std::string a_name, std::string a_description, type& a_target, validating_function_type a_validating_function = nullptr) : type_base(a_name, a_description), target(a_target), validating_function(a_validating_function){}; void parse_and_store(const std::string& s) /*override*/ { try { const bool is_bool = internal::is_bool::value(); if (is_bool && s.empty()) { //to avoid directly assigning true //(as it will impose additional layer of indirection) //so, simply pass it as string internal::string_to("1", target); } else { internal::string_to(s, target); } } catch (std::invalid_argument& e) { std::stringstream str; str << "'" << s << "' is incorrect input for argument '" << name << "'" << " (" << e.what() << ")"; throw std::invalid_argument(str.str()); } if (validating_function) { if (!((validating_function)(target))) { std::stringstream str; str << "'" << target << "' is invalid value for argument '" << name << "'"; throw std::invalid_argument(str.str()); } } } template static bool is_null_c_str(t&) { return false; } static bool is_null_c_str(char* s) { return s == nullptr; } std::string value() const /*override*/ { std::stringstream str; if (!is_null_c_str(target)) str << target; return str.str(); } std::unique_ptr clone() const /*override*/ { return std::unique_ptr(new type_impl(*this)); } }; class argument { private: std::unique_ptr p_type; bool matched_; public: argument(argument const& other) : p_type(other.p_type.get() ? (other.p_type->clone()).release() : nullptr), matched_(other.matched_) {} argument& operator=(argument a) { this->swap(a); return *this; } void swap(argument& other) { std::swap(p_type, other.p_type); std::swap(matched_, other.matched_); } template argument(std::string a_name, std::string a_description, type& dest, bool (*a_validating_function)(const type&) = nullptr) : p_type(new type_impl(a_name, a_description, dest, a_validating_function)), matched_(false) {} std::string value() const { return p_type->value(); } std::string name() const { return p_type->name; } std::string description() const { return p_type->description; } void parse_and_store(const std::string& s) { p_type->parse_and_store(s); matched_ = true; } bool is_matched() const { return matched_; } }; } // namespace internal class cli_argument_pack { typedef std::map args_map_type; typedef std::vector args_display_order_type; typedef std::vector positional_arg_names_type; private: args_map_type args_map; args_display_order_type args_display_order; positional_arg_names_type positional_arg_names; std::set bool_args_names; private: void add_arg(internal::argument const& a) { std::pair result = args_map.insert(std::make_pair(a.name(), a)); if (!result.second) { throw std::invalid_argument("argument with name: '" + a.name() + "' already registered"); } args_display_order.push_back(a.name()); } public: template cli_argument_pack& arg(type& dest, std::string const& name, std::string const& description, bool (*validate)(const type&) = nullptr) { internal::argument a(name, description, dest, validate); add_arg(a); if (internal::is_bool::value()) { bool_args_names.insert(name); } return *this; } //Positional means that argument name can be omitted in actual CL //only key to match values for parameters with template cli_argument_pack& positional_arg(type& dest, std::string const& name, std::string const& description, bool (*validate)(const type&) = nullptr) { internal::argument a(name, description, dest, validate); add_arg(a); if (internal::is_bool::value()) { bool_args_names.insert(name); } positional_arg_names.push_back(name); return *this; } void parse(std::size_t argc, char const* argv[]) { { std::size_t current_positional_index = 0; for (std::size_t j = 1; j < argc; j++) { internal::argument* pa = nullptr; std::string argument_value; const char* const begin = argv[j]; const char* const end = begin + std::strlen(argv[j]); const char* const assign_sign = std::find(begin, end, '='); struct throw_unknown_parameter { static void _(std::string const& location) { throw std::invalid_argument(std::string("unknown parameter starting at:'") + location + "'"); } }; //first try to interpret it like parameter=value string if (assign_sign != end) { std::string name_found = std::string(begin, assign_sign); args_map_type::iterator it = args_map.find(name_found); if (it != args_map.end()) { pa = &((*it).second); argument_value = std::string(assign_sign + 1, end); } else { throw_unknown_parameter::_(argv[j]); } } //then see is it a named flag else { args_map_type::iterator it = args_map.find(argv[j]); if (it != args_map.end()) { pa = &((*it).second); argument_value = ""; } //then try it as positional argument without name specified else if (current_positional_index < positional_arg_names.size()) { std::stringstream str(argv[j]); args_map_type::iterator found_positional_arg = args_map.find(positional_arg_names.at(current_positional_index)); //TODO: probably use of smarter assert would help here assert( found_positional_arg != args_map .end() /*&&"positional_arg_names and args_map are out of sync"*/); if (found_positional_arg == args_map.end()) { throw std::logic_error( "positional_arg_names and args_map are out of sync"); } pa = &((*found_positional_arg).second); argument_value = argv[j]; current_positional_index++; } else { //TODO: add tc to check throw_unknown_parameter::_(argv[j]); } } assert(pa); if (pa->is_matched()) { throw std::invalid_argument(std::string("several values specified for: '") + pa->name() + "' argument"); } pa->parse_and_store(argument_value); } } } std::string usage_string(const std::string& binary_name) const { std::string command_line_params; std::string summary_description; for (args_display_order_type::const_iterator it = args_display_order.begin(); it != args_display_order.end(); ++it) { const bool is_bool = (0 != bool_args_names.count((*it))); args_map_type::const_iterator argument_it = args_map.find(*it); //TODO: probably use of smarter assert would help here assert(argument_it != args_map.end() /*&&"args_display_order and args_map are out of sync"*/); if (argument_it == args_map.end()) { throw std::logic_error("args_display_order and args_map are out of sync"); } const internal::argument& a = (*argument_it).second; command_line_params += " [" + a.name() + (is_bool ? "" : "=value") + "]"; summary_description += " " + a.name() + " - " + a.description() + " (" + a.value() + ")" + "\n"; } std::string positional_arg_cl; for (positional_arg_names_type::const_iterator it = positional_arg_names.begin(); it != positional_arg_names.end(); ++it) { positional_arg_cl += " [" + (*it); } for (std::size_t i = 0; i < positional_arg_names.size(); ++i) { positional_arg_cl += "]"; } command_line_params += positional_arg_cl; std::stringstream str; str << " Program usage is:" << "\n" << " " << binary_name << command_line_params << "\n" << "\n" << " where:" << "\n" << summary_description; return str.str(); } }; // class cli_argument_pack // utility class to aid relative error measurement of samples class measurements { public: measurements() = default; measurements(unsigned iterations) { _time_intervals.reserve(iterations); } inline void start() { _startTime = std::chrono::steady_clock::now(); } inline void stop() { auto _endTime = std::chrono::steady_clock::now(); // store the end time and start time _time_intervals.push_back(std::make_pair(_startTime, _endTime)); } double computeRelError() { // Accumulate the total duration in microseconds using std::accumulate with a lambda function assert(0 != _time_intervals.size()); auto total_duration = std::accumulate( _time_intervals.begin(), _time_intervals.end(), 0, // Start with 0 count [](long long total, const std::pair& interval) { // Compute the difference and add it to the total return total + std::chrono::duration_cast( interval.second - interval.first) .count(); }); unsigned long long averageTimePerFrame = total_duration / _time_intervals.size(); unsigned long long sumOfSquareDiff = 0; std::for_each(_time_intervals.begin(), _time_intervals.end(), [&](const std::pair& interval) { unsigned long long duration = std::chrono::duration_cast( interval.second - interval.first) .count(); long long diff = duration - averageTimePerFrame; sumOfSquareDiff += diff * diff; }); double stdDev = std::sqrt(sumOfSquareDiff / _time_intervals.size()); double relError = 100 * (stdDev / averageTimePerFrame); return relError; } private: using time_point = std::chrono::steady_clock::time_point; time_point _startTime; std::vector> _time_intervals; }; namespace internal { template bool is_power_of_2(T val) { std::size_t intval = std::size_t(val); return (intval & (intval - 1)) == std::size_t(0); } int step_function_plus(int previous, double step) { return static_cast(previous + step); } int step_function_multiply(int previous, double multiply) { return static_cast(previous * multiply); } // "Power-of-2 ladder": nsteps is the desired number of steps between any subsequent powers of 2. // The actual step is the quotient of the nearest smaller power of 2 divided by that number (but at least 1). // E.g., '1:32:#4' means 1,2,3,4,5,6,7,8,10,12,14,16,20,24,28,32 int step_function_power2_ladder(int previous, double nsteps) { int steps = int(nsteps); assert(is_power_of_2(steps)); // must be a power of 2 // The actual step is 1 until the value is twice as big as nsteps if (previous < 2 * steps) return previous + 1; // calculate the previous power of 2 int prev_power2 = previous / 2; // start with half the given value int rshift = 1; // and with the shift of 1; while (int shifted = prev_power2 >> rshift) { // shift the value right; while the result is non-zero, prev_power2 |= shifted; // add the bits set in 'shifted'; rshift <<= 1; // double the shift, as twice as many top bits are set; } // repeat. ++prev_power2; // all low bits set; now it's just one less than the desired power of 2 assert(is_power_of_2(prev_power2)); assert((prev_power2 <= previous) && (2 * prev_power2 > previous)); // The actual step value is the previous power of 2 divided by steps return previous + (prev_power2 / steps); } typedef int (*step_function_ptr_type)(int, double); struct step_function_descriptor { char mnemonic; step_function_ptr_type function; public: step_function_descriptor(char a_mnemonic, step_function_ptr_type a_function) : mnemonic(a_mnemonic), function(a_function) {} private: void operator=(step_function_descriptor const&); }; step_function_descriptor step_function_descriptors[] = { step_function_descriptor('*', step_function_multiply), step_function_descriptor('+', step_function_plus), step_function_descriptor('#', step_function_power2_ladder) }; template inline std::size_t array_length(const T (&)[N]) { return N; } struct thread_range_step { step_function_ptr_type step_function; double step_function_argument; thread_range_step(step_function_ptr_type step_function_, double step_function_argument_) : step_function(step_function_), step_function_argument(step_function_argument_) { if (!step_function_) throw std::invalid_argument( "step_function for thread range step should not be nullptr"); } int operator()(int previous) const { assert(0 <= previous); // test 0<=first and loop discipline const int ret = step_function(previous, step_function_argument); assert(previous < ret); return ret; } friend std::istream& operator>>(std::istream& input_stream, thread_range_step& step) { char function_char; double function_argument; input_stream >> function_char >> function_argument; std::size_t i = 0; while ((i < array_length(step_function_descriptors)) && (step_function_descriptors[i].mnemonic != function_char)) ++i; if (i >= array_length(step_function_descriptors)) { throw std::invalid_argument("unknown step function mnemonic: " + std::string(1, function_char)); } else if ((function_char == '#') && !is_power_of_2(function_argument)) { throw std::invalid_argument("the argument of # should be a power of 2"); } step.step_function = step_function_descriptors[i].function; step.step_function_argument = function_argument; return input_stream; } }; } // namespace internal struct thread_number_range { int (*auto_number_of_threads)(); int first; // 0<=first (0 can be used as a special value) int last; // first<=last ::utility::internal::thread_range_step step; thread_number_range( int (*auto_number_of_threads_)(), int low_ = 1, int high_ = -1, ::utility::internal::thread_range_step step_ = ::utility::internal::thread_range_step(::utility::internal::step_function_power2_ladder, 4)) : auto_number_of_threads(auto_number_of_threads_), first(low_), last((high_ > -1) ? high_ : auto_number_of_threads_()), step(step_) { if (first < 0) { throw std::invalid_argument("negative value not allowed"); } if (first > last) { throw std::invalid_argument("decreasing sequence not allowed"); } } friend std::istream& operator>>(std::istream& i, thread_number_range& range) { try { std::string s; i >> s; struct string_to_number_of_threads { int auto_value; string_to_number_of_threads(int auto_value_) : auto_value(auto_value_) {} int operator()(const std::string& value) const { return (value == "auto") ? auto_value : internal::string_to(value); } }; string_to_number_of_threads string_to_number_of_threads(range.auto_number_of_threads()); int low, high; std::size_t colon = s.find(':'); if (colon == std::string::npos) { low = high = string_to_number_of_threads(s); } else { //it is a range std::size_t second_colon = s.find(':', colon + 1); low = string_to_number_of_threads(std::string(s, 0, colon)); //not copying the colon high = string_to_number_of_threads( std::string(s, colon + 1, second_colon - (colon + 1))); //not copying the colons if (second_colon != std::string::npos) { internal::string_to(std::string(s, second_colon + 1), range.step); } } range = thread_number_range(range.auto_number_of_threads, low, high, range.step); } catch (std::invalid_argument&) { i.setstate(std::ios::failbit); throw; } return i; } friend std::ostream& operator<<(std::ostream& o, thread_number_range const& range) { using namespace internal; std::size_t i = 0; for (; i < array_length(step_function_descriptors) && step_function_descriptors[i].function != range.step.step_function; ++i) { } if (i >= array_length(step_function_descriptors)) { throw std::invalid_argument("unknown step function for thread range"); } o << range.first << ":" << range.last << ":" << step_function_descriptors[i].mnemonic << range.step.step_function_argument; return o; } }; // struct thread_number_range //TODO: fix unused warning here static const char* thread_number_range_desc = "number of threads to use; a range of the form low[:high[:(+|*|#)step]]," "\n\twhere low and optional high are non-negative integers or 'auto' for the default choice," "\n\tand optional step expression specifies how thread numbers are chosen within the range."; inline void report_elapsed_time(double seconds) { std::cout << "elapsed time : " << seconds << " seconds" << "\n"; } inline void report_skipped() { std::cout << "skip" << "\n"; } inline void report_relative_error(double err) { std::cout << "Relative_Err : " << err << " %" << "\n"; } inline void parse_cli_arguments(int argc, const char* argv[], utility::cli_argument_pack cli_pack) { bool show_help = false; cli_pack.arg(show_help, "-h", "show this message"); bool invalid_input = false; try { cli_pack.parse(argc, argv); } catch (std::exception& e) { std::cerr << "error occurred while parsing command line." << "\n" << "error text: " << e.what() << "\n" << std::flush; invalid_input = true; } if (show_help || invalid_input) { std::cout << cli_pack.usage_string(argv[0]) << std::flush; std::exit(0); } } inline void parse_cli_arguments(int argc, char* argv[], utility::cli_argument_pack cli_pack) { parse_cli_arguments(argc, const_cast(argv), cli_pack); } } // namespace utility #endif /* TBB_examples_utility_H */ ================================================ FILE: third-party/tbb/examples/concurrent_hash_map/README.md ================================================ # Code Samples of oneAPI Threading Building Blocks (oneTBB) This directory contains examples of the `concurrent_hash_map` container. | Code sample name | Description |:--- |:--- | count_strings | Concurrently inserts strings into a `concurrent_hash_map` container. ================================================ FILE: third-party/tbb/examples/concurrent_hash_map/count_strings/CMakeLists.txt ================================================ # Copyright (c) 2020-2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. cmake_minimum_required(VERSION 3.5.0...3.31.3) project(count_strings CXX) include(../../common/cmake/common.cmake) set_common_project_settings(tbb) add_executable(count_strings count_strings.cpp) target_link_libraries(count_strings TBB::tbb Threads::Threads) target_compile_options(count_strings PRIVATE ${TBB_CXX_STD_FLAG}) set(EXECUTABLE "$") set(ARGS "") set(PERF_ARGS auto 10000000 silent) add_execution_target(run_count_strings count_strings ${EXECUTABLE} "${ARGS}") add_execution_target(perf_run_count_strings count_strings ${EXECUTABLE} "${PERF_ARGS}") ================================================ FILE: third-party/tbb/examples/concurrent_hash_map/count_strings/README.md ================================================ # Count_strings sample The example counts the number of unique words in a text. ## Building the example ``` cmake cmake --build . ``` ## Running the sample ### Predefined make targets * `make run_count_strings` - executes the example with predefined parameters. * `make perf_run_count_strings` - executes the example with suggested parameters to measure the oneTBB performance. ### Application parameters Usage: ``` count_strings [n-of-threads=value] [n-of-strings=value] [verbose] [silent] [count_collisions] [-h] [n-of-threads [n-of-strings]] ``` * `-h` - prints the help for command line options. * `n-of-threads` - number of threads to use; a range of the form low\[:high\], where low and optional high are non-negative integers or `auto` for a platform-specific default number. * `n-of-strings` - number of strings. * `verbose` - prints diagnostic output to screen. * `silent` - no output except elapsed time. ================================================ FILE: third-party/tbb/examples/concurrent_hash_map/count_strings/count_strings.cpp ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include #include // Apple clang and MSVC defines their own specializations for std::hash> #if !(_LIBCPP_VERSION) && !(_CPPLIB_VER) namespace std { template class hash> { public: std::size_t operator()(const std::basic_string& s) const { std::size_t h = 0; for (const CharT* c = s.c_str(); *c; ++c) { h = h * hash_multiplier ^ char_hash(*c); } return h; } private: static constexpr std::size_t hash_multiplier = (std::size_t)( (sizeof(std::size_t) == sizeof(unsigned)) ? 2654435769U : 11400714819323198485ULL); std::hash char_hash; }; // struct hash } // namespace std #endif // !(_LIBCPP_VERSION || _CPPLIB_VER) #include "oneapi/tbb/concurrent_hash_map.h" #include "oneapi/tbb/blocked_range.h" #include "oneapi/tbb/parallel_for.h" #include "oneapi/tbb/tick_count.h" #include "oneapi/tbb/tbb_allocator.h" #include "oneapi/tbb/global_control.h" #include "common/utility/utility.hpp" #include "common/utility/get_default_num_threads.hpp" #include //! Count collisions std::map hashes; int c = 0; //! String type typedef std::basic_string, oneapi::tbb::tbb_allocator> MyString; //! Set to true to counts. static bool verbose = false; static bool silent = false; static bool count_collisions = false; //! Problem size long N = 1000000; const int size_factor = 2; //! A concurrent hash table that maps strings to ints. typedef oneapi::tbb::concurrent_hash_map StringTable; //! Function object for counting occurrences of strings. struct Tally { StringTable& table; Tally(StringTable& table_) : table(table_) {} void operator()(const oneapi::tbb::blocked_range range) const { for (MyString* p = range.begin(); p != range.end(); ++p) { StringTable::accessor a; table.insert(a, *p); a->second += 1; } } }; static MyString* Data; static void CountOccurrences(int nthreads) { StringTable table; oneapi::tbb::tick_count t0 = oneapi::tbb::tick_count::now(); oneapi::tbb::parallel_for(oneapi::tbb::blocked_range(Data, Data + N, 1000), Tally(table)); oneapi::tbb::tick_count t1 = oneapi::tbb::tick_count::now(); int n = 0; for (StringTable::iterator i = table.begin(); i != table.end(); ++i) { if (verbose && nthreads) printf("%s %d\n", i->first.c_str(), i->second); if (!silent && count_collisions) { // it doesn't count real collisions in hash_map, a mask should be applied on hash value hashes[std::hash()(i->first) & 0xFFFF]++; } n += i->second; } if (!silent && count_collisions) { for (auto i = hashes.begin(); i != hashes.end(); ++i) c += i->second - 1; printf("hashes = %d collisions = %d ", static_cast(hashes.size()), c); c = 0; hashes.clear(); } if (!silent) printf( "total = %d unique = %u time = %g\n", n, unsigned(table.size()), (t1 - t0).seconds()); } /// Generator of random words struct Sound { const char* chars; int rates[3]; // beginning, middle, ending }; Sound Vowels[] = { { "e", { 445, 6220, 1762 } }, { "a", { 704, 5262, 514 } }, { "i", { 402, 5224, 162 } }, { "o", { 248, 3726, 191 } }, { "u", { 155, 1669, 23 } }, { "y", { 4, 400, 989 } }, { "io", { 5, 512, 18 } }, { "ia", { 1, 329, 111 } }, { "ea", { 21, 370, 16 } }, { "ou", { 32, 298, 4 } }, { "ie", { 0, 177, 140 } }, { "ee", { 2, 183, 57 } }, { "ai", { 17, 206, 7 } }, { "oo", { 1, 215, 7 } }, { "au", { 40, 111, 2 } }, { "ua", { 0, 102, 4 } }, { "ui", { 0, 104, 1 } }, { "ei", { 6, 94, 3 } }, { "ue", { 0, 67, 28 } }, { "ay", { 1, 42, 52 } }, { "ey", { 1, 14, 80 } }, { "oa", { 5, 84, 3 } }, { "oi", { 2, 81, 1 } }, { "eo", { 1, 71, 5 } }, { "iou", { 0, 61, 0 } }, { "oe", { 2, 46, 9 } }, { "eu", { 12, 43, 0 } }, { "iu", { 0, 45, 0 } }, { "ya", { 12, 19, 5 } }, { "ae", { 7, 18, 10 } }, { "oy", { 0, 10, 13 } }, { "ye", { 8, 7, 7 } }, { "ion", { 0, 0, 20 } }, { "ing", { 0, 0, 20 } }, { "ium", { 0, 0, 10 } }, { "er", { 0, 0, 20 } } }; Sound Consonants[] = { { "r", { 483, 1414, 1110 } }, { "n", { 312, 1548, 1114 } }, { "t", { 363, 1653, 251 } }, { "l", { 424, 1341, 489 } }, { "c", { 734, 735, 260 } }, { "m", { 732, 785, 161 } }, { "d", { 558, 612, 389 } }, { "s", { 574, 570, 405 } }, { "p", { 519, 361, 98 } }, { "b", { 528, 356, 30 } }, { "v", { 197, 598, 16 } }, { "ss", { 3, 191, 567 } }, { "g", { 285, 430, 42 } }, { "st", { 142, 323, 180 } }, { "h", { 470, 89, 30 } }, { "nt", { 0, 350, 231 } }, { "ng", { 0, 117, 442 } }, { "f", { 319, 194, 19 } }, { "ll", { 1, 414, 83 } }, { "w", { 249, 131, 64 } }, { "k", { 154, 179, 47 } }, { "nd", { 0, 279, 92 } }, { "bl", { 62, 235, 0 } }, { "z", { 35, 223, 16 } }, { "sh", { 112, 69, 79 } }, { "ch", { 139, 95, 25 } }, { "th", { 70, 143, 39 } }, { "tt", { 0, 219, 19 } }, { "tr", { 131, 104, 0 } }, { "pr", { 186, 41, 0 } }, { "nc", { 0, 223, 2 } }, { "j", { 184, 32, 1 } }, { "nn", { 0, 188, 20 } }, { "rt", { 0, 148, 51 } }, { "ct", { 0, 160, 29 } }, { "rr", { 0, 182, 3 } }, { "gr", { 98, 87, 0 } }, { "ck", { 0, 92, 86 } }, { "rd", { 0, 81, 88 } }, { "x", { 8, 102, 48 } }, { "ph", { 47, 101, 10 } }, { "br", { 115, 43, 0 } }, { "cr", { 92, 60, 0 } }, { "rm", { 0, 131, 18 } }, { "ns", { 0, 124, 18 } }, { "sp", { 81, 55, 4 } }, { "sm", { 25, 29, 85 } }, { "sc", { 53, 83, 1 } }, { "rn", { 0, 100, 30 } }, { "cl", { 78, 42, 0 } }, { "mm", { 0, 116, 0 } }, { "pp", { 0, 114, 2 } }, { "mp", { 0, 99, 14 } }, { "rs", { 0, 96, 16 } }, { "rl", { 0, 97, 7 } }, { "rg", { 0, 81, 15 } }, { "pl", { 56, 39, 0 } }, { "sn", { 32, 62, 1 } }, { "str", { 38, 56, 0 } }, { "dr", { 47, 44, 0 } }, { "fl", { 77, 13, 1 } }, { "fr", { 77, 11, 0 } }, { "ld", { 0, 47, 38 } }, { "ff", { 0, 62, 20 } }, { "lt", { 0, 61, 19 } }, { "rb", { 0, 75, 4 } }, { "mb", { 0, 72, 7 } }, { "rc", { 0, 76, 1 } }, { "gg", { 0, 74, 1 } }, { "pt", { 1, 56, 10 } }, { "bb", { 0, 64, 1 } }, { "sl", { 48, 17, 0 } }, { "dd", { 0, 59, 2 } }, { "gn", { 3, 50, 4 } }, { "rk", { 0, 30, 28 } }, { "nk", { 0, 35, 20 } }, { "gl", { 40, 14, 0 } }, { "wh", { 45, 6, 0 } }, { "ntr", { 0, 50, 0 } }, { "rv", { 0, 47, 1 } }, { "ght", { 0, 19, 29 } }, { "sk", { 23, 17, 5 } }, { "nf", { 0, 46, 0 } }, { "cc", { 0, 45, 0 } }, { "ln", { 0, 41, 0 } }, { "sw", { 36, 4, 0 } }, { "rp", { 0, 36, 4 } }, { "dn", { 0, 38, 0 } }, { "ps", { 14, 19, 5 } }, { "nv", { 0, 38, 0 } }, { "tch", { 0, 21, 16 } }, { "nch", { 0, 26, 11 } }, { "lv", { 0, 35, 0 } }, { "wn", { 0, 14, 21 } }, { "rf", { 0, 32, 3 } }, { "lm", { 0, 30, 5 } }, { "dg", { 0, 34, 0 } }, { "ft", { 0, 18, 15 } }, { "scr", { 23, 10, 0 } }, { "rch", { 0, 24, 6 } }, { "rth", { 0, 23, 7 } }, { "rh", { 13, 15, 0 } }, { "mpl", { 0, 29, 0 } }, { "cs", { 0, 1, 27 } }, { "gh", { 4, 10, 13 } }, { "ls", { 0, 23, 3 } }, { "ndr", { 0, 25, 0 } }, { "tl", { 0, 23, 1 } }, { "ngl", { 0, 25, 0 } }, { "lk", { 0, 15, 9 } }, { "rw", { 0, 23, 0 } }, { "lb", { 0, 23, 1 } }, { "tw", { 15, 8, 0 } }, { "chr", { 18, 4, 0 } }, { "dl", { 0, 23, 0 } }, { "ctr", { 0, 22, 0 } }, { "nst", { 0, 21, 0 } }, { "lc", { 0, 22, 0 } }, { "sch", { 16, 4, 0 } }, { "ths", { 0, 1, 20 } }, { "nl", { 0, 21, 0 } }, { "lf", { 0, 15, 6 } }, { "ssn", { 0, 20, 0 } }, { "xt", { 0, 18, 1 } }, { "xp", { 0, 20, 0 } }, { "rst", { 0, 15, 5 } }, { "nh", { 0, 19, 0 } }, { "wr", { 14, 5, 0 } } }; const int VowelsNumber = sizeof(Vowels) / sizeof(Sound); const int ConsonantsNumber = sizeof(Consonants) / sizeof(Sound); int VowelsRatesSum[3] = { 0, 0, 0 }, ConsonantsRatesSum[3] = { 0, 0, 0 }; int CountRateSum(Sound sounds[], const int num, const int part) { int sum = 0; for (int i = 0; i < num; i++) sum += sounds[i].rates[part]; return sum; } const char* GetLetters(int type, const int part) { Sound* sounds; int rate, i = 0; if (type & 1) sounds = Vowels, rate = rand() % VowelsRatesSum[part]; else sounds = Consonants, rate = rand() % ConsonantsRatesSum[part]; do { rate -= sounds[i++].rates[part]; } while (rate > 0); return sounds[--i].chars; } static void CreateData() { for (int i = 0; i < 3; i++) { ConsonantsRatesSum[i] = CountRateSum(Consonants, ConsonantsNumber, i); VowelsRatesSum[i] = CountRateSum(Vowels, VowelsNumber, i); } for (int i = 0; i < N; ++i) { int type = rand(); Data[i] = GetLetters(type++, 0); for (int j = 0; j < type % size_factor; ++j) Data[i] += GetLetters(type++, 1); Data[i] += GetLetters(type, 2); } MyString planet = Data[12]; planet[0] = toupper(planet[0]); MyString helloworld = Data[0]; helloworld[0] = toupper(helloworld[0]); helloworld += ", " + Data[1] + " " + Data[2] + " " + Data[3] + " " + Data[4] + " " + Data[5]; if (!silent) printf("Message from planet '%s': %s!\nAnalyzing whole text...\n", planet.c_str(), helloworld.c_str()); } int main(int argc, char* argv[]) { StringTable table; oneapi::tbb::tick_count mainStartTime = oneapi::tbb::tick_count::now(); srand(2); //! Working threads count // The 1st argument is the function to obtain 'auto' value; the 2nd is the default value // The example interprets 0 threads as "run serially, then fully subscribed" utility::thread_number_range threads(utility::get_default_num_threads, 0); utility::parse_cli_arguments( argc, argv, utility::cli_argument_pack() //"-h" option for displaying help is present implicitly .positional_arg(threads, "n-of-threads", utility::thread_number_range_desc) .positional_arg(N, "n-of-strings", "number of strings") .arg(verbose, "verbose", "verbose mode") .arg(silent, "silent", "no output except elapsed time") .arg(count_collisions, "count_collisions", "print the count of collisions")); if (silent) verbose = false; Data = new MyString[N]; CreateData(); if (threads.first) { for (int p = threads.first; p <= threads.last; p = threads.step(p)) { if (!silent) printf("threads = %d ", p); oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism, p); CountOccurrences(p); } } else { // Number of threads wasn't set explicitly. Run serial and parallel version { // serial run if (!silent) printf("serial run "); oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism, 1); CountOccurrences(1); } { // parallel run (number of threads is selected automatically) if (!silent) printf("parallel run "); oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism, utility::get_default_num_threads()); CountOccurrences(0); } } delete[] Data; utility::report_elapsed_time((oneapi::tbb::tick_count::now() - mainStartTime).seconds()); return 0; } ================================================ FILE: third-party/tbb/examples/concurrent_priority_queue/README.md ================================================ # Code Samples of oneAPI Threading Building Blocks (oneTBB) This directory contains examples of the `concurrent_priority_queue` container. | Code sample name | Description |:--- |:--- | shortpath | Solves the single source shortest path problem using a `concurrent_priority_queue` container. ================================================ FILE: third-party/tbb/examples/concurrent_priority_queue/shortpath/CMakeLists.txt ================================================ # Copyright (c) 2020-2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. cmake_minimum_required(VERSION 3.5.0...3.31.3) project(shortpath CXX) include(../../common/cmake/common.cmake) set_common_project_settings(tbb) add_executable(shortpath shortpath.cpp) target_link_libraries(shortpath TBB::tbb Threads::Threads) target_compile_options(shortpath PRIVATE ${TBB_CXX_STD_FLAG}) set(EXECUTABLE "$") set(ARGS 4 N=1000 start=0 end=999 verbose) set(PERF_ARGS auto N=1000 start=0 end=99 silent) add_execution_target(run_shortpath shortpath ${EXECUTABLE} "${ARGS}") add_execution_target(perf_run_shortpath shortpath ${EXECUTABLE} "${PERF_ARGS}") ================================================ FILE: third-party/tbb/examples/concurrent_priority_queue/shortpath/README.md ================================================ # Shortpath sample This directory contains an example that solves the single source shortest path problem. It is parameterized by `N`, a number of nodes, and a start and end node in `[0..N)`. A graph is generated with `N` nodes and some random number of connections between those nodes. A parallel algorithm based on `A*` is used to find the shortest path. This algorithm varies from serial `A*` in that it needs to add nodes back to the open set when the `g` estimate (shortest path from start to the node) is improved, even if the node has already been "visited". This is because nodes are added and removed from the open-set in parallel, resulting in some less optimal paths being explored. The open-set is implemented with the `concurrent_priority_queue`. Note that since we re-visit nodes, the `f` estimate (on which the priority queue is sorted) is not technically needed, so we could use this same parallel algorithm with just a `concurrent_queue`. However, keeping the `f` estimate and using `concurrent_priority_queue` results in much better performance. Silent mode prints run time only, regular mode prints the shortest path length, and verbose mode prints out the shortest path. The generated graph follows a pattern in which the closer two pairs of node ids are together, the fewer hops there are in a typical path between those nodes. So, for example, the path between 5 and 7 likely has few hops whereas 14 to 78 has more and 0 to 9999 has even more, etc. ## Building the example ``` cmake cmake --build . ``` ## Running the sample ### Predefined make targets * `make run_shortpath` - executes the example with predefined parameters. * `make perf_run_shortpath` - executes the example with suggested parameters to measure the oneTBB performance. ### Application parameters Usage: ``` shortpath [#threads=value] [verbose] [silent] [N=value] [start=value] [end=value] [-h] [#threads] ``` * `-h` - prints the help for command line options. * `n-of-threads` - number of threads to use; a range of the form low[:high], where low and optional high are non-negative integers or `auto` for a platform-specific default number. * `verbose` - prints diagnostic output to screen. * `silent` - no output except elapsed time. * `N` - number of nodes in graph. * `start` - node to start path at. * `end` - node to end path at. ================================================ FILE: third-party/tbb/examples/concurrent_priority_queue/shortpath/shortpath.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include #include "oneapi/tbb/tick_count.h" #include "oneapi/tbb/task_group.h" #include "oneapi/tbb/concurrent_priority_queue.h" #include "oneapi/tbb/spin_mutex.h" #include "oneapi/tbb/parallel_for.h" #include "oneapi/tbb/blocked_range.h" #include "oneapi/tbb/global_control.h" #include "common/utility/utility.hpp" #include "common/utility/fast_random.hpp" #include "common/utility/get_default_num_threads.hpp" #if defined(_MSC_VER) && defined(_Wp64) // Workaround for overzealous compiler warnings in /Wp64 mode #pragma warning(disable : 4267) #endif /* _MSC_VER && _Wp64 */ struct point { double x, y; point() {} point(double _x, double _y) : x(_x), y(_y) {} point(const point& p) : x(p.x), y(p.y) {} }; double get_distance(const point& p1, const point& p2) { double xdiff = p1.x - p2.x, ydiff = p1.y - p2.y; return sqrt(xdiff * xdiff + ydiff * ydiff); } // generates random points on 2D plane within a box of maxsize width & height point generate_random_point(utility::FastRandom& mr) { const std::size_t maxsize = 500; double x = (double)(mr.get() % maxsize); double y = (double)(mr.get() % maxsize); return point(x, y); } // weighted toss makes closer nodes (in the point vector) heavily connected bool die_toss(std::size_t a, std::size_t b, utility::FastRandom& mr) { int node_diff = std::abs(int(a - b)); // near nodes if (node_diff < 16) return true; // mid nodes if (node_diff < 64) return ((int)mr.get() % 8 == 0); // far nodes if (node_diff < 512) return ((int)mr.get() % 16 == 0); return false; } typedef std::vector point_set; typedef std::size_t vertex_id; typedef std::pair vertex_rec; typedef std::vector> edge_set; bool verbose = false; // prints bin details and other diagnostics to screen bool silent = false; // suppress all output except for time std::size_t N = 1000; // number of vertices std::size_t src = 0; // start of path std::size_t dst = N - 1; // end of path double INF = 100000.0; // infinity std::size_t grainsize = 16; // number of vertices per task on average std::size_t max_spawn; // max tasks to spawn std::atomic num_spawn; // number of active tasks point_set vertices; // vertices edge_set edges; // edges std::vector predecessor; // for recreating path from src to dst std::vector f_distance; // estimated distances at particular vertex std::vector g_distance; // current shortest distances from src vertex oneapi::tbb::spin_mutex* locks; // a lock for each vertex oneapi::tbb::task_group* sp_group; // task group for tasks executing sub-problems struct compare_f { bool operator()(const vertex_rec& u, const vertex_rec& v) const { return u.second > v.second; } }; oneapi::tbb::concurrent_priority_queue open_set; // tentative vertices void shortpath_helper(); void shortpath() { sp_group = new oneapi::tbb::task_group; g_distance[src] = 0.0; // src's distance from src is zero f_distance[src] = get_distance(vertices[src], vertices[dst]); // estimate distance from src to dst open_set.emplace(src, f_distance[src]); // emplace src into open_set sp_group->run([]() { shortpath_helper(); }); sp_group->wait(); delete sp_group; } void shortpath_helper() { vertex_rec u_rec; while (open_set.try_pop(u_rec)) { vertex_id u = u_rec.first; if (u == dst) continue; double f = u_rec.second; double old_g_u = 0.0; { oneapi::tbb::spin_mutex::scoped_lock l(locks[u]); if (f > f_distance[u]) continue; // prune search space old_g_u = g_distance[u]; } for (std::size_t i = 0; i < edges[u].size(); ++i) { vertex_id v = edges[u][i]; double new_g_v = old_g_u + get_distance(vertices[u], vertices[v]); double new_f_v = 0.0; // the push flag lets us move some work out of the critical section below bool push = false; { oneapi::tbb::spin_mutex::scoped_lock l(locks[v]); if (new_g_v < g_distance[v]) { predecessor[v] = u; g_distance[v] = new_g_v; new_f_v = f_distance[v] = g_distance[v] + get_distance(vertices[v], vertices[dst]); push = true; } } if (push) { open_set.push(std::make_pair(v, new_f_v)); std::size_t n_spawn = ++num_spawn; if (n_spawn < max_spawn) { sp_group->run([] { shortpath_helper(); }); } else --num_spawn; } } } --num_spawn; } void make_path(vertex_id src, vertex_id dst, std::vector& path) { vertex_id at = predecessor[dst]; if (at == N) path.push_back(src); else if (at == src) { path.push_back(src); path.push_back(dst); } else { make_path(src, at, path); path.push_back(dst); } } void print_path() { std::vector path; double path_length = 0.0; make_path(src, dst, path); if (verbose) printf("\n "); for (std::size_t i = 0; i < path.size(); ++i) { if (path[i] != dst) { double seg_length = get_distance(vertices[path[i]], vertices[path[i + 1]]); if (verbose) printf("%6.1f ", seg_length); path_length += seg_length; } else if (verbose) printf("\n"); } if (verbose) { for (std::size_t i = 0; i < path.size(); ++i) { if (path[i] != dst) printf("(%4d)------>", (int)path[i]); else printf("(%4d)\n", (int)path[i]); } } if (verbose) printf("Total distance = %5.1f\n", path_length); else if (!silent) printf(" %5.1f\n", path_length); } void InitializeGraph() { oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism, utility::get_default_num_threads()); vertices.resize(N); edges.resize(N); predecessor.resize(N); g_distance.resize(N); f_distance.resize(N); locks = new oneapi::tbb::spin_mutex[N]; if (verbose) printf("Generating vertices...\n"); oneapi::tbb::parallel_for( oneapi::tbb::blocked_range(0, N, 64), [&](oneapi::tbb::blocked_range& r) { utility::FastRandom my_random(r.begin()); for (std::size_t i = r.begin(); i != r.end(); ++i) { vertices[i] = generate_random_point(my_random); } }, oneapi::tbb::simple_partitioner()); if (verbose) printf("Generating edges...\n"); oneapi::tbb::parallel_for( oneapi::tbb::blocked_range(0, N, 64), [&](oneapi::tbb::blocked_range& r) { utility::FastRandom my_random(r.begin()); for (std::size_t i = r.begin(); i != r.end(); ++i) { for (std::size_t j = 0; j < i; ++j) { if (die_toss(i, j, my_random)) edges[i].push_back(j); } } }, oneapi::tbb::simple_partitioner()); for (std::size_t i = 0; i < N; ++i) { for (std::size_t j = 0; j < edges[i].size(); ++j) { vertex_id k = edges[i][j]; edges[k].push_back(i); } } if (verbose) printf("Done.\n"); } void ReleaseGraph() { delete[] locks; } void ResetGraph() { oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism, utility::get_default_num_threads()); oneapi::tbb::parallel_for(oneapi::tbb::blocked_range(0, N), [&](oneapi::tbb::blocked_range& r) { for (std::size_t i = r.begin(); i != r.end(); ++i) { f_distance[i] = g_distance[i] = INF; predecessor[i] = N; } }); } int main(int argc, char* argv[]) { utility::thread_number_range threads(utility::get_default_num_threads); utility::parse_cli_arguments( argc, argv, utility::cli_argument_pack() //"-h" option for displaying help is present implicitly .positional_arg(threads, "#threads", utility::thread_number_range_desc) .arg(verbose, "verbose", " print diagnostic output to screen") .arg(silent, "silent", " limits output to timing info; overrides verbose") .arg(N, "N", " number of vertices") .arg(src, "start", " start of path") .arg(dst, "end", " end of path")); if (silent) verbose = false; // make silent override verbose else printf("shortpath will run with %d vertices to find shortest path between vertices" " %d and %d using %d:%d threads.\n", (int)N, (int)src, (int)dst, (int)threads.first, (int)threads.last); if (dst >= N) { if (verbose) printf("end value %d is invalid for %d vertices; correcting to %d\n", (int)dst, (int)N, (int)N - 1); dst = N - 1; } num_spawn = 0; max_spawn = N / grainsize; oneapi::tbb::tick_count t0, t1; InitializeGraph(); for (int n_thr = threads.first; n_thr <= threads.last; n_thr = threads.step(n_thr)) { ResetGraph(); oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism, n_thr); t0 = oneapi::tbb::tick_count::now(); shortpath(); t1 = oneapi::tbb::tick_count::now(); if (!silent) { if (predecessor[dst] != N) { printf("%d threads: [%6.6f] The shortest path from vertex %d to vertex %d is:", (int)n_thr, (t1 - t0).seconds(), (int)src, (int)dst); print_path(); } else { printf("%d threads: [%6.6f] There is no path from vertex %d to vertex %d\n", (int)n_thr, (t1 - t0).seconds(), (int)src, (int)dst); } } else utility::report_elapsed_time((t1 - t0).seconds()); } ReleaseGraph(); return 0; } ================================================ FILE: third-party/tbb/examples/getting_started/README.md ================================================ # Code Samples of oneAPI Threading Building Blocks (oneTBB) This directory contains the examples referenced by the [oneAPI Threading Building Blocks Get Started Guide](https://www.intel.com/content/www/us/en/docs/onetbb/get-started-guide/current/overview.html) | Code sample name | Description |:--- |:--- | sub_string_finder | Finds largest matching substrings. ================================================ FILE: third-party/tbb/examples/getting_started/sub_string_finder/CMakeLists.txt ================================================ # Copyright (c) 2020-2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. cmake_minimum_required(VERSION 3.5.0...3.31.3) project(sub_string_finder_simple CXX) project(sub_string_finder_extended CXX) project(sub_string_finder_pretty CXX) include(../../common/cmake/common.cmake) set_common_project_settings(tbb) add_executable(sub_string_finder_simple sub_string_finder.cpp) add_executable(sub_string_finder_extended sub_string_finder_extended.cpp) add_executable(sub_string_finder_pretty sub_string_finder_pretty.cpp) target_link_libraries(sub_string_finder_simple TBB::tbb Threads::Threads) target_link_libraries(sub_string_finder_extended TBB::tbb Threads::Threads) target_link_libraries(sub_string_finder_pretty TBB::tbb Threads::Threads) target_compile_options(sub_string_finder_simple PRIVATE ${TBB_CXX_STD_FLAG}) target_compile_options(sub_string_finder_extended PRIVATE ${TBB_CXX_STD_FLAG}) target_compile_options(sub_string_finder_pretty PRIVATE ${TBB_CXX_STD_FLAG}) add_custom_target(sub_string_finder) add_dependencies(sub_string_finder sub_string_finder_simple sub_string_finder_extended sub_string_finder_pretty) set(EXECUTABLE "$") set(LIGHT_EXECUTABLE "$") set(ARGS "") set(LIGHT_ARGS "silent") add_execution_target(run_sub_string_finder sub_string_finder ${EXECUTABLE} "${ARGS}") add_execution_target(light_test_sub_string_finder sub_string_finder ${LIGHT_EXECUTABLE} "${LIGHT_ARGS}") ================================================ FILE: third-party/tbb/examples/getting_started/sub_string_finder/README.md ================================================ # Sub_string_finder sample An example that uses the `parallel_for` template in a substring matching program. The [oneAPI Threading Building Blocks Get Started Guide](https://www.intel.com/content/www/us/en/docs/onetbb/get-started-guide/current/overview.html) describes this example. For each position in a string, the program displays the length of the largest matching substring elsewhere in the string. The program also displays the location of a largest match for each position. Consider the string "babba" as an example. Starting at position 0, "ba" is the largest substring with a match elsewhere in the string (position 3). ## Building the example ``` cmake cmake --build . ``` ### Predefined make targets * `make sub_string_finder_simple` - builds the example as it appears in the Get Started Guide. * `make sub_string_finder_extended` - builds the similar example with more attractive printing of the results. * `make sub_string_finder_pretty` - builds the example extended with a sequential implementation. * `make sub_string_finder` - builds all sample versions. ## Running the sample ### Predefined make targets * `make run_sub_string_finder` - executes the example with predefined parameters. * `make light_test_sub_string_finder` - executes the example with suggested parameters to reduce execution time. ### Application parameters Usage: ``` sub_string_finder_simple sub_string_finder_extended sub_string_finder_pretty ``` The example does not requires application parameters. ================================================ FILE: third-party/tbb/examples/getting_started/sub_string_finder/sub_string_finder.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include // std::max #include "oneapi/tbb/parallel_for.h" #include "oneapi/tbb/blocked_range.h" #include "common/utility/utility.hpp" bool silent = false; static const std::size_t N = 23; class SubStringFinder { const std::string &str; std::vector &max_array; std::vector &pos_array; public: void operator()(const oneapi::tbb::blocked_range &r) const { for (std::size_t i = r.begin(); i != r.end(); ++i) { std::size_t max_size = 0, max_pos = 0; for (std::size_t j = 0; j < str.size(); ++j) { if (j != i) { std::size_t limit = str.size() - (std::max)(i, j); for (std::size_t k = 0; k < limit; ++k) { if (str[i + k] != str[j + k]) break; if (k > max_size) { max_size = k; max_pos = j; } } } } max_array[i] = max_size; pos_array[i] = max_pos; } } SubStringFinder(const std::string &s, std::vector &m, std::vector &p) : str(s), max_array(m), pos_array(p) {} }; int main(int argc, char *argv[]) { // command line parsing utility::parse_cli_arguments(argc, argv, utility::cli_argument_pack() //"-h" option for displaying help is present implicitly .arg(silent, "silent", "no output")); std::string str[N] = { std::string("a"), std::string("b") }; for (std::size_t i = 2; i < N; ++i) str[i] = str[i - 1] + str[i - 2]; std::string &to_scan = str[N - 1]; const std::size_t num_elem = to_scan.size(); std::vector max(num_elem); std::vector pos(num_elem); oneapi::tbb::parallel_for(oneapi::tbb::blocked_range(0, num_elem), SubStringFinder(to_scan, max, pos)); for (std::size_t i = 0; i < num_elem; ++i) if (!silent) std::cout << " " << max[i] << "(" << pos[i] << ")" << "\n"; return 0; } ================================================ FILE: third-party/tbb/examples/getting_started/sub_string_finder/sub_string_finder_extended.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include // std::max #include "oneapi/tbb/parallel_for.h" #include "oneapi/tbb/blocked_range.h" #include "oneapi/tbb/tick_count.h" static const std::size_t N = 22; void SerialSubStringFinder(const std::string &str, std::vector &max_array, std::vector &pos_array) { for (std::size_t i = 0; i < str.size(); ++i) { std::size_t max_size = 0, max_pos = 0; for (std::size_t j = 0; j < str.size(); ++j) if (j != i) { std::size_t limit = str.size() - (std::max)(i, j); for (std::size_t k = 0; k < limit; ++k) { if (str[i + k] != str[j + k]) break; if (k > max_size) { max_size = k; max_pos = j; } } } max_array[i] = max_size; pos_array[i] = max_pos; } } class SubStringFinder { const char *str; const std::size_t len; std::size_t *max_array; std::size_t *pos_array; public: void operator()(const oneapi::tbb::blocked_range &r) const { for (std::size_t i = r.begin(); i != r.end(); ++i) { std::size_t max_size = 0, max_pos = 0; for (std::size_t j = 0; j < len; ++j) { if (j != i) { std::size_t limit = len - (std::max)(i, j); for (std::size_t k = 0; k < limit; ++k) { if (str[i + k] != str[j + k]) break; if (k > max_size) { max_size = k; max_pos = j; } } } } max_array[i] = max_size; pos_array[i] = max_pos; } } // We do not use std::vector for compatibility with offload execution SubStringFinder(const char *s, const std::size_t s_len, std::size_t *m, std::size_t *p) : str(s), len(s_len), max_array(m), pos_array(p) {} }; int main() { std::string str[N] = { std::string("a"), std::string("b") }; for (std::size_t i = 2; i < N; ++i) str[i] = str[i - 1] + str[i - 2]; std::string &to_scan = str[N - 1]; const std::size_t num_elem = to_scan.size(); std::vector max1(num_elem); std::vector pos1(num_elem); std::vector max2(num_elem); std::vector pos2(num_elem); std::cout << " Done building string." << "\n"; oneapi::tbb::tick_count serial_t0 = oneapi::tbb::tick_count::now(); SerialSubStringFinder(to_scan, max2, pos2); oneapi::tbb::tick_count serial_t1 = oneapi::tbb::tick_count::now(); std::cout << " Done with serial version." << "\n"; oneapi::tbb::tick_count parallel_t0 = oneapi::tbb::tick_count::now(); oneapi::tbb::parallel_for(oneapi::tbb::blocked_range(0, num_elem, 100), SubStringFinder(to_scan.c_str(), num_elem, &max1[0], &pos1[0])); oneapi::tbb::tick_count parallel_t1 = oneapi::tbb::tick_count::now(); std::cout << " Done with parallel version." << "\n"; for (std::size_t i = 0; i < num_elem; ++i) { if (max1[i] != max2[i] || pos1[i] != pos2[i]) { std::cout << "ERROR: Serial and Parallel Results are Different!" << "\n"; break; } } std::cout << " Done validating results." << "\n"; std::cout << "Serial version ran in " << (serial_t1 - serial_t0).seconds() << " seconds" << "\n" << "Parallel version ran in " << (parallel_t1 - parallel_t0).seconds() << " seconds" << "\n" << "Resulting in a speedup of " << (serial_t1 - serial_t0).seconds() / (parallel_t1 - parallel_t0).seconds() << "\n"; return 0; } ================================================ FILE: third-party/tbb/examples/getting_started/sub_string_finder/sub_string_finder_pretty.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include #include // std::max #include "oneapi/tbb/parallel_for.h" #include "oneapi/tbb/blocked_range.h" static const std::size_t N = 9; class SubStringFinder { const std::string &str; std::vector &max_array; std::vector &pos_array; public: void operator()(const oneapi::tbb::blocked_range &r) const { for (std::size_t i = r.begin(); i != r.end(); ++i) { std::size_t max_size = 0, max_pos = 0; for (std::size_t j = 0; j < str.size(); ++j) { if (j != i) { std::size_t limit = str.size() - (std::max)(i, j); for (std::size_t k = 0; k < limit; ++k) { if (str[i + k] != str[j + k]) break; if (k + 1 > max_size) { max_size = k + 1; max_pos = j; } } } } max_array[i] = max_size; pos_array[i] = max_pos; } } SubStringFinder(const std::string &s, std::vector &m, std::vector &p) : str(s), max_array(m), pos_array(p) {} }; int main() { std::string str[N] = { std::string("a"), std::string("b") }; for (std::size_t i = 2; i < N; ++i) str[i] = str[i - 1] + str[i - 2]; std::string &to_scan = str[N - 1]; const std::size_t num_elem = to_scan.size(); std::cout << "String to scan: " << to_scan << "\n"; std::vector max(num_elem); std::vector pos(num_elem); oneapi::tbb::parallel_for(oneapi::tbb::blocked_range(0, num_elem, 100), SubStringFinder(to_scan, max, pos)); for (std::size_t i = 0; i < num_elem; ++i) { for (std::size_t j = 0; j < num_elem; ++j) { if (j >= i && j < i + max[i]) std::cout << "_"; else std::cout << " "; } std::cout << "\n" << to_scan << "\n"; for (std::size_t j = 0; j < num_elem; ++j) { if (j >= pos[i] && j < pos[i] + max[i]) std::cout << "*"; else std::cout << " "; } std::cout << "\n"; } return 0; } ================================================ FILE: third-party/tbb/examples/graph/README.md ================================================ # Code Samples of oneAPI Threading Building Blocks (oneTBB) Examples using oneTBB Flow Graph feature. | Code sample name | Description |:--- |:--- | binpack | A solution to the binpacking problem using a `queue_node`, a `buffer_node` and `function_node`s. | cholesky | Several versions of Cholesky Factorization algorithm implementation. | dining_philosophers | An implementation of dining philosophers in graph using the reserving `join_node`. | fgbzip2 | A parallel implementation of bzip2 block-sorting file compressor. | logic_sim | An example of a collection of digital logic gates that can be easily composed into larger circuits. | som | An example of a Kohonen Self-Organizing Map using cancellation. ================================================ FILE: third-party/tbb/examples/graph/binpack/CMakeLists.txt ================================================ # Copyright (c) 2020-2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. cmake_minimum_required(VERSION 3.5.0...3.31.3) project(binpack CXX) include(../../common/cmake/common.cmake) set_common_project_settings(tbb) add_executable(binpack binpack.cpp) target_link_libraries(binpack TBB::tbb Threads::Threads) target_compile_options(binpack PRIVATE ${TBB_CXX_STD_FLAG}) set(EXECUTABLE "$") set(ARGS 4 elements_num=1000) set(PERF_ARGS auto elements_num=1000 silent) add_execution_target(run_binpack binpack ${EXECUTABLE} "${ARGS}") add_execution_target(perf_run_binpack binpack ${EXECUTABLE} "${PERF_ARGS}") ================================================ FILE: third-party/tbb/examples/graph/binpack/README.md ================================================ # Binpack sample This directory contains an `oneapi::tbb::flow` example that performs binpacking of `N` integer values into a near-optimal number of bins of capacity `V`. It features a `source_node` which passes randomly generated integer values of `size <= V` to a `queue_node`. Multiple function nodes set about taking values from this `queue_node` and packing them into bins according to a best-fit policy. Items that cannot be made to fit are rejected and returned to the queue. When a bin is packed as well as it can be, it is passed to a `buffer_node` where it waits to be picked up by another `function_node`. This final function node gathers stats about the bin and optionally prints its contents. When all bins are accounted for, it optionally prints a summary of the quality of the bin-packing. ## Building the example ``` cmake cmake --build . ``` ## Running the sample ### Predefined make targets * `make run_binpack` - executes the example with predefined parameters. * `make perf_run_binpack` - executes the example with suggested parameters to measure the oneTBB performance. ### Application parameters Usage: ``` binpack [#threads=value] [verbose] [silent] [elements_num=value] [bin_capacity=value] [#packers=value] [optimality=value] [-h] [#threads] ``` * `-h` - prints the help for command line options. * `#threads` - the number of threads to use; a range of the form low\[:high\] where low and optional high are non-negative integers, or `auto` for a platform-specific default number. * `verbose` - prints diagnostic output to screen. * `silent` - limits output to timing info; overrides verbose. * `N` - number of values to pack. * `V` - capacity of each bin. * `#packers` - number of concurrent bin packers to use (`default=#threads`). * `optimality` - controls optimality of solution; 1 is highest, use larger numbers for less optimal but faster solution. ================================================ FILE: third-party/tbb/examples/graph/binpack/binpack.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* Bin-packing algorithm that attempts to use minimal number of bins B of size desired_bin_capacity to contain elements_num items of varying sizes. */ #include #include #include #include #include #include #include #include "oneapi/tbb/tick_count.h" #include "oneapi/tbb/flow_graph.h" #include "oneapi/tbb/global_control.h" #include "common/utility/utility.hpp" #include "common/utility/get_default_num_threads.hpp" typedef std::size_t size_type; // to represent non-zero indices, capacities, etc. typedef std::size_t value_type; // the type of items we are attempting to pack into bins typedef std::vector bin; // we use a simple vector to represent a bin // Our bin packers will be function nodes in the graph that take value_type items and // return a dummy value. They will also implicitly send packed bins to the bin_buffer // node, and unused items back to the value_pool node: typedef oneapi::tbb::flow:: multifunction_node, oneapi::tbb::flow::rejecting> bin_packer; // Items are placed into a pool that all bin packers grab from, represent by a queue_node: typedef oneapi::tbb::flow::queue_node value_pool; // Packed bins are placed in this buffer waiting to be serially printed and/or accounted for: typedef oneapi::tbb::flow::buffer_node bin_buffer; // Packed bins are taken from the_bin_buffer and processed by the_writer: typedef oneapi::tbb::flow:: function_node bin_writer; // Items are injected into the graph when this node sends them to the_value_pool: typedef oneapi::tbb::flow::input_node value_source; // User-specified globals with default values size_type desired_bin_capacity = 42; size_type elements_num = 1000; // number of elements to generate bool verbose = false; // prints bin details and other diagnostics to screen bool silent = false; // suppress all output except for time int num_bin_packers = -1; // number of concurrent bin packers in operation; default is #threads; // larger values can result in more bins at less than full capacity size_type optimality = 1; // 1 (default) is highest the algorithm can obtain; larger numbers run faster // Calculated globals size_type bins_num_min; // lower bound on the optimal number of bins size_type bins_num; // the answer, i.e. number of bins used by the algorithm std::vector input_array; // stores randomly generated input values value_type item_sum; // sum of all randomly generated input values std::atomic packed_sum; // sum of all values currently packed into all bins std::atomic packed_items; // number of values currently packed into all bins std::atomic active_bins; // number of active bin_packers std::vector bins; // the array of bin packers // This class is the Body type for bin_packer class bin_filler { typedef bin_packer::output_ports_type ports_type; bin my_bin; // the current bin that this bin_filler is packing size_type my_used; // capacity of bin used by current contents (not to be confused with my_bin.size()) size_type relax, relax_val; // relaxation counter for determining when to settle for a non-full bin bin_packer* my_bin_packer; // ptr to the bin packer that this body object is associated with size_type bin_index; // index of the encapsulating bin packer in the global bins array value_type looking_for; // the minimum size of item this bin_packer will accept value_pool* the_value_pool; // the queue of incoming values bool done; // flag to indicate that this binpacker has been deactivated public: bin_filler(std::size_t bidx, value_pool* _q) : my_used(0), relax(0), relax_val(0), my_bin_packer(nullptr), bin_index(bidx), looking_for(desired_bin_capacity), the_value_pool(_q), done(false) {} void operator()(const value_type& item, ports_type& p) { if (!my_bin_packer) my_bin_packer = bins[bin_index]; if (done) // this bin_packer is done packing items; put item back to pool std::get<0>(p).try_put(item); else if ( item > desired_bin_capacity) { // signal that packed_sum has reached item_sum at some point size_type remaining = active_bins--; if (remaining == 1 && packed_sum == item_sum) { // this is the last bin and it has seen everything // this bin_packer may not have seen everything, so stay active if (my_used > 0) std::get<1>(p).try_put(my_bin); my_bin.clear(); my_used = 0; looking_for = desired_bin_capacity; ++active_bins; } else if (remaining == 1) { // this is the last bin, but there are remaining items std::get<0>(p).try_put(desired_bin_capacity + 1); // send out signal ++active_bins; } else if (remaining > 1) { // this is not the last bin; deactivate // this bin is ill-utilized; throw back items and deactivate if (my_used < desired_bin_capacity / (1 + optimality * .1)) { packed_sum -= my_used; packed_items -= my_bin.size(); for (size_type i = 0; i < my_bin.size(); ++i) std::get<0>(p).try_put(my_bin[i]); oneapi::tbb::flow::remove_edge(*the_value_pool, *my_bin_packer); // deactivate done = true; std::get<0>(p).try_put(desired_bin_capacity + 1); // send out signal } else { // this bin is well-utilized; send out bin and deactivate oneapi::tbb::flow::remove_edge(*the_value_pool, *my_bin_packer); // build no more bins done = true; if (my_used > 0) std::get<1>(p).try_put(my_bin); std::get<0>(p).try_put(desired_bin_capacity + 1); // send out signal } } } else if (item <= desired_bin_capacity - my_used && item >= looking_for) { // this item can be packed my_bin.push_back(item); my_used += item; packed_sum += item; ++packed_items; looking_for = desired_bin_capacity - my_used; relax = 0; if (packed_sum == item_sum) { std::get<0>(p).try_put(desired_bin_capacity + 1); // send out signal } if (my_used == desired_bin_capacity) { std::get<1>(p).try_put(my_bin); my_bin.clear(); my_used = 0; looking_for = desired_bin_capacity; } } else { // this item can't be packed; relax constraints ++relax; // this bin_packer has looked through enough items if (relax >= (elements_num - packed_items) / optimality) { relax = 0; --looking_for; // accept a wider range of items if (looking_for == 0 && my_used < desired_bin_capacity / (1 + optimality * .1) && my_used > 0 && active_bins > 1) { // this bin_packer is ill-utilized and can't find items; deactivate and throw back items size_type remaining = active_bins--; if (remaining > 1) { // not the last bin_packer oneapi::tbb::flow::remove_edge(*the_value_pool, *my_bin_packer); // deactivate done = true; } else active_bins++; // can't deactivate last bin_packer packed_sum -= my_used; packed_items -= my_bin.size(); for (size_type i = 0; i < my_bin.size(); ++i) std::get<0>(p).try_put(my_bin[i]); my_bin.clear(); my_used = 0; } else if (looking_for == 0 && (my_used >= desired_bin_capacity / (1 + optimality * .1) || active_bins == 1)) { // this bin_packer can't find items but is well-utilized, so send it out and reset std::get<1>(p).try_put(my_bin); my_bin.clear(); my_used = 0; looking_for = desired_bin_capacity; } } std::get<0>(p).try_put(item); // put unused item back to pool } } }; // input node uses this to send the values to the value_pool class item_generator { size_type counter; public: item_generator() : counter(0) {} value_type operator()(oneapi::tbb::flow_control& fc) { if (counter < elements_num) { value_type result = input_array[counter]; ++counter; return result; } fc.stop(); return value_type{}; } }; // the terminal function_node uses this to gather stats and print bin information class bin_printer { value_type running_count; size_type item_count; value_type my_min, my_max; double avg; public: bin_printer() : running_count(0), item_count(0), my_min(desired_bin_capacity), my_max(0), avg(0) {} oneapi::tbb::flow::continue_msg operator()(bin b) { value_type sum = 0; ++bins_num; if (verbose) std::cout << "[ "; for (size_type i = 0; i < b.size(); ++i) { if (verbose) std::cout << b[i] << " "; sum += b[i]; ++item_count; } my_min = std::min(sum, my_min); my_max = std::max(sum, my_max); avg += sum; running_count += sum; if (verbose) { std::cout << "]=" << sum << "; Done/Packed/Total cap: " << running_count << "/" << packed_sum << "/" << item_sum << " items:" << item_count << "/" << packed_items << "/" << elements_num << " bins_num=" << bins_num << "\n"; } if (item_count == elements_num) { // should be the last; print stats avg = avg / (double)bins_num; if (!silent) std::cout << "SUMMARY: #Bins used: " << bins_num << "; Avg size: " << avg << "; Max size: " << my_max << "; Min size: " << my_min << "\n" << " Lower bound on optimal #bins: " << bins_num_min << "; Start #bins: " << num_bin_packers << "\n"; } return oneapi::tbb::flow::continue_msg(); // need to return something } }; int main(int argc, char* argv[]) { utility::thread_number_range threads(utility::get_default_num_threads); utility::parse_cli_arguments( argc, argv, utility::cli_argument_pack() //"-h" option for displaying help is present implicitly .positional_arg(threads, "#threads", utility::thread_number_range_desc) .arg(verbose, "verbose", " print diagnostic output to screen") .arg(silent, "silent", " limits output to timing info; overrides verbose") .arg(elements_num, "elements_num", " number of values to pack") .arg(desired_bin_capacity, "bin_capacity", " capacity of each bin") .arg(num_bin_packers, "#packers", " number of concurrent bin packers to use " "(default=#threads)") .arg(optimality, "optimality", "controls optimality of solution; 1 is highest, use\n" " larger numbers for less optimal but faster solution")); if (silent) verbose = false; // make silent override verbose // Generate random input data srand(42); input_array.resize(elements_num); item_sum = 0; for (auto& item : input_array) { item = rand() % desired_bin_capacity + 1; // generate items that fit in a bin item_sum += item; } bins_num_min = (item_sum % desired_bin_capacity) ? item_sum / desired_bin_capacity + 1 : item_sum / desired_bin_capacity; oneapi::tbb::tick_count start = oneapi::tbb::tick_count::now(); for (int p = threads.first; p <= threads.last; p = threads.step(p)) { oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism, p); packed_sum = 0; packed_items = 0; bins_num = 0; if (num_bin_packers == -1) num_bin_packers = p; active_bins = num_bin_packers; if (!silent) std::cout << "binpack running with " << item_sum << " capacity over " << elements_num << " items, optimality=" << optimality << ", " << num_bin_packers << " bins of capacity=" << desired_bin_capacity << " on " << p << " threads." << "\n"; oneapi::tbb::flow::graph g; value_source the_source(g, item_generator()); value_pool the_value_pool(g); oneapi::tbb::flow::make_edge(the_source, the_value_pool); bin_buffer the_bin_buffer(g); bins.resize(num_bin_packers); for (int i = 0; i < num_bin_packers; ++i) { bins[i] = new bin_packer(g, 1, bin_filler(i, &the_value_pool)); oneapi::tbb::flow::make_edge(the_value_pool, *(bins[i])); oneapi::tbb::flow::make_edge(oneapi::tbb::flow::output_port<0>(*(bins[i])), the_value_pool); oneapi::tbb::flow::make_edge(oneapi::tbb::flow::output_port<1>(*(bins[i])), the_bin_buffer); } bin_writer the_writer(g, 1, bin_printer()); make_edge(the_bin_buffer, the_writer); the_source.activate(); g.wait_for_all(); for (int i = 0; i < num_bin_packers; ++i) { delete bins[i]; } } utility::report_elapsed_time((oneapi::tbb::tick_count::now() - start).seconds()); return 0; } ================================================ FILE: third-party/tbb/examples/graph/cholesky/CMakeLists.txt ================================================ # Copyright (c) 2020-2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. cmake_minimum_required(VERSION 3.5.0...3.31.3) project(cholesky CXX) include(../../common/cmake/common.cmake) set_common_project_settings(tbb) # TODO: Consider using FindMKL module find_library(MKL_INTEL_LP64_LIB mkl_intel_lp64 PATHS ENV LIBRARY_PATH) find_library(MKL_SEQUENTIAL_LIB mkl_sequential PATHS ENV LIBRARY_PATH) find_library(MKL_CORE_LIB mkl_core PATHS ENV LIBRARY_PATH) if(NOT (MKL_INTEL_LP64_LIB OR MKL_SEQUENTIAL_LIB OR MKL_CORE_LIB)) message(FATAL_ERROR "Cannot find Intel(R) Math Kernel Library (Intel(R) MKL).") endif() add_executable(cholesky init.cpp cholesky.cpp) target_link_libraries(cholesky ${MKL_INTEL_LP64_LIB} ${MKL_SEQUENTIAL_LIB} ${MKL_CORE_LIB} TBB::tbb Threads::Threads ) target_compile_options(cholesky PRIVATE ${TBB_CXX_STD_FLAG}) set(EXECUTABLE "$") set(ARGS 4 2) add_execution_target(run_cholesky cholesky ${EXECUTABLE} "${ARGS}") ================================================ FILE: third-party/tbb/examples/graph/cholesky/README.md ================================================ # Cholesky sample This directory contains an example of several versions of Cholesky Factorization algorithm. **dpotrf**: An implementation that calls the oneAPI Math Kernel Library (oneMKL) `dpotrf` function to directly perform the factorization. This can be a serial implementation or threaded implementation depending on the version of the oneMKL library that is linked against. **crout**: A serial implementation that uses the Crout-Cholesky algorithm for factorization. The same approach is parallelized for the other oneAPI Threading Building Blocks (oneTBB) based approaches below. **depend**: A parallel version of Crout-Cholesky factorization that uses the oneTBB flow graph. This version uses a dependency graph made solely of `continue_node` objects. This an inspector-executor approach, where a loop nest that is similar to the serial implementation is used to create an unrolled version of the computation. Where the oneMKL calls would have been made in the original serial implementation of Crout-Cholesky, graph nodes are created instead and these nodes are linked by edges to the other nodes they are dependent upon. The resulting graph is relatively large, with a node for each instance of each oneMKL call. For example, there are many nodes that call `dtrsm`; one for each invocation of `dtrsm` in the serial implementation. The is very little overhead in message management for this version and so it is often the highest performing. **join**: A parallel version of Crout-Cholesky factorization that uses the oneTBB flow graph. This version uses a data flow approach. This is a small, compact graph that passes tiles along its edges. There is one node per type of oneMKL call, plus `join_node`s that combine the inputs required for each call. So for example, there is only a single node that applies all calls to `dtrsm`. This node is invoked when the tiles that hold the inputs and outputs for an invocation are matched together in the tag-matching `join_node` that precedes it. The tag represents the iteration values of the `i`, `j`, `k` loops in the serial implementation at that invocation of the call. There is some overhead in message matching and forwarding, so it may not perform as well as the dependency graph implementation. This sample code requires a oneTBB library and also the oneMKL library. ## Building the example ``` cmake cmake --build . ``` ## Running the sample ### Predefined make targets * `make run_cholesky` - executes the example with predefined parameters. ### Application parameters Usage: ``` cholesky [size=value] [blocksize=value] [num_trials=value] [output_prefix=value] [algorithm=value] [num_tbb_threads=value] [input_file=value] [-x] [-h] [size [blocksize [num_trials [output_prefix [algorithm [num_tbb_threads]]]]]] ``` * `-h` - prints the help for command line options. * `size` - the row/column size of `NxN` matrix (size <= 46000). * `blocksize` - the block size; size must be a multiple of the blocksize. * `num_trials` - the number of times to run each algorithm. * `output_prefix` - if provided the prefix will be prepended to output files: _posdef.txt and _X.txt; where `X` is the algorithm used. If `output_prefix` is not provided, no output will be written. * `algorithm` - name of the used algorithm - can be dpotrf, crout, depend or join. * `num_tbb_threads` - number of oneTBB threads. * `input_file` - input matrix (optional). If omitted, randomly generated values are used. * `-x` - skips all validation. ================================================ FILE: third-party/tbb/examples/graph/cholesky/cholesky.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include #include #include #include #include "mkl_lapack.h" #include "mkl.h" #include "oneapi/tbb/flow_graph.h" #include "oneapi/tbb/tick_count.h" #include "oneapi/tbb/global_control.h" // Application command line arguments parsing #include "common/utility/utility.hpp" #include "common/utility/get_default_num_threads.hpp" /************************************************************ FORWARD DECLARATIONS ************************************************************/ /********************************************** Read or generate a positive-definite matrix -- reads from file if fname != nullptr -- sets n to matrix size -- allocates and reads values in to A -- otherwise generates a matrix -- uses n to determine size -- allocates and generates values in to A **********************************************/ void matrix_init(double *&A, int &n, const char *fname); /********************************************** Writes a lower triangular matrix to a file -- first line of file is n -- subsequently 1 row per line **********************************************/ void matrix_write(double *A, int n, const char *fname, bool is_triangular = false); /************************************************************ GLOBAL VARIABLES ************************************************************/ bool g_benchmark_run = false; int g_n = -1, g_b = -1, g_num_trials = 1; char *g_input_file_name = nullptr; char *g_output_prefix = nullptr; std::string g_alg_name; int g_num_tbb_threads; // Creates tiled array static double ***create_tile_array(double *A, int n, int b) { const int p = n / b; double ***tile = (double ***)calloc(sizeof(double **), p); for (int j = 0; j < p; ++j) { tile[j] = (double **)calloc(sizeof(double *), p); } for (int j = 0; j < p; ++j) { for (int i = 0; i < p; ++i) { double *temp_block = (double *)calloc(sizeof(double), b * b); for (int A_j = j * b, T_j = 0; T_j < b; ++A_j, ++T_j) { for (int A_i = i * b, T_i = 0; T_i < b; ++A_i, ++T_i) { temp_block[T_j * b + T_i] = A[A_j * n + A_i]; } } tile[j][i] = temp_block; } } return tile; } static void collapse_tile_array(double ***tile, double *A, int n, int b) { const int p = n / b; for (int j = 0; j < p; ++j) { for (int i = 0; i < p; ++i) { double *temp_block = tile[j][i]; for (int A_j = j * b, T_j = 0; T_j < b; ++A_j, ++T_j) { for (int A_i = i * b, T_i = 0; T_i < b; ++A_i, ++T_i) { A[A_j * n + A_i] = temp_block[T_j * b + T_i]; } } free(temp_block); tile[j][i] = nullptr; } free(tile[j]); } free(tile); } /************************************************************ Helper base class: algorithm ************************************************************/ class algorithm { std::string name; bool is_tiled; bool check_if_valid(double *A0, double *C, double *A, int n) { char transa = 'n', transb = 't'; double alpha = 1; double beta = 0; for (int i = 0; i < n; ++i) { for (int j = i + 1; j < n; ++j) { A0[j * n + i] = 0.; } } dgemm(&transa, &transb, &n, &n, &n, &alpha, A0, &n, A0, &n, &beta, C, &n); for (int j = 0; j < n; ++j) { for (int i = 0; i < n; ++i) { const double epsilon = std::abs(A[j * n + i] * 0.1); if (std::abs(C[j * n + i] - A[j * n + i]) > epsilon) { printf("ERROR: %s did not validate at C(%d,%d) = %lf != A(%d,%d) = %lf\n", name.c_str(), i, j, C[j * n + i], i, j, A[j * n + i]); printf("ERROR: %g; %g < %g < %g\n", epsilon, A[j * n + i] - epsilon, C[j * n + i], A[j * n + i] + epsilon); return false; } } } return true; } public: algorithm(const std::string &alg_name, bool t) : name(alg_name), is_tiled(t) {} double operator()(double *A, int n, int b, int trials) { oneapi::tbb::tick_count t0, t1; double elapsed_time = 0.0; double *A0 = (double *)calloc(sizeof(double), n * n); double *C = (double *)calloc(sizeof(double), n * n); for (int i = 0; i < trials + 1; ++i) { if (is_tiled) { double ***tile = create_tile_array(A, n, b); t0 = oneapi::tbb::tick_count::now(); func(tile, n, b); t1 = oneapi::tbb::tick_count::now(); collapse_tile_array(tile, A0, n, b); } else { memcpy(A0, A, sizeof(double) * n * n); t0 = oneapi::tbb::tick_count::now(); func(A0, n, b); t1 = oneapi::tbb::tick_count::now(); } if (i) elapsed_time += (t1 - t0).seconds(); if (!g_benchmark_run && !check_if_valid(A0, C, A, n)) { if (g_output_prefix) { std::string s(g_output_prefix); s += "_" + name + ".txt"; matrix_write(A0, g_n, s.c_str(), true); free(A0); free(C); return 0.; } } } if (g_output_prefix) { std::string s(g_output_prefix); s += "_" + name + ".txt"; matrix_write(A0, g_n, s.c_str(), true); } printf("%s %d %d %d %d %lf %lf\n", name.c_str(), g_num_tbb_threads, trials, n, b, elapsed_time, elapsed_time / trials); free(A0); free(C); return elapsed_time; } protected: // Main algorithm body function must be defined in any direved class virtual void func(void *ptr, int n, int b) = 0; }; /***********************************************************/ static void call_dpotf2(double ***tile, int b, int k) { double *A_block = tile[k][k]; char uplo = 'l'; int info = 0; dpotf2(&uplo, &b, A_block, &b, &info); return; } static void call_dtrsm(double ***tile, int b, int k, int j) { double *A_block = tile[k][j]; double *L_block = tile[k][k]; char uplo = 'l', side = 'r', transa = 't', diag = 'n'; double alpha = 1; dtrsm(&side, &uplo, &transa, &diag, &b, &b, &alpha, L_block, &b, A_block, &b); return; } static void call_dsyr2k(double ***tile, int b, int k, int j, int i) { double *A_block = tile[i][j]; char transa = 'n', transb = 't'; char uplo = 'l'; double alpha = -1; double beta = 1; if (i == j) { // Diagonal block double *L_block = tile[k][i]; dsyrk(&uplo, &transa, &b, &b, &alpha, L_block, &b, &beta, A_block, &b); } else { // Non-diagonal block double *L2_block = tile[k][i]; double *L1_block = tile[k][j]; dgemm(&transa, &transb, &b, &b, &b, &alpha, L1_block, &b, L2_block, &b, &beta, A_block, &b); } return; } class algorithm_crout : public algorithm { public: algorithm_crout() : algorithm("crout_cholesky", true) {} protected: virtual void func(void *ptr, int n, int b) { double ***tile = (double ***)ptr; const int p = n / b; for (int k = 0; k < p; ++k) { call_dpotf2(tile, b, k); for (int j = k + 1; j < p; ++j) { call_dtrsm(tile, b, k, j); for (int i = k + 1; i <= j; ++i) { call_dsyr2k(tile, b, k, j, i); } } } } }; class algorithm_dpotrf : public algorithm { public: algorithm_dpotrf() : algorithm("dpotrf_cholesky", false) {} protected: virtual void func(void *ptr, int n, int /* b */) { double *A = (double *)ptr; int lda = n; int info = 0; char uplo = 'l'; dpotrf(&uplo, &n, A, &lda, &info); } }; /************************************************************ Begin data join graph based version of cholesky ************************************************************/ typedef union { char a[4]; std::size_t tag; } tag_t; typedef double *tile_t; typedef std::pair tagged_tile_t; typedef std::tuple t1_t; typedef std::tuple t2_t; typedef std::tuple t3_t; typedef oneapi::tbb::flow::multifunction_node dpotf2_node_t; typedef oneapi::tbb::flow::multifunction_node dtrsm_node_t; typedef oneapi::tbb::flow::multifunction_node dsyr2k_node_t; typedef oneapi::tbb::flow::join_node dtrsm_join_t; typedef oneapi::tbb::flow::join_node dsyr2k_join_t; class dpotf2_body { int p; int b; public: dpotf2_body(int p_, int b_) : p(p_), b(b_) {} void operator()(const tagged_tile_t &in, dpotf2_node_t::output_ports_type &ports) { int k = in.first.a[0]; tile_t A_block = in.second; tag_t t; t.tag = 0; t.a[0] = k; char uplo = 'l'; int info = 0; dpotf2(&uplo, &b, A_block, &b, &info); // Send to dtrsms in same column // k == k j == k t.a[2] = k; for (int j = k + 1; j < p; ++j) { t.a[1] = j; std::get<0>(ports).try_put(std::make_pair(t, A_block)); } } }; class dtrsm_body { int p; int b; public: dtrsm_body(int p_, int b_) : p(p_), b(b_) {} void operator()(const t2_t &in, dtrsm_node_t::output_ports_type &ports) { tagged_tile_t in0 = std::get<0>(in); tagged_tile_t in1 = std::get<1>(in); int k = in0.first.a[0]; int j = in0.first.a[1]; tile_t L_block = in0.second; tile_t A_block = in1.second; tag_t t; t.tag = 0; t.a[0] = k; char uplo = 'l', side = 'r', transa = 't', diag = 'n'; double alpha = 1; dtrsm(&side, &uplo, &transa, &diag, &b, &b, &alpha, L_block, &b, A_block, &b); // Send to rest of my row t.a[1] = j; for (int i = k + 1; i <= j; ++i) { t.a[2] = i; std::get<0>(ports).try_put(std::make_pair(t, A_block)); } // Send to transposed row t.a[2] = j; for (int i = j; i < p; ++i) { t.a[1] = i; std::get<1>(ports).try_put(std::make_pair(t, A_block)); } } }; class dsyr2k_body { int p; int b; public: dsyr2k_body(int p_, int b_) : p(p_), b(b_) {} void operator()(const t3_t &in, dsyr2k_node_t::output_ports_type &ports) { tag_t t; t.tag = 0; char transa = 'n', transb = 't'; char uplo = 'l'; double alpha = -1; double beta = 1; tagged_tile_t in0 = std::get<0>(in); tagged_tile_t in1 = std::get<1>(in); tagged_tile_t in2 = std::get<2>(in); int k = in2.first.a[0]; int j = in2.first.a[1]; int i = in2.first.a[2]; tile_t A_block = in2.second; if (i == j) { // Diagonal block tile_t L_block = in0.second; dsyrk(&uplo, &transa, &b, &b, &alpha, L_block, &b, &beta, A_block, &b); } else { // Non-diagonal block tile_t L1_block = in0.second; tile_t L2_block = in1.second; dgemm(&transa, &transb, &b, &b, &b, &alpha, L1_block, &b, L2_block, &b, &beta, A_block, &b); } // All outputs flow to next step t.a[0] = k + 1; t.a[1] = j; t.a[2] = i; if (k != p - 1 && j == k + 1 && i == k + 1) { std::get<0>(ports).try_put(std::make_pair(t, A_block)); } if (k < p - 2) { if (i == k + 1 && j > i) { t.a[0] = k + 1; t.a[1] = j; std::get<1>(ports).try_put(std::make_pair(t, A_block)); } if (j != k + 1 && i != k + 1) { t.a[0] = k + 1; t.a[1] = j; t.a[2] = i; std::get<2>(ports).try_put(std::make_pair(t, A_block)); } } } }; struct tagged_tile_to_size_t { std::size_t operator()(const tagged_tile_t &t) { return t.first.tag; } }; class algorithm_join : public algorithm { public: algorithm_join() : algorithm("data_join_cholesky", true) {} protected: virtual void func(void *ptr, int n, int b) { using oneapi::tbb::flow::unlimited; using oneapi::tbb::flow::output_port; using oneapi::tbb::flow::input_port; double ***tile = (double ***)ptr; const int p = n / b; oneapi::tbb::flow::graph g; dpotf2_node_t dpotf2_node(g, unlimited, dpotf2_body(p, b)); dtrsm_node_t dtrsm_node(g, unlimited, dtrsm_body(p, b)); dsyr2k_node_t dsyr2k_node(g, unlimited, dsyr2k_body(p, b)); dtrsm_join_t dtrsm_join(g, tagged_tile_to_size_t(), tagged_tile_to_size_t()); dsyr2k_join_t dsyr2k_join( g, tagged_tile_to_size_t(), tagged_tile_to_size_t(), tagged_tile_to_size_t()); make_edge(output_port<0>(dsyr2k_node), dpotf2_node); make_edge(output_port<0>(dpotf2_node), input_port<0>(dtrsm_join)); make_edge(output_port<1>(dsyr2k_node), input_port<1>(dtrsm_join)); make_edge(dtrsm_join, dtrsm_node); make_edge(output_port<0>(dtrsm_node), input_port<0>(dsyr2k_join)); make_edge(output_port<1>(dtrsm_node), input_port<1>(dsyr2k_join)); make_edge(output_port<2>(dsyr2k_node), input_port<2>(dsyr2k_join)); make_edge(dsyr2k_join, dsyr2k_node); // Now we need to send out the tiles to their first nodes tag_t t; t.tag = 0; t.a[0] = 0; t.a[1] = 0; t.a[2] = 0; // Send to feedback input of first dpotf2 // k == 0, j == 0, i == 0 dpotf2_node.try_put(std::make_pair(t, tile[0][0])); // Send to feedback input (port 1) of each dtrsm // k == 0, j == 1..p-1 for (int j = 1; j < p; ++j) { t.a[1] = j; input_port<1>(dtrsm_join).try_put(std::make_pair(t, tile[0][j])); } // Send to feedback input (port 2) of each dsyr2k // k == 0 for (int i = 1; i < p; ++i) { t.a[2] = i; for (int j = i; j < p; ++j) { t.a[1] = j; input_port<2>(dsyr2k_join).try_put(std::make_pair(t, tile[i][j])); } } g.wait_for_all(); } }; /************************************************************ End data join graph based version of cholesky ************************************************************/ /************************************************************ Begin dependence graph based version of cholesky ************************************************************/ typedef oneapi::tbb::flow::continue_node continue_type; typedef continue_type *continue_ptr_type; class algorithm_depend : public algorithm { public: algorithm_depend() : algorithm("depend_cholesky", true) {} protected: virtual void func(void *ptr, int n, int b) { double ***tile = (double ***)ptr; const int p = n / b; continue_ptr_type *c = new continue_ptr_type[p]; continue_ptr_type **t = new continue_ptr_type *[p]; continue_ptr_type ***u = new continue_ptr_type **[p]; oneapi::tbb::flow::graph g; for (int k = p - 1; k >= 0; --k) { c[k] = new continue_type(g, [=](const oneapi::tbb::flow::continue_msg &) { call_dpotf2(tile, b, k); }); t[k] = new continue_ptr_type[p]; u[k] = new continue_ptr_type *[p]; for (int j = k + 1; j < p; ++j) { t[k][j] = new continue_type(g, [=](const oneapi::tbb::flow::continue_msg &) { call_dtrsm(tile, b, k, j); }); make_edge(*c[k], *t[k][j]); u[k][j] = new continue_ptr_type[p]; for (int i = k + 1; i <= j; ++i) { u[k][j][i] = new continue_type(g, [=](const oneapi::tbb::flow::continue_msg &) { call_dsyr2k(tile, b, k, j, i); }); if (k < p - 2 && k + 1 != j && k + 1 != i) { make_edge(*u[k][j][i], *u[k + 1][j][i]); } make_edge(*t[k][j], *u[k][j][i]); if (i != j) { make_edge(*t[k][i], *u[k][j][i]); } if (k < p - 2 && j > i && i == k + 1) { make_edge(*u[k][j][i], *t[i][j]); } } } if (k != p - 1) { make_edge(*u[k][k + 1][k + 1], *c[k + 1]); } } c[0]->try_put(oneapi::tbb::flow::continue_msg()); g.wait_for_all(); for (int k = p - 1; k >= 0; --k) { for (int j = k + 1; j < p; ++j) { for (int i = k + 1; i <= j; ++i) { delete u[k][j][i]; } delete t[k][j]; delete[] u[k][j]; } delete c[k]; delete[] t[k]; delete[] u[k]; } delete[] c; delete[] t; delete[] u; } }; // class algorithm_depend /************************************************************ End dependence graph based version of cholesky ************************************************************/ bool process_args(int argc, char *argv[]) { utility::parse_cli_arguments( argc, argv, utility::cli_argument_pack() //"-h" option for displaying help is present implicitly .positional_arg(g_n, "size", "the row/column size of NxN matrix (size <= 46000)") .positional_arg( g_b, "blocksize", "the block size; size must be a multiple of the blocksize") .positional_arg(g_num_trials, "num_trials", "the number of times to run each algorithm") .positional_arg( g_output_prefix, "output_prefix", "if provided the prefix will be preappended to output files:\n" " output_prefix_posdef.txt\n" " output_prefix_X.txt; where X is the algorithm used\n" " if output_prefix is not provided, no output will be written") .positional_arg(g_alg_name, "algorithm", "name of the used algorithm - can be dpotrf, crout, depend or join") .positional_arg(g_num_tbb_threads, "num_tbb_threads", "number of started TBB threads") .arg(g_input_file_name, "input_file", "if provided it will be read to get the input matrix") .arg(g_benchmark_run, "-x", "skips all validation")); if (g_n > 46000) { printf("ERROR: invalid 'size' value (must be less or equal 46000): %d\n", g_n); return false; } if (g_n % g_b != 0) { printf("ERROR: size %d must be a multiple of the blocksize %d\n", g_n, g_b); return false; } if (g_n / g_b > 256) { // Because tile index size is 1 byte only in tag_t type printf("ERROR: size / blocksize must be less or equal 256, but %d / %d = %d\n", g_n, g_b, g_n / g_b); return false; } if (g_b == -1 || (g_n == -1 && g_input_file_name == nullptr)) { return false; } return true; } int main(int argc, char *argv[]) { g_num_tbb_threads = utility::get_default_num_threads(); typedef std::map algmap_t; algmap_t algmap; // Init algorithms algmap.insert(std::pair("dpotrf", new algorithm_dpotrf)); algmap.insert(std::pair("crout", new algorithm_crout)); algmap.insert(std::pair("depend", new algorithm_depend)); algmap.insert(std::pair("join", new algorithm_join)); if (!process_args(argc, argv)) { printf("ERROR: Invalid arguments. Run: %s -h\n", argv[0]); return -1; } oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism, g_num_tbb_threads); double *A = nullptr; // Read input matrix matrix_init(A, g_n, g_input_file_name); // Write input matrix if output_prefix is set and we didn't read from a file if (!g_input_file_name && g_output_prefix) { std::string s(g_output_prefix); s += "_posdef.txt"; matrix_write(A, g_n, s.c_str()); } if (g_alg_name.empty()) { for (algmap_t::iterator i = algmap.begin(); i != algmap.end(); ++i) { algorithm *const alg = i->second; (*alg)(A, g_n, g_b, g_num_trials); } } else { algmap_t::iterator alg_iter = algmap.find(g_alg_name); if (alg_iter != algmap.end()) { algorithm *const alg = alg_iter->second; (*alg)(A, g_n, g_b, g_num_trials); } else { printf("ERROR: Invalid algorithm name: %s\n", g_alg_name.c_str()); return -1; } } free(A); return 0; } ================================================ FILE: third-party/tbb/examples/graph/cholesky/init.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include #include static void posdef_gen(double *A, int n) { /* Allocate memory for the matrix and its transpose */ double *L = (double *)calloc(sizeof(double), n * n); assert(L); double *LT = (double *)calloc(sizeof(double), n * n); assert(LT); memset(A, 0, sizeof(double) * n * n); /* Generate a conditioned matrix and fill it with random numbers */ for (int j = 0; j < n; ++j) { for (int k = 0; k < j; ++k) { // The initial value has to be between [0,1]. L[k * n + j] = (((j * k) / ((double)(j + 1)) / ((double)(k + 2)) * 2.0) - 1.0) / ((double)n); } L[j * n + j] = 1; } /* Compute transpose of the matrix */ for (int i = 0; i < n; ++i) { for (int j = 0; j < n; ++j) { LT[j * n + i] = L[i * n + j]; } } cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, n, n, n, 1, L, n, LT, n, 0, A, n); free(L); free(LT); } // Read the matrix from the input file void matrix_init(double *&A, int &n, const char *fname) { if (fname) { int i; int j; FILE *fp; fp = fopen(fname, "r"); if (fp == nullptr) { fprintf(stderr, "\nFile does not exist\n"); std::exit(0); } if (fscanf(fp, "%d", &n) <= 0) { fprintf(stderr, "\nCouldn't read n from %s\n", fname); std::exit(-1); } A = (double *)calloc(sizeof(double), n * n); for (i = 0; i < n; ++i) { for (j = 0; j <= i; ++j) { if (fscanf(fp, "%lf ", &A[i * n + j]) <= 0) { fprintf(stderr, "\nMatrix size incorrect %i %i\n", i, j); std::exit(-1); } if (i != j) { A[j * n + i] = A[i * n + j]; } } } fclose(fp); } else { A = (double *)calloc(sizeof(double), n * n); posdef_gen(A, n); } } // write matrix to file void matrix_write(double *A, int n, const char *fname, bool is_triangular = false) { if (fname) { int i = 0; int j = 0; FILE *fp = nullptr; fp = fopen(fname, "w"); if (fp == nullptr) { fprintf(stderr, "\nCould not open file %s for writing.\n", fname); std::exit(0); } fprintf(fp, "%d\n", n); for (i = 0; i < n; ++i) { for (j = 0; j <= i; ++j) { fprintf(fp, "%lf ", A[j * n + i]); } if (!is_triangular) { for (; j < n; ++j) { fprintf(fp, "%lf ", A[i * n + j]); } } else { for (; j < n; ++j) { fprintf(fp, "%lf ", 0.0); } } fprintf(fp, "\n"); } if (is_triangular) { fprintf(fp, "\n"); for (i = 0; i < n; ++i) { for (j = 0; j < i; ++j) { fprintf(fp, "%lf ", 0.0); } for (; j < n; ++j) { fprintf(fp, "%lf ", A[i * n + j]); } fprintf(fp, "\n"); } } fclose(fp); } } ================================================ FILE: third-party/tbb/examples/graph/dining_philosophers/CMakeLists.txt ================================================ # Copyright (c) 2020-2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. cmake_minimum_required(VERSION 3.5.0...3.31.3) project(dining_philosophers CXX) include(../../common/cmake/common.cmake) set_common_project_settings(tbb) add_executable(dining_philosophers dining_philosophers.cpp) target_link_libraries(dining_philosophers TBB::tbb Threads::Threads) target_compile_options(dining_philosophers PRIVATE ${TBB_CXX_STD_FLAG}) set(EXECUTABLE "$") set(ARGS auto 5) set(LIGHT_ARGS auto 3) add_execution_target(run_dining_philosophers dining_philosophers ${EXECUTABLE} "${ARGS}") add_execution_target(light_test_dining_philosophers dining_philosophers ${EXECUTABLE} "${LIGHT_ARGS}") ================================================ FILE: third-party/tbb/examples/graph/dining_philosophers/README.md ================================================ # Dining_philosophers sample The Dining Philosophers problem demonstrates `oneapi::tbb::flow` and the use of the reserving `join_node` to solve the potential deadlock. This program runs some number of philosophers in parallel, each thinking and then waiting for chopsticks to be available before eating. Eating and thinking are implemented with `sleep()`. The chopstick positions are represented by a `queue_node` with one item. ## Building the example ``` cmake cmake --build . ``` ## Running the sample ### Predefined make targets * `make run_dining_philosophers` - executes the example with predefined parameters. * `make light_test_dining_philosophers` - executes the example with suggested parameters to reduce execution time. ### Application parameters Usage: ``` dining_philosophers [n-of_threads=value] [n-of-philosophers=value] [verbose] [-h] [n-of_threads [n-of-philosophers]] ``` * `-h` - prints the help for command line options. * `n-of_threads` - number of threads to use; a range of the form low\[:high\[:(+|*|#)step\]\], where low and optional high are non-negative integers or 'auto' for the default choice, and optional step expression specifies how thread numbers are chosen within the range. * `n-of-philosophers` - how many philosophers, from 2-26. * `verbose` - prints diagnostic output to screen. ================================================ FILE: third-party/tbb/examples/graph/dining_philosophers/dining_philosophers.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #if _MSC_VER // Suppress "decorated name length exceeded, name was truncated" warning #pragma warning(disable : 4503) #endif #include #include #include #include #include #include #include "oneapi/tbb/flow_graph.h" #include "oneapi/tbb/tick_count.h" #include "oneapi/tbb/spin_mutex.h" #include "oneapi/tbb/global_control.h" #include "common/utility/utility.hpp" #include "common/utility/get_default_num_threads.hpp" // Each philosopher is an object, and is invoked in the think() function_node, the // eat() function_node and forward() multifunction_node. // // The graph is constructed, and each think() function_node is started with a continue_msg. // // The philosopher will think, then gather two chopsticks, eat, place the chopsticks back, // and if they have not completed the required number of cycles, will start to think() again // by sending a continue_msg to their corresponding think() function_node. // // The reserving join has as its inputs the left and right chopstick queues an a queue // that stores the continue_msg emitted by the function_node after think()ing is done. // When all three inputs are available, a tuple of the inputs will be forwarded to the // eat() function_node. The output of the eat() function_node is sent to the forward() // multifunction_node. const std::chrono::seconds think_time(1); const std::chrono::seconds eat_time(1); const int num_times = 10; oneapi::tbb::tick_count t0; bool verbose = false; const char *names[] = { "Archimedes", "Bakunin", "Confucius", "Democritus", "Euclid", "Favorinus", "Geminus", "Heraclitus", "Ichthyas", "Jason of Nysa", "Kant", "Lavrov", "Metrocles", "Nausiphanes", "Onatas", "Phaedrus", "Quillot", "Russell", "Socrates", "Thales", "Udayana", "Vernadsky", "Wittgenstein", "Xenophilus", "Yen Yuan", "Zenodotus" }; const int NumPhilosophers = sizeof(names) / sizeof(char *); struct RunOptions { utility::thread_number_range threads; int number_of_philosophers; bool silent; RunOptions(utility::thread_number_range threads_, int number_of_philosophers_, bool silent_) : threads(threads_), number_of_philosophers(number_of_philosophers_), silent(silent_) {} }; RunOptions ParseCommandLine(int argc, char *argv[]) { int auto_threads = utility::get_default_num_threads(); utility::thread_number_range threads( utility::get_default_num_threads, auto_threads, auto_threads); int nPhilosophers = 5; bool verbose = false; char charbuf[100]; std::sprintf(charbuf, "%d", NumPhilosophers); std::string pCount = "how many philosophers, from 2-"; pCount += charbuf; utility::cli_argument_pack cli_pack; cli_pack.positional_arg(threads, "n-of_threads", utility::thread_number_range_desc) .positional_arg(nPhilosophers, "n-of-philosophers", pCount) .arg(verbose, "verbose", "verbose output"); utility::parse_cli_arguments(argc, argv, cli_pack); if (nPhilosophers < 2 || nPhilosophers > NumPhilosophers) { std::cout << "Number of philosophers (" << nPhilosophers << ") out of range [2:" << NumPhilosophers << "]\n"; std::cout << cli_pack.usage_string(argv[0]) << std::flush; std::exit(-1); } return RunOptions(threads, nPhilosophers, !verbose); } oneapi::tbb::spin_mutex my_mutex; class chopstick {}; typedef std::tuple join_output; typedef oneapi::tbb::flow::join_node join_node_type; typedef oneapi::tbb::flow::function_node think_node_type; typedef oneapi::tbb::flow::function_node eat_node_type; typedef oneapi::tbb::flow::multifunction_node forward_node_type; class philosopher { public: philosopher(const char *name) : my_name(name), my_count(num_times) {} ~philosopher() {} void check(); const char *name() const { return my_name; } private: friend std::ostream &operator<<(std::ostream &o, philosopher const &p); const char *my_name; int my_count; friend class think_node_body; friend class eat_node_body; friend class forward_node_body; void think(); void eat(); void forward(const oneapi::tbb::flow::continue_msg &in, forward_node_type::output_ports_type &out_ports); }; std::ostream &operator<<(std::ostream &o, philosopher const &p) { o << "< philosopher[" << reinterpret_cast(const_cast(&p)) << "] " << p.name() << ", my_count=" << p.my_count; return o; } class think_node_body { philosopher &my_philosopher; public: think_node_body(philosopher &p) : my_philosopher(p) {} think_node_body(const think_node_body &other) : my_philosopher(other.my_philosopher) {} oneapi::tbb::flow::continue_msg operator()(oneapi::tbb::flow::continue_msg /*m*/) { my_philosopher.think(); return oneapi::tbb::flow::continue_msg(); } }; class eat_node_body { philosopher &my_philosopher; public: eat_node_body(philosopher &p) : my_philosopher(p) {} eat_node_body(const eat_node_body &other) : my_philosopher(other.my_philosopher) {} oneapi::tbb::flow::continue_msg operator()(const join_output &in) { my_philosopher.eat(); return oneapi::tbb::flow::continue_msg(); } }; class forward_node_body { philosopher &my_philosopher; public: forward_node_body(philosopher &p) : my_philosopher(p) {} forward_node_body(const forward_node_body &other) : my_philosopher(other.my_philosopher) {} void operator()(const oneapi::tbb::flow::continue_msg &in, forward_node_type::output_ports_type &out) { my_philosopher.forward(in, out); } }; void philosopher::check() { if (my_count != 0) { std::printf("ERROR: philosopher %s still had to run %d more times\n", name(), my_count); std::exit(-1); } } void philosopher::forward(const oneapi::tbb::flow::continue_msg & /*in*/, forward_node_type::output_ports_type &out_ports) { if (my_count < 0) abort(); --my_count; (void)std::get<1>(out_ports).try_put(chopstick()); (void)std::get<2>(out_ports).try_put(chopstick()); if (my_count > 0) { (void)std::get<0>(out_ports).try_put( oneapi::tbb::flow::continue_msg()); //start thinking again } else { if (verbose) { oneapi::tbb::spin_mutex::scoped_lock lock(my_mutex); std::printf("%s has left the building\n", name()); } } } void philosopher::eat() { if (verbose) { oneapi::tbb::spin_mutex::scoped_lock lock(my_mutex); std::printf("%s eating\n", name()); } std::this_thread::sleep_for(eat_time); if (verbose) { oneapi::tbb::spin_mutex::scoped_lock lock(my_mutex); std::printf("%s done eating\n", name()); } } void philosopher::think() { if (verbose) { oneapi::tbb::spin_mutex::scoped_lock lock(my_mutex); std::printf("%s thinking\n", name()); } std::this_thread::sleep_for(think_time); if (verbose) { oneapi::tbb::spin_mutex::scoped_lock lock(my_mutex); std::printf("%s done thinking\n", name()); } } typedef oneapi::tbb::flow::queue_node thinking_done_type; int main(int argc, char *argv[]) { using oneapi::tbb::flow::make_edge; using oneapi::tbb::flow::input_port; using oneapi::tbb::flow::output_port; oneapi::tbb::tick_count main_time = oneapi::tbb::tick_count::now(); int num_threads; int num_philosophers; RunOptions options = ParseCommandLine(argc, argv); num_philosophers = options.number_of_philosophers; verbose = !options.silent; for (num_threads = options.threads.first; num_threads <= options.threads.last; num_threads = options.threads.step(num_threads)) { oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism, num_threads); oneapi::tbb::flow::graph g; if (verbose) { std::cout << "\n" << num_philosophers << " philosophers with " << num_threads << " threads" << "\n" << "\n"; } t0 = oneapi::tbb::tick_count::now(); std::vector> places( num_philosophers, oneapi::tbb::flow::queue_node(g)); std::vector philosophers; philosophers.reserve(num_philosophers); std::vector think_nodes; think_nodes.reserve(num_philosophers); std::vector done_vector(num_philosophers, thinking_done_type(g)); std::vector join_vector(num_philosophers, join_node_type(g)); std::vector eat_nodes; eat_nodes.reserve(num_philosophers); std::vector forward_nodes; forward_nodes.reserve(num_philosophers); for (int i = 0; i < num_philosophers; ++i) { places[i].try_put(chopstick()); philosophers.push_back( philosopher(names[i])); // allowed because of default generated assignment if (verbose) { oneapi::tbb::spin_mutex::scoped_lock lock(my_mutex); std::cout << "Built philosopher " << philosophers[i] << "\n"; } think_nodes.push_back(new think_node_type( g, oneapi::tbb::flow::unlimited, think_node_body(philosophers[i]))); eat_nodes.push_back( new eat_node_type(g, oneapi::tbb::flow::unlimited, eat_node_body(philosophers[i]))); forward_nodes.push_back(new forward_node_type( g, oneapi::tbb::flow::unlimited, forward_node_body(philosophers[i]))); } // attach chopstick buffers and think function_nodes to joins for (int i = 0; i < num_philosophers; ++i) { make_edge(*think_nodes[i], done_vector[i]); make_edge(done_vector[i], input_port<0>(join_vector[i])); make_edge(places[i], input_port<1>(join_vector[i])); // left chopstick make_edge(places[(i + 1) % num_philosophers], input_port<2>(join_vector[i])); // right chopstick make_edge(join_vector[i], *eat_nodes[i]); make_edge(*eat_nodes[i], *forward_nodes[i]); make_edge(output_port<0>(*forward_nodes[i]), *think_nodes[i]); make_edge(output_port<1>(*forward_nodes[i]), places[i]); make_edge(output_port<2>(*forward_nodes[i]), places[(i + 1) % num_philosophers]); } // start all the philosophers thinking for (int i = 0; i < num_philosophers; ++i) think_nodes[i]->try_put(oneapi::tbb::flow::continue_msg()); g.wait_for_all(); oneapi::tbb::tick_count t1 = oneapi::tbb::tick_count::now(); if (verbose) std::cout << "\n" << num_philosophers << " philosophers with " << num_threads << " threads have taken " << (t1 - t0).seconds() << "seconds" << "\n"; for (int i = 0; i < num_philosophers; ++i) philosophers[i].check(); for (int i = 0; i < num_philosophers; ++i) { delete think_nodes[i]; delete eat_nodes[i]; delete forward_nodes[i]; } } utility::report_elapsed_time((oneapi::tbb::tick_count::now() - main_time).seconds()); return 0; } ================================================ FILE: third-party/tbb/examples/graph/fgbzip2/CMakeLists.txt ================================================ # Copyright (c) 2020-2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. cmake_minimum_required(VERSION 3.5.0...3.31.3) project(fgbzip2 CXX) include(../../common/cmake/common.cmake) set_common_project_settings(tbb) add_executable(fgbzip2 blocksort.cpp bzlib.cpp compress.cpp crctable.cpp decompress.cpp fgbzip2.cpp huffman.cpp randtable.cpp ) target_link_libraries(fgbzip2 TBB::tbb Threads::Threads) target_compile_options(fgbzip2 PRIVATE ${TBB_CXX_STD_FLAG}) if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL IntelLLVM) target_compile_options(fgbzip2 PRIVATE -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_DEPRECATE) endif() if (MSVC AND (CMAKE_CXX_COMPILER_ID STREQUAL Intel OR CMAKE_CXX_COMPILER_ID STREQUAL IntelLLVM)) if (COMMAND target_link_options) target_link_options(fgbzip2 PRIVATE /FORCE:MULTIPLE /INCREMENTAL:NO) else() set_target_properties(fgbzip2 PROPERTIES LINK_FLAGS /FORCE:MULTIPLE /INCREMENTAL:NO) endif() endif() set(EXECUTABLE "$") set(ARGS -b=9 -async "$") add_execution_target(run_fgbzip2 fgbzip2 ${EXECUTABLE} "${ARGS}") add_execution_target(perf_run_fgbzip2 fgbzip2 ${EXECUTABLE} "${ARGS}") ================================================ FILE: third-party/tbb/examples/graph/fgbzip2/README.md ================================================ # fgbzip2 sample fgbzip2 is a parallel implementation of bzip2 block-sorting file compressor that uses `oneapi::tbb::flow`. The output of this application is fully compatible with bzip2 v1.0.6 or newer. This example includes software developed by Julian R Seward. See here for copyright information. It exemplifies support for asynchronous capabilities in the flow graph API. ## Building the example ``` cmake cmake --build . ``` ## Running the sample ### Predefined make targets * `make run_fgbzip2` - executes the example with predefined parameters. * `make perf_run_fgbzip2` - executes the example with suggested parameters to measure the oneTBB performance. ### Application parameters Usage: ``` fgbzip2 [-b=value] [-v] [-l=value] [-async] [filename=value] [-h] [filename] ``` * `-h` - prints the help for command line options. * `-b` - block size in 100 KB chunks, [1 .. 9]. * `-v` - prints diagnostic output to screen. * `-l` - use memory limit for compression algorithm with 1 MB (minimum) granularity. * `-async` - use graph `async_node`-based implementation. * `filename` - name of the file to compress. ================================================ FILE: third-party/tbb/examples/graph/fgbzip2/blocksort.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /*-------------------------------------------------------------*/ /*--- Block sorting machinery ---*/ /*--- blocksort.cpp ---*/ /*-------------------------------------------------------------*/ /* ------------------------------------------------------------------ The original source for this example: This file is part of bzip2/libbzip2, a program and library for lossless, block-sorting data compression. bzip2/libbzip2 version 1.0.6 of 6 September 2010 Copyright (C) 1996-2010 Julian Seward This program, "bzip2", the associated library "libbzip2", and all documentation, are copyright (C) 1996-2010 Julian R Seward. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 3. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 4. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Julian Seward, jseward@bzip.org bzip2/libbzip2 version 1.0.6 of 6 September 2010 ------------------------------------------------------------------ */ #include "bzlib_private.hpp" /*---------------------------------------------*/ /*--- Fallback O(N log(N)^2) sorting ---*/ /*--- algorithm, for repetitive blocks ---*/ /*---------------------------------------------*/ /*---------------------------------------------*/ static __inline__ void fallbackSimpleSort(UInt32* fmap, UInt32* eclass, Int32 lo, Int32 hi) { Int32 i, j, tmp; UInt32 ec_tmp; if (lo == hi) return; if (hi - lo > 3) { for (i = hi - 4; i >= lo; i--) { tmp = fmap[i]; ec_tmp = eclass[tmp]; for (j = i + 4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4) fmap[j - 4] = fmap[j]; fmap[j - 4] = tmp; } } for (i = hi - 1; i >= lo; i--) { tmp = fmap[i]; ec_tmp = eclass[tmp]; for (j = i + 1; j <= hi && ec_tmp > eclass[fmap[j]]; j++) fmap[j - 1] = fmap[j]; fmap[j - 1] = tmp; } } /*---------------------------------------------*/ #define fswap(zz1, zz2) \ { \ Int32 zztmp = zz1; \ zz1 = zz2; \ zz2 = zztmp; \ } #define fvswap(zzp1, zzp2, zzn) \ { \ Int32 yyp1 = (zzp1); \ Int32 yyp2 = (zzp2); \ Int32 yyn = (zzn); \ while (yyn > 0) { \ fswap(fmap[yyp1], fmap[yyp2]); \ yyp1++; \ yyp2++; \ yyn--; \ } \ } #define fmin(a, b) ((a) < (b)) ? (a) : (b) #define fpush(lz, hz) \ { \ stackLo[sp] = lz; \ stackHi[sp] = hz; \ sp++; \ } #define fpop(lz, hz) \ { \ sp--; \ lz = stackLo[sp]; \ hz = stackHi[sp]; \ } #define FALLBACK_QSORT_SMALL_THRESH 10 #define FALLBACK_QSORT_STACK_SIZE 100 static void fallbackQSort3(UInt32* fmap, UInt32* eclass, Int32 loSt, Int32 hiSt) { Int32 unLo, unHi, ltLo, gtHi, n, m; Int32 sp, lo, hi; UInt32 med, r, r3; Int32 stackLo[FALLBACK_QSORT_STACK_SIZE]; Int32 stackHi[FALLBACK_QSORT_STACK_SIZE]; r = 0; sp = 0; fpush(loSt, hiSt); while (sp > 0) { AssertH(sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004); fpop(lo, hi); if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) { fallbackSimpleSort(fmap, eclass, lo, hi); continue; } /* Random partitioning. Median of 3 sometimes fails to avoid bad cases. Median of 9 seems to help but looks rather expensive. This too seems to work but is cheaper. Guidance for the magic constants 7621 and 32768 is taken from Sedgewick's algorithms book, chapter 35. */ r = ((r * 7621) + 1) % 32768; r3 = r % 3; if (r3 == 0) med = eclass[fmap[lo]]; else if (r3 == 1) med = eclass[fmap[(lo + hi) >> 1]]; else med = eclass[fmap[hi]]; unLo = ltLo = lo; unHi = gtHi = hi; while (1) { while (1) { if (unLo > unHi) break; n = (Int32)eclass[fmap[unLo]] - (Int32)med; if (n == 0) { fswap(fmap[unLo], fmap[ltLo]); ltLo++; unLo++; continue; }; if (n > 0) break; unLo++; } while (1) { if (unLo > unHi) break; n = (Int32)eclass[fmap[unHi]] - (Int32)med; if (n == 0) { fswap(fmap[unHi], fmap[gtHi]); gtHi--; unHi--; continue; }; if (n < 0) break; unHi--; } if (unLo > unHi) break; fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--; } AssertD(unHi == unLo - 1, "fallbackQSort3(2)"); if (gtHi < ltLo) continue; n = fmin(ltLo - lo, unLo - ltLo); fvswap(lo, unLo - n, n); m = fmin(hi - gtHi, gtHi - unHi); fvswap(unLo, hi - m + 1, m); n = lo + unLo - ltLo - 1; m = hi - (gtHi - unHi) + 1; if (n - lo > hi - m) { fpush(lo, n); fpush(m, hi); } else { fpush(m, hi); fpush(lo, n); } } } #undef fmin #undef fpush #undef fpop #undef fswap #undef fvswap #undef FALLBACK_QSORT_SMALL_THRESH #undef FALLBACK_QSORT_STACK_SIZE /*---------------------------------------------*/ /* Pre: nblock > 0 eclass exists for [0 .. nblock-1] ((UChar*)eclass) [0 .. nblock-1] holds block ptr exists for [0 .. nblock-1] Post: ((UChar*)eclass) [0 .. nblock-1] holds block All other areas of eclass destroyed fmap [0 .. nblock-1] holds sorted order bhtab [ 0 .. 2+(nblock/32) ] destroyed */ #define SET_BH(zz) bhtab[(zz) >> 5] |= (1 << ((zz)&31)) #define CLEAR_BH(zz) bhtab[(zz) >> 5] &= ~(1 << ((zz)&31)) #define ISSET_BH(zz) (bhtab[(zz) >> 5] & (1 << ((zz)&31))) #define WORD_BH(zz) bhtab[(zz) >> 5] #define UNALIGNED_BH(zz) ((zz)&0x01f) static void fallbackSort(UInt32* fmap, UInt32* eclass, UInt32* bhtab, Int32 nblock, Int32 verb) { Int32 ftab[257]; Int32 ftabCopy[256]; Int32 H, i, j, k, l, r, cc, cc1; Int32 nNotDone; Int32 nBhtab; UChar* eclass8 = (UChar*)eclass; /*-- Initial 1-char radix sort to generate initial fmap and initial BH bits. --*/ if (verb >= 4) VPrintf0(" bucket sorting ...\n"); for (i = 0; i < 257; i++) ftab[i] = 0; for (i = 0; i < nblock; i++) ftab[eclass8[i]]++; for (i = 0; i < 256; i++) ftabCopy[i] = ftab[i]; for (i = 1; i < 257; i++) ftab[i] += ftab[i - 1]; for (i = 0; i < nblock; i++) { j = eclass8[i]; k = ftab[j] - 1; ftab[j] = k; fmap[k] = i; } nBhtab = 2 + (nblock / 32); for (i = 0; i < nBhtab; i++) bhtab[i] = 0; for (i = 0; i < 256; i++) SET_BH(ftab[i]); /*-- Inductively refine the buckets. Kind-of an "exponential radix sort" (!), inspired by the Manber-Myers suffix array construction algorithm. --*/ /*-- set sentinel bits for block-end detection --*/ for (i = 0; i < 32; i++) { SET_BH(nblock + 2 * i); CLEAR_BH(nblock + 2 * i + 1); } /*-- the log(N) loop --*/ H = 1; while (1) { if (verb >= 4) VPrintf1(" depth %6d has ", H); j = 0; for (i = 0; i < nblock; i++) { if (ISSET_BH(i)) j = i; k = fmap[i] - H; if (k < 0) k += nblock; eclass[k] = j; } nNotDone = 0; r = -1; while (1) { /*-- find the next non-singleton bucket --*/ k = r + 1; while (ISSET_BH(k) && UNALIGNED_BH(k)) k++; if (ISSET_BH(k)) { while (WORD_BH(k) == 0xffffffff) k += 32; while (ISSET_BH(k)) k++; } l = k - 1; if (l >= nblock) break; while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++; if (!ISSET_BH(k)) { while (WORD_BH(k) == 0x00000000) k += 32; while (!ISSET_BH(k)) k++; } r = k - 1; if (r >= nblock) break; /*-- now [l, r] bracket current bucket --*/ if (r > l) { nNotDone += (r - l + 1); fallbackQSort3(fmap, eclass, l, r); /*-- scan bucket and generate header bits-- */ cc = -1; for (i = l; i <= r; i++) { cc1 = eclass[fmap[i]]; if (cc != cc1) { SET_BH(i); cc = cc1; }; } } } if (verb >= 4) VPrintf1("%6d unresolved strings\n", nNotDone); H *= 2; if (H > nblock || nNotDone == 0) break; } /*-- Reconstruct the original block in eclass8 [0 .. nblock-1], since the previous phase destroyed it. --*/ if (verb >= 4) VPrintf0(" reconstructing block ...\n"); j = 0; for (i = 0; i < nblock; i++) { while (ftabCopy[j] == 0) j++; ftabCopy[j]--; eclass8[fmap[i]] = (UChar)j; } AssertH(j < 256, 1005); } #undef SET_BH #undef CLEAR_BH #undef ISSET_BH #undef WORD_BH #undef UNALIGNED_BH /*---------------------------------------------*/ /*--- The main, O(N^2 log(N)) sorting ---*/ /*--- algorithm. Faster for "normal" ---*/ /*--- non-repetitive blocks. ---*/ /*---------------------------------------------*/ /*---------------------------------------------*/ static __inline__ Bool mainGtU(UInt32 i1, UInt32 i2, UChar* block, UInt16* quadrant, UInt32 nblock, Int32* budget) { Int32 k; UChar c1, c2; UInt16 s1, s2; AssertD(i1 != i2, "mainGtU"); /* 1 */ c1 = block[i1]; c2 = block[i2]; if (c1 != c2) return (c1 > c2); i1++; i2++; /* 2 */ c1 = block[i1]; c2 = block[i2]; if (c1 != c2) return (c1 > c2); i1++; i2++; /* 3 */ c1 = block[i1]; c2 = block[i2]; if (c1 != c2) return (c1 > c2); i1++; i2++; /* 4 */ c1 = block[i1]; c2 = block[i2]; if (c1 != c2) return (c1 > c2); i1++; i2++; /* 5 */ c1 = block[i1]; c2 = block[i2]; if (c1 != c2) return (c1 > c2); i1++; i2++; /* 6 */ c1 = block[i1]; c2 = block[i2]; if (c1 != c2) return (c1 > c2); i1++; i2++; /* 7 */ c1 = block[i1]; c2 = block[i2]; if (c1 != c2) return (c1 > c2); i1++; i2++; /* 8 */ c1 = block[i1]; c2 = block[i2]; if (c1 != c2) return (c1 > c2); i1++; i2++; /* 9 */ c1 = block[i1]; c2 = block[i2]; if (c1 != c2) return (c1 > c2); i1++; i2++; /* 10 */ c1 = block[i1]; c2 = block[i2]; if (c1 != c2) return (c1 > c2); i1++; i2++; /* 11 */ c1 = block[i1]; c2 = block[i2]; if (c1 != c2) return (c1 > c2); i1++; i2++; /* 12 */ c1 = block[i1]; c2 = block[i2]; if (c1 != c2) return (c1 > c2); i1++; i2++; k = nblock + 8; do { /* 1 */ c1 = block[i1]; c2 = block[i2]; if (c1 != c2) return (c1 > c2); s1 = quadrant[i1]; s2 = quadrant[i2]; if (s1 != s2) return (s1 > s2); i1++; i2++; /* 2 */ c1 = block[i1]; c2 = block[i2]; if (c1 != c2) return (c1 > c2); s1 = quadrant[i1]; s2 = quadrant[i2]; if (s1 != s2) return (s1 > s2); i1++; i2++; /* 3 */ c1 = block[i1]; c2 = block[i2]; if (c1 != c2) return (c1 > c2); s1 = quadrant[i1]; s2 = quadrant[i2]; if (s1 != s2) return (s1 > s2); i1++; i2++; /* 4 */ c1 = block[i1]; c2 = block[i2]; if (c1 != c2) return (c1 > c2); s1 = quadrant[i1]; s2 = quadrant[i2]; if (s1 != s2) return (s1 > s2); i1++; i2++; /* 5 */ c1 = block[i1]; c2 = block[i2]; if (c1 != c2) return (c1 > c2); s1 = quadrant[i1]; s2 = quadrant[i2]; if (s1 != s2) return (s1 > s2); i1++; i2++; /* 6 */ c1 = block[i1]; c2 = block[i2]; if (c1 != c2) return (c1 > c2); s1 = quadrant[i1]; s2 = quadrant[i2]; if (s1 != s2) return (s1 > s2); i1++; i2++; /* 7 */ c1 = block[i1]; c2 = block[i2]; if (c1 != c2) return (c1 > c2); s1 = quadrant[i1]; s2 = quadrant[i2]; if (s1 != s2) return (s1 > s2); i1++; i2++; /* 8 */ c1 = block[i1]; c2 = block[i2]; if (c1 != c2) return (c1 > c2); s1 = quadrant[i1]; s2 = quadrant[i2]; if (s1 != s2) return (s1 > s2); i1++; i2++; if (i1 >= nblock) i1 -= nblock; if (i2 >= nblock) i2 -= nblock; k -= 8; (*budget)--; } while (k >= 0); return False; } /*---------------------------------------------*/ /*-- Knuth's increments seem to work better than Incerpi-Sedgewick here. Possibly because the number of elements to sort is usually small, typically <= 20. --*/ static Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280, 9841, 29524, 88573, 265720, 797161, 2391484 }; static void mainSimpleSort(UInt32* ptr, UChar* block, UInt16* quadrant, Int32 nblock, Int32 lo, Int32 hi, Int32 d, Int32* budget) { Int32 i, j, h, bigN, hp; UInt32 v; bigN = hi - lo + 1; if (bigN < 2) return; hp = 0; while (incs[hp] < bigN) hp++; hp--; for (; hp >= 0; hp--) { h = incs[hp]; i = lo + h; while (True) { /*-- copy 1 --*/ if (i > hi) break; v = ptr[i]; j = i; while (mainGtU(ptr[j - h] + d, v + d, block, quadrant, nblock, budget)) { ptr[j] = ptr[j - h]; j = j - h; if (j <= (lo + h - 1)) break; } ptr[j] = v; i++; /*-- copy 2 --*/ if (i > hi) break; v = ptr[i]; j = i; while (mainGtU(ptr[j - h] + d, v + d, block, quadrant, nblock, budget)) { ptr[j] = ptr[j - h]; j = j - h; if (j <= (lo + h - 1)) break; } ptr[j] = v; i++; /*-- copy 3 --*/ if (i > hi) break; v = ptr[i]; j = i; while (mainGtU(ptr[j - h] + d, v + d, block, quadrant, nblock, budget)) { ptr[j] = ptr[j - h]; j = j - h; if (j <= (lo + h - 1)) break; } ptr[j] = v; i++; if (*budget < 0) return; } } } /*---------------------------------------------*/ /*-- The following is an implementation of an elegant 3-way quicksort for strings, described in a paper "Fast Algorithms for Sorting and Searching Strings", by Robert Sedgewick and Jon L. Bentley. --*/ #define mswap(zz1, zz2) \ { \ Int32 zztmp = zz1; \ zz1 = zz2; \ zz2 = zztmp; \ } #define mvswap(zzp1, zzp2, zzn) \ { \ Int32 yyp1 = (zzp1); \ Int32 yyp2 = (zzp2); \ Int32 yyn = (zzn); \ while (yyn > 0) { \ mswap(ptr[yyp1], ptr[yyp2]); \ yyp1++; \ yyp2++; \ yyn--; \ } \ } static __inline__ UChar mmed3(UChar a, UChar b, UChar c) { UChar t; if (a > b) { t = a; a = b; b = t; }; if (b > c) { b = c; if (a > b) b = a; } return b; } #define mmin(a, b) ((a) < (b)) ? (a) : (b) #define mpush(lz, hz, dz) \ { \ stackLo[sp] = lz; \ stackHi[sp] = hz; \ stackD[sp] = dz; \ sp++; \ } #define mpop(lz, hz, dz) \ { \ sp--; \ lz = stackLo[sp]; \ hz = stackHi[sp]; \ dz = stackD[sp]; \ } #define mnextsize(az) (nextHi[az] - nextLo[az]) #define mnextswap(az, bz) \ { \ Int32 tz; \ tz = nextLo[az]; \ nextLo[az] = nextLo[bz]; \ nextLo[bz] = tz; \ tz = nextHi[az]; \ nextHi[az] = nextHi[bz]; \ nextHi[bz] = tz; \ tz = nextD[az]; \ nextD[az] = nextD[bz]; \ nextD[bz] = tz; \ } #define MAIN_QSORT_SMALL_THRESH 20 #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT) #define MAIN_QSORT_STACK_SIZE 100 static void mainQSort3(UInt32* ptr, UChar* block, UInt16* quadrant, Int32 nblock, Int32 loSt, Int32 hiSt, Int32 dSt, Int32* budget) { Int32 unLo, unHi, ltLo, gtHi, n, m, med; Int32 sp, lo, hi, d; Int32 stackLo[MAIN_QSORT_STACK_SIZE]; Int32 stackHi[MAIN_QSORT_STACK_SIZE]; Int32 stackD[MAIN_QSORT_STACK_SIZE]; Int32 nextLo[3]; Int32 nextHi[3]; Int32 nextD[3]; sp = 0; mpush(loSt, hiSt, dSt); while (sp > 0) { AssertH(sp < MAIN_QSORT_STACK_SIZE - 2, 1001); mpop(lo, hi, d); if (hi - lo < MAIN_QSORT_SMALL_THRESH || d > MAIN_QSORT_DEPTH_THRESH) { mainSimpleSort(ptr, block, quadrant, nblock, lo, hi, d, budget); if (*budget < 0) return; continue; } med = (Int32)mmed3(block[ptr[lo] + d], block[ptr[hi] + d], block[ptr[(lo + hi) >> 1] + d]); unLo = ltLo = lo; unHi = gtHi = hi; while (True) { while (True) { if (unLo > unHi) break; n = ((Int32)block[ptr[unLo] + d]) - med; if (n == 0) { mswap(ptr[unLo], ptr[ltLo]); ltLo++; unLo++; continue; }; if (n > 0) break; unLo++; } while (True) { if (unLo > unHi) break; n = ((Int32)block[ptr[unHi] + d]) - med; if (n == 0) { mswap(ptr[unHi], ptr[gtHi]); gtHi--; unHi--; continue; }; if (n < 0) break; unHi--; } if (unLo > unHi) break; mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--; } AssertD(unHi == unLo - 1, "mainQSort3(2)"); if (gtHi < ltLo) { mpush(lo, hi, d + 1); continue; } n = mmin(ltLo - lo, unLo - ltLo); mvswap(lo, unLo - n, n); m = mmin(hi - gtHi, gtHi - unHi); mvswap(unLo, hi - m + 1, m); n = lo + unLo - ltLo - 1; m = hi - (gtHi - unHi) + 1; nextLo[0] = lo; nextHi[0] = n; nextD[0] = d; nextLo[1] = m; nextHi[1] = hi; nextD[1] = d; nextLo[2] = n + 1; nextHi[2] = m - 1; nextD[2] = d + 1; if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1); if (mnextsize(1) < mnextsize(2)) mnextswap(1, 2); if (mnextsize(0) < mnextsize(1)) mnextswap(0, 1); AssertD(mnextsize(0) >= mnextsize(1), "mainQSort3(8)"); AssertD(mnextsize(1) >= mnextsize(2), "mainQSort3(9)"); mpush(nextLo[0], nextHi[0], nextD[0]); mpush(nextLo[1], nextHi[1], nextD[1]); mpush(nextLo[2], nextHi[2], nextD[2]); } } #undef mswap #undef mvswap #undef mpush #undef mpop #undef mmin #undef mnextsize #undef mnextswap #undef MAIN_QSORT_SMALL_THRESH #undef MAIN_QSORT_DEPTH_THRESH #undef MAIN_QSORT_STACK_SIZE /*---------------------------------------------*/ /* Pre: nblock > N_OVERSHOOT block32 exists for [0 .. nblock-1 +N_OVERSHOOT] ((UChar*)block32) [0 .. nblock-1] holds block ptr exists for [0 .. nblock-1] Post: ((UChar*)block32) [0 .. nblock-1] holds block All other areas of block32 destroyed ftab [0 .. 65536 ] destroyed ptr [0 .. nblock-1] holds sorted order if (*budget < 0), sorting was abandoned */ #define BIGFREQ(b) (ftab[((b) + 1) << 8] - ftab[(b) << 8]) #define SETMASK (1 << 21) #define CLEARMASK (~(SETMASK)) static void mainSort(UInt32* ptr, UChar* block, UInt16* quadrant, UInt32* ftab, Int32 nblock, Int32 verb, Int32* budget) { Int32 i, j, k, ss, sb; Int32 runningOrder[256]; Bool bigDone[256]; Int32 copyStart[256]; Int32 copyEnd[256]; UChar c1; Int32 numQSorted; UInt16 s; if (verb >= 4) VPrintf0(" main sort initialise ...\n"); /*-- set up the 2-byte frequency table --*/ for (i = 65536; i >= 0; i--) ftab[i] = 0; j = block[0] << 8; i = nblock - 1; for (; i >= 3; i -= 4) { quadrant[i] = 0; j = (j >> 8) | (((UInt16)block[i]) << 8); ftab[j]++; quadrant[i - 1] = 0; j = (j >> 8) | (((UInt16)block[i - 1]) << 8); ftab[j]++; quadrant[i - 2] = 0; j = (j >> 8) | (((UInt16)block[i - 2]) << 8); ftab[j]++; quadrant[i - 3] = 0; j = (j >> 8) | (((UInt16)block[i - 3]) << 8); ftab[j]++; } for (; i >= 0; i--) { quadrant[i] = 0; j = (j >> 8) | (((UInt16)block[i]) << 8); ftab[j]++; } /*-- (emphasises close relationship of block & quadrant) --*/ for (i = 0; i < BZ_N_OVERSHOOT; i++) { block[nblock + i] = block[i]; quadrant[nblock + i] = 0; } if (verb >= 4) VPrintf0(" bucket sorting ...\n"); /*-- Complete the initial radix sort --*/ for (i = 1; i <= 65536; i++) ftab[i] += ftab[i - 1]; s = block[0] << 8; i = nblock - 1; for (; i >= 3; i -= 4) { s = (s >> 8) | (block[i] << 8); j = ftab[s] - 1; ftab[s] = j; ptr[j] = i; s = (s >> 8) | (block[i - 1] << 8); j = ftab[s] - 1; ftab[s] = j; ptr[j] = i - 1; s = (s >> 8) | (block[i - 2] << 8); j = ftab[s] - 1; ftab[s] = j; ptr[j] = i - 2; s = (s >> 8) | (block[i - 3] << 8); j = ftab[s] - 1; ftab[s] = j; ptr[j] = i - 3; } for (; i >= 0; i--) { s = (s >> 8) | (block[i] << 8); j = ftab[s] - 1; ftab[s] = j; ptr[j] = i; } /*-- Now ftab contains the first loc of every small bucket. Calculate the running order, from smallest to largest big bucket. --*/ for (i = 0; i <= 255; i++) { bigDone[i] = False; runningOrder[i] = i; } { Int32 vv; Int32 h = 1; do h = 3 * h + 1; while (h <= 256); do { h = h / 3; for (i = h; i <= 255; i++) { vv = runningOrder[i]; j = i; while (BIGFREQ(runningOrder[j - h]) > BIGFREQ(vv)) { runningOrder[j] = runningOrder[j - h]; j = j - h; if (j <= (h - 1)) goto zero; } zero: runningOrder[j] = vv; } } while (h != 1); } /*-- The main sorting loop. --*/ numQSorted = 0; for (i = 0; i <= 255; i++) { /*-- Process big buckets, starting with the least full. Basically this is a 3-step process in which we call mainQSort3 to sort the small buckets [ss, j], but also make a big effort to avoid the calls if we can. --*/ ss = runningOrder[i]; /*-- Step 1: Complete the big bucket [ss] by quicksorting any unsorted small buckets [ss, j], for j != ss. Hopefully previous pointer-scanning phases have already completed many of the small buckets [ss, j], so we don't have to sort them at all. --*/ for (j = 0; j <= 255; j++) { if (j != ss) { sb = (ss << 8) + j; if (!(ftab[sb] & SETMASK)) { Int32 lo = ftab[sb] & CLEARMASK; Int32 hi = (ftab[sb + 1] & CLEARMASK) - 1; if (hi > lo) { if (verb >= 4) VPrintf4(" qsort [0x%x, 0x%x] " "done %d this %d\n", ss, j, numQSorted, hi - lo + 1); mainQSort3(ptr, block, quadrant, nblock, lo, hi, BZ_N_RADIX, budget); numQSorted += (hi - lo + 1); if (*budget < 0) return; } } ftab[sb] |= SETMASK; } } AssertH(!bigDone[ss], 1006); /*-- Step 2: Now scan this big bucket [ss] so as to synthesise the sorted order for small buckets [t, ss] for all t, including, magically, the bucket [ss,ss] too. This will avoid doing Real Work in subsequent Step 1's. --*/ { for (j = 0; j <= 255; j++) { copyStart[j] = ftab[(j << 8) + ss] & CLEARMASK; copyEnd[j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1; } for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) { k = ptr[j] - 1; if (k < 0) k += nblock; c1 = block[k]; if (!bigDone[c1]) ptr[copyStart[c1]++] = k; } for (j = (ftab[(ss + 1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) { k = ptr[j] - 1; if (k < 0) k += nblock; c1 = block[k]; if (!bigDone[c1]) ptr[copyEnd[c1]--] = k; } } AssertH((copyStart[ss] - 1 == copyEnd[ss]) || /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1. Necessity for this case is demonstrated by compressing a sequence of approximately 48.5 million of character 251; 1.0.0/1.0.1 will then die here. */ (copyStart[ss] == 0 && copyEnd[ss] == nblock - 1), 1007) for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK; /*-- Step 3: The [ss] big bucket is now done. Record this fact, and update the quadrant descriptors. Remember to update quadrants in the overshoot area too, if necessary. The "if (i < 255)" test merely skips this updating for the last bucket processed, since updating for the last bucket is pointless. The quadrant array provides a way to incrementally cache sort orderings, as they appear, so as to make subsequent comparisons in fullGtU() complete faster. For repetitive blocks this makes a big difference (but not big enough to be able to avoid the fallback sorting mechanism, exponential radix sort). The precise meaning is: at all times: for 0 <= i < nblock and 0 <= j <= nblock if block[i] != block[j], then the relative values of quadrant[i] and quadrant[j] are meaningless. else { if quadrant[i] < quadrant[j] then the string starting at i lexicographically precedes the string starting at j else if quadrant[i] > quadrant[j] then the string starting at j lexicographically precedes the string starting at i else the relative ordering of the strings starting at i and j has not yet been determined. } --*/ bigDone[ss] = True; if (i < 255) { Int32 bbStart = ftab[ss << 8] & CLEARMASK; Int32 bbSize = (ftab[(ss + 1) << 8] & CLEARMASK) - bbStart; Int32 shifts = 0; while ((bbSize >> shifts) > 65534) shifts++; for (j = bbSize - 1; j >= 0; j--) { Int32 a2update = ptr[bbStart + j]; UInt16 qVal = (UInt16)(j >> shifts); quadrant[a2update] = qVal; if (a2update < BZ_N_OVERSHOOT) quadrant[a2update + nblock] = qVal; } AssertH(((bbSize - 1) >> shifts) <= 65535, 1002); } } if (verb >= 4) VPrintf3(" %d pointers, %d sorted, %d scanned\n", nblock, numQSorted, nblock - numQSorted); } #undef BIGFREQ #undef SETMASK #undef CLEARMASK /*---------------------------------------------*/ /* Pre: nblock > 0 arr2 exists for [0 .. nblock-1 +N_OVERSHOOT] ((UChar*)arr2) [0 .. nblock-1] holds block arr1 exists for [0 .. nblock-1] Post: ((UChar*)arr2) [0 .. nblock-1] holds block All other areas of block destroyed ftab [ 0 .. 65536 ] destroyed arr1 [0 .. nblock-1] holds sorted order */ void BZ2_blockSort(EState* s) { UInt32* ptr = s->ptr; UChar* block = s->block; UInt32* ftab = s->ftab; Int32 nblock = s->nblock; Int32 verb = s->verbosity; Int32 wfact = s->workFactor; UInt16* quadrant; Int32 budget; Int32 budgetInit; Int32 i; if (nblock < 10000) { fallbackSort(s->arr1, s->arr2, ftab, nblock, verb); } else { /* Calculate the location for quadrant, remembering to get the alignment right. Assumes that &(block[0]) is at least 2-byte aligned -- this should be ok since block is really the first section of arr2. */ i = nblock + BZ_N_OVERSHOOT; if (i & 1) i++; quadrant = (UInt16*)(&(block[i])); /* (wfact-1) / 3 puts the default-factor-30 transition point at very roughly the same place as with v0.1 and v0.9.0. Not that it particularly matters any more, since the resulting compressed stream is now the same regardless of whether or not we use the main sort or fallback sort. */ if (wfact < 1) wfact = 1; if (wfact > 100) wfact = 100; budgetInit = nblock * ((wfact - 1) / 3); budget = budgetInit; mainSort(ptr, block, quadrant, ftab, nblock, verb, &budget); if (verb >= 3) VPrintf3(" %d work, %d block, ratio %5.2f\n", budgetInit - budget, nblock, (float)(budgetInit - budget) / (float)(nblock == 0 ? 1 : nblock)); if (budget < 0) { if (verb >= 2) VPrintf0(" too repetitive; using fallback" " sorting algorithm\n"); fallbackSort(s->arr1, s->arr2, ftab, nblock, verb); } } s->origPtr = -1; for (i = 0; i < s->nblock; i++) if (ptr[i] == 0) { s->origPtr = i; break; }; AssertH(s->origPtr != -1, 1003); } /*-------------------------------------------------------------*/ /*--- end blocksort.c ---*/ /*-------------------------------------------------------------*/ ================================================ FILE: third-party/tbb/examples/graph/fgbzip2/bzlib.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /*-------------------------------------------------------------*/ /*--- Library top-level functions. ---*/ /*--- bzlib.cpp ---*/ /*-------------------------------------------------------------*/ /* ------------------------------------------------------------------ The original source for this example: This file is part of bzip2/libbzip2, a program and library for lossless, block-sorting data compression. bzip2/libbzip2 version 1.0.6 of 6 September 2010 Copyright (C) 1996-2010 Julian Seward This program, "bzip2", the associated library "libbzip2", and all documentation, are copyright (C) 1996-2010 Julian R Seward. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 3. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 4. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Julian Seward, jseward@bzip.org bzip2/libbzip2 version 1.0.6 of 6 September 2010 ------------------------------------------------------------------ */ /* CHANGES 0.9.0 -- original version. 0.9.0a/b -- no changes in this file. 0.9.0c -- made zero-length BZ_FLUSH work correctly in bzCompress(). fixed bzWrite/bzRead to ignore zero-length requests. fixed bzread to correctly handle read requests after EOF. wrong parameter order in call to bzDecompressInit in bzBuffToBuffDecompress. Fixed. */ #include "bzlib_private.hpp" /*---------------------------------------------------*/ /*--- Compression stuff ---*/ /*---------------------------------------------------*/ /*---------------------------------------------------*/ #ifndef BZ_NO_STDIO void BZ2_bz__AssertH__fail(int errcode) { fprintf(stderr, "\n\nbzip2/libbzip2: internal error number %d.\n" "This is a bug in bzip2/libbzip2, %s.\n" "Please report it to me at: jseward@bzip.org. If this happened\n" "when you were using some program which uses libbzip2 as a\n" "component, you should also report this bug to the author(s)\n" "of that program. Please make an effort to report this bug;\n" "timely and accurate bug reports eventually lead to higher\n" "quality software. Thanks. Julian Seward, 10 December 2007.\n\n", errcode, BZ2_bzlibVersion()); if (errcode == 1007) { fprintf(stderr, "\n*** A special note about internal error number 1007 ***\n" "\n" "Experience suggests that a common cause of i.e. 1007\n" "is unreliable memory or other hardware. The 1007 assertion\n" "just happens to cross-check the results of huge numbers of\n" "memory reads/writes, and so acts (unintendedly) as a stress\n" "test of your memory system.\n" "\n" "I suggest the following: try compressing the file again,\n" "possibly monitoring progress in detail with the -vv flag.\n" "\n" "* If the error cannot be reproduced, and/or happens at different\n" " points in compression, you may have a flaky memory system.\n" " Try a memory-test program. I have used Memtest86\n" " (www.memtest86.com). At the time of writing it is free (GPLd).\n" " Memtest86 tests memory much more thorougly than your BIOSs\n" " power-on test, and may find failures that the BIOS doesn't.\n" "\n" "* If the error can be repeatably reproduced, this is a bug in\n" " bzip2, and I would very much like to hear about it. Please\n" " let me know, and, ideally, save a copy of the file causing the\n" " problem -- without which I will be unable to investigate it.\n" "\n"); } std::exit(-1); } #endif /*---------------------------------------------------*/ static int bz_config_ok(void) { if (sizeof(int) != 4) return 0; if (sizeof(short) != 2) return 0; if (sizeof(char) != 1) return 0; return 1; } /*---------------------------------------------------*/ static void* default_bzalloc(void* opaque, Int32 items, Int32 size) { void* v = malloc(items * size); return v; } static void default_bzfree(void* opaque, void* addr) { if (addr != nullptr) free(addr); } /*---------------------------------------------------*/ static void prepare_new_block(EState* s) { Int32 i; s->nblock = 0; s->numZ = 0; s->state_out_pos = 0; BZ_INITIALISE_CRC(s->blockCRC); for (i = 0; i < 256; i++) s->inUse[i] = False; s->blockNo++; } /*---------------------------------------------------*/ static void init_RL(EState* s) { s->state_in_ch = 256; s->state_in_len = 0; } static Bool isempty_RL(EState* s) { if (s->state_in_ch < 256 && s->state_in_len > 0) return False; else return True; } /*---------------------------------------------------*/ int BZ_API(BZ2_bzCompressInit)(bz_stream* strm, int blockSize100k, int verbosity, int workFactor) { Int32 n; EState* s; if (!bz_config_ok()) return BZ_CONFIG_ERROR; if (strm == nullptr || blockSize100k < 1 || blockSize100k > 9 || workFactor < 0 || workFactor > 250) return BZ_PARAM_ERROR; if (workFactor == 0) workFactor = 30; if (strm->bzalloc == nullptr) strm->bzalloc = default_bzalloc; if (strm->bzfree == nullptr) strm->bzfree = default_bzfree; s = (EState*)BZALLOC(sizeof(EState)); if (s == nullptr) return BZ_MEM_ERROR; s->strm = strm; s->arr1 = nullptr; s->arr2 = nullptr; s->ftab = nullptr; n = 100000 * blockSize100k; s->arr1 = (UInt32*)BZALLOC(n * sizeof(UInt32)); s->arr2 = (UInt32*)BZALLOC((n + BZ_N_OVERSHOOT) * sizeof(UInt32)); s->ftab = (UInt32*)BZALLOC(65537 * sizeof(UInt32)); if (s->arr1 == nullptr || s->arr2 == nullptr || s->ftab == nullptr) { if (s->arr1 != nullptr) BZFREE(s->arr1); if (s->arr2 != nullptr) BZFREE(s->arr2); if (s->ftab != nullptr) BZFREE(s->ftab); if (s != nullptr) BZFREE(s); return BZ_MEM_ERROR; } s->blockNo = 0; s->state = BZ_S_INPUT; s->mode = BZ_M_RUNNING; s->combinedCRC = 0; s->blockSize100k = blockSize100k; s->nblockMAX = 100000 * blockSize100k - 19; s->verbosity = verbosity; s->workFactor = workFactor; s->block = (UChar*)s->arr2; s->mtfv = (UInt16*)s->arr1; s->zbits = nullptr; s->ptr = (UInt32*)s->arr1; strm->state = s; strm->total_in_lo32 = 0; strm->total_in_hi32 = 0; strm->total_out_lo32 = 0; strm->total_out_hi32 = 0; init_RL(s); prepare_new_block(s); return BZ_OK; } /*---------------------------------------------------*/ static void add_pair_to_block(EState* s) { Int32 i; UChar ch = (UChar)(s->state_in_ch); for (i = 0; i < s->state_in_len; i++) { BZ_UPDATE_CRC(s->blockCRC, ch); } s->inUse[s->state_in_ch] = True; switch (s->state_in_len) { case 1: s->block[s->nblock] = (UChar)ch; s->nblock++; break; case 2: s->block[s->nblock] = (UChar)ch; s->nblock++; s->block[s->nblock] = (UChar)ch; s->nblock++; break; case 3: s->block[s->nblock] = (UChar)ch; s->nblock++; s->block[s->nblock] = (UChar)ch; s->nblock++; s->block[s->nblock] = (UChar)ch; s->nblock++; break; default: s->inUse[s->state_in_len - 4] = True; s->block[s->nblock] = (UChar)ch; s->nblock++; s->block[s->nblock] = (UChar)ch; s->nblock++; s->block[s->nblock] = (UChar)ch; s->nblock++; s->block[s->nblock] = (UChar)ch; s->nblock++; s->block[s->nblock] = ((UChar)(s->state_in_len - 4)); s->nblock++; break; } } /*---------------------------------------------------*/ static void flush_RL(EState* s) { if (s->state_in_ch < 256) add_pair_to_block(s); init_RL(s); } /*---------------------------------------------------*/ #define ADD_CHAR_TO_BLOCK(zs, zchh0) \ { \ UInt32 zchh = (UInt32)(zchh0); \ /*-- fast track the common case --*/ \ if (zchh != zs->state_in_ch && zs->state_in_len == 1) { \ UChar ch = (UChar)(zs->state_in_ch); \ BZ_UPDATE_CRC(zs->blockCRC, ch); \ zs->inUse[zs->state_in_ch] = True; \ zs->block[zs->nblock] = (UChar)ch; \ zs->nblock++; \ zs->state_in_ch = zchh; \ } \ else /*-- general, uncommon cases --*/ \ if (zchh != zs->state_in_ch || zs->state_in_len == 255) { \ if (zs->state_in_ch < 256) \ add_pair_to_block(zs); \ zs->state_in_ch = zchh; \ zs->state_in_len = 1; \ } \ else { \ zs->state_in_len++; \ } \ } /*---------------------------------------------------*/ static Bool copy_input_until_stop(EState* s) { Bool progress_in = False; if (s->mode == BZ_M_RUNNING) { /*-- fast track the common case --*/ while (True) { /*-- block full? --*/ if (s->nblock >= s->nblockMAX) break; /*-- no input? --*/ if (s->strm->avail_in == 0) break; progress_in = True; ADD_CHAR_TO_BLOCK(s, (UInt32)(*((UChar*)(s->strm->next_in)))); s->strm->next_in++; s->strm->avail_in--; s->strm->total_in_lo32++; if (s->strm->total_in_lo32 == 0) s->strm->total_in_hi32++; } } else { /*-- general, uncommon case --*/ while (True) { /*-- block full? --*/ if (s->nblock >= s->nblockMAX) break; /*-- no input? --*/ if (s->strm->avail_in == 0) break; /*-- flush/finish end? --*/ if (s->avail_in_expect == 0) break; progress_in = True; ADD_CHAR_TO_BLOCK(s, (UInt32)(*((UChar*)(s->strm->next_in)))); s->strm->next_in++; s->strm->avail_in--; s->strm->total_in_lo32++; if (s->strm->total_in_lo32 == 0) s->strm->total_in_hi32++; s->avail_in_expect--; } } return progress_in; } /*---------------------------------------------------*/ static Bool copy_output_until_stop(EState* s) { Bool progress_out = False; while (True) { /*-- no output space? --*/ if (s->strm->avail_out == 0) break; /*-- block done? --*/ if (s->state_out_pos >= s->numZ) break; progress_out = True; *(s->strm->next_out) = s->zbits[s->state_out_pos]; s->state_out_pos++; s->strm->avail_out--; s->strm->next_out++; s->strm->total_out_lo32++; if (s->strm->total_out_lo32 == 0) s->strm->total_out_hi32++; } return progress_out; } /*---------------------------------------------------*/ static Bool handle_compress(bz_stream* strm) { Bool progress_in = False; Bool progress_out = False; EState* s = (EState*)strm->state; while (True) { if (s->state == BZ_S_OUTPUT) { progress_out |= copy_output_until_stop(s); if (s->state_out_pos < s->numZ) break; if (s->mode == BZ_M_FINISHING && s->avail_in_expect == 0 && isempty_RL(s)) break; prepare_new_block(s); s->state = BZ_S_INPUT; if (s->mode == BZ_M_FLUSHING && s->avail_in_expect == 0 && isempty_RL(s)) break; } if (s->state == BZ_S_INPUT) { progress_in |= copy_input_until_stop(s); if (s->mode != BZ_M_RUNNING && s->avail_in_expect == 0) { flush_RL(s); BZ2_compressBlock(s, (Bool)(s->mode == BZ_M_FINISHING)); s->state = BZ_S_OUTPUT; } else if (s->nblock >= s->nblockMAX) { BZ2_compressBlock(s, False); s->state = BZ_S_OUTPUT; } else if (s->strm->avail_in == 0) { break; } } } return progress_in || progress_out; } /*---------------------------------------------------*/ int BZ_API(BZ2_bzCompress)(bz_stream* strm, int action) { Bool progress; EState* s; if (strm == nullptr) return BZ_PARAM_ERROR; s = (EState*)strm->state; if (s == nullptr) return BZ_PARAM_ERROR; if (s->strm != strm) return BZ_PARAM_ERROR; preswitch: switch (s->mode) { case BZ_M_IDLE: return BZ_SEQUENCE_ERROR; case BZ_M_RUNNING: if (action == BZ_RUN) { progress = handle_compress(strm); return progress ? BZ_RUN_OK : BZ_PARAM_ERROR; } else if (action == BZ_FLUSH) { s->avail_in_expect = strm->avail_in; s->mode = BZ_M_FLUSHING; goto preswitch; } else if (action == BZ_FINISH) { s->avail_in_expect = strm->avail_in; s->mode = BZ_M_FINISHING; goto preswitch; } else return BZ_PARAM_ERROR; case BZ_M_FLUSHING: if (action != BZ_FLUSH) return BZ_SEQUENCE_ERROR; if (s->avail_in_expect != s->strm->avail_in) return BZ_SEQUENCE_ERROR; progress = handle_compress(strm); if (s->avail_in_expect > 0 || !isempty_RL(s) || s->state_out_pos < s->numZ) return BZ_FLUSH_OK; s->mode = BZ_M_RUNNING; return BZ_RUN_OK; case BZ_M_FINISHING: if (action != BZ_FINISH) return BZ_SEQUENCE_ERROR; if (s->avail_in_expect != s->strm->avail_in) return BZ_SEQUENCE_ERROR; progress = handle_compress(strm); if (!progress) return BZ_SEQUENCE_ERROR; if (s->avail_in_expect > 0 || !isempty_RL(s) || s->state_out_pos < s->numZ) return BZ_FINISH_OK; s->mode = BZ_M_IDLE; return BZ_STREAM_END; } return BZ_OK; /*--not reached--*/ } /*---------------------------------------------------*/ int BZ_API(BZ2_bzCompressEnd)(bz_stream* strm) { EState* s; if (strm == nullptr) return BZ_PARAM_ERROR; s = (EState*)strm->state; if (s == nullptr) return BZ_PARAM_ERROR; if (s->strm != strm) return BZ_PARAM_ERROR; if (s->arr1 != nullptr) BZFREE(s->arr1); if (s->arr2 != nullptr) BZFREE(s->arr2); if (s->ftab != nullptr) BZFREE(s->ftab); BZFREE(strm->state); strm->state = nullptr; return BZ_OK; } /*---------------------------------------------------*/ /*--- Decompression stuff ---*/ /*---------------------------------------------------*/ /*---------------------------------------------------*/ int BZ_API(BZ2_bzDecompressInit)(bz_stream* strm, int verbosity, int small) { DState* s; if (!bz_config_ok()) return BZ_CONFIG_ERROR; if (strm == nullptr) return BZ_PARAM_ERROR; if (small != 0 && small != 1) return BZ_PARAM_ERROR; if (verbosity < 0 || verbosity > 4) return BZ_PARAM_ERROR; if (strm->bzalloc == nullptr) strm->bzalloc = default_bzalloc; if (strm->bzfree == nullptr) strm->bzfree = default_bzfree; s = (DState*)BZALLOC(sizeof(DState)); if (s == nullptr) return BZ_MEM_ERROR; s->strm = strm; strm->state = s; s->state = BZ_X_MAGIC_1; s->bsLive = 0; s->bsBuff = 0; s->calculatedCombinedCRC = 0; strm->total_in_lo32 = 0; strm->total_in_hi32 = 0; strm->total_out_lo32 = 0; strm->total_out_hi32 = 0; s->smallDecompress = (Bool)small; s->ll4 = nullptr; s->ll16 = nullptr; s->tt = nullptr; s->currBlockNo = 0; s->verbosity = verbosity; return BZ_OK; } /*---------------------------------------------------*/ /* Return True iff data corruption is discovered. Returns False if there is no problem. */ static Bool unRLE_obuf_to_output_FAST(DState* s) { UChar k1; if (s->blockRandomised) { while (True) { /* try to finish existing run */ while (True) { if (s->strm->avail_out == 0) return False; if (s->state_out_len == 0) break; *((UChar*)(s->strm->next_out)) = s->state_out_ch; BZ_UPDATE_CRC(s->calculatedBlockCRC, s->state_out_ch); s->state_out_len--; s->strm->next_out++; s->strm->avail_out--; s->strm->total_out_lo32++; if (s->strm->total_out_lo32 == 0) s->strm->total_out_hi32++; } /* can a new run be started? */ if (s->nblock_used == s->save_nblock + 1) return False; /* Only caused by corrupt data stream? */ if (s->nblock_used > s->save_nblock + 1) return True; s->state_out_len = 1; s->state_out_ch = s->k0; BZ_GET_FAST(k1); BZ_RAND_UPD_MASK; k1 ^= BZ_RAND_MASK; s->nblock_used++; if (s->nblock_used == s->save_nblock + 1) continue; if (k1 != s->k0) { s->k0 = k1; continue; }; s->state_out_len = 2; BZ_GET_FAST(k1); BZ_RAND_UPD_MASK; k1 ^= BZ_RAND_MASK; s->nblock_used++; if (s->nblock_used == s->save_nblock + 1) continue; if (k1 != s->k0) { s->k0 = k1; continue; }; s->state_out_len = 3; BZ_GET_FAST(k1); BZ_RAND_UPD_MASK; k1 ^= BZ_RAND_MASK; s->nblock_used++; if (s->nblock_used == s->save_nblock + 1) continue; if (k1 != s->k0) { s->k0 = k1; continue; }; BZ_GET_FAST(k1); BZ_RAND_UPD_MASK; k1 ^= BZ_RAND_MASK; s->nblock_used++; s->state_out_len = ((Int32)k1) + 4; BZ_GET_FAST(s->k0); BZ_RAND_UPD_MASK; s->k0 ^= BZ_RAND_MASK; s->nblock_used++; } } else { /* restore */ UInt32 c_calculatedBlockCRC = s->calculatedBlockCRC; UChar c_state_out_ch = s->state_out_ch; Int32 c_state_out_len = s->state_out_len; Int32 c_nblock_used = s->nblock_used; Int32 c_k0 = s->k0; UInt32* c_tt = s->tt; UInt32 c_tPos = s->tPos; char* cs_next_out = s->strm->next_out; unsigned int cs_avail_out = s->strm->avail_out; Int32 ro_blockSize100k = s->blockSize100k; /* end restore */ UInt32 avail_out_INIT = cs_avail_out; Int32 s_save_nblockPP = s->save_nblock + 1; unsigned int total_out_lo32_old; while (True) { /* try to finish existing run */ if (c_state_out_len > 0) { while (True) { if (cs_avail_out == 0) goto return_notr; if (c_state_out_len == 1) break; *((UChar*)(cs_next_out)) = c_state_out_ch; BZ_UPDATE_CRC(c_calculatedBlockCRC, c_state_out_ch); c_state_out_len--; cs_next_out++; cs_avail_out--; } s_state_out_len_eq_one : { if (cs_avail_out == 0) { c_state_out_len = 1; goto return_notr; }; *((UChar*)(cs_next_out)) = c_state_out_ch; BZ_UPDATE_CRC(c_calculatedBlockCRC, c_state_out_ch); cs_next_out++; cs_avail_out--; } } /* Only caused by corrupt data stream? */ if (c_nblock_used > s_save_nblockPP) return True; /* can a new run be started? */ if (c_nblock_used == s_save_nblockPP) { c_state_out_len = 0; goto return_notr; }; c_state_out_ch = c_k0; BZ_GET_FAST_C(k1); c_nblock_used++; if (k1 != c_k0) { c_k0 = k1; goto s_state_out_len_eq_one; }; if (c_nblock_used == s_save_nblockPP) goto s_state_out_len_eq_one; c_state_out_len = 2; BZ_GET_FAST_C(k1); c_nblock_used++; if (c_nblock_used == s_save_nblockPP) continue; if (k1 != c_k0) { c_k0 = k1; continue; }; c_state_out_len = 3; BZ_GET_FAST_C(k1); c_nblock_used++; if (c_nblock_used == s_save_nblockPP) continue; if (k1 != c_k0) { c_k0 = k1; continue; }; BZ_GET_FAST_C(k1); c_nblock_used++; c_state_out_len = ((Int32)k1) + 4; BZ_GET_FAST_C(c_k0); c_nblock_used++; } return_notr: total_out_lo32_old = s->strm->total_out_lo32; s->strm->total_out_lo32 += (avail_out_INIT - cs_avail_out); if (s->strm->total_out_lo32 < total_out_lo32_old) s->strm->total_out_hi32++; /* save */ s->calculatedBlockCRC = c_calculatedBlockCRC; s->state_out_ch = c_state_out_ch; s->state_out_len = c_state_out_len; s->nblock_used = c_nblock_used; s->k0 = c_k0; s->tt = c_tt; s->tPos = c_tPos; s->strm->next_out = cs_next_out; s->strm->avail_out = cs_avail_out; /* end save */ } return False; } /*---------------------------------------------------*/ Int32 BZ2_indexIntoF(Int32 indx, Int32* cftab) { Int32 nb, na, mid; nb = 0; na = 256; do { mid = (nb + na) >> 1; if (indx >= cftab[mid]) nb = mid; else na = mid; } while (na - nb != 1); return nb; } /*---------------------------------------------------*/ /* Return True iff data corruption is discovered. Returns False if there is no problem. */ static Bool unRLE_obuf_to_output_SMALL(DState* s) { UChar k1; if (s->blockRandomised) { while (True) { /* try to finish existing run */ while (True) { if (s->strm->avail_out == 0) return False; if (s->state_out_len == 0) break; *((UChar*)(s->strm->next_out)) = s->state_out_ch; BZ_UPDATE_CRC(s->calculatedBlockCRC, s->state_out_ch); s->state_out_len--; s->strm->next_out++; s->strm->avail_out--; s->strm->total_out_lo32++; if (s->strm->total_out_lo32 == 0) s->strm->total_out_hi32++; } /* can a new run be started? */ if (s->nblock_used == s->save_nblock + 1) return False; /* Only caused by corrupt data stream? */ if (s->nblock_used > s->save_nblock + 1) return True; s->state_out_len = 1; s->state_out_ch = s->k0; BZ_GET_SMALL(k1); BZ_RAND_UPD_MASK; k1 ^= BZ_RAND_MASK; s->nblock_used++; if (s->nblock_used == s->save_nblock + 1) continue; if (k1 != s->k0) { s->k0 = k1; continue; }; s->state_out_len = 2; BZ_GET_SMALL(k1); BZ_RAND_UPD_MASK; k1 ^= BZ_RAND_MASK; s->nblock_used++; if (s->nblock_used == s->save_nblock + 1) continue; if (k1 != s->k0) { s->k0 = k1; continue; }; s->state_out_len = 3; BZ_GET_SMALL(k1); BZ_RAND_UPD_MASK; k1 ^= BZ_RAND_MASK; s->nblock_used++; if (s->nblock_used == s->save_nblock + 1) continue; if (k1 != s->k0) { s->k0 = k1; continue; }; BZ_GET_SMALL(k1); BZ_RAND_UPD_MASK; k1 ^= BZ_RAND_MASK; s->nblock_used++; s->state_out_len = ((Int32)k1) + 4; BZ_GET_SMALL(s->k0); BZ_RAND_UPD_MASK; s->k0 ^= BZ_RAND_MASK; s->nblock_used++; } } else { while (True) { /* try to finish existing run */ while (True) { if (s->strm->avail_out == 0) return False; if (s->state_out_len == 0) break; *((UChar*)(s->strm->next_out)) = s->state_out_ch; BZ_UPDATE_CRC(s->calculatedBlockCRC, s->state_out_ch); s->state_out_len--; s->strm->next_out++; s->strm->avail_out--; s->strm->total_out_lo32++; if (s->strm->total_out_lo32 == 0) s->strm->total_out_hi32++; } /* can a new run be started? */ if (s->nblock_used == s->save_nblock + 1) return False; /* Only caused by corrupt data stream? */ if (s->nblock_used > s->save_nblock + 1) return True; s->state_out_len = 1; s->state_out_ch = s->k0; BZ_GET_SMALL(k1); s->nblock_used++; if (s->nblock_used == s->save_nblock + 1) continue; if (k1 != s->k0) { s->k0 = k1; continue; }; s->state_out_len = 2; BZ_GET_SMALL(k1); s->nblock_used++; if (s->nblock_used == s->save_nblock + 1) continue; if (k1 != s->k0) { s->k0 = k1; continue; }; s->state_out_len = 3; BZ_GET_SMALL(k1); s->nblock_used++; if (s->nblock_used == s->save_nblock + 1) continue; if (k1 != s->k0) { s->k0 = k1; continue; }; BZ_GET_SMALL(k1); s->nblock_used++; s->state_out_len = ((Int32)k1) + 4; BZ_GET_SMALL(s->k0); s->nblock_used++; } } } /*---------------------------------------------------*/ int BZ_API(BZ2_bzDecompress)(bz_stream* strm) { Bool corrupt; DState* s; if (strm == nullptr) return BZ_PARAM_ERROR; s = (DState*)strm->state; if (s == nullptr) return BZ_PARAM_ERROR; if (s->strm != strm) return BZ_PARAM_ERROR; while (True) { if (s->state == BZ_X_IDLE) return BZ_SEQUENCE_ERROR; if (s->state == BZ_X_OUTPUT) { if (s->smallDecompress) corrupt = unRLE_obuf_to_output_SMALL(s); else corrupt = unRLE_obuf_to_output_FAST(s); if (corrupt) return BZ_DATA_ERROR; if (s->nblock_used == s->save_nblock + 1 && s->state_out_len == 0) { BZ_FINALISE_CRC(s->calculatedBlockCRC); if (s->verbosity >= 3) VPrintf2(" {0x%08x, 0x%08x}", s->storedBlockCRC, s->calculatedBlockCRC); if (s->verbosity >= 2) VPrintf0("]"); if (s->calculatedBlockCRC != s->storedBlockCRC) return BZ_DATA_ERROR; s->calculatedCombinedCRC = (s->calculatedCombinedCRC << 1) | (s->calculatedCombinedCRC >> 31); s->calculatedCombinedCRC ^= s->calculatedBlockCRC; s->state = BZ_X_BLKHDR_1; } else { return BZ_OK; } } if (s->state >= BZ_X_MAGIC_1) { Int32 r = BZ2_decompress(s); if (r == BZ_STREAM_END) { if (s->verbosity >= 3) VPrintf2("\n combined CRCs: stored = 0x%08x, computed = 0x%08x", s->storedCombinedCRC, s->calculatedCombinedCRC); if (s->calculatedCombinedCRC != s->storedCombinedCRC) return BZ_DATA_ERROR; return r; } if (s->state != BZ_X_OUTPUT) return r; } } AssertH(0, 6001); return 0; /*NOTREACHED*/ } /*---------------------------------------------------*/ int BZ_API(BZ2_bzDecompressEnd)(bz_stream* strm) { DState* s; if (strm == nullptr) return BZ_PARAM_ERROR; s = (DState*)strm->state; if (s == nullptr) return BZ_PARAM_ERROR; if (s->strm != strm) return BZ_PARAM_ERROR; if (s->tt != nullptr) BZFREE(s->tt); if (s->ll16 != nullptr) BZFREE(s->ll16); if (s->ll4 != nullptr) BZFREE(s->ll4); BZFREE(strm->state); strm->state = nullptr; return BZ_OK; } #ifndef BZ_NO_STDIO /*---------------------------------------------------*/ /*--- File I/O stuff ---*/ /*---------------------------------------------------*/ #define BZ_SETERR(eee) \ { \ if (bzerror != nullptr) \ *bzerror = eee; \ if (bzf != nullptr) \ bzf->lastErr = eee; \ } typedef struct { FILE* handle; Char buf[BZ_MAX_UNUSED]; Int32 bufN; Bool writing; bz_stream strm; Int32 lastErr; Bool initialisedOk; } bzFile; /*---------------------------------------------*/ static Bool myfeof(FILE* f) { Int32 c = fgetc(f); if (c == EOF) return True; ungetc(c, f); return False; } /*---------------------------------------------------*/ BZFILE* BZ_API( BZ2_bzWriteOpen)(int* bzerror, FILE* f, int blockSize100k, int verbosity, int workFactor) { Int32 ret; bzFile* bzf = nullptr; BZ_SETERR(BZ_OK); if (f == nullptr || (blockSize100k < 1 || blockSize100k > 9) || (workFactor < 0 || workFactor > 250) || (verbosity < 0 || verbosity > 4)) { BZ_SETERR(BZ_PARAM_ERROR); return nullptr; }; if (ferror(f)) { BZ_SETERR(BZ_IO_ERROR); return nullptr; }; bzf = (bzFile*)malloc(sizeof(bzFile)); if (bzf == nullptr) { BZ_SETERR(BZ_MEM_ERROR); return nullptr; }; BZ_SETERR(BZ_OK); bzf->initialisedOk = False; bzf->bufN = 0; bzf->handle = f; bzf->writing = True; bzf->strm.bzalloc = nullptr; bzf->strm.bzfree = nullptr; bzf->strm.opaque = nullptr; if (workFactor == 0) workFactor = 30; ret = BZ2_bzCompressInit(&(bzf->strm), blockSize100k, verbosity, workFactor); if (ret != BZ_OK) { BZ_SETERR(ret); free(bzf); return nullptr; }; bzf->strm.avail_in = 0; bzf->initialisedOk = True; return bzf; } /*---------------------------------------------------*/ void BZ_API(BZ2_bzWrite)(int* bzerror, BZFILE* b, void* buf, int len) { Int32 n, n2, ret; bzFile* bzf = (bzFile*)b; BZ_SETERR(BZ_OK); if (bzf == nullptr || buf == nullptr || len < 0) { BZ_SETERR(BZ_PARAM_ERROR); return; }; if (!(bzf->writing)) { BZ_SETERR(BZ_SEQUENCE_ERROR); return; }; if (ferror(bzf->handle)) { BZ_SETERR(BZ_IO_ERROR); return; }; if (len == 0) { BZ_SETERR(BZ_OK); return; }; bzf->strm.avail_in = len; bzf->strm.next_in = (char*)buf; while (True) { bzf->strm.avail_out = BZ_MAX_UNUSED; bzf->strm.next_out = bzf->buf; ret = BZ2_bzCompress(&(bzf->strm), BZ_RUN); if (ret != BZ_RUN_OK) { BZ_SETERR(ret); return; }; if (bzf->strm.avail_out < BZ_MAX_UNUSED) { n = BZ_MAX_UNUSED - bzf->strm.avail_out; n2 = fwrite((void*)(bzf->buf), sizeof(UChar), n, bzf->handle); if (n != n2 || ferror(bzf->handle)) { BZ_SETERR(BZ_IO_ERROR); return; }; } if (bzf->strm.avail_in == 0) { BZ_SETERR(BZ_OK); return; }; } } /*---------------------------------------------------*/ void BZ_API(BZ2_bzWriteClose)(int* bzerror, BZFILE* b, int abandon, unsigned int* nbytes_in, unsigned int* nbytes_out) { BZ2_bzWriteClose64(bzerror, b, abandon, nbytes_in, nullptr, nbytes_out, nullptr); } void BZ_API(BZ2_bzWriteClose64)(int* bzerror, BZFILE* b, int abandon, unsigned int* nbytes_in_lo32, unsigned int* nbytes_in_hi32, unsigned int* nbytes_out_lo32, unsigned int* nbytes_out_hi32) { Int32 n, n2, ret; bzFile* bzf = (bzFile*)b; if (bzf == nullptr) { BZ_SETERR(BZ_OK); return; }; if (!(bzf->writing)) { BZ_SETERR(BZ_SEQUENCE_ERROR); return; }; if (ferror(bzf->handle)) { BZ_SETERR(BZ_IO_ERROR); return; }; if (nbytes_in_lo32 != nullptr) *nbytes_in_lo32 = 0; if (nbytes_in_hi32 != nullptr) *nbytes_in_hi32 = 0; if (nbytes_out_lo32 != nullptr) *nbytes_out_lo32 = 0; if (nbytes_out_hi32 != nullptr) *nbytes_out_hi32 = 0; if ((!abandon) && bzf->lastErr == BZ_OK) { while (True) { bzf->strm.avail_out = BZ_MAX_UNUSED; bzf->strm.next_out = bzf->buf; ret = BZ2_bzCompress(&(bzf->strm), BZ_FINISH); if (ret != BZ_FINISH_OK && ret != BZ_STREAM_END) { BZ_SETERR(ret); return; }; if (bzf->strm.avail_out < BZ_MAX_UNUSED) { n = BZ_MAX_UNUSED - bzf->strm.avail_out; n2 = fwrite((void*)(bzf->buf), sizeof(UChar), n, bzf->handle); if (n != n2 || ferror(bzf->handle)) { BZ_SETERR(BZ_IO_ERROR); return; }; } if (ret == BZ_STREAM_END) break; } } if (!abandon && !ferror(bzf->handle)) { fflush(bzf->handle); if (ferror(bzf->handle)) { BZ_SETERR(BZ_IO_ERROR); return; }; } if (nbytes_in_lo32 != nullptr) *nbytes_in_lo32 = bzf->strm.total_in_lo32; if (nbytes_in_hi32 != nullptr) *nbytes_in_hi32 = bzf->strm.total_in_hi32; if (nbytes_out_lo32 != nullptr) *nbytes_out_lo32 = bzf->strm.total_out_lo32; if (nbytes_out_hi32 != nullptr) *nbytes_out_hi32 = bzf->strm.total_out_hi32; BZ_SETERR(BZ_OK); BZ2_bzCompressEnd(&(bzf->strm)); free(bzf); } /*---------------------------------------------------*/ BZFILE* BZ_API( BZ2_bzReadOpen)(int* bzerror, FILE* f, int verbosity, int small, void* unused, int nUnused) { bzFile* bzf = nullptr; int ret; BZ_SETERR(BZ_OK); if (f == nullptr || (small != 0 && small != 1) || (verbosity < 0 || verbosity > 4) || (unused == nullptr && nUnused != 0) || (unused != nullptr && (nUnused < 0 || nUnused > BZ_MAX_UNUSED))) { BZ_SETERR(BZ_PARAM_ERROR); return nullptr; }; if (ferror(f)) { BZ_SETERR(BZ_IO_ERROR); return nullptr; }; bzf = (bzFile*)malloc(sizeof(bzFile)); if (bzf == nullptr) { BZ_SETERR(BZ_MEM_ERROR); return nullptr; }; BZ_SETERR(BZ_OK); bzf->initialisedOk = False; bzf->handle = f; bzf->bufN = 0; bzf->writing = False; bzf->strm.bzalloc = nullptr; bzf->strm.bzfree = nullptr; bzf->strm.opaque = nullptr; while (nUnused > 0) { bzf->buf[bzf->bufN] = *((UChar*)(unused)); bzf->bufN++; unused = ((void*)(1 + ((UChar*)(unused)))); nUnused--; } ret = BZ2_bzDecompressInit(&(bzf->strm), verbosity, small); if (ret != BZ_OK) { BZ_SETERR(ret); free(bzf); return nullptr; }; bzf->strm.avail_in = bzf->bufN; bzf->strm.next_in = bzf->buf; bzf->initialisedOk = True; return bzf; } /*---------------------------------------------------*/ void BZ_API(BZ2_bzReadClose)(int* bzerror, BZFILE* b) { bzFile* bzf = (bzFile*)b; BZ_SETERR(BZ_OK); if (bzf == nullptr) { BZ_SETERR(BZ_OK); return; }; if (bzf->writing) { BZ_SETERR(BZ_SEQUENCE_ERROR); return; }; if (bzf->initialisedOk) (void)BZ2_bzDecompressEnd(&(bzf->strm)); free(bzf); } /*---------------------------------------------------*/ int BZ_API(BZ2_bzRead)(int* bzerror, BZFILE* b, void* buf, int len) { Int32 n, ret; bzFile* bzf = (bzFile*)b; BZ_SETERR(BZ_OK); if (bzf == nullptr || buf == nullptr || len < 0) { BZ_SETERR(BZ_PARAM_ERROR); return 0; }; if (bzf->writing) { BZ_SETERR(BZ_SEQUENCE_ERROR); return 0; }; if (len == 0) { BZ_SETERR(BZ_OK); return 0; }; bzf->strm.avail_out = len; bzf->strm.next_out = (char*)buf; while (True) { if (ferror(bzf->handle)) { BZ_SETERR(BZ_IO_ERROR); return 0; }; if (bzf->strm.avail_in == 0 && !myfeof(bzf->handle)) { n = fread(bzf->buf, sizeof(UChar), BZ_MAX_UNUSED, bzf->handle); if (ferror(bzf->handle)) { BZ_SETERR(BZ_IO_ERROR); return 0; }; bzf->bufN = n; bzf->strm.avail_in = bzf->bufN; bzf->strm.next_in = bzf->buf; } ret = BZ2_bzDecompress(&(bzf->strm)); if (ret != BZ_OK && ret != BZ_STREAM_END) { BZ_SETERR(ret); return 0; }; if (ret == BZ_OK && myfeof(bzf->handle) && bzf->strm.avail_in == 0 && bzf->strm.avail_out > 0) { BZ_SETERR(BZ_UNEXPECTED_EOF); return 0; }; if (ret == BZ_STREAM_END) { BZ_SETERR(BZ_STREAM_END); return len - bzf->strm.avail_out; }; if (bzf->strm.avail_out == 0) { BZ_SETERR(BZ_OK); return len; }; } return 0; /*not reached*/ } /*---------------------------------------------------*/ void BZ_API(BZ2_bzReadGetUnused)(int* bzerror, BZFILE* b, void** unused, int* nUnused) { bzFile* bzf = (bzFile*)b; if (bzf == nullptr) { BZ_SETERR(BZ_PARAM_ERROR); return; }; if (bzf->lastErr != BZ_STREAM_END) { BZ_SETERR(BZ_SEQUENCE_ERROR); return; }; if (unused == nullptr || nUnused == nullptr) { BZ_SETERR(BZ_PARAM_ERROR); return; }; BZ_SETERR(BZ_OK); *nUnused = bzf->strm.avail_in; *unused = bzf->strm.next_in; } #endif /*---------------------------------------------------*/ /*--- Misc convenience stuff ---*/ /*---------------------------------------------------*/ /*---------------------------------------------------*/ int BZ_API(BZ2_bzBuffToBuffCompress)(char* dest, unsigned int* destLen, char* source, unsigned int sourceLen, int blockSize100k, int verbosity, int workFactor) { bz_stream strm; int ret; if (dest == nullptr || destLen == nullptr || source == nullptr || blockSize100k < 1 || blockSize100k > 9 || verbosity < 0 || verbosity > 4 || workFactor < 0 || workFactor > 250) return BZ_PARAM_ERROR; if (workFactor == 0) workFactor = 30; strm.bzalloc = nullptr; strm.bzfree = nullptr; strm.opaque = nullptr; ret = BZ2_bzCompressInit(&strm, blockSize100k, verbosity, workFactor); if (ret != BZ_OK) return ret; strm.next_in = source; strm.next_out = dest; strm.avail_in = sourceLen; strm.avail_out = *destLen; ret = BZ2_bzCompress(&strm, BZ_FINISH); if (ret == BZ_FINISH_OK) goto output_overflow; if (ret != BZ_STREAM_END) goto errhandler; /* normal termination */ *destLen -= strm.avail_out; BZ2_bzCompressEnd(&strm); return BZ_OK; output_overflow: BZ2_bzCompressEnd(&strm); return BZ_OUTBUFF_FULL; errhandler: BZ2_bzCompressEnd(&strm); return ret; } /*---------------------------------------------------*/ int BZ_API(BZ2_bzBuffToBuffDecompress)(char* dest, unsigned int* destLen, char* source, unsigned int sourceLen, int small, int verbosity) { bz_stream strm; int ret; if (dest == nullptr || destLen == nullptr || source == nullptr || (small != 0 && small != 1) || verbosity < 0 || verbosity > 4) return BZ_PARAM_ERROR; strm.bzalloc = nullptr; strm.bzfree = nullptr; strm.opaque = nullptr; ret = BZ2_bzDecompressInit(&strm, verbosity, small); if (ret != BZ_OK) return ret; strm.next_in = source; strm.next_out = dest; strm.avail_in = sourceLen; strm.avail_out = *destLen; ret = BZ2_bzDecompress(&strm); if (ret == BZ_OK) goto output_overflow_or_eof; if (ret != BZ_STREAM_END) goto errhandler; /* normal termination */ *destLen -= strm.avail_out; BZ2_bzDecompressEnd(&strm); return BZ_OK; output_overflow_or_eof: if (strm.avail_out > 0) { BZ2_bzDecompressEnd(&strm); return BZ_UNEXPECTED_EOF; } else { BZ2_bzDecompressEnd(&strm); return BZ_OUTBUFF_FULL; }; errhandler: BZ2_bzDecompressEnd(&strm); return ret; } /*---------------------------------------------------*/ /*-- Code contributed by Yoshioka Tsuneo (tsuneo@rr.iij4u.or.jp) to support better zlib compatibility. This code is not _officially_ part of libbzip2 (yet); I haven't tested it, documented it, or considered the threading-safeness of it. If this code breaks, please contact both Yoshioka and me. --*/ /*---------------------------------------------------*/ /*---------------------------------------------------*/ /*-- return version like "0.9.5d, 4-Sept-1999". --*/ const char* BZ_API(BZ2_bzlibVersion)(void) { return BZ_VERSION; } #ifndef BZ_NO_STDIO /*---------------------------------------------------*/ #if defined(_WIN32) || defined(OS2) || defined(MSDOS) #include #include #define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY) #else #define SET_BINARY_MODE(file) #endif static BZFILE* bzopen_or_bzdopen(const char* path, /* no use when bzdopen */ int fd, /* no use when bzdopen */ const char* mode, int open_mode) /* bzopen: 0, bzdopen:1 */ { int bzerr; char unused[BZ_MAX_UNUSED]; int blockSize100k = 9; int writing = 0; char mode2[10] = ""; FILE* fp = nullptr; BZFILE* bzfp = nullptr; int verbosity = 0; int workFactor = 30; int smallMode = 0; int nUnused = 0; if (mode == nullptr) return nullptr; while (*mode) { switch (*mode) { case 'r': writing = 0; break; case 'w': writing = 1; break; case 's': smallMode = 1; break; default: if (isdigit((int)(*mode))) { blockSize100k = *mode - BZ_HDR_0; } } mode++; } strcat(mode2, writing ? "w" : "r"); strcat(mode2, "b"); /* binary mode */ if (open_mode == 0) { if (path == nullptr || strcmp(path, "") == 0) { fp = (writing ? stdout : stdin); SET_BINARY_MODE(fp); } else { fp = fopen(path, mode2); } } else { #ifdef BZ_STRICT_ANSI fp = nullptr; #else fp = fdopen(fd, mode2); #endif } if (fp == nullptr) return nullptr; if (writing) { /* Guard against total chaos and anarchy -- JRS */ if (blockSize100k < 1) blockSize100k = 1; if (blockSize100k > 9) blockSize100k = 9; bzfp = BZ2_bzWriteOpen(&bzerr, fp, blockSize100k, verbosity, workFactor); } else { bzfp = BZ2_bzReadOpen(&bzerr, fp, verbosity, smallMode, unused, nUnused); } if (bzfp == nullptr) { if (fp != stdin && fp != stdout) fclose(fp); return nullptr; } return bzfp; } /*---------------------------------------------------*/ /*-- open file for read or write. ex) bzopen("file","w9") case path="" or nullptr => use stdin or stdout. --*/ BZFILE* BZ_API(BZ2_bzopen)(const char* path, const char* mode) { return bzopen_or_bzdopen(path, -1, mode, /*bzopen*/ 0); } /*---------------------------------------------------*/ BZFILE* BZ_API(BZ2_bzdopen)(int fd, const char* mode) { return bzopen_or_bzdopen(nullptr, fd, mode, /*bzdopen*/ 1); } /*---------------------------------------------------*/ int BZ_API(BZ2_bzread)(BZFILE* b, void* buf, int len) { int bzerr, nread; if (((bzFile*)b)->lastErr == BZ_STREAM_END) return 0; nread = BZ2_bzRead(&bzerr, b, buf, len); if (bzerr == BZ_OK || bzerr == BZ_STREAM_END) { return nread; } else { return -1; } } /*---------------------------------------------------*/ int BZ_API(BZ2_bzwrite)(BZFILE* b, void* buf, int len) { int bzerr; BZ2_bzWrite(&bzerr, b, buf, len); if (bzerr == BZ_OK) { return len; } else { return -1; } } /*---------------------------------------------------*/ int BZ_API(BZ2_bzflush)(BZFILE* b) { /* do nothing now... */ return 0; } /*---------------------------------------------------*/ void BZ_API(BZ2_bzclose)(BZFILE* b) { int bzerr; FILE* fp; if (b == nullptr) { return; } fp = ((bzFile*)b)->handle; if (((bzFile*)b)->writing) { BZ2_bzWriteClose(&bzerr, b, 0, nullptr, nullptr); if (bzerr != BZ_OK) { BZ2_bzWriteClose(nullptr, b, 1, nullptr, nullptr); } } else { BZ2_bzReadClose(&bzerr, b); } if (fp != stdin && fp != stdout) { fclose(fp); } } /*---------------------------------------------------*/ /*-- return last error code --*/ static const char* bzerrorstrings[] = { "OK", "SEQUENCE_ERROR", "PARAM_ERROR", "MEM_ERROR", "DATA_ERROR", "DATA_ERROR_MAGIC", "IO_ERROR", "UNEXPECTED_EOF", "OUTBUFF_FULL", "CONFIG_ERROR", "???" /* for future */ , "???" /* for future */ , "???" /* for future */ , "???" /* for future */ , "???" /* for future */ , "???" /* for future */ }; const char* BZ_API(BZ2_bzerror)(BZFILE* b, int* errnum) { int err = ((bzFile*)b)->lastErr; if (err > 0) err = 0; *errnum = err; return bzerrorstrings[err * -1]; } #endif /*-------------------------------------------------------------*/ /*--- end bzlib.c ---*/ /*-------------------------------------------------------------*/ ================================================ FILE: third-party/tbb/examples/graph/fgbzip2/bzlib.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /*-------------------------------------------------------------*/ /*--- Public header file for the library. ---*/ /*--- bzlib.h ---*/ /*-------------------------------------------------------------*/ /* ------------------------------------------------------------------ The original source for this example: This file is part of bzip2/libbzip2, a program and library for lossless, block-sorting data compression. bzip2/libbzip2 version 1.0.6 of 6 September 2010 Copyright (C) 1996-2010 Julian Seward This program, "bzip2", the associated library "libbzip2", and all documentation, are copyright (C) 1996-2010 Julian R Seward. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 3. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 4. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Julian Seward, jseward@bzip.org bzip2/libbzip2 version 1.0.6 of 6 September 2010 ------------------------------------------------------------------ */ #ifndef _BZLIB_H #define _BZLIB_H #ifdef __cplusplus extern "C" { #endif #define BZ_RUN 0 #define BZ_FLUSH 1 #define BZ_FINISH 2 #define BZ_OK 0 #define BZ_RUN_OK 1 #define BZ_FLUSH_OK 2 #define BZ_FINISH_OK 3 #define BZ_STREAM_END 4 #define BZ_SEQUENCE_ERROR (-1) #define BZ_PARAM_ERROR (-2) #define BZ_MEM_ERROR (-3) #define BZ_DATA_ERROR (-4) #define BZ_DATA_ERROR_MAGIC (-5) #define BZ_IO_ERROR (-6) #define BZ_UNEXPECTED_EOF (-7) #define BZ_OUTBUFF_FULL (-8) #define BZ_CONFIG_ERROR (-9) typedef struct { char* next_in; unsigned int avail_in; unsigned int total_in_lo32; unsigned int total_in_hi32; char* next_out; unsigned int avail_out; unsigned int total_out_lo32; unsigned int total_out_hi32; void* state; void* (*bzalloc)(void*, int, int); void (*bzfree)(void*, void*); void* opaque; } bz_stream; #ifndef BZ_IMPORT #define BZ_EXPORT #endif #ifndef BZ_NO_STDIO /* Need a definitition for FILE */ #include #endif #ifdef _WIN32 #include #ifdef small /* windows.h define small to char */ #undef small #endif #ifdef BZ_EXPORT #define BZ_API(func) WINAPI func #define BZ_EXTERN extern #else /* import windows dll dynamically */ #define BZ_API(func) (WINAPI * func) #define BZ_EXTERN #endif #else #define BZ_API(func) func #define BZ_EXTERN extern #endif /*-- Core (low-level) library functions --*/ BZ_EXTERN int BZ_API(BZ2_bzCompressInit)(bz_stream* strm, int blockSize100k, int verbosity, int workFactor); BZ_EXTERN int BZ_API(BZ2_bzCompress)(bz_stream* strm, int action); BZ_EXTERN int BZ_API(BZ2_bzCompressEnd)(bz_stream* strm); BZ_EXTERN int BZ_API(BZ2_bzDecompressInit)(bz_stream* strm, int verbosity, int small); BZ_EXTERN int BZ_API(BZ2_bzDecompress)(bz_stream* strm); BZ_EXTERN int BZ_API(BZ2_bzDecompressEnd)(bz_stream* strm); /*-- High(er) level library functions --*/ #ifndef BZ_NO_STDIO #define BZ_MAX_UNUSED 5000 typedef void BZFILE; BZ_EXTERN BZFILE* BZ_API( BZ2_bzReadOpen)(int* bzerror, FILE* f, int verbosity, int small, void* unused, int nUnused); BZ_EXTERN void BZ_API(BZ2_bzReadClose)(int* bzerror, BZFILE* b); BZ_EXTERN void BZ_API(BZ2_bzReadGetUnused)(int* bzerror, BZFILE* b, void** unused, int* nUnused); BZ_EXTERN int BZ_API(BZ2_bzRead)(int* bzerror, BZFILE* b, void* buf, int len); BZ_EXTERN BZFILE* BZ_API( BZ2_bzWriteOpen)(int* bzerror, FILE* f, int blockSize100k, int verbosity, int workFactor); BZ_EXTERN void BZ_API(BZ2_bzWrite)(int* bzerror, BZFILE* b, void* buf, int len); BZ_EXTERN void BZ_API(BZ2_bzWriteClose)(int* bzerror, BZFILE* b, int abandon, unsigned int* nbytes_in, unsigned int* nbytes_out); BZ_EXTERN void BZ_API(BZ2_bzWriteClose64)(int* bzerror, BZFILE* b, int abandon, unsigned int* nbytes_in_lo32, unsigned int* nbytes_in_hi32, unsigned int* nbytes_out_lo32, unsigned int* nbytes_out_hi32); #endif /*-- Utility functions --*/ BZ_EXTERN int BZ_API(BZ2_bzBuffToBuffCompress)(char* dest, unsigned int* destLen, char* source, unsigned int sourceLen, int blockSize100k, int verbosity, int workFactor); BZ_EXTERN int BZ_API(BZ2_bzBuffToBuffDecompress)(char* dest, unsigned int* destLen, char* source, unsigned int sourceLen, int small, int verbosity); /*-- Code contributed by Yoshioka Tsuneo (tsuneo@rr.iij4u.or.jp) to support better zlib compatibility. This code is not _officially_ part of libbzip2 (yet); I haven't tested it, documented it, or considered the threading-safeness of it. If this code breaks, please contact both Yoshioka and me. --*/ BZ_EXTERN const char* BZ_API(BZ2_bzlibVersion)(void); #ifndef BZ_NO_STDIO BZ_EXTERN BZFILE* BZ_API(BZ2_bzopen)(const char* path, const char* mode); BZ_EXTERN BZFILE* BZ_API(BZ2_bzdopen)(int fd, const char* mode); BZ_EXTERN int BZ_API(BZ2_bzread)(BZFILE* b, void* buf, int len); BZ_EXTERN int BZ_API(BZ2_bzwrite)(BZFILE* b, void* buf, int len); BZ_EXTERN int BZ_API(BZ2_bzflush)(BZFILE* b); BZ_EXTERN void BZ_API(BZ2_bzclose)(BZFILE* b); BZ_EXTERN const char* BZ_API(BZ2_bzerror)(BZFILE* b, int* errnum); #endif #ifdef __cplusplus } #endif #endif /*-------------------------------------------------------------*/ /*--- end bzlib.h ---*/ /*-------------------------------------------------------------*/ ================================================ FILE: third-party/tbb/examples/graph/fgbzip2/bzlib_private.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /*-------------------------------------------------------------*/ /*--- Private header file for the library. ---*/ /*--- bzlib_private.h ---*/ /*-------------------------------------------------------------*/ /* ------------------------------------------------------------------ The original source for this example: This file is part of bzip2/libbzip2, a program and library for lossless, block-sorting data compression. bzip2/libbzip2 version 1.0.6 of 6 September 2010 Copyright (C) 1996-2010 Julian Seward This program, "bzip2", the associated library "libbzip2", and all documentation, are copyright (C) 1996-2010 Julian R Seward. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 3. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 4. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Julian Seward, jseward@bzip.org bzip2/libbzip2 version 1.0.6 of 6 September 2010 ------------------------------------------------------------------ */ #ifndef _BZLIB_PRIVATE_H #define _BZLIB_PRIVATE_H #include #ifndef BZ_NO_STDIO #include #include #include #endif #include "bzlib.hpp" /*-- General stuff. --*/ #define BZ_VERSION "1.0.6, 6-Sept-2010" typedef char Char; typedef unsigned char Bool; typedef unsigned char UChar; typedef int Int32; typedef unsigned int UInt32; typedef short Int16; typedef unsigned short UInt16; #define True ((Bool)1) #define False ((Bool)0) #ifndef __GNUC__ #define __inline__ /* */ #endif #ifndef BZ_NO_STDIO extern void BZ2_bz__AssertH__fail(int errcode); #define AssertH(cond, errcode) \ { \ if (!(cond)) \ BZ2_bz__AssertH__fail(errcode); \ } #if BZ_DEBUG #define AssertD(cond, msg) \ { \ if (!(cond)) { \ fprintf(stderr, "\n\nlibbzip2(debug build): internal error\n\t%s\n", msg); \ std::exit(-1); \ } \ } #else #define AssertD(cond, msg) /* */ #endif #define VPrintf0(zf) fprintf(stderr, zf) #define VPrintf1(zf, za1) fprintf(stderr, zf, za1) #define VPrintf2(zf, za1, za2) fprintf(stderr, zf, za1, za2) #define VPrintf3(zf, za1, za2, za3) fprintf(stderr, zf, za1, za2, za3) #define VPrintf4(zf, za1, za2, za3, za4) fprintf(stderr, zf, za1, za2, za3, za4) #define VPrintf5(zf, za1, za2, za3, za4, za5) fprintf(stderr, zf, za1, za2, za3, za4, za5) #else extern void bz_internal_error(int errcode); #define AssertH(cond, errcode) \ { \ if (!(cond)) \ bz_internal_error(errcode); \ } #define AssertD(cond, msg) \ do { \ } while (0) #define VPrintf0(zf) \ do { \ } while (0) #define VPrintf1(zf, za1) \ do { \ } while (0) #define VPrintf2(zf, za1, za2) \ do { \ } while (0) #define VPrintf3(zf, za1, za2, za3) \ do { \ } while (0) #define VPrintf4(zf, za1, za2, za3, za4) \ do { \ } while (0) #define VPrintf5(zf, za1, za2, za3, za4, za5) \ do { \ } while (0) #endif #define BZALLOC(nnn) (strm->bzalloc)(strm->opaque, (nnn), 1) #define BZFREE(ppp) (strm->bzfree)(strm->opaque, (ppp)) /*-- Header bytes. --*/ #define BZ_HDR_B 0x42 /* 'B' */ #define BZ_HDR_Z 0x5a /* 'Z' */ #define BZ_HDR_h 0x68 /* 'h' */ #define BZ_HDR_0 0x30 /* '0' */ /*-- Constants for the back end. --*/ #define BZ_MAX_ALPHA_SIZE 258 #define BZ_MAX_CODE_LEN 23 #define BZ_RUNA 0 #define BZ_RUNB 1 #define BZ_N_GROUPS 6 #define BZ_G_SIZE 50 #define BZ_N_ITERS 4 #define BZ_MAX_SELECTORS (2 + (900000 / BZ_G_SIZE)) /*-- Stuff for randomising repetitive blocks. --*/ extern Int32 BZ2_rNums[512]; #define BZ_RAND_DECLS \ Int32 rNToGo; \ Int32 rTPos #define BZ_RAND_INIT_MASK \ s->rNToGo = 0; \ s->rTPos = 0 #define BZ_RAND_MASK ((s->rNToGo == 1) ? 1 : 0) #define BZ_RAND_UPD_MASK \ if (s->rNToGo == 0) { \ s->rNToGo = BZ2_rNums[s->rTPos]; \ s->rTPos++; \ if (s->rTPos == 512) \ s->rTPos = 0; \ } \ s->rNToGo--; /*-- Stuff for doing CRCs. --*/ extern UInt32 BZ2_crc32Table[256]; #define BZ_INITIALISE_CRC(crcVar) \ { crcVar = 0xffffffffL; } #define BZ_FINALISE_CRC(crcVar) \ { crcVar = ~(crcVar); } #define BZ_UPDATE_CRC(crcVar, cha) \ { crcVar = (crcVar << 8) ^ BZ2_crc32Table[(crcVar >> 24) ^ ((UChar)cha)]; } /*-- States and modes for compression. --*/ #define BZ_M_IDLE 1 #define BZ_M_RUNNING 2 #define BZ_M_FLUSHING 3 #define BZ_M_FINISHING 4 #define BZ_S_OUTPUT 1 #define BZ_S_INPUT 2 #define BZ_N_RADIX 2 #define BZ_N_QSORT 12 #define BZ_N_SHELL 18 #define BZ_N_OVERSHOOT (BZ_N_RADIX + BZ_N_QSORT + BZ_N_SHELL + 2) /*-- Structure holding all the compression-side stuff. --*/ typedef struct { /* pointer back to the struct bz_stream */ bz_stream* strm; /* mode this stream is in, and whether inputting */ /* or outputting data */ Int32 mode; Int32 state; /* remembers avail_in when flush/finish requested */ UInt32 avail_in_expect; /* for doing the block sorting */ UInt32* arr1; UInt32* arr2; UInt32* ftab; Int32 origPtr; /* aliases for arr1 and arr2 */ UInt32* ptr; UChar* block; UInt16* mtfv; UChar* zbits; /* for deciding when to use the fallback sorting algorithm */ Int32 workFactor; /* run-length-encoding of the input */ UInt32 state_in_ch; Int32 state_in_len; BZ_RAND_DECLS; /* input and output limits and current posns */ Int32 nblock; Int32 nblockMAX; Int32 numZ; Int32 state_out_pos; /* map of bytes used in block */ Int32 nInUse; Bool inUse[256]; UChar unseqToSeq[256]; /* the buffer for bit stream creation */ UInt32 bsBuff; Int32 bsLive; /* block and combined CRCs */ UInt32 blockCRC; UInt32 combinedCRC; /* misc administratium */ Int32 verbosity; Int32 blockNo; Int32 blockSize100k; /* stuff for coding the MTF values */ Int32 nMTF; Int32 mtfFreq[BZ_MAX_ALPHA_SIZE]; UChar selector[BZ_MAX_SELECTORS]; UChar selectorMtf[BZ_MAX_SELECTORS]; UChar len[BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE]; Int32 code[BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE]; Int32 rfreq[BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE]; /* second dimension: only 3 needed; 4 makes index calculations faster */ UInt32 len_pack[BZ_MAX_ALPHA_SIZE][4]; } EState; /*-- externs for compression. --*/ extern void BZ2_blockSort(EState*); extern void BZ2_compressBlock(EState*, Bool); extern void BZ2_bsInitWrite(EState*); extern void BZ2_hbAssignCodes(Int32*, UChar*, Int32, Int32, Int32); extern void BZ2_hbMakeCodeLengths(UChar*, Int32*, Int32, Int32); /*-- states for decompression. --*/ #define BZ_X_IDLE 1 #define BZ_X_OUTPUT 2 #define BZ_X_MAGIC_1 10 #define BZ_X_MAGIC_2 11 #define BZ_X_MAGIC_3 12 #define BZ_X_MAGIC_4 13 #define BZ_X_BLKHDR_1 14 #define BZ_X_BLKHDR_2 15 #define BZ_X_BLKHDR_3 16 #define BZ_X_BLKHDR_4 17 #define BZ_X_BLKHDR_5 18 #define BZ_X_BLKHDR_6 19 #define BZ_X_BCRC_1 20 #define BZ_X_BCRC_2 21 #define BZ_X_BCRC_3 22 #define BZ_X_BCRC_4 23 #define BZ_X_RANDBIT 24 #define BZ_X_ORIGPTR_1 25 #define BZ_X_ORIGPTR_2 26 #define BZ_X_ORIGPTR_3 27 #define BZ_X_MAPPING_1 28 #define BZ_X_MAPPING_2 29 #define BZ_X_SELECTOR_1 30 #define BZ_X_SELECTOR_2 31 #define BZ_X_SELECTOR_3 32 #define BZ_X_CODING_1 33 #define BZ_X_CODING_2 34 #define BZ_X_CODING_3 35 #define BZ_X_MTF_1 36 #define BZ_X_MTF_2 37 #define BZ_X_MTF_3 38 #define BZ_X_MTF_4 39 #define BZ_X_MTF_5 40 #define BZ_X_MTF_6 41 #define BZ_X_ENDHDR_2 42 #define BZ_X_ENDHDR_3 43 #define BZ_X_ENDHDR_4 44 #define BZ_X_ENDHDR_5 45 #define BZ_X_ENDHDR_6 46 #define BZ_X_CCRC_1 47 #define BZ_X_CCRC_2 48 #define BZ_X_CCRC_3 49 #define BZ_X_CCRC_4 50 /*-- Constants for the fast MTF decoder. --*/ #define MTFA_SIZE 4096 #define MTFL_SIZE 16 /*-- Structure holding all the decompression-side stuff. --*/ typedef struct { /* pointer back to the struct bz_stream */ bz_stream* strm; /* state indicator for this stream */ Int32 state; /* for doing the final run-length decoding */ UChar state_out_ch; Int32 state_out_len; Bool blockRandomised; BZ_RAND_DECLS; /* the buffer for bit stream reading */ UInt32 bsBuff; Int32 bsLive; /* misc administratium */ Int32 blockSize100k; Bool smallDecompress; Int32 currBlockNo; Int32 verbosity; /* for undoing the Burrows-Wheeler transform */ Int32 origPtr; UInt32 tPos; Int32 k0; Int32 unzftab[256]; Int32 nblock_used; Int32 cftab[257]; Int32 cftabCopy[257]; /* for undoing the Burrows-Wheeler transform (FAST) */ UInt32* tt; /* for undoing the Burrows-Wheeler transform (SMALL) */ UInt16* ll16; UChar* ll4; /* stored and calculated CRCs */ UInt32 storedBlockCRC; UInt32 storedCombinedCRC; UInt32 calculatedBlockCRC; UInt32 calculatedCombinedCRC; /* map of bytes used in block */ Int32 nInUse; Bool inUse[256]; Bool inUse16[16]; UChar seqToUnseq[256]; /* for decoding the MTF values */ UChar mtfa[MTFA_SIZE]; Int32 mtfbase[256 / MTFL_SIZE]; UChar selector[BZ_MAX_SELECTORS]; UChar selectorMtf[BZ_MAX_SELECTORS]; UChar len[BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE]; Int32 limit[BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE]; Int32 base[BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE]; Int32 perm[BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE]; Int32 minLens[BZ_N_GROUPS]; /* save area for scalars in the main decompress code */ Int32 save_i; Int32 save_j; Int32 save_t; Int32 save_alphaSize; Int32 save_nGroups; Int32 save_nSelectors; Int32 save_EOB; Int32 save_groupNo; Int32 save_groupPos; Int32 save_nextSym; Int32 save_nblockMAX; Int32 save_nblock; Int32 save_es; Int32 save_N; Int32 save_curr; Int32 save_zt; Int32 save_zn; Int32 save_zvec; Int32 save_zj; Int32 save_gSel; Int32 save_gMinlen; Int32* save_gLimit; Int32* save_gBase; Int32* save_gPerm; } DState; /*-- Macros for decompression. --*/ #define BZ_GET_FAST(cccc) \ /* c_tPos is unsigned, hence test < 0 is pointless. */ \ if (s->tPos >= (UInt32)100000 * (UInt32)s->blockSize100k) \ return True; \ s->tPos = s->tt[s->tPos]; \ cccc = (UChar)(s->tPos & 0xff); \ s->tPos >>= 8; #define BZ_GET_FAST_C(cccc) \ /* c_tPos is unsigned, hence test < 0 is pointless. */ \ if (c_tPos >= (UInt32)100000 * (UInt32)ro_blockSize100k) \ return True; \ c_tPos = c_tt[c_tPos]; \ cccc = (UChar)(c_tPos & 0xff); \ c_tPos >>= 8; #define SET_LL4(i, n) \ { \ if (((i)&0x1) == 0) \ s->ll4[(i) >> 1] = (s->ll4[(i) >> 1] & 0xf0) | (n); \ else \ s->ll4[(i) >> 1] = (s->ll4[(i) >> 1] & 0x0f) | ((n) << 4); \ } #define GET_LL4(i) ((((UInt32)(s->ll4[(i) >> 1])) >> (((i) << 2) & 0x4)) & 0xF) #define SET_LL(i, n) \ { \ s->ll16[i] = (UInt16)(n & 0x0000ffff); \ SET_LL4(i, n >> 16); \ } #define GET_LL(i) (((UInt32)s->ll16[i]) | (GET_LL4(i) << 16)) #define BZ_GET_SMALL(cccc) \ /* c_tPos is unsigned, hence test < 0 is pointless. */ \ if (s->tPos >= (UInt32)100000 * (UInt32)s->blockSize100k) \ return True; \ cccc = BZ2_indexIntoF(s->tPos, s->cftab); \ s->tPos = GET_LL(s->tPos); /*-- externs for decompression. --*/ extern Int32 BZ2_indexIntoF(Int32, Int32*); extern Int32 BZ2_decompress(DState*); extern void BZ2_hbCreateDecodeTables(Int32*, Int32*, Int32*, UChar*, Int32, Int32, Int32); #endif /*-- BZ_NO_STDIO seems to make nullptr disappear on some platforms. --*/ #ifdef BZ_NO_STDIO #ifndef nullptr #define nullptr 0 #endif #endif /*-------------------------------------------------------------*/ /*--- end bzlib_private.h ---*/ /*-------------------------------------------------------------*/ ================================================ FILE: third-party/tbb/examples/graph/fgbzip2/compress.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /*-------------------------------------------------------------*/ /*--- Compression machinery (not incl block sorting) ---*/ /*--- compress.cpp ---*/ /*-------------------------------------------------------------*/ /* ------------------------------------------------------------------ The original source for this example: This file is part of bzip2/libbzip2, a program and library for lossless, block-sorting data compression. bzip2/libbzip2 version 1.0.6 of 6 September 2010 Copyright (C) 1996-2010 Julian Seward This program, "bzip2", the associated library "libbzip2", and all documentation, are copyright (C) 1996-2010 Julian R Seward. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 3. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 4. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Julian Seward, jseward@bzip.org bzip2/libbzip2 version 1.0.6 of 6 September 2010 ------------------------------------------------------------------ */ /* CHANGES 0.9.0 -- original version. 0.9.0a/b -- no changes in this file. 0.9.0c -- changed setting of nGroups in sendMTFValues() so as to do a bit better on small files */ #include "bzlib_private.hpp" /*---------------------------------------------------*/ /*--- Bit stream I/O ---*/ /*---------------------------------------------------*/ /*---------------------------------------------------*/ void BZ2_bsInitWrite(EState* s) { s->bsLive = 0; s->bsBuff = 0; } /*---------------------------------------------------*/ static void bsFinishWrite(EState* s) { while (s->bsLive > 0) { s->zbits[s->numZ] = (UChar)(s->bsBuff >> 24); s->numZ++; s->bsBuff <<= 8; s->bsLive -= 8; } } /*---------------------------------------------------*/ #define bsNEEDW(nz) \ { \ while (s->bsLive >= 8) { \ s->zbits[s->numZ] = (UChar)(s->bsBuff >> 24); \ s->numZ++; \ s->bsBuff <<= 8; \ s->bsLive -= 8; \ } \ } /*---------------------------------------------------*/ static __inline__ void bsW(EState* s, Int32 n, UInt32 v) { bsNEEDW(n); s->bsBuff |= (v << (32 - s->bsLive - n)); s->bsLive += n; } /*---------------------------------------------------*/ static void bsPutUInt32(EState* s, UInt32 u) { bsW(s, 8, (u >> 24) & 0xffL); bsW(s, 8, (u >> 16) & 0xffL); bsW(s, 8, (u >> 8) & 0xffL); bsW(s, 8, u & 0xffL); } /*---------------------------------------------------*/ static void bsPutUChar(EState* s, UChar c) { bsW(s, 8, (UInt32)c); } /*---------------------------------------------------*/ /*--- The back end proper ---*/ /*---------------------------------------------------*/ /*---------------------------------------------------*/ static void makeMaps_e(EState* s) { Int32 i; s->nInUse = 0; for (i = 0; i < 256; i++) if (s->inUse[i]) { s->unseqToSeq[i] = s->nInUse; s->nInUse++; } } /*---------------------------------------------------*/ static void generateMTFValues(EState* s) { UChar yy[256]; Int32 i, j; Int32 zPend; Int32 wr; Int32 EOB; /* After sorting (eg, here), s->arr1 [ 0 .. s->nblock-1 ] holds sorted order, and ((UChar*)s->arr2) [ 0 .. s->nblock-1 ] holds the original block data. The first thing to do is generate the MTF values, and put them in ((UInt16*)s->arr1) [ 0 .. s->nblock-1 ]. Because there are strictly fewer or equal MTF values than block values, ptr values in this area are overwritten with MTF values only when they are no longer needed. The final compressed bitstream is generated into the area starting at (UChar*) (&((UChar*)s->arr2)[s->nblock]) These storage aliases are set up in bzCompressInit(), except for the last one, which is arranged in compressBlock(). */ UInt32* ptr = s->ptr; UChar* block = s->block; UInt16* mtfv = s->mtfv; makeMaps_e(s); EOB = s->nInUse + 1; for (i = 0; i <= EOB; i++) s->mtfFreq[i] = 0; wr = 0; zPend = 0; for (i = 0; i < s->nInUse; i++) yy[i] = (UChar)i; for (i = 0; i < s->nblock; i++) { UChar ll_i; AssertD(wr <= i, "generateMTFValues(1)"); j = ptr[i] - 1; if (j < 0) j += s->nblock; ll_i = s->unseqToSeq[block[j]]; AssertD(ll_i < s->nInUse, "generateMTFValues(2a)"); if (yy[0] == ll_i) { zPend++; } else { if (zPend > 0) { zPend--; while (True) { if (zPend & 1) { mtfv[wr] = BZ_RUNB; wr++; s->mtfFreq[BZ_RUNB]++; } else { mtfv[wr] = BZ_RUNA; wr++; s->mtfFreq[BZ_RUNA]++; } if (zPend < 2) break; zPend = (zPend - 2) / 2; }; zPend = 0; } { UChar rtmp; UChar* ryy_j; UChar rll_i; rtmp = yy[1]; yy[1] = yy[0]; ryy_j = &(yy[1]); rll_i = ll_i; while (rll_i != rtmp) { UChar rtmp2; ryy_j++; rtmp2 = rtmp; rtmp = *ryy_j; *ryy_j = rtmp2; }; yy[0] = rtmp; j = ryy_j - &(yy[0]); mtfv[wr] = j + 1; wr++; s->mtfFreq[j + 1]++; } } } if (zPend > 0) { zPend--; while (True) { if (zPend & 1) { mtfv[wr] = BZ_RUNB; wr++; s->mtfFreq[BZ_RUNB]++; } else { mtfv[wr] = BZ_RUNA; wr++; s->mtfFreq[BZ_RUNA]++; } if (zPend < 2) break; zPend = (zPend - 2) / 2; }; zPend = 0; } mtfv[wr] = EOB; wr++; s->mtfFreq[EOB]++; s->nMTF = wr; } /*---------------------------------------------------*/ #define BZ_LESSER_ICOST 0 #define BZ_GREATER_ICOST 15 static void sendMTFValues(EState* s) { Int32 t, i, j, k, gs, ge, totc, bt, bc; Int32 nSelectors, alphaSize, minLen, maxLen, selCtr; Int32 nGroups, nBytes; /*-- UChar len [BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE]; is a global since the decoder also needs it. Int32 code[BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE]; Int32 rfreq[BZ_N_GROUPS][BZ_MAX_ALPHA_SIZE]; are also globals only used in this proc. Made global to keep stack frame size small. --*/ UInt16 cost[BZ_N_GROUPS]; Int32 fave[BZ_N_GROUPS]; UInt16* mtfv = s->mtfv; if (s->verbosity >= 3) VPrintf3(" %d in block, %d after MTF & 1-2 coding, " "%d+2 syms in use\n", s->nblock, s->nMTF, s->nInUse); alphaSize = s->nInUse + 2; for (i = 0; i < BZ_N_GROUPS; i++) for (j = 0; j < alphaSize; j++) s->len[i][j] = BZ_GREATER_ICOST; /*--- Decide how many coding tables to use ---*/ AssertH(s->nMTF > 0, 3001); if (s->nMTF < 200) nGroups = 2; else if (s->nMTF < 600) nGroups = 3; else if (s->nMTF < 1200) nGroups = 4; else if (s->nMTF < 2400) nGroups = 5; else nGroups = 6; /*--- Generate an initial set of coding tables ---*/ { Int32 nPart, remF, tFreq, aFreq; nPart = nGroups; remF = s->nMTF; gs = 0; while (nPart > 0) { tFreq = remF / nPart; ge = gs - 1; aFreq = 0; while (aFreq < tFreq && ge < alphaSize - 1) { ge++; aFreq += s->mtfFreq[ge]; } if (ge > gs && nPart != nGroups && nPart != 1 && ((nGroups - nPart) % 2 == 1)) { aFreq -= s->mtfFreq[ge]; ge--; } if (s->verbosity >= 3) VPrintf5(" initial group %d, [%d .. %d], " "has %d syms (%4.1f%%)\n", nPart, gs, ge, aFreq, (100.0 * (float)aFreq) / (float)(s->nMTF)); for (i = 0; i < alphaSize; i++) if (i >= gs && i <= ge) s->len[nPart - 1][i] = BZ_LESSER_ICOST; else s->len[nPart - 1][i] = BZ_GREATER_ICOST; nPart--; gs = ge + 1; remF -= aFreq; } } /*--- Iterate up to BZ_N_ITERS times to improve the tables. ---*/ for (k = 0; k < BZ_N_ITERS; k++) { for (i = 0; i < nGroups; i++) fave[i] = 0; for (i = 0; i < nGroups; i++) for (j = 0; j < alphaSize; j++) s->rfreq[i][j] = 0; /*--- Set up an auxiliary length table which is used to fast-track the common case (nGroups == 6). ---*/ if (nGroups == 6) { for (i = 0; i < alphaSize; i++) { s->len_pack[i][0] = (s->len[1][i] << 16) | s->len[0][i]; s->len_pack[i][1] = (s->len[3][i] << 16) | s->len[2][i]; s->len_pack[i][2] = (s->len[5][i] << 16) | s->len[4][i]; } } nSelectors = 0; totc = 0; gs = 0; while (True) { /*--- Set group start & end marks. --*/ if (gs >= s->nMTF) break; ge = gs + BZ_G_SIZE - 1; if (ge >= s->nMTF) ge = s->nMTF - 1; /*-- Calculate the cost of this group as coded by each of the coding tables. --*/ for (i = 0; i < nGroups; i++) cost[i] = 0; if (nGroups == 6 && 50 == ge - gs + 1) { /*--- fast track the common case ---*/ UInt32 cost01, cost23, cost45; UInt16 icv; cost01 = cost23 = cost45 = 0; #define BZ_ITER(nn) \ icv = mtfv[gs + (nn)]; \ cost01 += s->len_pack[icv][0]; \ cost23 += s->len_pack[icv][1]; \ cost45 += s->len_pack[icv][2]; BZ_ITER(0); BZ_ITER(1); BZ_ITER(2); BZ_ITER(3); BZ_ITER(4); BZ_ITER(5); BZ_ITER(6); BZ_ITER(7); BZ_ITER(8); BZ_ITER(9); BZ_ITER(10); BZ_ITER(11); BZ_ITER(12); BZ_ITER(13); BZ_ITER(14); BZ_ITER(15); BZ_ITER(16); BZ_ITER(17); BZ_ITER(18); BZ_ITER(19); BZ_ITER(20); BZ_ITER(21); BZ_ITER(22); BZ_ITER(23); BZ_ITER(24); BZ_ITER(25); BZ_ITER(26); BZ_ITER(27); BZ_ITER(28); BZ_ITER(29); BZ_ITER(30); BZ_ITER(31); BZ_ITER(32); BZ_ITER(33); BZ_ITER(34); BZ_ITER(35); BZ_ITER(36); BZ_ITER(37); BZ_ITER(38); BZ_ITER(39); BZ_ITER(40); BZ_ITER(41); BZ_ITER(42); BZ_ITER(43); BZ_ITER(44); BZ_ITER(45); BZ_ITER(46); BZ_ITER(47); BZ_ITER(48); BZ_ITER(49); #undef BZ_ITER cost[0] = cost01 & 0xffff; cost[1] = cost01 >> 16; cost[2] = cost23 & 0xffff; cost[3] = cost23 >> 16; cost[4] = cost45 & 0xffff; cost[5] = cost45 >> 16; } else { /*--- slow version which correctly handles all situations ---*/ for (i = gs; i <= ge; i++) { UInt16 icv = mtfv[i]; for (j = 0; j < nGroups; j++) cost[j] += s->len[j][icv]; } } /*-- Find the coding table which is best for this group, and record its identity in the selector table. --*/ bc = 999999999; bt = -1; for (i = 0; i < nGroups; i++) if (cost[i] < bc) { bc = cost[i]; bt = i; }; totc += bc; fave[bt]++; s->selector[nSelectors] = bt; nSelectors++; /*-- Increment the symbol frequencies for the selected table. --*/ if (nGroups == 6 && 50 == ge - gs + 1) { /*--- fast track the common case ---*/ #define BZ_ITUR(nn) s->rfreq[bt][mtfv[gs + (nn)]]++ BZ_ITUR(0); BZ_ITUR(1); BZ_ITUR(2); BZ_ITUR(3); BZ_ITUR(4); BZ_ITUR(5); BZ_ITUR(6); BZ_ITUR(7); BZ_ITUR(8); BZ_ITUR(9); BZ_ITUR(10); BZ_ITUR(11); BZ_ITUR(12); BZ_ITUR(13); BZ_ITUR(14); BZ_ITUR(15); BZ_ITUR(16); BZ_ITUR(17); BZ_ITUR(18); BZ_ITUR(19); BZ_ITUR(20); BZ_ITUR(21); BZ_ITUR(22); BZ_ITUR(23); BZ_ITUR(24); BZ_ITUR(25); BZ_ITUR(26); BZ_ITUR(27); BZ_ITUR(28); BZ_ITUR(29); BZ_ITUR(30); BZ_ITUR(31); BZ_ITUR(32); BZ_ITUR(33); BZ_ITUR(34); BZ_ITUR(35); BZ_ITUR(36); BZ_ITUR(37); BZ_ITUR(38); BZ_ITUR(39); BZ_ITUR(40); BZ_ITUR(41); BZ_ITUR(42); BZ_ITUR(43); BZ_ITUR(44); BZ_ITUR(45); BZ_ITUR(46); BZ_ITUR(47); BZ_ITUR(48); BZ_ITUR(49); #undef BZ_ITUR } else { /*--- slow version which correctly handles all situations ---*/ for (i = gs; i <= ge; i++) s->rfreq[bt][mtfv[i]]++; } gs = ge + 1; } if (s->verbosity >= 3) { VPrintf2(" pass %d: size is %d, grp uses are ", k + 1, totc / 8); for (i = 0; i < nGroups; i++) VPrintf1("%d ", fave[i]); VPrintf0("\n"); } /*-- Recompute the tables based on the accumulated frequencies. --*/ /* maxLen was changed from 20 to 17 in bzip2-1.0.3. See comment in huffman.c for details. */ for (i = 0; i < nGroups; i++) BZ2_hbMakeCodeLengths(&(s->len[i][0]), &(s->rfreq[i][0]), alphaSize, 17 /*20*/); } AssertH(nGroups < 8, 3002); AssertH(nSelectors < 32768 && nSelectors <= (2 + (900000 / BZ_G_SIZE)), 3003); /*--- Compute MTF values for the selectors. ---*/ { UChar pos[BZ_N_GROUPS], ll_i, tmp2, tmp; for (i = 0; i < nGroups; i++) pos[i] = i; for (i = 0; i < nSelectors; i++) { ll_i = s->selector[i]; j = 0; tmp = pos[j]; while (ll_i != tmp) { j++; tmp2 = tmp; tmp = pos[j]; pos[j] = tmp2; }; pos[0] = tmp; s->selectorMtf[i] = j; } }; /*--- Assign actual codes for the tables. --*/ for (j = 0; j < nGroups; j++) { minLen = 32; maxLen = 0; for (i = 0; i < alphaSize; i++) { if (s->len[j][i] > maxLen) maxLen = s->len[j][i]; if (s->len[j][i] < minLen) minLen = s->len[j][i]; } AssertH(!(maxLen > 17 /*20*/), 3004); AssertH(!(minLen < 1), 3005); BZ2_hbAssignCodes(&(s->code[j][0]), &(s->len[j][0]), minLen, maxLen, alphaSize); } /*--- Transmit the mapping table. ---*/ { Bool inUse16[16]; for (i = 0; i < 16; i++) { inUse16[i] = False; for (j = 0; j < 16; j++) if (s->inUse[i * 16 + j]) inUse16[i] = True; } nBytes = s->numZ; for (i = 0; i < 16; i++) if (inUse16[i]) bsW(s, 1, 1); else bsW(s, 1, 0); for (i = 0; i < 16; i++) if (inUse16[i]) for (j = 0; j < 16; j++) { if (s->inUse[i * 16 + j]) bsW(s, 1, 1); else bsW(s, 1, 0); } if (s->verbosity >= 3) VPrintf1(" bytes: mapping %d, ", s->numZ - nBytes); } /*--- Now the selectors. ---*/ nBytes = s->numZ; bsW(s, 3, nGroups); bsW(s, 15, nSelectors); for (i = 0; i < nSelectors; i++) { for (j = 0; j < s->selectorMtf[i]; j++) bsW(s, 1, 1); bsW(s, 1, 0); } if (s->verbosity >= 3) VPrintf1("selectors %d, ", s->numZ - nBytes); /*--- Now the coding tables. ---*/ nBytes = s->numZ; for (t = 0; t < nGroups; t++) { Int32 curr = s->len[t][0]; bsW(s, 5, curr); for (i = 0; i < alphaSize; i++) { while (curr < s->len[t][i]) { bsW(s, 2, 2); curr++; /* 10 */ }; while (curr > s->len[t][i]) { bsW(s, 2, 3); curr--; /* 11 */ }; bsW(s, 1, 0); } } if (s->verbosity >= 3) VPrintf1("code lengths %d, ", s->numZ - nBytes); /*--- And finally, the block data proper ---*/ nBytes = s->numZ; selCtr = 0; gs = 0; while (True) { if (gs >= s->nMTF) break; ge = gs + BZ_G_SIZE - 1; if (ge >= s->nMTF) ge = s->nMTF - 1; AssertH(s->selector[selCtr] < nGroups, 3006); if (nGroups == 6 && 50 == ge - gs + 1) { /*--- fast track the common case ---*/ UInt16 mtfv_i; UChar* s_len_sel_selCtr = &(s->len[s->selector[selCtr]][0]); Int32* s_code_sel_selCtr = &(s->code[s->selector[selCtr]][0]); #define BZ_ITAH(nn) \ mtfv_i = mtfv[gs + (nn)]; \ bsW(s, s_len_sel_selCtr[mtfv_i], s_code_sel_selCtr[mtfv_i]) BZ_ITAH(0); BZ_ITAH(1); BZ_ITAH(2); BZ_ITAH(3); BZ_ITAH(4); BZ_ITAH(5); BZ_ITAH(6); BZ_ITAH(7); BZ_ITAH(8); BZ_ITAH(9); BZ_ITAH(10); BZ_ITAH(11); BZ_ITAH(12); BZ_ITAH(13); BZ_ITAH(14); BZ_ITAH(15); BZ_ITAH(16); BZ_ITAH(17); BZ_ITAH(18); BZ_ITAH(19); BZ_ITAH(20); BZ_ITAH(21); BZ_ITAH(22); BZ_ITAH(23); BZ_ITAH(24); BZ_ITAH(25); BZ_ITAH(26); BZ_ITAH(27); BZ_ITAH(28); BZ_ITAH(29); BZ_ITAH(30); BZ_ITAH(31); BZ_ITAH(32); BZ_ITAH(33); BZ_ITAH(34); BZ_ITAH(35); BZ_ITAH(36); BZ_ITAH(37); BZ_ITAH(38); BZ_ITAH(39); BZ_ITAH(40); BZ_ITAH(41); BZ_ITAH(42); BZ_ITAH(43); BZ_ITAH(44); BZ_ITAH(45); BZ_ITAH(46); BZ_ITAH(47); BZ_ITAH(48); BZ_ITAH(49); #undef BZ_ITAH } else { /*--- slow version which correctly handles all situations ---*/ for (i = gs; i <= ge; i++) { bsW(s, s->len[s->selector[selCtr]][mtfv[i]], s->code[s->selector[selCtr]][mtfv[i]]); } } gs = ge + 1; selCtr++; } AssertH(selCtr == nSelectors, 3007); if (s->verbosity >= 3) VPrintf1("codes %d\n", s->numZ - nBytes); } /*---------------------------------------------------*/ void BZ2_compressBlock(EState* s, Bool is_last_block) { if (s->nblock > 0) { BZ_FINALISE_CRC(s->blockCRC); s->combinedCRC = (s->combinedCRC << 1) | (s->combinedCRC >> 31); s->combinedCRC ^= s->blockCRC; if (s->blockNo > 1) s->numZ = 0; if (s->verbosity >= 2) VPrintf4(" block %d: crc = 0x%08x, " "combined CRC = 0x%08x, size = %d\n", s->blockNo, s->blockCRC, s->combinedCRC, s->nblock); BZ2_blockSort(s); } s->zbits = (UChar*)(&((UChar*)s->arr2)[s->nblock]); /*-- If this is the first block, create the stream header. --*/ if (s->blockNo == 1) { BZ2_bsInitWrite(s); bsPutUChar(s, BZ_HDR_B); bsPutUChar(s, BZ_HDR_Z); bsPutUChar(s, BZ_HDR_h); bsPutUChar(s, (UChar)(BZ_HDR_0 + s->blockSize100k)); } if (s->nblock > 0) { bsPutUChar(s, 0x31); bsPutUChar(s, 0x41); bsPutUChar(s, 0x59); bsPutUChar(s, 0x26); bsPutUChar(s, 0x53); bsPutUChar(s, 0x59); /*-- Now the block's CRC, so it is in a known place. --*/ bsPutUInt32(s, s->blockCRC); /*-- Now a single bit indicating (non-)randomisation. As of version 0.9.5, we use a better sorting algorithm which makes randomisation unnecessary. So always set the randomised bit to 'no'. Of course, the decoder still needs to be able to handle randomised blocks so as to maintain backwards compatibility with older versions of bzip2. --*/ bsW(s, 1, 0); bsW(s, 24, s->origPtr); generateMTFValues(s); sendMTFValues(s); } /*-- If this is the last block, add the stream trailer. --*/ if (is_last_block) { bsPutUChar(s, 0x17); bsPutUChar(s, 0x72); bsPutUChar(s, 0x45); bsPutUChar(s, 0x38); bsPutUChar(s, 0x50); bsPutUChar(s, 0x90); bsPutUInt32(s, s->combinedCRC); if (s->verbosity >= 2) VPrintf1(" final combined CRC = 0x%08x\n ", s->combinedCRC); bsFinishWrite(s); } } /*-------------------------------------------------------------*/ /*--- end compress.c ---*/ /*-------------------------------------------------------------*/ ================================================ FILE: third-party/tbb/examples/graph/fgbzip2/crctable.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /*-------------------------------------------------------------*/ /*--- Table for doing CRCs ---*/ /*--- crctable.cpp ---*/ /*-------------------------------------------------------------*/ /* ------------------------------------------------------------------ The original source for this example: This file is part of bzip2/libbzip2, a program and library for lossless, block-sorting data compression. bzip2/libbzip2 version 1.0.6 of 6 September 2010 Copyright (C) 1996-2010 Julian Seward This program, "bzip2", the associated library "libbzip2", and all documentation, are copyright (C) 1996-2010 Julian R Seward. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 3. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 4. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Julian Seward, jseward@bzip.org bzip2/libbzip2 version 1.0.6 of 6 September 2010 ------------------------------------------------------------------ */ #include "bzlib_private.hpp" /*-- I think this is an implementation of the AUTODIN-II, Ethernet & FDDI 32-bit CRC standard. Vaguely derived from code by Rob Warnock, in Section 51 of the comp.compression FAQ. --*/ UInt32 BZ2_crc32Table[256] = { /*-- Ugly, innit? --*/ 0x00000000L, 0x04c11db7L, 0x09823b6eL, 0x0d4326d9L, 0x130476dcL, 0x17c56b6bL, 0x1a864db2L, 0x1e475005L, 0x2608edb8L, 0x22c9f00fL, 0x2f8ad6d6L, 0x2b4bcb61L, 0x350c9b64L, 0x31cd86d3L, 0x3c8ea00aL, 0x384fbdbdL, 0x4c11db70L, 0x48d0c6c7L, 0x4593e01eL, 0x4152fda9L, 0x5f15adacL, 0x5bd4b01bL, 0x569796c2L, 0x52568b75L, 0x6a1936c8L, 0x6ed82b7fL, 0x639b0da6L, 0x675a1011L, 0x791d4014L, 0x7ddc5da3L, 0x709f7b7aL, 0x745e66cdL, 0x9823b6e0L, 0x9ce2ab57L, 0x91a18d8eL, 0x95609039L, 0x8b27c03cL, 0x8fe6dd8bL, 0x82a5fb52L, 0x8664e6e5L, 0xbe2b5b58L, 0xbaea46efL, 0xb7a96036L, 0xb3687d81L, 0xad2f2d84L, 0xa9ee3033L, 0xa4ad16eaL, 0xa06c0b5dL, 0xd4326d90L, 0xd0f37027L, 0xddb056feL, 0xd9714b49L, 0xc7361b4cL, 0xc3f706fbL, 0xceb42022L, 0xca753d95L, 0xf23a8028L, 0xf6fb9d9fL, 0xfbb8bb46L, 0xff79a6f1L, 0xe13ef6f4L, 0xe5ffeb43L, 0xe8bccd9aL, 0xec7dd02dL, 0x34867077L, 0x30476dc0L, 0x3d044b19L, 0x39c556aeL, 0x278206abL, 0x23431b1cL, 0x2e003dc5L, 0x2ac12072L, 0x128e9dcfL, 0x164f8078L, 0x1b0ca6a1L, 0x1fcdbb16L, 0x018aeb13L, 0x054bf6a4L, 0x0808d07dL, 0x0cc9cdcaL, 0x7897ab07L, 0x7c56b6b0L, 0x71159069L, 0x75d48ddeL, 0x6b93dddbL, 0x6f52c06cL, 0x6211e6b5L, 0x66d0fb02L, 0x5e9f46bfL, 0x5a5e5b08L, 0x571d7dd1L, 0x53dc6066L, 0x4d9b3063L, 0x495a2dd4L, 0x44190b0dL, 0x40d816baL, 0xaca5c697L, 0xa864db20L, 0xa527fdf9L, 0xa1e6e04eL, 0xbfa1b04bL, 0xbb60adfcL, 0xb6238b25L, 0xb2e29692L, 0x8aad2b2fL, 0x8e6c3698L, 0x832f1041L, 0x87ee0df6L, 0x99a95df3L, 0x9d684044L, 0x902b669dL, 0x94ea7b2aL, 0xe0b41de7L, 0xe4750050L, 0xe9362689L, 0xedf73b3eL, 0xf3b06b3bL, 0xf771768cL, 0xfa325055L, 0xfef34de2L, 0xc6bcf05fL, 0xc27dede8L, 0xcf3ecb31L, 0xcbffd686L, 0xd5b88683L, 0xd1799b34L, 0xdc3abdedL, 0xd8fba05aL, 0x690ce0eeL, 0x6dcdfd59L, 0x608edb80L, 0x644fc637L, 0x7a089632L, 0x7ec98b85L, 0x738aad5cL, 0x774bb0ebL, 0x4f040d56L, 0x4bc510e1L, 0x46863638L, 0x42472b8fL, 0x5c007b8aL, 0x58c1663dL, 0x558240e4L, 0x51435d53L, 0x251d3b9eL, 0x21dc2629L, 0x2c9f00f0L, 0x285e1d47L, 0x36194d42L, 0x32d850f5L, 0x3f9b762cL, 0x3b5a6b9bL, 0x0315d626L, 0x07d4cb91L, 0x0a97ed48L, 0x0e56f0ffL, 0x1011a0faL, 0x14d0bd4dL, 0x19939b94L, 0x1d528623L, 0xf12f560eL, 0xf5ee4bb9L, 0xf8ad6d60L, 0xfc6c70d7L, 0xe22b20d2L, 0xe6ea3d65L, 0xeba91bbcL, 0xef68060bL, 0xd727bbb6L, 0xd3e6a601L, 0xdea580d8L, 0xda649d6fL, 0xc423cd6aL, 0xc0e2d0ddL, 0xcda1f604L, 0xc960ebb3L, 0xbd3e8d7eL, 0xb9ff90c9L, 0xb4bcb610L, 0xb07daba7L, 0xae3afba2L, 0xaafbe615L, 0xa7b8c0ccL, 0xa379dd7bL, 0x9b3660c6L, 0x9ff77d71L, 0x92b45ba8L, 0x9675461fL, 0x8832161aL, 0x8cf30badL, 0x81b02d74L, 0x857130c3L, 0x5d8a9099L, 0x594b8d2eL, 0x5408abf7L, 0x50c9b640L, 0x4e8ee645L, 0x4a4ffbf2L, 0x470cdd2bL, 0x43cdc09cL, 0x7b827d21L, 0x7f436096L, 0x7200464fL, 0x76c15bf8L, 0x68860bfdL, 0x6c47164aL, 0x61043093L, 0x65c52d24L, 0x119b4be9L, 0x155a565eL, 0x18197087L, 0x1cd86d30L, 0x029f3d35L, 0x065e2082L, 0x0b1d065bL, 0x0fdc1becL, 0x3793a651L, 0x3352bbe6L, 0x3e119d3fL, 0x3ad08088L, 0x2497d08dL, 0x2056cd3aL, 0x2d15ebe3L, 0x29d4f654L, 0xc5a92679L, 0xc1683bceL, 0xcc2b1d17L, 0xc8ea00a0L, 0xd6ad50a5L, 0xd26c4d12L, 0xdf2f6bcbL, 0xdbee767cL, 0xe3a1cbc1L, 0xe760d676L, 0xea23f0afL, 0xeee2ed18L, 0xf0a5bd1dL, 0xf464a0aaL, 0xf9278673L, 0xfde69bc4L, 0x89b8fd09L, 0x8d79e0beL, 0x803ac667L, 0x84fbdbd0L, 0x9abc8bd5L, 0x9e7d9662L, 0x933eb0bbL, 0x97ffad0cL, 0xafb010b1L, 0xab710d06L, 0xa6322bdfL, 0xa2f33668L, 0xbcb4666dL, 0xb8757bdaL, 0xb5365d03L, 0xb1f740b4L }; /*-------------------------------------------------------------*/ /*--- end crctable.c ---*/ /*-------------------------------------------------------------*/ ================================================ FILE: third-party/tbb/examples/graph/fgbzip2/decompress.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /*-------------------------------------------------------------*/ /*--- Decompression machinery ---*/ /*--- decompress.cpp ---*/ /*-------------------------------------------------------------*/ /* ------------------------------------------------------------------ The original source for this example: This file is part of bzip2/libbzip2, a program and library for lossless, block-sorting data compression. bzip2/libbzip2 version 1.0.6 of 6 September 2010 Copyright (C) 1996-2010 Julian Seward This program, "bzip2", the associated library "libbzip2", and all documentation, are copyright (C) 1996-2010 Julian R Seward. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 3. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 4. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Julian Seward, jseward@bzip.org bzip2/libbzip2 version 1.0.6 of 6 September 2010 ------------------------------------------------------------------ */ #include "bzlib_private.hpp" /*---------------------------------------------------*/ static void makeMaps_d(DState* s) { Int32 i; s->nInUse = 0; for (i = 0; i < 256; i++) if (s->inUse[i]) { s->seqToUnseq[s->nInUse] = i; s->nInUse++; } } /*---------------------------------------------------*/ #define RETURN(rrr) \ { \ retVal = rrr; \ goto save_state_and_return; \ }; #define GET_BITS(lll, vvv, nnn) \ case lll: \ s->state = lll; \ while (True) { \ if (s->bsLive >= nnn) { \ UInt32 v; \ v = (s->bsBuff >> (s->bsLive - nnn)) & ((1 << nnn) - 1); \ s->bsLive -= nnn; \ vvv = v; \ break; \ } \ if (s->strm->avail_in == 0) \ RETURN(BZ_OK); \ s->bsBuff = (s->bsBuff << 8) | ((UInt32)(*((UChar*)(s->strm->next_in)))); \ s->bsLive += 8; \ s->strm->next_in++; \ s->strm->avail_in--; \ s->strm->total_in_lo32++; \ if (s->strm->total_in_lo32 == 0) \ s->strm->total_in_hi32++; \ } #define GET_UCHAR(lll, uuu) GET_BITS(lll, uuu, 8) #define GET_BIT(lll, uuu) GET_BITS(lll, uuu, 1) /*---------------------------------------------------*/ #define GET_MTF_VAL(label1, label2, lval) \ { \ if (groupPos == 0) { \ groupNo++; \ if (groupNo >= nSelectors) \ RETURN(BZ_DATA_ERROR); \ groupPos = BZ_G_SIZE; \ gSel = s->selector[groupNo]; \ gMinlen = s->minLens[gSel]; \ gLimit = &(s->limit[gSel][0]); \ gPerm = &(s->perm[gSel][0]); \ gBase = &(s->base[gSel][0]); \ } \ groupPos--; \ zn = gMinlen; \ GET_BITS(label1, zvec, zn); \ while (1) { \ if (zn > 20 /* the longest code */) \ RETURN(BZ_DATA_ERROR); \ if (zvec <= gLimit[zn]) \ break; \ zn++; \ GET_BIT(label2, zj); \ zvec = (zvec << 1) | zj; \ }; \ if (zvec - gBase[zn] < 0 || zvec - gBase[zn] >= BZ_MAX_ALPHA_SIZE) \ RETURN(BZ_DATA_ERROR); \ lval = gPerm[zvec - gBase[zn]]; \ } /*---------------------------------------------------*/ Int32 BZ2_decompress(DState* s) { UChar uc; Int32 retVal; Int32 minLen, maxLen; bz_stream* strm = s->strm; /* stuff that needs to be saved/restored */ Int32 i; Int32 j; Int32 t; Int32 alphaSize; Int32 nGroups; Int32 nSelectors; Int32 EOB; Int32 groupNo; Int32 groupPos; Int32 nextSym; Int32 nblockMAX; Int32 nblock; Int32 es; Int32 N; Int32 curr; Int32 zt; Int32 zn; Int32 zvec; Int32 zj; Int32 gSel; Int32 gMinlen; Int32* gLimit; Int32* gBase; Int32* gPerm; if (s->state == BZ_X_MAGIC_1) { /*initialise the save area*/ s->save_i = 0; s->save_j = 0; s->save_t = 0; s->save_alphaSize = 0; s->save_nGroups = 0; s->save_nSelectors = 0; s->save_EOB = 0; s->save_groupNo = 0; s->save_groupPos = 0; s->save_nextSym = 0; s->save_nblockMAX = 0; s->save_nblock = 0; s->save_es = 0; s->save_N = 0; s->save_curr = 0; s->save_zt = 0; s->save_zn = 0; s->save_zvec = 0; s->save_zj = 0; s->save_gSel = 0; s->save_gMinlen = 0; s->save_gLimit = nullptr; s->save_gBase = nullptr; s->save_gPerm = nullptr; } /*restore from the save area*/ i = s->save_i; j = s->save_j; t = s->save_t; alphaSize = s->save_alphaSize; nGroups = s->save_nGroups; nSelectors = s->save_nSelectors; EOB = s->save_EOB; groupNo = s->save_groupNo; groupPos = s->save_groupPos; nextSym = s->save_nextSym; nblockMAX = s->save_nblockMAX; nblock = s->save_nblock; es = s->save_es; N = s->save_N; curr = s->save_curr; zt = s->save_zt; zn = s->save_zn; zvec = s->save_zvec; zj = s->save_zj; gSel = s->save_gSel; gMinlen = s->save_gMinlen; gLimit = s->save_gLimit; gBase = s->save_gBase; gPerm = s->save_gPerm; retVal = BZ_OK; switch (s->state) { GET_UCHAR(BZ_X_MAGIC_1, uc); if (uc != BZ_HDR_B) RETURN(BZ_DATA_ERROR_MAGIC); GET_UCHAR(BZ_X_MAGIC_2, uc); if (uc != BZ_HDR_Z) RETURN(BZ_DATA_ERROR_MAGIC); GET_UCHAR(BZ_X_MAGIC_3, uc) if (uc != BZ_HDR_h) RETURN(BZ_DATA_ERROR_MAGIC); GET_BITS(BZ_X_MAGIC_4, s->blockSize100k, 8) if (s->blockSize100k < (BZ_HDR_0 + 1) || s->blockSize100k > (BZ_HDR_0 + 9)) RETURN(BZ_DATA_ERROR_MAGIC); s->blockSize100k -= BZ_HDR_0; if (s->smallDecompress) { s->ll16 = (UInt16*)BZALLOC(s->blockSize100k * 100000 * sizeof(UInt16)); s->ll4 = (UChar*)BZALLOC(((1 + s->blockSize100k * 100000) >> 1) * sizeof(UChar)); if (s->ll16 == nullptr || s->ll4 == nullptr) RETURN(BZ_MEM_ERROR); } else { s->tt = (UInt32*)BZALLOC(s->blockSize100k * 100000 * sizeof(Int32)); if (s->tt == nullptr) RETURN(BZ_MEM_ERROR); } GET_UCHAR(BZ_X_BLKHDR_1, uc); if (uc == 0x17) goto endhdr_2; if (uc != 0x31) RETURN(BZ_DATA_ERROR); GET_UCHAR(BZ_X_BLKHDR_2, uc); if (uc != 0x41) RETURN(BZ_DATA_ERROR); GET_UCHAR(BZ_X_BLKHDR_3, uc); if (uc != 0x59) RETURN(BZ_DATA_ERROR); GET_UCHAR(BZ_X_BLKHDR_4, uc); if (uc != 0x26) RETURN(BZ_DATA_ERROR); GET_UCHAR(BZ_X_BLKHDR_5, uc); if (uc != 0x53) RETURN(BZ_DATA_ERROR); GET_UCHAR(BZ_X_BLKHDR_6, uc); if (uc != 0x59) RETURN(BZ_DATA_ERROR); s->currBlockNo++; if (s->verbosity >= 2) VPrintf1("\n [%d: huff+mtf ", s->currBlockNo); s->storedBlockCRC = 0; GET_UCHAR(BZ_X_BCRC_1, uc); s->storedBlockCRC = (s->storedBlockCRC << 8) | ((UInt32)uc); GET_UCHAR(BZ_X_BCRC_2, uc); s->storedBlockCRC = (s->storedBlockCRC << 8) | ((UInt32)uc); GET_UCHAR(BZ_X_BCRC_3, uc); s->storedBlockCRC = (s->storedBlockCRC << 8) | ((UInt32)uc); GET_UCHAR(BZ_X_BCRC_4, uc); s->storedBlockCRC = (s->storedBlockCRC << 8) | ((UInt32)uc); GET_BITS(BZ_X_RANDBIT, s->blockRandomised, 1); s->origPtr = 0; GET_UCHAR(BZ_X_ORIGPTR_1, uc); s->origPtr = (s->origPtr << 8) | ((Int32)uc); GET_UCHAR(BZ_X_ORIGPTR_2, uc); s->origPtr = (s->origPtr << 8) | ((Int32)uc); GET_UCHAR(BZ_X_ORIGPTR_3, uc); s->origPtr = (s->origPtr << 8) | ((Int32)uc); if (s->origPtr < 0) RETURN(BZ_DATA_ERROR); if (s->origPtr > 10 + 100000 * s->blockSize100k) RETURN(BZ_DATA_ERROR); /*--- Receive the mapping table ---*/ for (i = 0; i < 16; i++) { GET_BIT(BZ_X_MAPPING_1, uc); if (uc == 1) s->inUse16[i] = True; else s->inUse16[i] = False; } for (i = 0; i < 256; i++) s->inUse[i] = False; for (i = 0; i < 16; i++) if (s->inUse16[i]) for (j = 0; j < 16; j++) { GET_BIT(BZ_X_MAPPING_2, uc); if (uc == 1) s->inUse[i * 16 + j] = True; } makeMaps_d(s); if (s->nInUse == 0) RETURN(BZ_DATA_ERROR); alphaSize = s->nInUse + 2; /*--- Now the selectors ---*/ GET_BITS(BZ_X_SELECTOR_1, nGroups, 3); if (nGroups < 2 || nGroups > 6) RETURN(BZ_DATA_ERROR); GET_BITS(BZ_X_SELECTOR_2, nSelectors, 15); if (nSelectors < 1) RETURN(BZ_DATA_ERROR); for (i = 0; i < nSelectors; i++) { j = 0; while (True) { GET_BIT(BZ_X_SELECTOR_3, uc); if (uc == 0) break; j++; if (j >= nGroups) RETURN(BZ_DATA_ERROR); } s->selectorMtf[i] = j; } /*--- Undo the MTF values for the selectors. ---*/ { UChar pos[BZ_N_GROUPS], tmp, v; for (i = 0; i < nGroups; i++) pos[i] = i; for (i = 0; i < nSelectors; i++) { v = s->selectorMtf[i]; tmp = pos[v]; while (v > 0) { pos[v] = pos[v - 1]; v--; } pos[0] = tmp; s->selector[i] = tmp; } } /*--- Now the coding tables ---*/ for (j = 0; j < nGroups; j++) { GET_BITS(BZ_X_CODING_1, curr, 5); for (i = 0; i < alphaSize; i++) { while (True) { if (curr < 1 || curr > 20) RETURN(BZ_DATA_ERROR); GET_BIT(BZ_X_CODING_2, uc); if (uc == 0) break; GET_BIT(BZ_X_CODING_3, uc); if (uc == 0) curr++; else curr--; } s->len[j][i] = curr; } } /*--- Create the Huffman decoding tables ---*/ for (j = 0; j < nGroups; j++) { minLen = 32; maxLen = 0; for (i = 0; i < alphaSize; i++) { if (s->len[j][i] > maxLen) maxLen = s->len[j][i]; if (s->len[j][i] < minLen) minLen = s->len[j][i]; } BZ2_hbCreateDecodeTables(&(s->limit[j][0]), &(s->base[j][0]), &(s->perm[j][0]), &(s->len[j][0]), minLen, maxLen, alphaSize); s->minLens[j] = minLen; } /*--- Now the MTF values ---*/ EOB = s->nInUse + 1; nblockMAX = 100000 * s->blockSize100k; groupNo = -1; groupPos = 0; for (i = 0; i <= 255; i++) s->unzftab[i] = 0; /*-- MTF init --*/ { Int32 l, j, k; k = MTFA_SIZE - 1; for (l = 256 / MTFL_SIZE - 1; l >= 0; l--) { for (j = MTFL_SIZE - 1; j >= 0; j--) { s->mtfa[k] = (UChar)(l * MTFL_SIZE + j); k--; } s->mtfbase[l] = k + 1; } } /*-- end MTF init --*/ nblock = 0; GET_MTF_VAL(BZ_X_MTF_1, BZ_X_MTF_2, nextSym); while (True) { if (nextSym == EOB) break; if (nextSym == BZ_RUNA || nextSym == BZ_RUNB) { es = -1; N = 1; do { /* Check that N doesn't get too big, so that es doesn't go negative. The maximum value that can be RUNA/RUNB encoded is equal to the block size (post the initial RLE), viz, 900k, so bounding N at 2 million should guard against overflow without rejecting any legitimate inputs. */ if (N >= 2 * 1024 * 1024) RETURN(BZ_DATA_ERROR); if (nextSym == BZ_RUNA) es = es + (0 + 1) * N; else if (nextSym == BZ_RUNB) es = es + (1 + 1) * N; N = N * 2; GET_MTF_VAL(BZ_X_MTF_3, BZ_X_MTF_4, nextSym); } while (nextSym == BZ_RUNA || nextSym == BZ_RUNB); es++; uc = s->seqToUnseq[s->mtfa[s->mtfbase[0]]]; s->unzftab[uc] += es; if (s->smallDecompress) while (es > 0) { if (nblock >= nblockMAX) RETURN(BZ_DATA_ERROR); s->ll16[nblock] = (UInt16)uc; nblock++; es--; } else while (es > 0) { if (nblock >= nblockMAX) RETURN(BZ_DATA_ERROR); s->tt[nblock] = (UInt32)uc; nblock++; es--; }; continue; } else { if (nblock >= nblockMAX) RETURN(BZ_DATA_ERROR); /*-- uc = MTF ( nextSym-1 ) --*/ { Int32 i, j, k, l, lno, off; UInt32 nn; nn = (UInt32)(nextSym - 1); if (nn < MTFL_SIZE) { /* avoid general-case expense */ l = s->mtfbase[0]; uc = s->mtfa[l + nn]; while (nn > 3) { Int32 z = l + nn; s->mtfa[(z)] = s->mtfa[(z)-1]; s->mtfa[(z)-1] = s->mtfa[(z)-2]; s->mtfa[(z)-2] = s->mtfa[(z)-3]; s->mtfa[(z)-3] = s->mtfa[(z)-4]; nn -= 4; } while (nn > 0) { s->mtfa[(l + nn)] = s->mtfa[(l + nn) - 1]; nn--; }; s->mtfa[l] = uc; } else { /* general case */ lno = nn / MTFL_SIZE; off = nn % MTFL_SIZE; l = s->mtfbase[lno] + off; uc = s->mtfa[l]; while (l > s->mtfbase[lno]) { s->mtfa[l] = s->mtfa[l - 1]; l--; }; s->mtfbase[lno]++; while (lno > 0) { s->mtfbase[lno]--; s->mtfa[s->mtfbase[lno]] = s->mtfa[s->mtfbase[lno - 1] + MTFL_SIZE - 1]; lno--; } s->mtfbase[0]--; s->mtfa[s->mtfbase[0]] = uc; if (s->mtfbase[0] == 0) { k = MTFA_SIZE - 1; for (i = 256 / MTFL_SIZE - 1; i >= 0; i--) { for (j = MTFL_SIZE - 1; j >= 0; j--) { s->mtfa[k] = s->mtfa[s->mtfbase[i] + j]; k--; } s->mtfbase[i] = k + 1; } } } } /*-- end uc = MTF ( nextSym-1 ) --*/ s->unzftab[s->seqToUnseq[uc]]++; if (s->smallDecompress) s->ll16[nblock] = (UInt16)(s->seqToUnseq[uc]); else s->tt[nblock] = (UInt32)(s->seqToUnseq[uc]); nblock++; GET_MTF_VAL(BZ_X_MTF_5, BZ_X_MTF_6, nextSym); continue; } } /* Now we know what nblock is, we can do a better sanity check on s->origPtr. */ if (s->origPtr < 0 || s->origPtr >= nblock) RETURN(BZ_DATA_ERROR); /*-- Set up cftab to facilitate generation of T^(-1) --*/ /* Check: unzftab entries in range. */ for (i = 0; i <= 255; i++) { if (s->unzftab[i] < 0 || s->unzftab[i] > nblock) RETURN(BZ_DATA_ERROR); } /* Actually generate cftab. */ s->cftab[0] = 0; for (i = 1; i <= 256; i++) s->cftab[i] = s->unzftab[i - 1]; for (i = 1; i <= 256; i++) s->cftab[i] += s->cftab[i - 1]; /* Check: cftab entries in range. */ for (i = 0; i <= 256; i++) { if (s->cftab[i] < 0 || s->cftab[i] > nblock) { /* s->cftab[i] can legitimately be == nblock */ RETURN(BZ_DATA_ERROR); } } /* Check: cftab entries non-descending. */ for (i = 1; i <= 256; i++) { if (s->cftab[i - 1] > s->cftab[i]) { RETURN(BZ_DATA_ERROR); } } s->state_out_len = 0; s->state_out_ch = 0; BZ_INITIALISE_CRC(s->calculatedBlockCRC); s->state = BZ_X_OUTPUT; if (s->verbosity >= 2) VPrintf0("rt+rld"); if (s->smallDecompress) { /*-- Make a copy of cftab, used in generation of T --*/ for (i = 0; i <= 256; i++) s->cftabCopy[i] = s->cftab[i]; /*-- compute the T vector --*/ for (i = 0; i < nblock; i++) { uc = (UChar)(s->ll16[i]); SET_LL(i, s->cftabCopy[uc]); s->cftabCopy[uc]++; } /*-- Compute T^(-1) by pointer reversal on T --*/ i = s->origPtr; j = GET_LL(i); do { Int32 tmp = GET_LL(j); SET_LL(j, i); i = j; j = tmp; } while (i != s->origPtr); s->tPos = s->origPtr; s->nblock_used = 0; if (s->blockRandomised) { BZ_RAND_INIT_MASK; BZ_GET_SMALL(s->k0); s->nblock_used++; BZ_RAND_UPD_MASK; s->k0 ^= BZ_RAND_MASK; } else { BZ_GET_SMALL(s->k0); s->nblock_used++; } } else { /*-- compute the T^(-1) vector --*/ for (i = 0; i < nblock; i++) { uc = (UChar)(s->tt[i] & 0xff); s->tt[s->cftab[uc]] |= (i << 8); s->cftab[uc]++; } s->tPos = s->tt[s->origPtr] >> 8; s->nblock_used = 0; if (s->blockRandomised) { BZ_RAND_INIT_MASK; BZ_GET_FAST(s->k0); s->nblock_used++; BZ_RAND_UPD_MASK; s->k0 ^= BZ_RAND_MASK; } else { BZ_GET_FAST(s->k0); s->nblock_used++; } } RETURN(BZ_OK); endhdr_2: GET_UCHAR(BZ_X_ENDHDR_2, uc); if (uc != 0x72) RETURN(BZ_DATA_ERROR); GET_UCHAR(BZ_X_ENDHDR_3, uc); if (uc != 0x45) RETURN(BZ_DATA_ERROR); GET_UCHAR(BZ_X_ENDHDR_4, uc); if (uc != 0x38) RETURN(BZ_DATA_ERROR); GET_UCHAR(BZ_X_ENDHDR_5, uc); if (uc != 0x50) RETURN(BZ_DATA_ERROR); GET_UCHAR(BZ_X_ENDHDR_6, uc); if (uc != 0x90) RETURN(BZ_DATA_ERROR); s->storedCombinedCRC = 0; GET_UCHAR(BZ_X_CCRC_1, uc); s->storedCombinedCRC = (s->storedCombinedCRC << 8) | ((UInt32)uc); GET_UCHAR(BZ_X_CCRC_2, uc); s->storedCombinedCRC = (s->storedCombinedCRC << 8) | ((UInt32)uc); GET_UCHAR(BZ_X_CCRC_3, uc); s->storedCombinedCRC = (s->storedCombinedCRC << 8) | ((UInt32)uc); GET_UCHAR(BZ_X_CCRC_4, uc); s->storedCombinedCRC = (s->storedCombinedCRC << 8) | ((UInt32)uc); s->state = BZ_X_IDLE; RETURN(BZ_STREAM_END); default: AssertH(False, 4001); } AssertH(False, 4002); save_state_and_return: s->save_i = i; s->save_j = j; s->save_t = t; s->save_alphaSize = alphaSize; s->save_nGroups = nGroups; s->save_nSelectors = nSelectors; s->save_EOB = EOB; s->save_groupNo = groupNo; s->save_groupPos = groupPos; s->save_nextSym = nextSym; s->save_nblockMAX = nblockMAX; s->save_nblock = nblock; s->save_es = es; s->save_N = N; s->save_curr = curr; s->save_zt = zt; s->save_zn = zn; s->save_zvec = zvec; s->save_zj = zj; s->save_gSel = gSel; s->save_gMinlen = gMinlen; s->save_gLimit = gLimit; s->save_gBase = gBase; s->save_gPerm = gPerm; return retVal; } /*-------------------------------------------------------------*/ /*--- end decompress.c ---*/ /*-------------------------------------------------------------*/ ================================================ FILE: third-party/tbb/examples/graph/fgbzip2/fgbzip2.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include #include #include #include "bzlib.hpp" #include "common/utility/utility.hpp" #include "oneapi/tbb/flow_graph.h" #include "oneapi/tbb/tick_count.h" #include "oneapi/tbb/concurrent_queue.h" // TODO: change memory allocation/deallocation to be managed in constructor/destructor struct Buffer { std::size_t len; char* b; }; struct BufferMsg { BufferMsg() {} BufferMsg(Buffer& inputBuffer, Buffer& outputBuffer, std::size_t seqId, bool isLast = false) : inputBuffer(inputBuffer), outputBuffer(outputBuffer), seqId(seqId), isLast(isLast) {} static BufferMsg createBufferMsg(std::size_t seqId, std::size_t chunkSize) { Buffer inputBuffer; inputBuffer.b = new char[chunkSize]; inputBuffer.len = chunkSize; Buffer outputBuffer; std::size_t compressedChunkSize = chunkSize * 1.01 + 600; // compression overhead outputBuffer.b = new char[compressedChunkSize]; outputBuffer.len = compressedChunkSize; return BufferMsg(inputBuffer, outputBuffer, seqId); } static void destroyBufferMsg(const BufferMsg& destroyMsg) { delete[] destroyMsg.inputBuffer.b; delete[] destroyMsg.outputBuffer.b; } void markLast(std::size_t lastId) { isLast = true; seqId = lastId; } std::size_t seqId; Buffer inputBuffer; Buffer outputBuffer; bool isLast; }; class BufferCompressor { public: BufferCompressor(int blockSizeIn100KB) : m_blockSize(blockSizeIn100KB) {} BufferMsg operator()(BufferMsg buffer) const { if (!buffer.isLast) { unsigned int outSize = buffer.outputBuffer.len; BZ2_bzBuffToBuffCompress(buffer.outputBuffer.b, &outSize, buffer.inputBuffer.b, buffer.inputBuffer.len, m_blockSize, 0, 30); buffer.outputBuffer.len = outSize; } return buffer; } private: int m_blockSize; }; class IOOperations { public: IOOperations(std::ifstream& inputStream, std::ofstream& outputStream, std::size_t chunkSize) : m_inputStream(inputStream), m_outputStream(outputStream), m_chunkSize(chunkSize), m_chunksRead(0) {} void readChunk(Buffer& buffer) { m_inputStream.read(buffer.b, m_chunkSize); buffer.len = static_cast(m_inputStream.gcount()); m_chunksRead++; } void writeChunk(const Buffer& buffer) { m_outputStream.write(buffer.b, buffer.len); } std::size_t chunksRead() const { return m_chunksRead; } std::size_t chunkSize() const { return m_chunkSize; } bool hasDataToRead() const { return m_inputStream.is_open() && !m_inputStream.eof(); } private: std::ifstream& m_inputStream; std::ofstream& m_outputStream; std::size_t m_chunkSize; std::size_t m_chunksRead; }; //----------------------------------------------------------------------------------------------------------------------- //---------------------------------------Compression example based on async_node----------------------------------------- //----------------------------------------------------------------------------------------------------------------------- typedef oneapi::tbb::flow::async_node async_file_reader_node; typedef oneapi::tbb::flow::async_node async_file_writer_node; class AsyncNodeActivity { public: AsyncNodeActivity(IOOperations& io) : m_io(io), m_fileWriterThread(&AsyncNodeActivity::writingLoop, this) {} ~AsyncNodeActivity() { m_fileReaderThread.join(); m_fileWriterThread.join(); } void submitRead(async_file_reader_node::gateway_type& gateway) { gateway.reserve_wait(); std::thread(&AsyncNodeActivity::readingLoop, this, std::ref(gateway)) .swap(m_fileReaderThread); } void submitWrite(const BufferMsg& bufferMsg) { m_writeQueue.push(bufferMsg); } private: void readingLoop(async_file_reader_node::gateway_type& gateway) { while (m_io.hasDataToRead()) { BufferMsg bufferMsg = BufferMsg::createBufferMsg(m_io.chunksRead(), m_io.chunkSize()); m_io.readChunk(bufferMsg.inputBuffer); gateway.try_put(bufferMsg); } sendLastMessage(gateway); gateway.release_wait(); } void writingLoop() { BufferMsg buffer; m_writeQueue.pop(buffer); while (!buffer.isLast) { m_io.writeChunk(buffer.outputBuffer); m_writeQueue.pop(buffer); } } void sendLastMessage(async_file_reader_node::gateway_type& gateway) { BufferMsg lastMsg; lastMsg.markLast(m_io.chunksRead()); gateway.try_put(lastMsg); } IOOperations& m_io; oneapi::tbb::concurrent_bounded_queue m_writeQueue; std::thread m_fileReaderThread; std::thread m_fileWriterThread; }; void fgCompressionAsyncNode(IOOperations& io, int blockSizeIn100KB) { oneapi::tbb::flow::graph g; AsyncNodeActivity asyncNodeActivity(io); async_file_reader_node file_reader( g, oneapi::tbb::flow::unlimited, [&asyncNodeActivity](const oneapi::tbb::flow::continue_msg& msg, async_file_reader_node::gateway_type& gateway) { asyncNodeActivity.submitRead(gateway); }); oneapi::tbb::flow::function_node compressor( g, oneapi::tbb::flow::unlimited, BufferCompressor(blockSizeIn100KB)); oneapi::tbb::flow::sequencer_node ordering(g, [](const BufferMsg& bufferMsg) -> size_t { return bufferMsg.seqId; }); // The node is serial to preserve the right order of buffers set by the preceding sequencer_node async_file_writer_node output_writer( g, oneapi::tbb::flow::serial, [&asyncNodeActivity](const BufferMsg& bufferMsg, async_file_writer_node::gateway_type& gateway) { asyncNodeActivity.submitWrite(bufferMsg); }); make_edge(file_reader, compressor); make_edge(compressor, ordering); make_edge(ordering, output_writer); file_reader.try_put(oneapi::tbb::flow::continue_msg()); g.wait_for_all(); } //----------------------------------------------------------------------------------------------------------------------- //---------------------------------------------Simple compression example------------------------------------------------ //----------------------------------------------------------------------------------------------------------------------- void fgCompression(IOOperations& io, int blockSizeIn100KB) { oneapi::tbb::flow::graph g; oneapi::tbb::flow::input_node file_reader( g, [&io](oneapi::tbb::flow_control& fc) -> BufferMsg { if (io.hasDataToRead()) { BufferMsg bufferMsg = BufferMsg::createBufferMsg(io.chunksRead(), io.chunkSize()); io.readChunk(bufferMsg.inputBuffer); return bufferMsg; } fc.stop(); return BufferMsg{}; }); file_reader.activate(); oneapi::tbb::flow::function_node compressor( g, oneapi::tbb::flow::unlimited, BufferCompressor(blockSizeIn100KB)); oneapi::tbb::flow::sequencer_node ordering(g, [](const BufferMsg& buffer) -> size_t { return buffer.seqId; }); oneapi::tbb::flow::function_node output_writer( g, oneapi::tbb::flow::serial, [&io](const BufferMsg& bufferMsg) { io.writeChunk(bufferMsg.outputBuffer); BufferMsg::destroyBufferMsg(bufferMsg); }); make_edge(file_reader, compressor); make_edge(compressor, ordering); make_edge(ordering, output_writer); g.wait_for_all(); } //----------------------------------------------------------------------------------------------------------------------- bool endsWith(const std::string& str, const std::string& suffix) { return str.find(suffix, str.length() - suffix.length()) != std::string::npos; } //----------------------------------------------------------------------------------------------------------------------- int main(int argc, char* argv[]) { oneapi::tbb::tick_count mainStartTime = oneapi::tbb::tick_count::now(); const std::string archiveExtension = ".bz2"; bool verbose = false; bool asyncType; std::string inputFileName; int blockSizeIn100KB = 1; // block size in 100KB chunks std::size_t memoryLimitIn1MB = 1; // memory limit for compression in megabytes granularity utility::parse_cli_arguments( argc, argv, utility::cli_argument_pack() //"-h" option for displaying help is present implicitly .arg(blockSizeIn100KB, "-b", "\t block size in 100KB chunks, [1 .. 9]") .arg(verbose, "-v", "verbose mode") .arg(memoryLimitIn1MB, "-l", "used memory limit for compression algorithm in 1MB (minimum) granularity") .arg(asyncType, "-async", "use graph async_node-based implementation") .positional_arg(inputFileName, "filename", "input file name")); if (inputFileName.empty()) { throw std::invalid_argument( "Input file name is not specified. Try 'fgbzip2 -h' for more information."); } if (blockSizeIn100KB < 1 || blockSizeIn100KB > 9) { throw std::invalid_argument("Incorrect block size. Try 'fgbzip2 -h' for more information."); } if (memoryLimitIn1MB < 1) { throw std::invalid_argument( "Incorrect memory limit size. Try 'fgbzip2 -h' for more information."); } if (verbose) std::cout << "Input file name: " << inputFileName << "\n"; if (endsWith(inputFileName, archiveExtension)) { throw std::invalid_argument("Input file already have " + archiveExtension + " extension."); } std::ifstream inputStream(inputFileName.c_str(), std::ios::in | std::ios::binary); if (!inputStream.is_open()) { throw std::invalid_argument("Cannot open " + inputFileName + " file."); } std::string outputFileName(inputFileName + archiveExtension); std::ofstream outputStream(outputFileName.c_str(), std::ios::out | std::ios::binary | std::ios::trunc); if (!outputStream.is_open()) { throw std::invalid_argument("Cannot open " + outputFileName + " file."); } // General interface to work with I/O buffers operations std::size_t chunkSize = blockSizeIn100KB * 100 * 1024; IOOperations io(inputStream, outputStream, chunkSize); if (asyncType) { if (verbose) std::cout << "Running flow graph based compression algorithm with async_node based asynchronous IO operations." << "\n"; fgCompressionAsyncNode(io, blockSizeIn100KB); } else { if (verbose) std::cout << "Running flow graph based compression algorithm." << "\n"; fgCompression(io, blockSizeIn100KB); } inputStream.close(); outputStream.close(); utility::report_elapsed_time((oneapi::tbb::tick_count::now() - mainStartTime).seconds()); return 0; } ================================================ FILE: third-party/tbb/examples/graph/fgbzip2/huffman.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /*-------------------------------------------------------------*/ /*--- Huffman coding low-level stuff ---*/ /*--- huffman.cpp ---*/ /*-------------------------------------------------------------*/ /* ------------------------------------------------------------------ The original source for this example: This file is part of bzip2/libbzip2, a program and library for lossless, block-sorting data compression. bzip2/libbzip2 version 1.0.6 of 6 September 2010 Copyright (C) 1996-2010 Julian Seward This program, "bzip2", the associated library "libbzip2", and all documentation, are copyright (C) 1996-2010 Julian R Seward. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 3. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 4. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Julian Seward, jseward@bzip.org bzip2/libbzip2 version 1.0.6 of 6 September 2010 ------------------------------------------------------------------ */ #include "bzlib_private.hpp" /*---------------------------------------------------*/ #define WEIGHTOF(zz0) ((zz0)&0xffffff00) #define DEPTHOF(zz1) ((zz1)&0x000000ff) #define MYMAX(zz2, zz3) ((zz2) > (zz3) ? (zz2) : (zz3)) #define ADDWEIGHTS(zw1, zw2) \ (WEIGHTOF(zw1) + WEIGHTOF(zw2)) | (1 + MYMAX(DEPTHOF(zw1), DEPTHOF(zw2))) #define UPHEAP(z) \ { \ Int32 zz, tmp; \ zz = z; \ tmp = heap[zz]; \ while (weight[tmp] < weight[heap[zz >> 1]]) { \ heap[zz] = heap[zz >> 1]; \ zz >>= 1; \ } \ heap[zz] = tmp; \ } #define DOWNHEAP(z) \ { \ Int32 zz, yy, tmp; \ zz = z; \ tmp = heap[zz]; \ while (True) { \ yy = zz << 1; \ if (yy > nHeap) \ break; \ if (yy < nHeap && weight[heap[yy + 1]] < weight[heap[yy]]) \ yy++; \ if (weight[tmp] < weight[heap[yy]]) \ break; \ heap[zz] = heap[yy]; \ zz = yy; \ } \ heap[zz] = tmp; \ } /*---------------------------------------------------*/ void BZ2_hbMakeCodeLengths(UChar *len, Int32 *freq, Int32 alphaSize, Int32 maxLen) { /*-- Nodes and heap entries run from 1. Entry 0 for both the heap and nodes is a sentinel. --*/ Int32 nNodes, nHeap, n1, n2, i, j, k; Bool tooLong; Int32 heap[BZ_MAX_ALPHA_SIZE + 2]; Int32 weight[BZ_MAX_ALPHA_SIZE * 2]; Int32 parent[BZ_MAX_ALPHA_SIZE * 2]; for (i = 0; i < alphaSize; i++) weight[i + 1] = (freq[i] == 0 ? 1 : freq[i]) << 8; while (True) { nNodes = alphaSize; nHeap = 0; heap[0] = 0; weight[0] = 0; parent[0] = -2; for (i = 1; i <= alphaSize; i++) { parent[i] = -1; nHeap++; heap[nHeap] = i; UPHEAP(nHeap); } AssertH(nHeap < (BZ_MAX_ALPHA_SIZE + 2), 2001); while (nHeap > 1) { n1 = heap[1]; heap[1] = heap[nHeap]; nHeap--; DOWNHEAP(1); n2 = heap[1]; heap[1] = heap[nHeap]; nHeap--; DOWNHEAP(1); nNodes++; parent[n1] = parent[n2] = nNodes; weight[nNodes] = ADDWEIGHTS(weight[n1], weight[n2]); parent[nNodes] = -1; nHeap++; heap[nHeap] = nNodes; UPHEAP(nHeap); } AssertH(nNodes < (BZ_MAX_ALPHA_SIZE * 2), 2002); tooLong = False; for (i = 1; i <= alphaSize; i++) { j = 0; k = i; while (parent[k] >= 0) { k = parent[k]; j++; } len[i - 1] = j; if (j > maxLen) tooLong = True; } if (!tooLong) break; /* 17 Oct 04: keep-going condition for the following loop used to be 'i < alphaSize', which missed the last element, theoretically leading to the possibility of the compressor looping. However, this count-scaling step is only needed if one of the generated Huffman code words is longer than maxLen, which up to and including version 1.0.2 was 20 bits, which is extremely unlikely. In version 1.0.3 maxLen was changed to 17 bits, which has minimal effect on compression ratio, but does mean this scaling step is used from time to time, enough to verify that it works. This means that bzip2-1.0.3 and later will only produce Huffman codes with a maximum length of 17 bits. However, in order to preserve backwards compatibility with bitstreams produced by versions pre-1.0.3, the decompressor must still handle lengths of up to 20. */ for (i = 1; i <= alphaSize; i++) { j = weight[i] >> 8; j = 1 + (j / 2); weight[i] = j << 8; } } } /*---------------------------------------------------*/ void BZ2_hbAssignCodes(Int32 *code, UChar *length, Int32 minLen, Int32 maxLen, Int32 alphaSize) { Int32 j, vec, i; vec = 0; for (j = minLen; j <= maxLen; j++) { for (i = 0; i < alphaSize; i++) if (length[i] == j) { code[i] = vec; vec++; }; vec <<= 1; } } /*---------------------------------------------------*/ void BZ2_hbCreateDecodeTables(Int32 *limit, Int32 *base, Int32 *perm, UChar *length, Int32 minLen, Int32 maxLen, Int32 alphaSize) { Int32 pp, i, j, vec; pp = 0; for (i = minLen; i <= maxLen; i++) for (j = 0; j < alphaSize; j++) if (length[j] == i) { perm[pp] = j; pp++; }; for (i = 0; i < BZ_MAX_CODE_LEN; i++) base[i] = 0; for (i = 0; i < alphaSize; i++) base[length[i] + 1]++; for (i = 1; i < BZ_MAX_CODE_LEN; i++) base[i] += base[i - 1]; for (i = 0; i < BZ_MAX_CODE_LEN; i++) limit[i] = 0; vec = 0; for (i = minLen; i <= maxLen; i++) { vec += (base[i + 1] - base[i]); limit[i] = vec - 1; vec <<= 1; } for (i = minLen + 1; i <= maxLen; i++) base[i] = ((limit[i - 1] + 1) << 1) - base[i]; } /*-------------------------------------------------------------*/ /*--- end huffman.c ---*/ /*-------------------------------------------------------------*/ ================================================ FILE: third-party/tbb/examples/graph/fgbzip2/randtable.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /*-------------------------------------------------------------*/ /*--- Table for randomising repetitive blocks ---*/ /*--- randtable.cpp ---*/ /*-------------------------------------------------------------*/ /* ------------------------------------------------------------------ The original source for this example: This file is part of bzip2/libbzip2, a program and library for lossless, block-sorting data compression. bzip2/libbzip2 version 1.0.6 of 6 September 2010 Copyright (C) 1996-2010 Julian Seward This program, "bzip2", the associated library "libbzip2", and all documentation, are copyright (C) 1996-2010 Julian R Seward. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 3. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 4. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Julian Seward, jseward@bzip.org bzip2/libbzip2 version 1.0.6 of 6 September 2010 ------------------------------------------------------------------ */ #include "bzlib_private.hpp" /*---------------------------------------------*/ Int32 BZ2_rNums[512] = { 619, 720, 127, 481, 931, 816, 813, 233, 566, 247, 985, 724, 205, 454, 863, 491, 741, 242, 949, 214, 733, 859, 335, 708, 621, 574, 73, 654, 730, 472, 419, 436, 278, 496, 867, 210, 399, 680, 480, 51, 878, 465, 811, 169, 869, 675, 611, 697, 867, 561, 862, 687, 507, 283, 482, 129, 807, 591, 733, 623, 150, 238, 59, 379, 684, 877, 625, 169, 643, 105, 170, 607, 520, 932, 727, 476, 693, 425, 174, 647, 73, 122, 335, 530, 442, 853, 695, 249, 445, 515, 909, 545, 703, 919, 874, 474, 882, 500, 594, 612, 641, 801, 220, 162, 819, 984, 589, 513, 495, 799, 161, 604, 958, 533, 221, 400, 386, 867, 600, 782, 382, 596, 414, 171, 516, 375, 682, 485, 911, 276, 98, 553, 163, 354, 666, 933, 424, 341, 533, 870, 227, 730, 475, 186, 263, 647, 537, 686, 600, 224, 469, 68, 770, 919, 190, 373, 294, 822, 808, 206, 184, 943, 795, 384, 383, 461, 404, 758, 839, 887, 715, 67, 618, 276, 204, 918, 873, 777, 604, 560, 951, 160, 578, 722, 79, 804, 96, 409, 713, 940, 652, 934, 970, 447, 318, 353, 859, 672, 112, 785, 645, 863, 803, 350, 139, 93, 354, 99, 820, 908, 609, 772, 154, 274, 580, 184, 79, 626, 630, 742, 653, 282, 762, 623, 680, 81, 927, 626, 789, 125, 411, 521, 938, 300, 821, 78, 343, 175, 128, 250, 170, 774, 972, 275, 999, 639, 495, 78, 352, 126, 857, 956, 358, 619, 580, 124, 737, 594, 701, 612, 669, 112, 134, 694, 363, 992, 809, 743, 168, 974, 944, 375, 748, 52, 600, 747, 642, 182, 862, 81, 344, 805, 988, 739, 511, 655, 814, 334, 249, 515, 897, 955, 664, 981, 649, 113, 974, 459, 893, 228, 433, 837, 553, 268, 926, 240, 102, 654, 459, 51, 686, 754, 806, 760, 493, 403, 415, 394, 687, 700, 946, 670, 656, 610, 738, 392, 760, 799, 887, 653, 978, 321, 576, 617, 626, 502, 894, 679, 243, 440, 680, 879, 194, 572, 640, 724, 926, 56, 204, 700, 707, 151, 457, 449, 797, 195, 791, 558, 945, 679, 297, 59, 87, 824, 713, 663, 412, 693, 342, 606, 134, 108, 571, 364, 631, 212, 174, 643, 304, 329, 343, 97, 430, 751, 497, 314, 983, 374, 822, 928, 140, 206, 73, 263, 980, 736, 876, 478, 430, 305, 170, 514, 364, 692, 829, 82, 855, 953, 676, 246, 369, 970, 294, 750, 807, 827, 150, 790, 288, 923, 804, 378, 215, 828, 592, 281, 565, 555, 710, 82, 896, 831, 547, 261, 524, 462, 293, 465, 502, 56, 661, 821, 976, 991, 658, 869, 905, 758, 745, 193, 768, 550, 608, 933, 378, 286, 215, 979, 792, 961, 61, 688, 793, 644, 986, 403, 106, 366, 905, 644, 372, 567, 466, 434, 645, 210, 389, 550, 919, 135, 780, 773, 635, 389, 707, 100, 626, 958, 165, 504, 920, 176, 193, 713, 857, 265, 203, 50, 668, 108, 645, 990, 626, 197, 510, 357, 358, 850, 858, 364, 936, 638 }; /*-------------------------------------------------------------*/ /*--- end randtable.c ---*/ /*-------------------------------------------------------------*/ ================================================ FILE: third-party/tbb/examples/graph/logic_sim/CMakeLists.txt ================================================ # Copyright (c) 2020-2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. cmake_minimum_required(VERSION 3.5.0...3.31.3) project(logic_sim CXX) include(../../common/cmake/common.cmake) set_common_project_settings(tbb) add_executable(logic_sim test_all.cpp) target_link_libraries(logic_sim TBB::tbb Threads::Threads) target_compile_options(logic_sim PRIVATE ${TBB_CXX_STD_FLAG}) set(EXECUTABLE "$") set(ARGS 4) set(PERF_ARGS auto silent) add_execution_target(run_logic_sim logic_sim ${EXECUTABLE} "${ARGS}") add_execution_target(perf_run_logic_sim logic_sim ${EXECUTABLE} "${PERF_ARGS}") ================================================ FILE: third-party/tbb/examples/graph/logic_sim/D_latch.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef TBB_examples_logic_sim_dlatch_H #define TBB_examples_logic_sim_dlatch_H #include "basics.hpp" class D_latch : public oneapi::tbb::flow::composite_node, std::tuple> { oneapi::tbb::flow::broadcast_node D_port; oneapi::tbb::flow::broadcast_node E_port; not_gate a_not; and_gate<2> first_and; and_gate<2> second_and; nor_gate<2> first_nor; nor_gate<2> second_nor; oneapi::tbb::flow::graph& my_graph; typedef oneapi::tbb::flow::composite_node, std::tuple> base_type; public: D_latch(oneapi::tbb::flow::graph& g) : base_type(g), my_graph(g), D_port(g), E_port(g), a_not(g), first_and(g), second_and(g), first_nor(g), second_nor(g) { make_edge(D_port, input_port<0>(a_not)); make_edge(D_port, input_port<1>(second_and)); make_edge(E_port, input_port<1>(first_and)); make_edge(E_port, input_port<0>(second_and)); make_edge(a_not, input_port<0>(first_and)); make_edge(first_and, input_port<0>(first_nor)); make_edge(second_and, input_port<1>(second_nor)); make_edge(first_nor, input_port<0>(second_nor)); make_edge(second_nor, input_port<1>(first_nor)); base_type::input_ports_type input_tuple(D_port, E_port); base_type::output_ports_type output_tuple(output_port<0>(first_nor), output_port<0>(second_nor)); base_type::set_external_ports(input_tuple, output_tuple); base_type::add_visible_nodes( D_port, E_port, a_not, first_and, second_and, first_nor, second_nor); } ~D_latch() {} }; #endif /* TBB_examples_logic_sim_dlatch_H */ ================================================ FILE: third-party/tbb/examples/graph/logic_sim/README.md ================================================ # Logic_sim sample This directory contains `oneapi::tbb::flow` example that performs simplistic digital logic simulations with basic logic gates that can be easily composed to create more interesting circuits. ## Building the example ``` cmake cmake --build . ``` ## Running the sample ### Predefined make targets * `make run_logic_sim` - executes the example with predefined parameters. * `make perf_run_logic_sim` - executes the example with suggested parameters to measure the oneTBB performance. ### Application parameters Usage: ``` logic_sim [#threads=value] [verbose] [silent] [-h] [#threads] ``` * `-h` - prints the help for command line options. * `#threads` - the number of threads to use; a range of the form low[:high] where low and optional high are non-negative integers, or `auto` for a platform-specific default number. * `verbose` - prints diagnostic output to screen. * `silent` limits output to timing info; overrides verbose ================================================ FILE: third-party/tbb/examples/graph/logic_sim/basics.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef TBB_examples_logic_sim_basic_H #define TBB_examples_logic_sim_basic_H #include #include #include #include "oneapi/tbb/tick_count.h" #include "oneapi/tbb/flow_graph.h" #include "common/utility/utility.hpp" #ifndef _WIN32 #include #include void rt_sleep(int msec) { usleep(msec * 1000); } #else //_WIN32 #undef OLDUNIXTIME #undef STDTIME #include void rt_sleep(int msec) { Sleep(msec); } #endif /* _WIN32 */ using oneapi::tbb::flow::make_edge; using oneapi::tbb::flow::cast_to; using oneapi::tbb::flow::input_port; using oneapi::tbb::flow::output_port; typedef enum { low = 0, high, undefined } signal_t; template class gate; template <> class gate<1> : public oneapi::tbb::flow::composite_node, std::tuple> { protected: typedef oneapi::tbb::flow::indexer_node input_port_t; typedef oneapi::tbb::flow::multifunction_node> gate_fn_t; typedef gate_fn_t::output_ports_type ports_type; typedef oneapi::tbb::flow::composite_node, std::tuple> base_type; public: template gate(oneapi::tbb::flow::graph& g, Body b) : base_type(g), my_graph(g), in_ports(g), gate_fn(g, 1, b) { make_edge(in_ports, gate_fn); base_type::input_ports_type input_tuple(input_port<0>(in_ports)); base_type::output_ports_type output_tuple(output_port<0>(gate_fn)); base_type::set_external_ports(input_tuple, output_tuple); base_type::add_visible_nodes(in_ports, gate_fn); } virtual ~gate() {} gate& operator=(const gate& src) { return *this; } protected: oneapi::tbb::flow::graph& my_graph; private: input_port_t in_ports; gate_fn_t gate_fn; }; template <> class gate<2> : public oneapi::tbb::flow::composite_node, std::tuple> { protected: typedef oneapi::tbb::flow::indexer_node input_port_t; typedef oneapi::tbb::flow::multifunction_node> gate_fn_t; typedef gate_fn_t::output_ports_type ports_type; typedef oneapi::tbb::flow::composite_node, std::tuple> base_type; public: template gate(oneapi::tbb::flow::graph& g, Body b) : base_type(g), my_graph(g), in_ports(g), gate_fn(g, 1, b) { make_edge(in_ports, gate_fn); base_type::input_ports_type input_tuple(input_port<0>(in_ports), input_port<1>(in_ports)); base_type::output_ports_type output_tuple(output_port<0>(gate_fn)); base_type::set_external_ports(input_tuple, output_tuple); base_type::add_visible_nodes(in_ports, gate_fn); } virtual ~gate() {} gate& operator=(const gate& src) { return *this; } protected: oneapi::tbb::flow::graph& my_graph; private: input_port_t in_ports; gate_fn_t gate_fn; }; template <> class gate<3> : public oneapi::tbb::flow::composite_node, std::tuple> { protected: typedef oneapi::tbb::flow::indexer_node input_port_t; typedef oneapi::tbb::flow::multifunction_node> gate_fn_t; typedef gate_fn_t::output_ports_type ports_type; typedef oneapi::tbb::flow::composite_node, std::tuple> base_type; public: template gate(oneapi::tbb::flow::graph& g, Body b) : base_type(g), my_graph(g), in_ports(g), gate_fn(g, 1, b) { make_edge(in_ports, gate_fn); base_type::input_ports_type input_tuple( input_port<0>(in_ports), input_port<1>(in_ports), input_port<2>(in_ports)); base_type::output_ports_type output_tuple(output_port<0>(gate_fn)); base_type::set_external_ports(input_tuple, output_tuple); base_type::add_visible_nodes(in_ports, gate_fn); } virtual ~gate() {} gate& operator=(const gate& src) { return *this; } protected: oneapi::tbb::flow::graph& my_graph; private: input_port_t in_ports; gate_fn_t gate_fn; }; template <> class gate<4> : public oneapi::tbb::flow::composite_node< std::tuple, std::tuple> { protected: typedef oneapi::tbb::flow::indexer_node input_port_t; typedef oneapi::tbb::flow::multifunction_node> gate_fn_t; typedef gate_fn_t::output_ports_type ports_type; typedef oneapi::tbb::flow::composite_node, std::tuple> base_type; public: template gate(oneapi::tbb::flow::graph& g, Body b) : base_type(g), my_graph(g), in_ports(g), gate_fn(g, 1, b) { make_edge(in_ports, gate_fn); base_type::input_ports_type input_tuple(input_port<0>(in_ports), input_port<1>(in_ports), input_port<2>(in_ports), input_port<3>(in_ports)); base_type::output_ports_type output_tuple(output_port<0>(gate_fn)); base_type::set_external_ports(input_tuple, output_tuple); base_type::add_visible_nodes(in_ports, gate_fn); } virtual ~gate() {} gate& operator=(const gate& src) { return *this; } protected: oneapi::tbb::flow::graph& my_graph; private: input_port_t in_ports; gate_fn_t gate_fn; }; // Input devices class steady_signal { oneapi::tbb::flow::graph& my_graph; signal_t init_signal; oneapi::tbb::flow::write_once_node signal_node; public: steady_signal(oneapi::tbb::flow::graph& g, signal_t v) : my_graph(g), init_signal(v), signal_node(g) {} steady_signal(const steady_signal& src) : my_graph(src.my_graph), init_signal(src.init_signal), signal_node(src.my_graph) {} ~steady_signal() {} // Assignment is ignored steady_signal& operator=(const steady_signal& src) { return *this; } oneapi::tbb::flow::write_once_node& get_out() { return signal_node; } void activate() { signal_node.try_put(init_signal); } }; class pulse { class clock_body { std::size_t& ms; int& reps; signal_t val; public: clock_body(std::size_t& _ms, int& _reps) : ms(_ms), reps(_reps), val(low) {} signal_t operator()(oneapi::tbb::flow_control& fc) { rt_sleep((int)ms); if (reps > 0) --reps; if (val == low) val = high; else val = low; if (!(reps > 0 || reps == -1)) fc.stop(); return val; } }; oneapi::tbb::flow::graph& my_graph; std::size_t ms, init_ms; int reps, init_reps; oneapi::tbb::flow::input_node clock_node; public: pulse(oneapi::tbb::flow::graph& g, std::size_t _ms = 1000, int _reps = -1) : my_graph(g), ms(_ms), init_ms(_ms), reps(_reps), init_reps(_reps), clock_node(g, clock_body(ms, reps)) {} pulse(const pulse& src) : my_graph(src.my_graph), ms(src.init_ms), init_ms(src.init_ms), reps(src.init_reps), init_reps(src.init_reps), clock_node(src.my_graph, clock_body(ms, reps)) {} ~pulse() {} // Assignment changes the behavior of LHS to that of the RHS, but doesn't change owning graph pulse& operator=(const pulse& src) { ms = src.ms; init_ms = src.init_ms; reps = src.reps; init_reps = src.init_reps; return *this; } oneapi::tbb::flow::input_node& get_out() { return clock_node; } void activate() { clock_node.activate(); } void reset() { reps = init_reps; } }; class push_button { oneapi::tbb::flow::graph& my_graph; oneapi::tbb::flow::overwrite_node push_button_node; public: push_button(oneapi::tbb::flow::graph& g) : my_graph(g), push_button_node(g) { push_button_node.try_put(low); } push_button(const push_button& src) : my_graph(src.my_graph), push_button_node(src.my_graph) { push_button_node.try_put(low); } ~push_button() {} // Assignment is ignored push_button& operator=(const push_button& src) { return *this; } oneapi::tbb::flow::overwrite_node& get_out() { return push_button_node; } void press() { push_button_node.try_put(high); } void release() { push_button_node.try_put(low); } }; class toggle { oneapi::tbb::flow::graph& my_graph; signal_t state; oneapi::tbb::flow::overwrite_node toggle_node; public: toggle(oneapi::tbb::flow::graph& g) : my_graph(g), state(undefined), toggle_node(g) {} toggle(const toggle& src) : my_graph(src.my_graph), state(undefined), toggle_node(src.my_graph) {} ~toggle() {} // Assignment ignored toggle& operator=(const toggle& src) { return *this; } oneapi::tbb::flow::overwrite_node& get_out() { return toggle_node; } void flip() { if (state == high) state = low; else state = high; toggle_node.try_put(state); } void activate() { state = low; toggle_node.try_put(state); } }; // Basic gates class buffer : public gate<1> { using gate<1>::my_graph; typedef gate<1>::ports_type ports_type; class buffer_body { signal_t state; bool touched; public: buffer_body() : state(undefined), touched(false) {} void operator()(const input_port_t::output_type& v, ports_type& p) { if (!touched || state != cast_to(v)) { state = cast_to(v); std::get<0>(p).try_put(state); touched = true; } } }; public: buffer(oneapi::tbb::flow::graph& g) : gate<1>(g, buffer_body()) {} buffer(const buffer& src) : gate<1>(src.my_graph, buffer_body()) {} ~buffer() {} }; class not_gate : public gate<1> { using gate<1>::my_graph; typedef gate<1>::ports_type ports_type; class not_body { signal_t port; bool touched; public: not_body() : port(undefined), touched(false) {} void operator()(const input_port_t::output_type& v, ports_type& p) { if (!touched || port != cast_to(v)) { port = cast_to(v); signal_t state = low; if (port == low) state = high; std::get<0>(p).try_put(state); touched = true; } } }; public: not_gate(oneapi::tbb::flow::graph& g) : gate<1>(g, not_body()) {} not_gate(const not_gate& src) : gate<1>(src.my_graph, not_body()) {} ~not_gate() {} }; template class and_gate : public gate { using gate::my_graph; typedef typename gate::ports_type ports_type; typedef typename gate::input_port_t::output_type from_input; class and_body { signal_t* ports; signal_t state; bool touched; public: and_body() : state(undefined), touched(false) { ports = new signal_t[N]; for (int i = 0; i < N; ++i) ports[i] = undefined; } void operator()(const from_input& v, ports_type& p) { ports[v.tag()] = cast_to(v); signal_t new_state = high; std::size_t i = 0; while (i < N) { if (ports[i] == low) { new_state = low; break; } else if (ports[i] == undefined && new_state != low) { new_state = undefined; } ++i; } if (!touched || state != new_state) { state = new_state; std::get<0>(p).try_put(state); touched = true; } } }; public: and_gate(oneapi::tbb::flow::graph& g) : gate(g, and_body()) {} and_gate(const and_gate& src) : gate(src.my_graph, and_body()) {} ~and_gate() {} }; template class or_gate : public gate { using gate::my_graph; typedef typename gate::ports_type ports_type; typedef typename gate::input_port_t::output_type from_input; class or_body { signal_t* ports; signal_t state; bool touched; public: or_body() : state(undefined), touched(false) { ports = new signal_t[N]; for (int i = 0; i < N; ++i) ports[i] = undefined; } void operator()(const from_input& v, ports_type& p) { ports[v.tag()] = cast_to(v); signal_t new_state = low; std::size_t i = 0; while (i < N) { if (ports[i] == high) { new_state = high; break; } else if (ports[i] == undefined && new_state != high) { new_state = undefined; } ++i; } if (!touched || state != new_state) { state = new_state; std::get<0>(p).try_put(state); touched = true; } } }; public: or_gate(oneapi::tbb::flow::graph& g) : gate(g, or_body()) {} or_gate(const or_gate& src) : gate(src.my_graph, or_body()) {} ~or_gate() {} }; template class xor_gate : public gate { using gate::my_graph; typedef typename gate::ports_type ports_type; typedef typename gate::input_port_t input_port_t; class xor_body { signal_t* ports; signal_t state; bool touched; public: xor_body() : state(undefined), touched(false) { ports = new signal_t[N]; for (int i = 0; i < N; ++i) ports[i] = undefined; } void operator()(const typename input_port_t::output_type& v, ports_type& p) { ports[v.tag()] = cast_to(v); signal_t new_state = low; std::size_t i = 0, highs = 0; while (i < N) { if (ports[i] == undefined) { new_state = undefined; } else if (ports[i] == high && new_state == low) { new_state = high; ++highs; } else if (ports[i] == high && highs > 0) { new_state = low; break; } else if (ports[i] == high) { ++highs; } ++i; } if (!touched || state != new_state) { state = new_state; std::get<0>(p).try_put(state); touched = true; } } }; public: xor_gate(oneapi::tbb::flow::graph& g) : gate(g, xor_body()) {} xor_gate(const xor_gate& src) : gate(src.my_graph, xor_body()) {} ~xor_gate() {} }; template class nor_gate : public gate { using gate::my_graph; typedef typename gate::ports_type ports_type; typedef typename gate::input_port_t input_port_t; class nor_body { signal_t* ports; signal_t state; bool touched; public: nor_body() : state(undefined), touched(false) { ports = new signal_t[N]; for (int i = 0; i < N; ++i) ports[i] = undefined; } void operator()(const typename input_port_t::output_type& v, ports_type& p) { ports[v.tag()] = cast_to(v); signal_t new_state = low; std::size_t i = 0; while (i < N) { if (ports[i] == high) { new_state = high; break; } else if (ports[i] == undefined && new_state != high) { new_state = undefined; } ++i; } if (new_state == high) new_state = low; else if (new_state == low) new_state = high; if (!touched || state != new_state) { state = new_state; std::get<0>(p).try_put(state); touched = true; } } }; public: nor_gate(oneapi::tbb::flow::graph& g) : gate(g, nor_body()) {} nor_gate(const nor_gate& src) : gate(src.my_graph, nor_body()) {} ~nor_gate() {} }; // Output devices class led { class led_body { signal_t& state; std::string& label; bool report_changes; bool touched; public: led_body(signal_t& s, std::string& l, bool r) : state(s), label(l), report_changes(r), touched(false) {} oneapi::tbb::flow::continue_msg operator()(signal_t b) { if (!touched || b != state) { state = b; if (state != undefined && report_changes) { if (state) printf("%s: (*)\n", label.c_str()); else printf("%s: ( )\n", label.c_str()); } touched = false; } return oneapi::tbb::flow::continue_msg(); } }; oneapi::tbb::flow::graph& my_graph; std::string label; signal_t state; bool report_changes; oneapi::tbb::flow::function_node led_node; public: led(oneapi::tbb::flow::graph& g, std::string l, bool rc = false) : my_graph(g), label(l), state(undefined), report_changes(rc), led_node(g, 1, led_body(state, label, report_changes)) {} led(const led& src) : my_graph(src.my_graph), label(src.label), state(undefined), report_changes(src.report_changes), led_node(src.my_graph, 1, led_body(state, label, report_changes)) {} ~led() {} // Assignment changes the behavior of LHS to that of the RHS, but doesn't change owning graph // state is set to undefined so that next signal changes it led& operator=(const led& src) { label = src.label; state = undefined; report_changes = src.report_changes; return *this; } oneapi::tbb::flow::function_node& get_in() { return led_node; } void display() { if (state == high) printf("%s: (*)\n", label.c_str()); else if (state == low) printf("%s: ( )\n", label.c_str()); else printf("%s: (u)\n", label.c_str()); } signal_t get_value() { return state; } }; class digit : public gate<4> { using gate<4>::my_graph; typedef gate<4>::ports_type ports_type; typedef gate<4>::input_port_t input_port_t; class digit_body { signal_t ports[4]; static const int N = 4; unsigned int& state; std::string& label; bool& report_changes; public: digit_body(unsigned int& s, std::string& l, bool& r) : state(s), label(l), report_changes(r) { for (int i = 0; i < N; ++i) ports[i] = undefined; } void operator()(const input_port_t::output_type& v, ports_type& p) { unsigned int new_state = 0; ports[v.tag()] = cast_to(v); if (ports[0] == high) ++new_state; if (ports[1] == high) new_state += 2; if (ports[2] == high) new_state += 4; if (ports[3] == high) new_state += 8; if (state != new_state) { state = new_state; if (report_changes) { printf("%s: %x\n", label.c_str(), state); } } } }; std::string label; unsigned int state; bool report_changes; public: digit(oneapi::tbb::flow::graph& g, std::string l, bool rc = false) : gate<4>(g, digit_body(state, label, report_changes)), label(l), state(0), report_changes(rc) {} digit(const digit& src) : gate<4>(src.my_graph, digit_body(state, label, report_changes)), label(src.label), state(0), report_changes(src.report_changes) {} ~digit() {} // Assignment changes the behavior of LHS to that of the RHS, but doesn't change owning graph. // state is reset as in constructors digit& operator=(const digit& src) { label = src.label; state = 0; report_changes = src.report_changes; return *this; } void display() { printf("%s: %x\n", label.c_str(), state); } unsigned int get_value() { return state; } }; #endif /* TBB_examples_logic_sim_basic_H */ ================================================ FILE: third-party/tbb/examples/graph/logic_sim/four_bit_adder.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef TBB_examples_logic_sim_fba_H #define TBB_examples_logic_sim_fba_H #include "one_bit_adder.hpp" typedef oneapi::tbb::flow::composite_node< std::tuple, std::tuple> fba_base_type; class four_bit_adder : public fba_base_type { oneapi::tbb::flow::graph& my_graph; std::vector four_adders; public: four_bit_adder(oneapi::tbb::flow::graph& g) : fba_base_type(g), my_graph(g), four_adders(4, one_bit_adder(g)) { make_connections(); set_up_composite(); } four_bit_adder(const four_bit_adder& src) : fba_base_type(src.my_graph), my_graph(src.my_graph), four_adders(4, one_bit_adder(src.my_graph)) { make_connections(); set_up_composite(); } ~four_bit_adder() {} private: void make_connections() { make_edge(output_port<1>(four_adders[0]), input_port<0>(four_adders[1])); make_edge(output_port<1>(four_adders[1]), input_port<0>(four_adders[2])); make_edge(output_port<1>(four_adders[2]), input_port<0>(four_adders[3])); } void set_up_composite() { fba_base_type::input_ports_type input_tuple(input_port<0>(four_adders[0] /*CI*/), input_port<1>(four_adders[0]), input_port<2>(four_adders[0]), input_port<1>(four_adders[1]), input_port<2>(four_adders[1]), input_port<1>(four_adders[2]), input_port<2>(four_adders[2]), input_port<1>(four_adders[3]), input_port<2>(four_adders[3])); fba_base_type::output_ports_type output_tuple(output_port<0>(four_adders[0]), output_port<0>(four_adders[1]), output_port<0>(four_adders[2]), output_port<0>(four_adders[3]), output_port<1>(four_adders[3] /*CO*/)); fba_base_type::set_external_ports(input_tuple, output_tuple); } }; #endif /* TBB_examples_logic_sim_fba_H */ ================================================ FILE: third-party/tbb/examples/graph/logic_sim/one_bit_adder.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef TBB_examples_logic_sim_oba_H #define TBB_examples_logic_sim_oba_H namespace P { //input ports const int CI = 0; const int A0 = 1; const int B0 = 2; const int A1 = 3; const int B1 = 4; const int A2 = 5; const int B2 = 6; const int A3 = 7; const int B3 = 8; //output_ports const int S0 = 0; const int S1 = 1; const int S2 = 2; const int S3 = 3; #if USE_TWO_BIT_FULL_ADDER const int CO = 2; #else const int CO = 4; #endif } // namespace P #include "basics.hpp" class one_bit_adder : public oneapi::tbb::flow::composite_node, std::tuple> { oneapi::tbb::flow::broadcast_node A_port; oneapi::tbb::flow::broadcast_node B_port; oneapi::tbb::flow::broadcast_node CI_port; xor_gate<2> FirstXOR; xor_gate<2> SecondXOR; and_gate<2> FirstAND; and_gate<2> SecondAND; or_gate<2> FirstOR; oneapi::tbb::flow::graph& my_graph; typedef oneapi::tbb::flow::composite_node, std::tuple> base_type; public: one_bit_adder(oneapi::tbb::flow::graph& g) : base_type(g), my_graph(g), A_port(g), B_port(g), CI_port(g), FirstXOR(g), SecondXOR(g), FirstAND(g), SecondAND(g), FirstOR(g) { make_connections(); set_up_composite(); } one_bit_adder(const one_bit_adder& src) : base_type(src.my_graph), my_graph(src.my_graph), A_port(src.my_graph), B_port(src.my_graph), CI_port(src.my_graph), FirstXOR(src.my_graph), SecondXOR(src.my_graph), FirstAND(src.my_graph), SecondAND(src.my_graph), FirstOR(src.my_graph) { make_connections(); set_up_composite(); } ~one_bit_adder() {} private: void make_connections() { make_edge(A_port, input_port<0>(FirstXOR)); make_edge(A_port, input_port<0>(FirstAND)); make_edge(B_port, input_port<1>(FirstXOR)); make_edge(B_port, input_port<1>(FirstAND)); make_edge(CI_port, input_port<1>(SecondXOR)); make_edge(CI_port, input_port<1>(SecondAND)); make_edge(FirstXOR, input_port<0>(SecondXOR)); make_edge(FirstXOR, input_port<0>(SecondAND)); make_edge(SecondAND, input_port<0>(FirstOR)); make_edge(FirstAND, input_port<1>(FirstOR)); } void set_up_composite() { base_type::input_ports_type input_tuple(CI_port, A_port, B_port); base_type::output_ports_type output_tuple(output_port<0>(SecondXOR), output_port<0>(FirstOR)); base_type::set_external_ports(input_tuple, output_tuple); base_type::add_visible_nodes( A_port, B_port, CI_port, FirstXOR, SecondXOR, FirstAND, SecondAND, FirstOR); } }; #endif /* TBB_examples_logic_sim_oba_H */ ================================================ FILE: third-party/tbb/examples/graph/logic_sim/test_all.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include "oneapi/tbb/global_control.h" #include "common/utility/utility.hpp" #include "common/utility/get_default_num_threads.hpp" #if _MSC_VER #pragma warning( \ disable : 4503) // Suppress "decorated name length exceeded, name was truncated" warning #endif #define USE_TWO_BIT_FULL_ADDER 1 #include "basics.hpp" #include "one_bit_adder.hpp" #if USE_TWO_BIT_FULL_ADDER #include "two_bit_adder.hpp" #else #include "four_bit_adder.hpp" #endif #include "D_latch.hpp" // User-specified globals with default values bool verbose = false; // prints bin details and other diagnostics to screen bool silent = false; // suppress all output except for time int main(int argc, char *argv[]) { utility::thread_number_range threads(utility::get_default_num_threads); utility::parse_cli_arguments( argc, argv, utility::cli_argument_pack() //"-h" option for displaying help is present implicitly .positional_arg(threads, "#threads", utility::thread_number_range_desc) .arg(verbose, "verbose", " print diagnostic output to screen") .arg(silent, "silent", " limits output to timing info; overrides verbose")); if (silent) verbose = false; // make silent override verbose oneapi::tbb::tick_count start = oneapi::tbb::tick_count::now(); for (int p = threads.first; p <= threads.last; p = threads.step(p)) { oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism, p); if (!silent) std::cout << "graph test running on " << p << " threads." << "\n"; oneapi::tbb::flow::graph g; { // test buffer: 0, 1 buffer b(g); toggle input(g); led output( g, "OUTPUT", false); // false means we will explicitly call display to see LED make_edge(input.get_out(), input_port<0>(b)); make_edge(output_port<0>(b), output.get_in()); if (!silent) printf("Testing buffer...\n"); input.activate(); // 0 g.wait_for_all(); if (!silent) output.display(); assert(output.get_value() == low); input.flip(); // 1 g.wait_for_all(); if (!silent) output.display(); assert(output.get_value() == high); } { // test not_gate: 0, 1 not_gate n(g); toggle input(g); led output(g, "OUTPUT", false); make_edge(input.get_out(), input_port<0>(n)); make_edge(output_port<0>(n), output.get_in()); if (!silent) printf("Testing not_gate...\n"); input.activate(); // 0 g.wait_for_all(); if (!silent) output.display(); assert(output.get_value() == high); input.flip(); // 1 g.wait_for_all(); if (!silent) output.display(); assert(output.get_value() == low); } { // test two-input and_gate: 00, 01, 10, 11 and_gate<2> a(g); toggle input0(g); toggle input1(g); led output(g, "OUTPUT", false); make_edge(input0.get_out(), input_port<0>(a)); make_edge(input1.get_out(), input_port<1>(a)); make_edge(output_port<0>(a), output.get_in()); if (!silent) printf("Testing and_gate...\n"); input1.activate(); input0.activate(); // 0 0 g.wait_for_all(); if (!silent) output.display(); assert(output.get_value() == low); input0.flip(); // 0 1 g.wait_for_all(); if (!silent) output.display(); assert(output.get_value() == low); input1.flip(); input0.flip(); // 1 0 g.wait_for_all(); if (!silent) output.display(); assert(output.get_value() == low); input0.flip(); // 1 1 g.wait_for_all(); if (!silent) output.display(); assert(output.get_value() == high); } { // test three-input or_gate: 000, 001, 010, 100, 011, 101, 110, 111 or_gate<3> o(g); toggle input0(g); toggle input1(g); toggle input2(g); led output(g, "OUTPUT", false); make_edge(input0.get_out(), input_port<0>(o)); make_edge(input1.get_out(), input_port<1>(o)); make_edge(input2.get_out(), input_port<2>(o)); make_edge(output_port<0>(o), output.get_in()); if (!silent) printf("Testing or_gate...\n"); input2.activate(); input1.activate(); input0.activate(); // 0 0 0 g.wait_for_all(); if (!silent) output.display(); assert(output.get_value() == low); input0.flip(); // 0 0 1 g.wait_for_all(); if (!silent) output.display(); assert(output.get_value() == high); input1.flip(); input0.flip(); // 0 1 0 g.wait_for_all(); if (!silent) output.display(); assert(output.get_value() == high); input2.flip(); input1.flip(); // 1 0 0 g.wait_for_all(); if (!silent) output.display(); assert(output.get_value() == high); input2.flip(); input1.flip(); input0.flip(); // 0 1 1 g.wait_for_all(); if (!silent) output.display(); assert(output.get_value() == high); input2.flip(); input1.flip(); // 1 0 1 g.wait_for_all(); if (!silent) output.display(); assert(output.get_value() == high); input1.flip(); input0.flip(); // 1 1 0 g.wait_for_all(); if (!silent) output.display(); assert(output.get_value() == high); input0.flip(); // 1 1 1 g.wait_for_all(); if (!silent) output.display(); assert(output.get_value() == high); } { // test two-input xor_gate: 00, 01, 10, 11 xor_gate<2> x(g); toggle input0(g); toggle input1(g); led output(g, "OUTPUT", false); make_edge(input0.get_out(), input_port<0>(x)); make_edge(input1.get_out(), input_port<1>(x)); make_edge(output_port<0>(x), output.get_in()); if (!silent) printf("Testing xor_gate...\n"); input1.activate(); input0.activate(); // 0 0 g.wait_for_all(); if (!silent) output.display(); assert(output.get_value() == low); input0.flip(); // 0 1 g.wait_for_all(); if (!silent) output.display(); assert(output.get_value() == high); input1.flip(); input0.flip(); // 1 0 g.wait_for_all(); if (!silent) output.display(); assert(output.get_value() == high); input0.flip(); // 1 1 g.wait_for_all(); if (!silent) output.display(); assert(output.get_value() == low); } { // test two-input nor_gate: 00, 01, 10, 11 nor_gate<2> n(g); toggle input0(g); toggle input1(g); led output(g, "OUTPUT", false); make_edge(input0.get_out(), input_port<0>(n)); make_edge(input1.get_out(), input_port<1>(n)); make_edge(output_port<0>(n), output.get_in()); if (!silent) printf("Testing nor_gate...\n"); input1.activate(); input0.activate(); // 0 0 g.wait_for_all(); if (!silent) output.display(); assert(output.get_value() == high); input0.flip(); // 0 1 g.wait_for_all(); if (!silent) output.display(); assert(output.get_value() == low); input1.flip(); input0.flip(); // 1 0 g.wait_for_all(); if (!silent) output.display(); assert(output.get_value() == low); input0.flip(); // 1 1 g.wait_for_all(); if (!silent) output.display(); assert(output.get_value() == low); } { // test steady_signal and digit steady_signal input0(g, high); steady_signal input1(g, low); and_gate<2> a(g); or_gate<2> o(g); xor_gate<2> x(g); nor_gate<2> n(g); digit output(g, "OUTPUT", false); make_edge(input0.get_out(), input_port<0>(a)); make_edge(input1.get_out(), input_port<1>(a)); make_edge(output_port<0>(a), input_port<0>(output)); make_edge(input0.get_out(), input_port<0>(o)); make_edge(input1.get_out(), input_port<1>(o)); make_edge(output_port<0>(o), input_port<1>(output)); make_edge(input0.get_out(), input_port<0>(x)); make_edge(input1.get_out(), input_port<1>(x)); make_edge(output_port<0>(x), input_port<2>(output)); make_edge(input0.get_out(), input_port<0>(n)); make_edge(input1.get_out(), input_port<1>(n)); make_edge(output_port<0>(n), input_port<3>(output)); if (!silent) printf("Testing steady_signal...\n"); input0.activate(); // 1 input1.activate(); // 0 g.wait_for_all(); if (!silent) output.display(); assert(output.get_value() == 6); } { // test push_button push_button p(g); buffer b(g); led output(g, "OUTPUT", !silent); // true means print all LED state changes make_edge(p.get_out(), input_port<0>(b)); make_edge(output_port<0>(b), output.get_in()); if (!silent) printf("Testing push_button...\n"); p.press(); p.release(); p.press(); p.release(); g.wait_for_all(); } { // test one_bit_adder one_bit_adder my_adder(g); toggle A(g); toggle B(g); toggle CarryIN(g); led Sum(g, "SUM"); led CarryOUT(g, "CarryOUT"); make_edge(A.get_out(), input_port(my_adder)); make_edge(B.get_out(), input_port(my_adder)); make_edge(CarryIN.get_out(), input_port(my_adder)); make_edge(output_port(my_adder), Sum.get_in()); make_edge(output_port<1>(my_adder), CarryOUT.get_in()); A.activate(); B.activate(); CarryIN.activate(); if (!silent) printf("A on\n"); A.flip(); g.wait_for_all(); if (!silent) Sum.display(); if (!silent) CarryOUT.display(); assert((Sum.get_value() == high) && (CarryOUT.get_value() == low)); if (!silent) printf("A off\n"); A.flip(); g.wait_for_all(); if (!silent) Sum.display(); if (!silent) CarryOUT.display(); assert((Sum.get_value() == low) && (CarryOUT.get_value() == low)); if (!silent) printf("B on\n"); B.flip(); g.wait_for_all(); if (!silent) Sum.display(); if (!silent) CarryOUT.display(); assert((Sum.get_value() == high) && (CarryOUT.get_value() == low)); if (!silent) printf("B off\n"); B.flip(); g.wait_for_all(); if (!silent) Sum.display(); if (!silent) CarryOUT.display(); assert((Sum.get_value() == low) && (CarryOUT.get_value() == low)); if (!silent) printf("CarryIN on\n"); CarryIN.flip(); g.wait_for_all(); if (!silent) Sum.display(); if (!silent) CarryOUT.display(); assert((Sum.get_value() == high) && (CarryOUT.get_value() == low)); if (!silent) printf("CarryIN off\n"); CarryIN.flip(); g.wait_for_all(); if (!silent) Sum.display(); if (!silent) CarryOUT.display(); assert((Sum.get_value() == low) && (CarryOUT.get_value() == low)); if (!silent) printf("A&B on\n"); A.flip(); B.flip(); g.wait_for_all(); if (!silent) Sum.display(); if (!silent) CarryOUT.display(); assert((Sum.get_value() == low) && (CarryOUT.get_value() == high)); if (!silent) printf("A&B off\n"); A.flip(); B.flip(); g.wait_for_all(); if (!silent) Sum.display(); if (!silent) CarryOUT.display(); assert((Sum.get_value() == low) && (CarryOUT.get_value() == low)); if (!silent) printf("A&CarryIN on\n"); A.flip(); CarryIN.flip(); g.wait_for_all(); if (!silent) Sum.display(); if (!silent) CarryOUT.display(); assert((Sum.get_value() == low) && (CarryOUT.get_value() == high)); if (!silent) printf("A&CarryIN off\n"); A.flip(); CarryIN.flip(); g.wait_for_all(); if (!silent) Sum.display(); if (!silent) CarryOUT.display(); assert((Sum.get_value() == low) && (CarryOUT.get_value() == low)); if (!silent) printf("B&CarryIN on\n"); B.flip(); CarryIN.flip(); g.wait_for_all(); if (!silent) Sum.display(); if (!silent) CarryOUT.display(); assert((Sum.get_value() == low) && (CarryOUT.get_value() == high)); if (!silent) printf("B&CarryIN off\n"); B.flip(); CarryIN.flip(); g.wait_for_all(); if (!silent) Sum.display(); if (!silent) CarryOUT.display(); assert((Sum.get_value() == low) && (CarryOUT.get_value() == low)); if (!silent) printf("A&B&CarryIN on\n"); A.flip(); B.flip(); CarryIN.flip(); g.wait_for_all(); if (!silent) Sum.display(); if (!silent) CarryOUT.display(); assert((Sum.get_value() == high) && (CarryOUT.get_value() == high)); if (!silent) printf("A&B&CarryIN off\n"); A.flip(); B.flip(); CarryIN.flip(); g.wait_for_all(); if (!silent) Sum.display(); if (!silent) CarryOUT.display(); assert((Sum.get_value() == low) && (CarryOUT.get_value() == low)); } #if USE_TWO_BIT_FULL_ADDER { // test two_bit_adder if (!silent) printf("testing two_bit adder\n"); two_bit_adder two_adder(g); std::vector A(2, toggle(g)); std::vector B(2, toggle(g)); toggle CarryIN(g); digit Sum(g, "SUM"); led CarryOUT(g, "CarryOUT"); make_edge(A[0].get_out(), input_port(two_adder)); make_edge(B[0].get_out(), input_port(two_adder)); make_edge(output_port(two_adder), input_port<0>(Sum)); make_edge(A[1].get_out(), input_port(two_adder)); make_edge(B[1].get_out(), input_port(two_adder)); make_edge(output_port(two_adder), input_port<1>(Sum)); make_edge(CarryIN.get_out(), input_port(two_adder)); make_edge(output_port(two_adder), CarryOUT.get_in()); // Activate all switches at low state for (int i = 0; i < 2; ++i) { A[i].activate(); B[i].activate(); } CarryIN.activate(); if (!silent) printf("1+0\n"); A[0].flip(); g.wait_for_all(); if (!silent) Sum.display(); if (!silent) CarryOUT.display(); assert((Sum.get_value() == 1) && (CarryOUT.get_value() == low)); if (!silent) printf("0+1\n"); A[0].flip(); B[0].flip(); g.wait_for_all(); if (!silent) Sum.display(); if (!silent) CarryOUT.display(); assert((Sum.get_value() == 1) && (CarryOUT.get_value() == low)); } #else { // test four_bit_adder four_bit_adder four_adder(g); std::vector A(4, toggle(g)); std::vector B(4, toggle(g)); toggle CarryIN(g); digit Sum(g, "SUM"); led CarryOUT(g, "CarryOUT"); make_edge(A[0].get_out(), input_port(four_adder)); make_edge(B[0].get_out(), input_port(four_adder)); make_edge(output_port(four_adder), input_port<0>(Sum)); make_edge(A[1].get_out(), input_port(four_adder)); make_edge(B[1].get_out(), input_port(four_adder)); make_edge(output_port(four_adder), input_port<1>(Sum)); make_edge(A[2].get_out(), input_port(four_adder)); make_edge(B[2].get_out(), input_port(four_adder)); make_edge(output_port(four_adder), input_port<2>(Sum)); make_edge(A[3].get_out(), input_port(four_adder)); make_edge(B[3].get_out(), input_port(four_adder)); make_edge(output_port(four_adder), input_port<3>(Sum)); make_edge(CarryIN.get_out(), input_port(four_adder)); make_edge(output_port(four_adder), CarryOUT.get_in()); // Activate all switches at low state for (int i = 0; i < 4; ++i) { A[i].activate(); B[i].activate(); } CarryIN.activate(); if (!silent) printf("1+0\n"); A[0].flip(); g.wait_for_all(); if (!silent) Sum.display(); if (!silent) CarryOUT.display(); assert((Sum.get_value() == 1) && (CarryOUT.get_value() == low)); if (!silent) printf("0+1\n"); A[0].flip(); B[0].flip(); g.wait_for_all(); if (!silent) Sum.display(); if (!silent) CarryOUT.display(); assert((Sum.get_value() == 1) && (CarryOUT.get_value() == low)); if (!silent) printf("3+4\n"); A[0].flip(); A[1].flip(); B[0].flip(); B[2].flip(); g.wait_for_all(); if (!silent) Sum.display(); if (!silent) CarryOUT.display(); assert((Sum.get_value() == 7) && (CarryOUT.get_value() == low)); if (!silent) printf("6+1\n"); A[0].flip(); A[2].flip(); B[0].flip(); B[2].flip(); g.wait_for_all(); if (!silent) Sum.display(); if (!silent) CarryOUT.display(); assert((Sum.get_value() == 7) && (CarryOUT.get_value() == low)); if (!silent) printf("0+0+carry\n"); A[1].flip(); A[2].flip(); B[0].flip(); CarryIN.flip(); g.wait_for_all(); if (!silent) Sum.display(); if (!silent) CarryOUT.display(); assert((Sum.get_value() == 1) && (CarryOUT.get_value() == low)); if (!silent) printf("15+15+carry\n"); A[0].flip(); A[1].flip(); A[2].flip(); A[3].flip(); B[0].flip(); B[1].flip(); B[2].flip(); B[3].flip(); g.wait_for_all(); if (!silent) Sum.display(); if (!silent) CarryOUT.display(); assert((Sum.get_value() == 0xf) && (CarryOUT.get_value() == high)); if (!silent) printf("8+8\n"); A[0].flip(); A[1].flip(); A[2].flip(); B[0].flip(); B[1].flip(); B[2].flip(); CarryIN.flip(); g.wait_for_all(); if (!silent) Sum.display(); if (!silent) CarryOUT.display(); assert((Sum.get_value() == 0) && (CarryOUT.get_value() == high)); if (!silent) printf("0+0\n"); A[3].flip(); B[3].flip(); g.wait_for_all(); if (!silent) Sum.display(); if (!silent) CarryOUT.display(); assert((Sum.get_value() == 0) && (CarryOUT.get_value() == low)); } #endif { // test D_latch D_latch my_d_latch(g); toggle D(g); pulse E(g, 500, 4); // clock changes every 500ms; stops after 4 changes led Q(g, " Q", verbose); // if true, LEDs print at every state change led notQ(g, "~Q", verbose); make_edge(D.get_out(), input_port<0>(my_d_latch)); make_edge(E.get_out(), input_port<1>(my_d_latch)); make_edge(output_port<0>(my_d_latch), Q.get_in()); make_edge(output_port<1>(my_d_latch), notQ.get_in()); D.activate(); if (!silent) printf("Toggling D\n"); E.activate(); D.flip(); g.wait_for_all(); if (!silent && !verbose) { Q.display(); notQ.display(); } assert((Q.get_value() == high) && (notQ.get_value() == low)); E.reset(); if (!silent) printf("Toggling D\n"); E.activate(); D.flip(); g.wait_for_all(); if (!silent && !verbose) { Q.display(); notQ.display(); } assert((Q.get_value() == low) && (notQ.get_value() == high)); E.reset(); if (!silent) printf("Toggling D\n"); E.activate(); D.flip(); g.wait_for_all(); if (!silent && !verbose) { Q.display(); notQ.display(); } assert((Q.get_value() == high) && (notQ.get_value() == low)); E.reset(); if (!silent) printf("Toggling D\n"); E.activate(); D.flip(); g.wait_for_all(); if (!silent && !verbose) { Q.display(); notQ.display(); } assert((Q.get_value() == low) && (notQ.get_value() == high)); E.reset(); if (!silent) printf("Toggling D\n"); E.activate(); D.flip(); g.wait_for_all(); if (!silent && !verbose) { Q.display(); notQ.display(); } assert((Q.get_value() == high) && (notQ.get_value() == low)); } } utility::report_elapsed_time((oneapi::tbb::tick_count::now() - start).seconds()); return 0; } ================================================ FILE: third-party/tbb/examples/graph/logic_sim/two_bit_adder.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef TBB_examples_logic_sim_tba_H #define TBB_examples_logic_sim_tba_H #include "one_bit_adder.hpp" class two_bit_adder : public oneapi::tbb::flow::composite_node< std::tuple, std::tuple> { oneapi::tbb::flow::graph& my_graph; std::vector two_adders; typedef oneapi::tbb::flow::composite_node< std::tuple, std::tuple> base_type; public: two_bit_adder(oneapi::tbb::flow::graph& g) : base_type(g), my_graph(g), two_adders(2, one_bit_adder(g)) { make_connections(); set_up_composite(); } two_bit_adder(const two_bit_adder& src) : base_type(src.my_graph), my_graph(src.my_graph), two_adders(2, one_bit_adder(src.my_graph)) { make_connections(); set_up_composite(); } ~two_bit_adder() {} private: void make_connections() { make_edge(output_port<1>(two_adders[0]), input_port<0>(two_adders[1])); } void set_up_composite() { base_type::input_ports_type input_tuple(input_port<0>(two_adders[0] /*CI*/), input_port<1>(two_adders[0]), input_port<2>(two_adders[0]), input_port<1>(two_adders[1]), input_port<2>(two_adders[1])); base_type::output_ports_type output_tuple(output_port<0>(two_adders[0]), output_port<0>(two_adders[1]), output_port<1>(two_adders[1] /*CO*/)); base_type::set_external_ports(input_tuple, output_tuple); } }; #endif /* TBB_examples_logic_sim_tba_H */ ================================================ FILE: third-party/tbb/examples/graph/som/CMakeLists.txt ================================================ # Copyright (c) 2020-2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. cmake_minimum_required(VERSION 3.5.0...3.31.3) include(../../common/cmake/common.cmake) project(som CXX) set_common_project_settings(tbb) add_executable(som som_graph.cpp som.cpp) target_link_libraries(som TBB::tbb Threads::Threads) target_compile_options(som PRIVATE ${TBB_CXX_STD_FLAG}) set(EXECUTABLE "$") set(ARGS) set(LIGHT_ARGS 4) add_execution_target(run_som som ${EXECUTABLE} "${ARGS}") add_execution_target(light_test_som som ${EXECUTABLE} "${LIGHT_ARGS}") ================================================ FILE: third-party/tbb/examples/graph/som/README.md ================================================ # Self-Organizing Map (SOM) sample The Self-Organizing Map demonstrates `oneapi::tbb::flow` and the use of cancellation in scheduling multiple iterations of map updates. For tutorials on Self-organizing Maps, see [here](http://www.ai-junkie.com/ann/som/som1.html) and [here](http://davis.wpi.edu/~matt/courses/soms/). The program trains the map with several examples, splitting the map into subsections and looking for best-match for multiple examples. When an example is used to update the map, the graphs examining the sections being updated for the next example are cancelled and restarted after the update. ## Building the example ``` cmake cmake --build . ``` ## Running the sample ### Predefined make targets * `make run_som` - executes the example with predefined parameters. * `make light_test_som` - executes the example with suggested parameters to reduce execution time. ### Application parameters Usage: ``` som [n-of-threads=value] [radius-fraction=value] [number-of-epochs=value] [cancel-test] [debug] [nospeculate] [-h] [n-of-threads [radius-fraction [number-of-epochs]]] ``` * `-h` - prints the help for command line options. * `n-of-threads` - number of threads to use; a range of the form low\[:high\], where low and optional high are non-negative integers or `auto` for the oneTBB default. * `radius-fraction` - size of radius at which to start speculating. * `number-of-epochs` - number of examples used in learning phase. * `cancel-test` - test for cancel signal while finding BMU. * `debug` - additional output. * `nospeculate` - don't speculate in SOM map teaching. ================================================ FILE: third-party/tbb/examples/graph/som/som.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ // // Self-organizing map in TBB flow::graph // // we will do a color map (the simple example.) // // serial algorithm // // initialize map with vectors (could be random, gradient, or something else) // for some number of iterations // update radius r, weight of change L // for each example V // find the best matching unit // for each part of map within radius of BMU W // update vector: W(t+1) = W(t) + w(dist)*L*(V - W(t)) #include "oneapi/tbb/task_group.h" #include "som.hpp" std::ostream &operator<<(std::ostream &out, const SOM_element &s) { out << "("; for (int i = 0; i < (int)s.w.size(); ++i) { out << s.w[i]; if (i < (int)s.w.size() - 1) { out << ","; } } out << ")"; return out; } void remark_SOM_element(const SOM_element &s) { printf("("); for (int i = 0; i < (int)s.w.size(); ++i) { printf("%g", s.w[i]); if (i < (int)s.w.size() - 1) { printf(","); } } printf(")"); } std::ostream &operator<<(std::ostream &out, const search_result_type &s) { out << "<"; out << std::get(s); out << ", " << std::get(s); out << ", "; out << std::get(s); out << ">"; return out; } void remark_search_result_type(const search_result_type &s) { printf("<%g,%d,%d>", std::get(s), std::get(s), std::get(s)); } double randval(double lowlimit, double highlimit) { return double(rand()) / double(RAND_MAX) * (highlimit - lowlimit) + lowlimit; } void find_data_ranges(teaching_vector_type &teaching, SOM_element &max_range, SOM_element &min_range) { if (teaching.size() == 0) return; max_range = min_range = teaching[0]; for (int i = 1; i < (int)teaching.size(); ++i) { max_range.elementwise_max(teaching[i]); min_range.elementwise_min(teaching[i]); } } void add_fraction_of_difference(SOM_element &to, SOM_element const &from, double frac) { for (int i = 0; i < (int)from.size(); ++i) { to[i] += frac * (from[i] - to[i]); } } double distance_squared(SOM_element x, SOM_element y) { double rval = 0.0; for (int i = 0; i < (int)x.size(); ++i) { double diff = x[i] - y[i]; rval += diff * diff; } return rval; } void SOMap::initialize(InitializeType it, SOM_element &max_range, SOM_element &min_range) { for (int x = 0; x < xMax; ++x) { for (int y = 0; y < yMax; ++y) { for (int i = 0; i < (int)max_range.size(); ++i) { if (it == InitializeRandom) { my_map[x][y][i] = (randval(min_range[i], max_range[i])); } else if (it == InitializeGradient) { my_map[x][y][i] = ((double)(x + y) / (xMax + yMax) * (max_range[i] - min_range[i]) + min_range[i]); } } } } } // subsquare [low,high) double SOMap::BMU_range(const SOM_element &s, int &xval, int &yval, subsquare_type &r) { double min_distance_squared = DBL_MAX; int min_x = -1; int min_y = -1; for (int x = r.rows().begin(); x != r.rows().end(); ++x) { for (int y = r.cols().begin(); y != r.cols().end(); ++y) { double dist = distance_squared(s, my_map[x][y]); if (dist < min_distance_squared) { min_distance_squared = dist; min_x = x; min_y = y; } if (cancel_test && oneapi::tbb::is_current_task_group_canceling()) { xval = r.rows().begin(); yval = r.cols().begin(); return DBL_MAX; } } } xval = min_x; yval = min_y; return sqrt(min_distance_squared); } void SOMap::epoch_update_range(SOM_element const &s, int epoch, int min_x, int min_y, double radius, double learning_rate, oneapi::tbb::blocked_range &r) { int min_xiter = (int)((double)min_x - radius); if (min_xiter < 0) min_xiter = 0; int max_xiter = (int)((double)min_x + radius); if (max_xiter > (int)my_map.size() - 1) max_xiter = (int)my_map.size() - 1; for (int xx = r.begin(); xx <= r.end(); ++xx) { double xrsq = (xx - min_x) * (xx - min_x); double ysq = radius * radius - xrsq; // max extent of y influence double yd; if (ysq > 0) { yd = sqrt(ysq); int lb = (int)(min_y - yd); int ub = (int)(min_y + yd); for (int yy = lb; yy < ub; ++yy) { if (yy >= 0 && yy < (int)my_map[xx].size()) { // [xx, yy] is in the range of the update. double my_rsq = xrsq + (yy - min_y) * (yy - min_y); // distance from BMU squared double theta = exp(-(radius * radius) / (2.0 * my_rsq)); add_fraction_of_difference(my_map[xx][yy], s, theta * learning_rate); } } } } } void SOMap::teach(teaching_vector_type &in) { for (int i = 0; i < nPasses; ++i) { int j = (int)(randval(0, (double)in.size())); // this won't be reproducible. if (j == in.size()) --j; int min_x = -1; int min_y = -1; subsquare_type br2(0, (int)my_map.size(), 1, 0, (int)my_map[0].size(), 1); (void)BMU_range(in[j], min_x, min_y, br2); // just need min_x, min_y // radius of interest double radius = max_radius * exp(-(double)i * radius_decay_rate); // update circle is min_xiter to max_xiter inclusive. double learning_rate = max_learning_rate * exp(-(double)i * learning_decay_rate); epoch_update(in[j], i, min_x, min_y, radius, learning_rate); } } void SOMap::debug_output() { printf("SOMap:\n"); for (int i = 0; i < (int)(this->my_map.size()); ++i) { for (int j = 0; j < (int)(this->my_map[i].size()); ++j) { printf("map[%d, %d] == ", i, j); remark_SOM_element(this->my_map[i][j]); printf("\n"); } } } #define RED 0 #define GREEN 1 #define BLUE 2 void readInputData() { my_teaching.push_back(SOM_element()); my_teaching.push_back(SOM_element()); my_teaching.push_back(SOM_element()); my_teaching.push_back(SOM_element()); my_teaching.push_back(SOM_element()); my_teaching[0][RED] = 1.0; my_teaching[0][GREEN] = 0.0; my_teaching[0][BLUE] = 0.0; my_teaching[1][RED] = 0.0; my_teaching[1][GREEN] = 1.0; my_teaching[1][BLUE] = 0.0; my_teaching[2][RED] = 0.0; my_teaching[2][GREEN] = 0.0; my_teaching[2][BLUE] = 1.0; my_teaching[3][RED] = 0.3; my_teaching[3][GREEN] = 0.3; my_teaching[3][BLUE] = 0.0; my_teaching[4][RED] = 0.5; my_teaching[4][GREEN] = 0.5; my_teaching[4][BLUE] = 0.9; } ================================================ FILE: third-party/tbb/examples/graph/som/som.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ // // Self-organizing map // // support for self-ordering maps #ifndef TBB_examples_som_H #define TBB_examples_som_H #include #include #include #include #include #include #include #include "oneapi/tbb/blocked_range2d.h" typedef oneapi::tbb::blocked_range2d subsquare_type; typedef std::tuple search_result_type; std::ostream &operator<<(std::ostream &out, const search_result_type &s); #define RADIUS 0 // for the std::gets #define XV 1 #define YV 2 // to have single definitions of static variables, define _MAIN_C_ in the main program // #ifdef _MAIN_C_ #define DEFINE // nothing #define INIT(n) = n #else // not in main file #define DEFINE extern #define INIT(n) // nothing #endif // _MAIN_C_ DEFINE int nElements INIT(3); // length of input vectors, matching vector in map DEFINE double max_learning_rate INIT(0.8); // decays exponentially DEFINE double radius_decay_rate; DEFINE double learning_decay_rate INIT(0.005); DEFINE double max_radius; DEFINE bool extra_debug INIT(false); DEFINE bool cancel_test INIT(false); DEFINE int xMax INIT(100); DEFINE int yMax INIT(100); DEFINE int nPasses INIT(100); enum InitializeType { InitializeRandom, InitializeGradient }; #define RED 0 #define GREEN 1 #define BLUE 2 class SOM_element; void remark_SOM_element(const SOM_element &s); // all SOM_element vectors are the same length (nElements), so we do not have // to range-check the vector accesses. class SOM_element { std::vector w; public: friend std::ostream &operator<<(std::ostream &out, const SOM_element &s); friend void remark_SOM_element(const SOM_element &s); SOM_element() : w(nElements, 0.0) {} double &operator[](int indx) { return w.at(indx); } const double &operator[](int indx) const { return w.at(indx); } bool operator==(SOM_element const &other) const { for (std::size_t i = 0; i < size(); ++i) { if (w[i] != other.w[i]) { return false; } } return true; } bool operator!=(SOM_element const &other) const { return !operator==(other); } void elementwise_max(SOM_element const &other) { for (std::size_t i = 0; i < w.size(); ++i) if (w[i] < other.w[i]) w[i] = other.w[i]; } void elementwise_min(SOM_element const &other) { for (std::size_t i = 0; i < w.size(); ++i) if (w[i] > other.w[i]) w[i] = other.w[i]; } std::size_t size() const { return w.size(); } }; typedef std::vector teaching_vector_type; DEFINE SOM_element max_range; DEFINE SOM_element min_range; extern double randval(double lowlimit, double highlimit); extern void find_data_ranges(teaching_vector_type &teaching, SOM_element &max_range, SOM_element &min_range); extern void add_fraction_of_difference(SOM_element &to, SOM_element &from, double frac); DEFINE teaching_vector_type my_teaching; class SOMap { std::vector> my_map; public: SOMap(int xSize, int ySize) { my_map.reserve(xSize); for (int i = 0; i < xSize; ++i) { my_map.push_back(teaching_vector_type()); my_map[i].reserve(ySize); for (int j = 0; j < ySize; ++j) { my_map[i].push_back(SOM_element()); } } } std::size_t size() { return my_map.size(); } void initialize(InitializeType it, SOM_element &max_range, SOM_element &min_range); teaching_vector_type &operator[](int indx) { return my_map[indx]; } SOM_element &at(int xVal, int yVal) { return my_map[xVal][yVal]; } SOM_element &at(search_result_type const &s) { return my_map[std::get<1>(s)][std::get<2>(s)]; } void epoch_update(SOM_element const &s, int epoch, int min_x, int min_y, double radius, double learning_rate) { int min_xiter = (int)((double)min_x - radius); if (min_xiter < 0) min_xiter = 0; int max_xiter = (int)((double)min_x + radius); if (max_xiter > (int)my_map.size() - 1) max_xiter = (int)(my_map.size() - 1); oneapi::tbb::blocked_range br1(min_xiter, max_xiter, 1); epoch_update_range(s, epoch, min_x, min_y, radius, learning_rate, br1); } void epoch_update_range(SOM_element const &s, int epoch, int min_x, int min_y, double radius, double learning_rate, oneapi::tbb::blocked_range &r); void teach(teaching_vector_type &id); void debug_output(); // find BMU given an input, returns distance double BMU_range(const SOM_element &s, int &xval, int &yval, subsquare_type &r); double BMU(const SOM_element &s, int &xval, int &yval) { subsquare_type br(0, (int)my_map.size(), 1, 0, (int)my_map[0].size(), 1); return BMU_range(s, xval, yval, br); } }; extern double distance_squared(SOM_element x, SOM_element y); void remark_SOM_element(const SOM_element &s); extern void readInputData(); #endif // TBB_examples_som_H ================================================ FILE: third-party/tbb/examples/graph/som/som_graph.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ // // Self-organizing map in TBB flow::graph // // This is an example of the use of cancellation in a graph. After a point in searching for // the best match for an example, two examples are looked for simultaneously. When the // earlier example is found and the update radius is determined, the affected searches // for the subsequent example are cancelled, and after the update they are restarted. // As the update radius shrinks fewer searches are cancelled, and by the last iterations // virtually all the work done for the speculating example is useful. // // first, a simple implementation with only one example vector // at a time. // // we will do a color map (the simple example.) // // graph algorithm // // for some number of iterations // update radius r, weight of change L // for each example V // use graph to find BMU // for each part of map within radius of BMU W // update vector: W(t+1) = W(t) + w(dist)*L*(V - W(t)) #ifndef NOMINMAX #define NOMINMAX #endif // NOMINMAX #include #define _MAIN_C_ 1 #include "som.hpp" #include "oneapi/tbb/flow_graph.h" #include "oneapi/tbb/blocked_range2d.h" #include "oneapi/tbb/tick_count.h" #include "oneapi/tbb/task_arena.h" #include "oneapi/tbb/global_control.h" #include "common/utility/utility.hpp" #include "common/utility/get_default_num_threads.hpp" #define RED 0 #define GREEN 1 #define BLUE 2 static int xranges = 1; static int yranges = 1; static int xsize = -1; static int ysize = -1; static int global_i = 0; static int speculation_start; #if EXTRA_DEBUG std::vector cancel_count; std::vector extra_count; std::vector missing_count; std::vector canceled_before; #endif std::vector function_node_execs; static int xRangeMax = 3; static int yRangeMax = 3; static bool dont_speculate = false; static search_result_type last_update; class BMU_search_body { SOMap &my_map; subsquare_type my_square; int &fn_tally; public: BMU_search_body(SOMap &_m, subsquare_type &_sq, int &fnt) : my_map(_m), my_square(_sq), fn_tally(fnt) {} BMU_search_body(const BMU_search_body &other) : my_map(other.my_map), my_square(other.my_square), fn_tally(other.fn_tally) {} search_result_type operator()(const SOM_element s) { int my_x; int my_y; double min_dist = my_map.BMU_range(s, my_x, my_y, my_square); ++fn_tally; // count how many times this function_node executed return search_result_type(min_dist, my_x, my_y); } }; typedef oneapi::tbb::flow::function_node search_node; typedef oneapi::tbb::flow::broadcast_node b_node; typedef std::vector search_node_vector_type; typedef std::vector search_node_array_type; typedef std::vector graph_vector_type; typedef std::vector graph_array_type; #define SPECULATION_CNT 2 oneapi::tbb::flow::graph *g[SPECULATION_CNT]; // main graph; there should only be one per epoch b_node *send_to[SPECULATION_CNT]; // broadcast node to send exemplar to all function_nodes oneapi::tbb::flow::queue_node *q[SPECULATION_CNT]; // queue for function nodes to put their results in // each function_node should have its own graph search_node_array_type *s_array[SPECULATION_CNT]; // 2d array of function nodes graph_array_type *g_array[SPECULATION_CNT]; // 2d array of graphs // All graphs must locate in the same arena. oneapi::tbb::flow::graph *construct_graph(oneapi::tbb::task_arena &ta) { oneapi::tbb::flow::graph *result; ta.execute([&result] { result = new oneapi::tbb::flow::graph(); }); return result; } // build a set of SPECULATION_CNT graphs, each of which consists of a broadcast_node, // xranges x yranges function_nodes, and one queue_node for output. // once speculation starts, if i % SPECULATION_CNT is the current graph, (i+1) % SPECULATION_CNT // is the first speculation, and so on. void build_BMU_graph(SOMap &map1, oneapi::tbb::task_arena &ta) { // build current graph xsize = ((int)map1.size() + xranges - 1) / xranges; ysize = ((int)map1[0].size() + yranges - 1) / yranges; function_node_execs.clear(); function_node_execs.reserve(xranges * yranges + 1); for (int i = 0; i < xranges * yranges + 1; ++i) function_node_execs.push_back(0); for (int scnt = 0; scnt < SPECULATION_CNT; ++scnt) { g[scnt] = construct_graph(ta); send_to[scnt] = new b_node(*(g[scnt])); // broadcast node to the function_nodes q[scnt] = new oneapi::tbb::flow::queue_node(*(g[scnt])); // output queue // create the function_nodes, tie to the graph s_array[scnt] = new search_node_array_type; s_array[scnt]->reserve(xranges); g_array[scnt] = new graph_array_type; g_array[scnt]->reserve(xranges); for (int i = 0; i < (int)map1.size(); i += xsize) { int xindex = i / xsize; s_array[scnt]->push_back(search_node_vector_type()); #if EXTRA_DEBUG if (s_array[scnt]->size() != xindex + 1) { printf("Error; s_array[%d]->size() == %d, xindex== %d\n", scnt, (int)(s_array[scnt]->size()), xindex); } #endif (*s_array[scnt])[xindex].reserve(yranges); g_array[scnt]->push_back(graph_vector_type()); (*g_array[scnt])[xindex].reserve(yranges); for (int j = 0; j < (int)map1[0].size(); j += ysize) { int offset = (i / xsize) * yranges + (j / ysize); int xmax = (i + xsize) > (int)map1.size() ? (int)map1.size() : i + xsize; int ymax = (j + ysize) > (int)map1[0].size() ? (int)map1[0].size() : j + ysize; subsquare_type sst(i, xmax, 1, j, ymax, 1); BMU_search_body bb(map1, sst, function_node_execs[offset]); oneapi::tbb::flow::graph *g_local = construct_graph(ta); search_node *s = new search_node(*g_local, oneapi::tbb::flow::serial, bb); // copies Body (*g_array[scnt])[xindex].push_back(g_local); (*s_array[scnt])[xindex].push_back(s); oneapi::tbb::flow::make_edge(*(send_to[scnt]), *s); // broadcast_node -> function_node oneapi::tbb::flow::make_edge(*s, *(q[scnt])); // function_node -> queue_node } } } } // Wait for the 2D array of flow::graphs. void wait_for_all_graphs(int cIndex) { // cIndex ranges over [0 .. SPECULATION_CNT - 1] for (int x = 0; x < xranges; ++x) { for (int y = 0; y < yranges; ++y) { (*g_array[cIndex])[x][y]->wait_for_all(); } } } void destroy_BMU_graph() { for (int scnt = 0; scnt < SPECULATION_CNT; ++scnt) { for (int i = 0; i < (int)(*s_array[scnt]).size(); ++i) { for (int j = 0; j < (int)(*s_array[scnt])[i].size(); ++j) { delete (*s_array[scnt])[i][j]; delete (*g_array[scnt])[i][j]; } } (*s_array[scnt]).clear(); delete s_array[scnt]; (*g_array[scnt]).clear(); delete g_array[scnt]; delete q[scnt]; delete send_to[scnt]; delete g[scnt]; } } void find_subrange_overlap(int const &xval, int const &yval, double const &radius, int &xlow, int &xhigh, int &ylow, int &yhigh) { xlow = int((xval - radius) / xsize); xhigh = int((xval + radius) / xsize); ylow = int((yval - radius) / ysize); yhigh = int((yval + radius) / ysize); // circle may fall partly outside map if (xlow < 0) xlow = 0; if (xhigh >= xranges) xhigh = xranges - 1; if (ylow < 0) ylow = 0; if (yhigh >= yranges) yhigh = yranges - 1; #if EXTRA_DEBUG if (xlow >= xranges) printf(" Error *** xlow == %d\n", xlow); if (xhigh < 0) printf("Error *** xhigh == %d\n", xhigh); if (ylow >= yranges) printf("Error *** ylow == %d\n", ylow); if (yhigh < 0) printf("Error *** yhigh == %d\n", yhigh); #endif } bool overlap(int &xval, int &yval, search_result_type &sr) { int xlow, xhigh, ylow, yhigh; find_subrange_overlap( std::get(sr), std::get(sr), std::get(sr), xlow, xhigh, ylow, yhigh); return xval >= xlow && xval <= xhigh && yval >= ylow && yval <= yhigh; } void cancel_submaps(int &xval, int &yval, double &radius, int indx) { int xlow; int xhigh; int ylow; int yhigh; find_subrange_overlap(xval, yval, radius, xlow, xhigh, ylow, yhigh); for (int x = xlow; x <= xhigh; ++x) { for (int y = ylow; y <= yhigh; ++y) { (*g_array[indx])[x][y]->cancel(); } } #if EXTRA_DEBUG ++cancel_count[(xhigh - xlow + 1) * (yhigh - ylow + 1)]; #endif } void restart_submaps(int &xval, int &yval, double &radius, int indx, SOM_element &vector) { int xlow; int xhigh; int ylow; int yhigh; find_subrange_overlap(xval, yval, radius, xlow, xhigh, ylow, yhigh); for (int x = xlow; x <= xhigh; ++x) { for (int y = ylow; y <= yhigh; ++y) { // have to reset the graph (*g_array[indx])[x][y]->reset(); // and re-submit the exemplar for search. (*s_array[indx])[x][y]->try_put(vector); } } } search_result_type graph_BMU(int indx) { // indx ranges over [0 .. SPECULATION_CNT -1] wait_for_all_graphs(indx); // wait for the array of subgraphs (g[indx])->wait_for_all(); std::vector all_srs(xRangeMax * yRangeMax, search_result_type(DBL_MAX, -1, -1)); #if EXTRA_DEBUG int extra_computations = 0; #endif search_result_type sr; search_result_type min_sr; std::get(min_sr) = DBL_MAX; int result_count = 0; while ((q[indx])->try_get(sr)) { ++result_count; // figure which submap this came from int x = std::get(sr) / xsize; int y = std::get(sr) / ysize; #if EXTRA_DEBUG if (x < 0 || x >= xranges) printf(" ### x value out of range (%d)\n", x); if (y < 0 || y >= yranges) printf(" ### y value out of range (%d)\n", y); #endif int offset = x * yranges + y; // linearized subscript #if EXTRA_DEBUG if (std::get(all_srs[offset]) != DBL_MAX) { // we've already got a result from this subsquare ++extra_computations; } else if (std::get(all_srs[offset]) != -1) { if (extra_debug) printf("More than one cancellation of [%d,%d] iteration %d\n", x, y, global_i); } #endif all_srs[offset] = sr; if (std::get(sr) < std::get(min_sr)) min_sr = sr; else if (std::get(sr) == std::get(min_sr)) { if (std::get(sr) < std::get(min_sr)) { min_sr = sr; } else if ((std::get(sr) == std::get(min_sr) && std::get(sr) < std::get(min_sr))) { min_sr = sr; } } } #if EXTRA_DEBUG if (result_count != xranges * yranges + extra_computations) { // we are missing at least one of the expected results. Tally the missing values for (int i = 0; i < xranges * yranges; ++i) { if (std::get(all_srs[i]) == DBL_MAX) { // i == x*yranges + y int xval = i / yranges; int yval = i % yranges; bool received_cancel_result = std::get(all_srs[i]) != -1; if (overlap(xval, yval, last_update)) { // we have previously canceled this subsquare. printf("No result for [%d,%d] which was canceled(%s)\n", xval, yval, received_cancel_result ? "T" : "F"); ++canceled_before[i]; } else { printf("No result for [%d,%d] which was not canceled(%s)\n", xval, yval, received_cancel_result ? "T" : "F"); } ++missing_count[i]; } } } if (extra_computations) ++extra_count[extra_computations]; #endif return min_sr; // end of one epoch } void graph_teach(SOMap &map1, teaching_vector_type &in, oneapi::tbb::task_arena &ta) { build_BMU_graph(map1, ta); #if EXTRA_DEBUG cancel_count.clear(); extra_count.clear(); missing_count.clear(); canceled_before.clear(); cancel_count.reserve(xRangeMax * yRangeMax + 1); extra_count.reserve(xRangeMax * yRangeMax + 1); missing_count.reserve(xRangeMax * yRangeMax + 1); canceled_before.reserve(xRangeMax * yRangeMax + 1); for (int i = 0; i < xRangeMax * yRangeMax + 1; ++i) { cancel_count.push_back(0); extra_count.push_back(0); missing_count.push_back(0); canceled_before.push_back(0); } #endif // normally the training would pick random exemplars to teach the SOM. We need // the process to be reproducible, so we will pick the exemplars in order, [0, in.size()) int next_j = 0; for (int epoch = 0; epoch < nPasses; ++epoch) { global_i = epoch; bool canceled_submaps = false; int j = next_j; // try to make reproducible next_j = (epoch + 1) % in.size(); search_result_type min_sr; if (epoch < speculation_start) { (send_to[epoch % SPECULATION_CNT])->try_put(in[j]); } else if (epoch == speculation_start) { (send_to[epoch % SPECULATION_CNT])->try_put(in[j]); if (epoch < nPasses - 1) { (send_to[(epoch + 1) % SPECULATION_CNT])->try_put(in[next_j]); } } else if (epoch < nPasses - 1) { (send_to[(epoch + 1) % SPECULATION_CNT])->try_put(in[next_j]); } min_sr = graph_BMU(epoch % SPECULATION_CNT); //calls wait_for_all() double min_distance = std::get<0>(min_sr); double radius = max_radius * exp(-(double)epoch * radius_decay_rate); double learning_rate = max_learning_rate * exp(-(double)epoch * learning_decay_rate); if (epoch >= speculation_start && epoch < (nPasses - 1)) { // have to cancel the affected submaps cancel_submaps( std::get(min_sr), std::get(min_sr), radius, (epoch + 1) % SPECULATION_CNT); canceled_submaps = true; } map1.epoch_update( in[j], epoch, std::get<1>(min_sr), std::get<2>(min_sr), radius, learning_rate); ++global_i; if (canceled_submaps) { // do I have to wait for all the non-canceled speculative graph to complete first? // yes, in case a canceled task was already executing. wait_for_all_graphs((epoch + 1) % SPECULATION_CNT); // wait for the array of subgraphs restart_submaps(std::get<1>(min_sr), std::get<2>(min_sr), radius, (epoch + 1) % SPECULATION_CNT, in[next_j]); } last_update = min_sr; std::get(last_update) = radius; // not smallest value, but range of effect } destroy_BMU_graph(); } static const double serial_time_adjust = 1.25; static double radius_fraction = 3.0; int main(int argc, char *argv[]) { int l_speculation_start; utility::thread_number_range threads( utility::get_default_num_threads, utility:: get_default_num_threads() // run only the default number of threads if none specified ); utility::parse_cli_arguments( argc, argv, utility::cli_argument_pack() //"-h" option for for displaying help is present implicitly .positional_arg( threads, "n-of-threads", "number of threads to use; a range of the form low[:high], where low and optional high are non-negative integers or 'auto' for the TBB default.") // .positional_arg(InputFileName,"input-file","input file name") // .positional_arg(OutputFileName,"output-file","output file name") .positional_arg( radius_fraction, "radius-fraction", "size of radius at which to start speculating") .positional_arg( nPasses, "number-of-epochs", "number of examples used in learning phase") .arg(cancel_test, "cancel-test", "test for cancel signal while finding BMU") .arg(extra_debug, "debug", "additional output") .arg(dont_speculate, "nospeculate", "don't speculate in SOM map teaching")); readInputData(); max_radius = (xMax < yMax) ? yMax / 2 : xMax / 2; // need this value for the 1x1 timing below radius_decay_rate = -(log(1.0 / (double)max_radius) / (double)nPasses); find_data_ranges(my_teaching, max_range, min_range); if (extra_debug) { printf("Data range: "); remark_SOM_element(min_range); printf(" to "); remark_SOM_element(max_range); printf("\n"); } // find how much time is taken for the single function_node case. // adjust nPasses so the 1x1 time is somewhere around serial_time_adjust seconds. // make sure the example test runs for at least 0.5 second. for (;;) { // Restrict max concurrency level via task_arena interface oneapi::tbb::task_arena ta(1); SOMap map1(xMax, yMax); speculation_start = nPasses + 1; // Don't speculate xranges = 1; yranges = 1; map1.initialize(InitializeGradient, max_range, min_range); oneapi::tbb::tick_count t0 = oneapi::tbb::tick_count::now(); graph_teach(map1, my_teaching, ta); oneapi::tbb::tick_count t1 = oneapi::tbb::tick_count::now(); double nSeconds = (t1 - t0).seconds(); if (nSeconds < 0.5) { xMax *= 2; yMax *= 2; continue; } double size_adjust = sqrt(serial_time_adjust / nSeconds); xMax = (int)((double)xMax * size_adjust); yMax = (int)((double)yMax * size_adjust); max_radius = (xMax < yMax) ? yMax / 2 : xMax / 2; radius_decay_rate = log((double)max_radius) / (double)nPasses; if (extra_debug) { printf("original 1x1 case ran in %g seconds\n", nSeconds); printf(" Size of table == %d x %d\n", xMax, yMax); printf(" radius_decay_rate == %g\n", radius_decay_rate); } break; } // the "max_radius" starts at 1/2*radius_fraction the table size. To start the speculation when the radius is // 1 / n * the table size, the constant in the log below should be n / 2. so 2 == 1/4, 3 == 1/6th, // et c. if (dont_speculate) { l_speculation_start = nPasses + 1; if (extra_debug) printf("speculation will not be done\n"); } else { if (radius_fraction < 1.0) { if (extra_debug) printf("Warning: radius_fraction should be >= 1. Setting to 1.\n"); radius_fraction = 1.0; } l_speculation_start = (int)((double)nPasses * log(radius_fraction) / log((double)nPasses)); if (extra_debug) printf("We will start speculation at iteration %d\n", l_speculation_start); } double single_time; // for speedup calculations #if EXTRA_DEBUG // storage for the single-subrange answers, for comparing maps std::vector single_dist; single_dist.reserve(my_teaching.size()); std::vector single_xval; single_xval.reserve(my_teaching.size()); std::vector single_yval; single_yval.reserve(my_teaching.size()); #endif //TODO: Investigate how to not require mandatory concurrency for (int p = std::max(threads.first, 2); p <= std::max(threads.last, 2); ++p) { // Restrict max concurrency level via task_arena interface oneapi::tbb::global_control limit(oneapi::tbb::global_control::max_allowed_parallelism, p); oneapi::tbb::task_arena ta(p); if (extra_debug) printf(" -------------- Running with %d threads. ------------\n", p); // run the SOM build for a series of subranges for (xranges = 1; xranges <= xRangeMax; ++xranges) { for (yranges = xranges; yranges <= yRangeMax; ++yranges) { if (xranges == 1 && yranges == 1) { // don't pointlessly speculate if we're only running one subrange. speculation_start = nPasses + 1; } else { speculation_start = l_speculation_start; } SOMap map1(xMax, yMax); map1.initialize(InitializeGradient, max_range, min_range); if (extra_debug) printf("Start learning for [%d,%d] ----------- \n", xranges, yranges); oneapi::tbb::tick_count t0 = oneapi::tbb::tick_count::now(); graph_teach(map1, my_teaching, ta); oneapi::tbb::tick_count t1 = oneapi::tbb::tick_count::now(); if (extra_debug) printf("Done learning for [%d,%d], which took %g seconds ", xranges, yranges, (t1 - t0).seconds()); if (xranges == 1 && yranges == 1) single_time = (t1 - t0).seconds(); if (extra_debug) printf(": speedup == %g\n", single_time / (t1 - t0).seconds()); #if EXTRA_DEBUG if (extra_debug) { // number of times cancel was called, indexed by number of subranges canceled for (int i = 0; i < cancel_count.size(); ++i) { // only write output if we have a non-zero value. if (cancel_count[i] > 0) { int totalcnt = 0; printf(" cancellations: "); for (int j = 0; j < cancel_count.size(); ++j) { if (cancel_count[j]) { printf(" %d [%d]", j, cancel_count[j]); totalcnt += cancel_count[j]; } } totalcnt += speculation_start; printf(" for a total of %d\n", totalcnt); break; // from for } } // number of extra results (these occur when the subrange task starts before // cancel is received.) for (int i = 0; i < extra_count.size(); ++i) { if (extra_count[i] > 0) { int totalcnt = 0; printf("extra computations: "); for (int j = 0; j < extra_count.size(); ++j) { if (extra_count[j]) { printf(" %d[%d]", j, extra_count[j]); totalcnt += extra_count[j]; } } totalcnt += speculation_start; printf(" for a total of %d\n", totalcnt); break; // from for } } // here we count the number of times we looked for a particular subrange when fetching // the queue_node output and didn't find anything. This may occur when a function_node // is "stuck" and doesn't process some number of exemplars. function_node_execs is // a count of the number of times the corresponding function_node was executed (in // case the problem is dropped output in the queue_node.) for (int i = 0; i < missing_count.size(); ++i) { if (missing_count[i]) { int xval = i / yranges; int yval = i % yranges; printf(" f_node[%d,%d] missed %d values", xval, yval, missing_count[i]); if (canceled_before[i]) { printf(" canceled_before == %d", canceled_before[i]); } printf(", fn_tally == %d\n", function_node_execs[i]); } } } // check that output matches the 1x1 case for (int i = 0; i < my_teaching.size(); ++i) { int xdist; int ydist; double my_dist = map1.BMU(my_teaching[i], xdist, ydist); if (xranges == 1 && yranges == 1) { single_dist.push_back(my_dist); single_xval.push_back(xdist); single_yval.push_back(ydist); } else { if (single_dist[i] != my_dist || single_xval[i] != xdist || single_yval[i] != ydist) printf( "Error in output: expecting <%g, %d, %d>, but got <%g, %d, %d>\n", single_dist[i], single_xval[i], single_yval[i], my_dist, xdist, ydist); } } #endif } // yranges } // xranges } // #threads p printf("done\n"); return 0; } ================================================ FILE: third-party/tbb/examples/migration/README.md ================================================ # Code Samples of oneAPI Threading Building Blocks (oneTBB) Examples of migrating from TBB APIs to the oneTBB APIs. | Code sample name | Description |:--- |:--- | recursive_fibonacci | Compute Fibonacci number in recursive way. ================================================ FILE: third-party/tbb/examples/migration/recursive_fibonacci/CMakeLists.txt ================================================ # Copyright (c) 2023-2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. cmake_minimum_required(VERSION 3.5.0...3.31.3) project(recursive_fibonacci CXX) include(../../common/cmake/common.cmake) set_common_project_settings(tbb) add_executable(recursive_fibonacci fibonacci.cpp) target_link_libraries(recursive_fibonacci TBB::tbb Threads::Threads $<$:rt>) # Link "rt" library on Linux target_compile_options(recursive_fibonacci PRIVATE ${TBB_CXX_STD_FLAG}) set(EXECUTABLE "$") # Parameters of executable N C I: # `N` - specifies the fibonacci number which would be calculated. # `C` - cutoff that will be used to stop recursive split. # `I` - number of iteration to measure benchmark time. set(ARGS 30 16 20 1) set(PERF_ARGS 50 5 20) add_execution_target(run_recursive_fibonacci recursive_fibonacci ${EXECUTABLE} "${ARGS}") add_execution_target(perf_run_recursive_fibonacci recursive_fibonacci ${EXECUTABLE} "${PERF_ARGS}") ================================================ FILE: third-party/tbb/examples/migration/recursive_fibonacci/README.md ================================================ # Fibonacci sample This directory contains an example that computes Fibonacci numbers using emulation for TBB Task API. ## Building the example ``` cmake cmake --build . ``` ## Running the sample ### Predefined make targets * `make run_recursive_fibonacci` - executes the example with predefined parameters (extended testing enabled). * `make perf_run_recursive_fibonacci` - executes the example with suggested parameters to measure the oneTBB performance. ### Application parameters Usage: ``` recursive_fibonacci N C I T ``` * `N` - specifies the fibonacci number which would be calculated. * `C` - cutoff that will be used to stop recursive split. * `I` - number of iteration to measure benchmark time. * `T` - enables extended testing (recycle task in a loop). ================================================ FILE: third-party/tbb/examples/migration/recursive_fibonacci/fibonacci.cpp ================================================ /* Copyright (c) 2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "fibonacci_single_task.h" #include "fibonacci_two_tasks.h" #include #include #include int cutoff; bool testing_enabled; template std::pair measure(F&& f, int number, unsigned long ntrial) { std::vector times; unsigned long result; for (unsigned long i = 0; i < ntrial; ++i) { auto t1 = std::chrono::steady_clock::now(); result = f(number); auto t2 = std::chrono::steady_clock::now(); auto time = std::chrono::duration_cast(t2 - t1).count(); times.push_back(time); } return std::make_pair( result, static_cast(std::accumulate(times.begin(), times.end(), 0) / times.size())); } int main(int argc, char* argv[]) { int numbers = argc > 1 ? strtol(argv[1], nullptr, 0) : 50; cutoff = argc > 2 ? strtol(argv[2], nullptr, 0) : 16; unsigned long ntrial = argc > 3 ? (unsigned long)strtoul(argv[3], nullptr, 0) : 20; testing_enabled = argc > 4 ? (bool)strtol(argv[4], nullptr, 0) : false; auto res = measure(fibonacci_two_tasks, numbers, ntrial); std::cout << "Fibonacci two tasks impl N = " << res.first << " Avg time = " << res.second << " ms" << std::endl; res = measure(fibonacci_single_task, numbers, ntrial); std::cout << "Fibonacci single task impl N = " << res.first << " Avg time = " << res.second << " ms" << std::endl; } ================================================ FILE: third-party/tbb/examples/migration/recursive_fibonacci/fibonacci_single_task.h ================================================ /* Copyright (c) 2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef SINGLE_TASK_HEADER #define SINGLE_TASK_HEADER #include "task_emulation_layer.h" #include #include #include extern int cutoff; extern bool testing_enabled; long serial_fib_1(int n) { return n < 2 ? n : serial_fib_1(n - 1) + serial_fib_1(n - 2); } struct single_fib_task : task_emulation::base_task { enum class state { compute, sum }; single_fib_task(int n, int* x) : n(n), x(x), s(state::compute) {} task_emulation::base_task* execute() override { task_emulation::base_task* bypass = nullptr; switch (s) { case state::compute : { bypass = compute_impl(); break; } case state::sum : { *x = x_l + x_r; if (testing_enabled) { if (n == cutoff && num_recycles > 0) { --num_recycles; bypass = compute_impl(); } } break; } } return bypass; } task_emulation::base_task* compute_impl() { task_emulation::base_task* bypass = nullptr; if (n < cutoff) { *x = serial_fib_1(n); } else { bypass = this->allocate_child_and_increment(n - 2, &x_r); task_emulation::run_task(this->allocate_child_and_increment(n - 1, &x_l)); // Recycling this->s = state::sum; this->recycle_as_continuation(); } return bypass; } int n; int* x; state s; int x_l{ 0 }, x_r{ 0 }; int num_recycles{5}; }; int fibonacci_single_task(int n) { int sum{}; tbb::task_group tg; task_emulation::run_and_wait(tg, task_emulation::allocate_root_task(/* for root task = */ tg, n, &sum)); return sum; } #endif // SINGLE_TASK_HEADER ================================================ FILE: third-party/tbb/examples/migration/recursive_fibonacci/fibonacci_two_tasks.h ================================================ /* Copyright (c) 2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef TWO_TASKS_HEADER #define TWO_TASKS_HEADER #include "task_emulation_layer.h" #include #include #include #include extern int cutoff; long serial_fib(int n) { return n < 2 ? n : serial_fib(n - 1) + serial_fib(n - 2); } struct fib_continuation : task_emulation::base_task { fib_continuation(int& s) : sum(s) {} task_emulation::base_task* execute() override { sum = x + y; return nullptr; } int x{ 0 }, y{ 0 }; int& sum; }; struct fib_computation : task_emulation::base_task { fib_computation(int n, int* x) : n(n), x(x) {} task_emulation::base_task* execute() override { task_emulation::base_task* bypass = nullptr; if (n < cutoff) { *x = serial_fib(n); } else { // Continuation passing auto& c = *this->allocate_continuation(/* children_counter = */ 2, *x); task_emulation::run_task(c.create_child(n - 1, &c.x)); // Recycling this->recycle_as_child_of(c); n = n - 2; x = &c.y; bypass = this; } return bypass; } int n; int* x; }; int fibonacci_two_tasks(int n) { int sum{}; tbb::task_group tg; tg.run_and_wait( task_emulation::create_root_task(/* for root task = */ tg, n, &sum)); return sum; } #endif // TWO_TASKS_HEADER ================================================ FILE: third-party/tbb/examples/migration/recursive_fibonacci/task_emulation_layer.h ================================================ /* Copyright (c) 2023-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_task_emulation_layer_H #define __TBB_task_emulation_layer_H #include "tbb/task_group.h" #include "tbb/task_arena.h" #include namespace task_emulation { struct task_group_pool { task_group_pool() : pool_size(tbb::this_task_arena::max_concurrency()), task_submitters(new tbb::task_group[pool_size]) {} ~task_group_pool() { for (std::size_t i = 0; i < pool_size; ++i) { task_submitters[i].wait(); } delete [] task_submitters; } tbb::task_group& operator[] (std::size_t idx) { return task_submitters[idx]; } const std::size_t pool_size; tbb::task_group* task_submitters; }; static task_group_pool tg_pool; class base_task { public: base_task() = default; base_task(const base_task& t) : m_type(t.m_type), m_parent(t.m_parent), m_child_counter(t.m_child_counter.load()) {} virtual ~base_task() = default; void operator() () const { task_type type_snapshot = m_type; base_task* bypass = const_cast(this)->execute(); if (m_parent && m_type != task_type::recycled) { if (m_parent->remove_child_reference() == 0) { m_parent->operator()(); } } if (m_type == task_type::allocated) { delete this; } if (bypass != nullptr) { m_type = type_snapshot; // Bypass is not supported by task_emulation and next_task executed directly. // However, the old-TBB bypass behavior can be achieved with // `return task_group::defer()` (check Migration Guide). // Consider submit another task if recursion call is not acceptable // i.e. instead of Direct Body call // submit task_emulation::run_task(); bypass->operator()(); } } virtual base_task* execute() = 0; template C* allocate_continuation(std::uint64_t ref, Args&&... args) { C* continuation = new C{std::forward(args)...}; continuation->m_type = task_type::allocated; continuation->reset_parent(reset_parent()); continuation->m_child_counter = ref; return continuation; } template F create_child(Args&&... args) { return create_child_impl(std::forward(args)...); } template F create_child_and_increment(Args&&... args) { add_child_reference(); return create_child_impl(std::forward(args)...); } template F* allocate_child(Args&&... args) { return allocate_child_impl(std::forward(args)...); } template F* allocate_child_and_increment(Args&&... args) { add_child_reference(); return allocate_child_impl(std::forward(args)...); } template void recycle_as_child_of(C& c) { m_type = task_type::recycled; reset_parent(&c); } void recycle_as_continuation() { m_type = task_type::recycled; } void add_child_reference() { ++m_child_counter; } std::uint64_t remove_child_reference() { return --m_child_counter; } protected: enum class task_type { stack_based, allocated, recycled }; mutable task_type m_type; private: template friend F create_root_task(tbb::task_group& tg, Args&&... args); template friend F* allocate_root_task(tbb::task_group& tg, Args&&... args); template F create_child_impl(Args&&... args) { F obj{std::forward(args)...}; obj.m_type = task_type::stack_based; obj.reset_parent(this); return obj; } template F* allocate_child_impl(Args&&... args) { F* obj = new F{std::forward(args)...}; obj->m_type = task_type::allocated; obj->reset_parent(this); return obj; } base_task* reset_parent(base_task* ptr = nullptr) { auto p = m_parent; m_parent = ptr; return p; } base_task* m_parent{nullptr}; std::atomic m_child_counter{0}; }; class root_task : public base_task { public: root_task(tbb::task_group& tg) : m_tg(tg), m_callback(m_tg.defer([] { /* Create empty callback to preserve reference for wait. */})) { add_child_reference(); m_type = base_task::task_type::allocated; } private: base_task* execute() override { m_tg.run(std::move(m_callback)); return nullptr; } tbb::task_group& m_tg; tbb::task_handle m_callback; }; template F create_root_task(tbb::task_group& tg, Args&&... args) { F obj{std::forward(args)...}; obj.m_type = base_task::task_type::stack_based; obj.reset_parent(new root_task{tg}); return obj; } template F* allocate_root_task(tbb::task_group& tg, Args&&... args) { F* obj = new F{std::forward(args)...}; obj->m_type = base_task::task_type::allocated; obj->reset_parent(new root_task{tg}); return obj; } template void run_task(F&& f) { tg_pool[tbb::this_task_arena::current_thread_index()].run(std::forward(f)); } template void run_task(F* f) { tg_pool[tbb::this_task_arena::current_thread_index()].run(std::ref(*f)); } template void run_and_wait(tbb::task_group& tg, F* f) { tg.run_and_wait(std::ref(*f)); } } // namespace task_emulation #endif // __TBB_task_emulation_layer_H ================================================ FILE: third-party/tbb/examples/parallel_for/README.md ================================================ # Code Samples of oneAPI Threading Building Blocks (oneTBB) Examples using `parallel_for` algorithm. | Code sample name | Description |:--- |:--- | game_of_life | Game of life overlay. | polygon_overlay | polygon overlay. | seismic | Parallel seismic wave simulation. | tachyon | Parallel 2-D raytracer/renderer. ================================================ FILE: third-party/tbb/examples/parallel_for/game_of_life/Board.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef TBB_examples_game_of_life_board_H #define TBB_examples_game_of_life_board_H #define LabelPtr int* #define BoardPtr Board* struct Matrix { int width; int height; char* data; }; class Board { public: Board(int width, int height, int squareSize, LabelPtr counter); virtual ~Board(); void seed(int s); void seed(const BoardPtr s); public: Matrix* m_matrix; private: int m_width; int m_height; int m_squareSize; LabelPtr m_counter; }; #endif /* TBB_examples_game_of_life_board_H */ ================================================ FILE: third-party/tbb/examples/parallel_for/game_of_life/CMakeLists.txt ================================================ # Copyright (c) 2020-2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. cmake_minimum_required(VERSION 3.5.0...3.31.3) project(game_of_life CXX) include(../../common/cmake/common.cmake) set_common_project_settings(tbb) add_executable(game_of_life Evolution.cpp Game_of_life.cpp Update_state.cpp) target_compile_definitions(game_of_life PUBLIC _CONSOLE) target_link_libraries(game_of_life TBB::tbb Threads::Threads) target_compile_options(game_of_life PRIVATE ${TBB_CXX_STD_FLAG}) set(EXECUTABLE "$") set(ARGS 2:4 -t 5) set(LIGHT_ARGS 1:2 -t 5) add_execution_target(run_game_of_life game_of_life ${EXECUTABLE} "${ARGS}") add_execution_target(light_test_game_of_life game_of_life ${EXECUTABLE} "${LIGHT_ARGS}") ================================================ FILE: third-party/tbb/examples/parallel_for/game_of_life/Evolution.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* Evolution.cpp: implementation file for evolution classes; evolution classes do looped evolution of patterns in a defined 2 dimensional space */ #include "common/utility/get_default_num_threads.hpp" #include "Evolution.hpp" #include "Board.hpp" #ifdef USE_SSE #define GRAIN_SIZE 14 #else #define GRAIN_SIZE 4000 #endif #define TIME_SLICE 330 /* Evolution */ /** Evolution::UpdateMatrix() - moves the calculated destination data to the source data block. No destination zeroing is required since it will be completely overwritten during the next calculation cycle. **/ void Evolution::UpdateMatrix() { memcpy(m_matrix->data, m_dest, m_size); } /* SequentialEvolution */ //! SequentialEvolution::Run - begins looped evolution void SequentialEvolution::Run(double execution_time, int nthread) { printf("Starting game (Sequential evolution)\n"); m_nIteration = 0; m_serial_time = 0; oneapi::tbb::tick_count t0 = oneapi::tbb::tick_count::now(); while (!m_done) { if (!is_paused) { oneapi::tbb::tick_count t = oneapi::tbb::tick_count::now(); Step(); oneapi::tbb::tick_count t1 = oneapi::tbb::tick_count::now(); ++m_nIteration; double work_time = (t1 - t0).seconds(); m_serial_time += work_time; } //! Let the parallel algorithm work uncontended almost the same time //! as the serial one. See ParallelEvolution::Run() as well. t0 = oneapi::tbb::tick_count::now(); if (m_serial_time > execution_time) { printf("iterations count = %d time = %g\n", m_nIteration, m_serial_time); break; } } } //! SequentialEvolution::Step() - override of step method void SequentialEvolution::Step() { if (!is_paused) { #ifdef USE_SSE UpdateState(m_matrix, m_matrix->data, 0, m_matrix->height); #else UpdateState(m_matrix, m_dest, 0, (m_matrix->width * m_matrix->height) - 1); UpdateMatrix(); #endif } } /* ParallelEvolution */ //! SequentialEvolution::Run - begins looped evolution void ParallelEvolution::Run(double execution_time, int nthread) { if (nthread == utility::get_default_num_threads()) printf("Starting game (Parallel evolution for automatic number of thread(s))\n"); else printf("Starting game (Parallel evolution for %d thread(s))\n", nthread); m_nIteration = 0; m_parallel_time = 0; oneapi::tbb::global_control* pGlobControl = new oneapi::tbb::global_control( oneapi::tbb::global_control::max_allowed_parallelism, nthread); double work_time = m_serial_time; oneapi::tbb::tick_count t0 = oneapi::tbb::tick_count::now(); while (!m_done) { if (!is_paused) { oneapi::tbb::tick_count t = oneapi::tbb::tick_count::now(); Step(); oneapi::tbb::tick_count t1 = oneapi::tbb::tick_count::now(); ++m_nIteration; double real_work_time = (t1 - t0).seconds(); m_parallel_time += real_work_time; } //! Let the serial algorithm work the same time as the parallel one. t0 = oneapi::tbb::tick_count::now(); if (m_parallel_time > execution_time) { printf("iterations count = %d time = %g\n", m_nIteration, m_parallel_time); delete pGlobControl; pGlobControl = nullptr; break; } } delete pGlobControl; pGlobControl = nullptr; } /** class tbb_parallel_task TBB requires a class for parallel loop implementations. The actual loop "chunks" are performed using the () operator of the class. The blocked_range contains the range to calculate. Please see the TBB documentation for more information. **/ class tbb_parallel_task { public: static void set_values(Matrix* source, char* dest) { m_source = source; m_dest = dest; return; } void operator()(const oneapi::tbb::blocked_range& r) const { int begin = (int)r.begin(); //! capture lower range number for this chunk int end = (int)r.end(); //! capture upper range number for this chunk UpdateState(m_source, m_dest, begin, end); } tbb_parallel_task() {} private: static Matrix* m_source; static char* m_dest; }; Matrix* tbb_parallel_task::m_source; char* tbb_parallel_task::m_dest; //! ParallelEvolution::Step() - override of Step method void ParallelEvolution::Step() { std::size_t begin = 0; //! beginning cell position #ifdef USE_SSE std::size_t end = m_matrix->height; //! ending cell position #else std::size_t end = m_size - 1; //! ending cell position #endif //! set matrix pointers tbb_parallel_task::set_values(m_matrix, m_dest); //! do calculation loop parallel_for(oneapi::tbb::blocked_range(begin, end, GRAIN_SIZE), tbb_parallel_task()); UpdateMatrix(); } ================================================ FILE: third-party/tbb/examples/parallel_for/game_of_life/Evolution.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /** Evolution.h: Header file for evolution classes; evolution classes do looped evolution of patterns in a defined 2 dimensional space **/ #ifndef TBB_examples_game_of_life_evolution_H #define TBB_examples_game_of_life_evolution_H #include #include #include #include "oneapi/tbb/blocked_range.h" #include "oneapi/tbb/parallel_for.h" #include "oneapi/tbb/tick_count.h" #include "oneapi/tbb/global_control.h" #include "Board.hpp" typedef unsigned int Int32; void UpdateState(Matrix* m_matrix, char* dest, int begin, int end); /** class Evolution - base class for SequentialEvolution and ParallelEvolution **/ class Evolution { public: Evolution(Matrix* m, //! beginning matrix including initial pattern BoardPtr board //! the board to update ) : m_matrix(m), m_board(board), m_size(m_matrix->height * m_matrix->width), m_done(false) { //! allocate memory for second matrix data block m_dest = new char[m_size]; is_paused = false; } virtual ~Evolution() { delete[] m_dest; } //! Run() - begins looped evolution virtual void Run(double execution_time, int nthread) = 0; //! Quit() - tell the thread to terminate virtual void Quit() { m_done = true; } //! Step() - performs a single evolutionary generation computation on the game matrix virtual void Step() = 0; //! SetPause() - change condition of variable is_paused virtual void SetPause(bool condition) { if (condition == true) is_paused = true; else is_paused = false; } protected: /** UpdateMatrix() - moves the previous destination data to the source data block and zeros out destination. **/ void UpdateMatrix(); protected: Matrix* m_matrix; //! Pointer to initial matrix char* m_dest; //! Pointer to calculation destination data BoardPtr m_board; //! The game board to update int m_size; //! size of the matrix data block volatile bool m_done; //! a flag used to terminate the thread Int32 m_nIteration; //! current calculation cycle index volatile bool is_paused; //! is needed to perform next iteration //! Calculation time of the sequential version (since the start), seconds. /** This member is updated by the sequential version and read by parallel, so no synchronization is necessary. **/ double m_serial_time; }; /** class SequentialEvolution - derived from Evolution - calculate life generations serially **/ class SequentialEvolution : public Evolution { public: SequentialEvolution(Matrix* m, BoardPtr board) : Evolution(m, board) {} virtual void Run(double execution_time, int nthread); virtual void Step(); }; /** class ParallelEvolution - derived from Evolution - calculate life generations in parallel using oneTBB **/ class ParallelEvolution : public Evolution { public: ParallelEvolution(Matrix* m, BoardPtr board) : Evolution(m, board), m_parallel_time(0) { // instantiate a global_control object and save a pointer to it m_pGlobControl = nullptr; } ~ParallelEvolution() { //! delete global_control object delete m_pGlobControl; m_pGlobControl = nullptr; } virtual void Run(double execution_time, int nthread); virtual void Step(); private: oneapi::tbb::global_control* m_pGlobControl; double m_parallel_time; }; #endif /* TBB_examples_game_of_life_evolution_H */ ================================================ FILE: third-party/tbb/examples/parallel_for/game_of_life/Game_of_life.cpp ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* Game_of_life.cpp : main project file. */ #include #include #include #include "common/utility/get_default_num_threads.hpp" #include "Board.hpp" #include "Evolution.hpp" #define BOARD_SQUARE_SIZE 2 int low; //! lower range limit of threads int high; //! high range limit of threads double execution_time; //! time for game of life iterations Board::Board(int width, int height, int squareSize, LabelPtr counter) : m_width(width), m_height(height), m_squareSize(squareSize), m_counter(counter) { m_matrix = new Matrix(); m_matrix->width = width; m_matrix->height = height; m_matrix->data = new char[width * height]; memset(m_matrix->data, 0, width * height); } Board::~Board() { delete[] m_matrix->data; delete m_matrix; } void Board::seed(int s) { srand(s); for (int j = 0; j < m_height; j++) { for (int i = 0; i < m_width; i++) { int x = rand() / (int)(((unsigned)RAND_MAX + 1) / 100); m_matrix->data[i + j * m_width] = x > 75 ? 1 : 0; // 25% occupied } } } void Board::seed(const BoardPtr src) { memcpy(m_matrix->data, src->m_matrix->data, m_height * m_width); } //! Print usage of this program void PrintUsage() { printf("Usage: game_of_life [M[:N] -t execution_time]\n" "M and N are a range of numbers of threads to be used.\n" "execution_time is a time (in sec) for execution game_of_life iterations\n"); printf("Default values:\n" "M:\t\tautomatic\n" "N:\t\tM\n" "execution_time:\t10\n"); } //! Parse command line bool ParseCommandLine(int argc, char* argv[]) { char* s = argv[1]; char* end; //! command line without parameters if (argc == 1) { low = utility::get_default_num_threads(); high = low; execution_time = 5; return true; } //! command line with parameters if (argc != 4) { PrintUsage(); return false; } if (std::string("-t") != argv[argc - 2]) //! process M[:N] parameter high = strtol(s, &end, 0); low = strtol(s, &end, 0); switch (*end) { case ':': high = strtol(end + 1, nullptr, 0); break; case '\0': break; default: PrintUsage(); return false; } if (high < low) { std::cout << "Set correct range. Current range: " << low << ":" << high << "\n"; PrintUsage(); return false; } //! process execution_time parameter execution_time = strtol(argv[argc - 1], &end, 0); return true; } int main(int argc, char* argv[]) { if (!ParseCommandLine(argc, argv)) return -1; SequentialEvolution* m_seq; ParallelEvolution* m_par; Board* m_board1; Board* m_board2; int* count = nullptr; int boardWidth = 300; int boardHeight = 300; m_board1 = new Board(boardWidth, boardHeight, BOARD_SQUARE_SIZE, count); m_board2 = new Board(boardWidth, boardHeight, BOARD_SQUARE_SIZE, count); time_t now = time(nullptr); printf("Generate Game of life board\n"); m_board1->seed((int)now); m_board2->seed(m_board1); m_seq = new SequentialEvolution(m_board1->m_matrix, m_board1); m_seq->Run(execution_time, 1); delete m_seq; m_par = new ParallelEvolution(m_board2->m_matrix, m_board2); for (int p = low; p <= high; ++p) { m_par->Run(execution_time, p); } delete m_par; delete m_board1; delete m_board2; return 0; } ================================================ FILE: third-party/tbb/examples/parallel_for/game_of_life/README.md ================================================ # Game_of_life sample The "Game of life" example demonstrates interoperability of oneAPI Threading Building Blocks (oneTBB) and Microsoft* .NET*. This program runs 2 simultaneous instances of the classic Conway's "Game of Life". One of these instances uses serial calculations to update the board. The other one calculates in parallel with oneTBB. The visualization is written in managed C++ and uses .NET CLR. ## Building the example ``` cmake cmake --build . ``` ## Running the sample ### Predefined make targets * `make run_game_of_life` - executes the example with predefined parameters. * `make light_test_game_of_life` - executes the example with suggested parameters to reduce execution time. ### Application parameters Usage: ``` game_of_life [M[:N] -t execution_time] [-h] ``` * `-h` - prints the help for command line options. * `M:N` - range of numbers of threads to be used. * `execution_time` - time (in sec) for execution `game_of_life` iterations. ================================================ FILE: third-party/tbb/examples/parallel_for/game_of_life/Update_state.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "Evolution.hpp" #ifdef USE_SSE /* Update states with SSE */ #include #include inline void create_record(char* src, unsigned* dst, unsigned width) { dst[0] |= src[width - 1]; for (unsigned i = 0; i < 31u; ++i) dst[0] |= src[i] << (i + 1); unsigned col; for (unsigned col = 31u; col < width; ++col) dst[(col + 1) / 32u] |= src[col] << ((col + 1) % 32u); dst[(col + 1) / 32u] |= src[0] << ((col + 1) % 32u); } inline void sum_offset(__m128i* X, __m128i* A, __m128i* B, __m128i* C, unsigned size_sse_ar, unsigned shift) { for (unsigned i = 0; i < size_sse_ar; ++i) { __m128i tmp = _mm_and_si128(A[i], X[shift + i]); A[i] = _mm_xor_si128(A[i], X[shift + i]); C[i] = _mm_or_si128(C[i], _mm_and_si128(B[i], tmp)); B[i] = _mm_xor_si128(B[i], tmp); } } inline void shift_left2D(__m128i* X, unsigned height, unsigned size_sse_row) { for (unsigned row = 0; row < height; ++row) { unsigned ind = row * size_sse_row; unsigned x0 = X[ind].m128i_u32[0] & 1; X[ind] = _mm_or_si128(_mm_srli_epi16(X[ind], 1), _mm_slli_epi16(_mm_srli_si128(X[ind], 2), 15)); unsigned x1 = X[ind + 1].m128i_u32[0] & 1; X[ind + 1] = _mm_or_si128(_mm_srli_epi16(X[ind + 1], 1), _mm_slli_epi16(_mm_srli_si128(X[ind + 1], 2), 15)); X[ind].m128i_u32[3] |= x1 << 31; unsigned x2 = X[ind + 2].m128i_u32[0] & 1; X[ind + 2] = _mm_or_si128(_mm_srli_epi16(X[ind + 2], 1), _mm_slli_epi16(_mm_srli_si128(X[ind + 2], 2), 15)); X[ind + 1].m128i_u32[3] |= x2 << 31; unsigned* dst = (unsigned*)&X[ind]; dst[301 / 32u] |= x0 << (301 % 32u); } } inline void shift_right2D(__m128i* X, unsigned height, unsigned size_sse_row) { for (unsigned row = 0; row < height; ++row) { unsigned ind = row * size_sse_row; unsigned x0 = X[ind].m128i_u32[3]; x0 >>= 31; X[ind] = _mm_or_si128(_mm_slli_epi16(X[ind], 1), _mm_srli_epi16(_mm_slli_si128(X[ind], 2), 15)); unsigned x1 = X[ind + 1].m128i_u32[3]; x1 >>= 31; X[ind + 1] = _mm_or_si128(_mm_slli_epi16(X[ind + 1], 1), _mm_srli_epi16(_mm_slli_si128(X[ind + 1], 2), 15)); X[ind + 1].m128i_u32[0] |= x0; unsigned* dst = (unsigned*)&X[ind]; unsigned x2 = dst[301 / 32u] & (1 << (301 % 32u)); x2 >>= (301 % 32u); X[ind + 2] = _mm_or_si128(_mm_slli_epi16(X[ind + 2], 1), _mm_srli_epi16(_mm_slli_si128(X[ind + 2], 2), 15)); X[ind + 2].m128i_u32[0] |= x1; X[ind].m128i_u32[0] |= x2; } } void UpdateState(Matrix* m_matrix, char* dest, int begin, int end) { //300/128 + 1 =3, 3*300=900 unsigned size_sse_row = m_matrix->width / 128 + 1; //3 unsigned size_sse_ar = size_sse_row * (end - begin); __m128i X[906], A[900], B[900], C[900]; char* mas = m_matrix->data; for (unsigned i = 0; i < size_sse_ar; ++i) { A[i].m128i_u32[0] = 0; A[i].m128i_u32[1] = 0; A[i].m128i_u32[2] = 0; A[i].m128i_u32[3] = 0; B[i].m128i_u32[0] = 0; B[i].m128i_u32[1] = 0; B[i].m128i_u32[2] = 0; B[i].m128i_u32[3] = 0; C[i].m128i_u32[0] = 0; C[i].m128i_u32[1] = 0; C[i].m128i_u32[2] = 0; C[i].m128i_u32[3] = 0; } for (unsigned i = 0; i < size_sse_ar + 6; ++i) { X[i].m128i_u32[0] = 0; X[i].m128i_u32[1] = 0; X[i].m128i_u32[2] = 0; X[i].m128i_u32[3] = 0; } // create X[] with bounds unsigned height = end - begin; unsigned width = m_matrix->width; for (unsigned row = 0; row < height; ++row) { char* src = &mas[(row + begin) * width]; unsigned* dst = (unsigned*)&X[(row + 1) * size_sse_row]; create_record(src, dst, width); } // create high row in X[] char* src; if (begin == 0) { src = &mas[(m_matrix->height - 1) * width]; } else { src = &mas[(begin - 1) * width]; } unsigned* dst = (unsigned*)X; create_record(src, dst, width); //create lower row in X[] if (end == m_matrix->height) { src = mas; } else { src = &mas[end * width]; } dst = (unsigned*)&X[(height + 1) * size_sse_row]; create_record(src, dst, width); //sum( C, B, A, X+offset_for_upwards ); high-left friend sum_offset(X, A, B, C, size_sse_ar, 0); //sum( C, B, A, X+offset_for_no_vertical_shift ); sum_offset(X, A, B, C, size_sse_ar, size_sse_row); //sum( C, B, A, X+offset_for_downwards ); sum_offset(X, A, B, C, size_sse_ar, 2 * size_sse_row); //shift_left( X ); (when view 2D) in our logic it is in right height = end - begin + 2; shift_left2D(X, height, size_sse_row); //sum( C, B, A, X+offset_for_upwards ); high-left friend sum_offset(X, A, B, C, size_sse_ar, 0); //sum( C, B, A, X+offset_for_downwards ); sum_offset(X, A, B, C, size_sse_ar, 2 * size_sse_row); //shift_left( X ); (view in 2D) in our logic it is right shift height = end - begin + 2; shift_left2D(X, height, size_sse_row); //sum( C, B, A, X+offset_for_upwards ); high-right friend sum_offset(X, A, B, C, size_sse_ar, 0); //sum( C, B, A, X+offset_for_no_vertical_shift ); right friend sum_offset(X, A, B, C, size_sse_ar, size_sse_row); //sum( C, B, A, X+offset_for_downwards ); right down friend sum_offset(X, A, B, C, size_sse_ar, 2 * size_sse_row); //shift_right( X ); (when view in 2D) in our case it left shift. height = end - begin + 2; shift_right2D(X, height, size_sse_row); //X = (X|A)&B&~C (done bitwise over the arrays) unsigned shift = size_sse_row; for (unsigned i = 0; i < size_sse_ar; ++i) { C[i].m128i_u32[0] = ~C[i].m128i_u32[0]; C[i].m128i_u32[1] = ~C[i].m128i_u32[1]; C[i].m128i_u32[2] = ~C[i].m128i_u32[2]; C[i].m128i_u32[3] = ~C[i].m128i_u32[3]; X[shift + i] = _mm_and_si128(_mm_and_si128(_mm_or_si128(X[shift + i], A[i]), B[i]), C[i]); } height = end - begin; width = m_matrix->width; for (unsigned row = 0; row < height; ++row) { char* dst = &dest[(row + begin) * width]; unsigned* src = (unsigned*)&X[(row + 1) * size_sse_row]; for (unsigned col = 0; col < width; ++col) { unsigned c = src[col / 32u] & 1 << (col % 32u); dst[col] = c >> (col % 32u); } } } #else /* end SSE block */ // ---------------------------------------------------------------------- // GetAdjacentCellState() - returns the state (value) of the specified // adjacent cell of the current cell "cellNumber" char GetAdjacentCellState(char* source, // pointer to source data block int x, // logical width of field int y, // logical height of field int cellNumber, // number of cell position to examine int cp // which adjacent position ) { /* cp *-- cp=1 ... --- cp=8 (summary: -1-2-3- -x- -x- -4-x-5- --- --* -6-7-8- ) */ char cellState = 0; // return value // set up boundary flags to trigger field-wrap logic bool onTopRow = false; bool onBottomRow = false; bool onLeftColumn = false; bool onRightColumn = false; // check to see if cell is on top row if (cellNumber < x) { onTopRow = true; } // check to see if cell is on bottom row if ((x * y) - cellNumber <= x) { onBottomRow = true; } // check to see if cell is on left column if (cellNumber % x == 0) { onLeftColumn = true; } // check to see if cell is on right column if ((cellNumber + 1) % x == 0) { onRightColumn = true; } switch (cp) { case 1: if (onTopRow && onLeftColumn) { return *(source + ((x * y) - 1)); } if (onTopRow && !onLeftColumn) { return *(source + (((x * y) - x) + (cellNumber - 1))); } if (onLeftColumn && !onTopRow) { return *(source + (cellNumber - 1)); } return *((source + cellNumber) - (x + 1)); case 2: if (onTopRow) { return *(source + (((x * y) - x) + cellNumber)); } return *((source + cellNumber) - x); case 3: if (onTopRow && onRightColumn) { return *(source + ((x * y) - x)); } if (onTopRow && !onRightColumn) { return *(source + (((x * y) - x) + (cellNumber + 1))); } if (onRightColumn && !onTopRow) { return *(source + ((cellNumber - (x * 2)) + 1)); } return *(source + (cellNumber - (x - 1))); case 4: if (onRightColumn) { return *(source + (cellNumber - (x - 1))); } return *(source + (cellNumber + 1)); case 5: if (onBottomRow && onRightColumn) { return *source; } if (onBottomRow && !onRightColumn) { return *(source + ((cellNumber - ((x * y) - x)) + 1)); } if (onRightColumn && !onBottomRow) { return *(source + (cellNumber + 1)); } return *(source + (((cellNumber + x)) + 1)); case 6: if (onBottomRow) { return *(source + (cellNumber - ((x * y) - x))); } return *(source + (cellNumber + x)); case 7: if (onBottomRow && onLeftColumn) { return *(source + (x - 1)); } if (onBottomRow && !onLeftColumn) { return *(source + (cellNumber - ((x * y) - x) - 1)); } if (onLeftColumn && !onBottomRow) { return *(source + (cellNumber + ((x * 2) - 1))); } return *(source + (cellNumber + (x - 1))); case 8: if (onLeftColumn) { return *(source + (cellNumber + (x - 1))); } return *(source + (cellNumber - 1)); } return cellState; } char CheckCell(Matrix* m_matrix, int cellNumber) { char total = 0; char* source = m_matrix->data; //look around to find cell's with status "alive" for (int i = 1; i < 9; i++) { total += GetAdjacentCellState(source, m_matrix->width, m_matrix->height, cellNumber, i); } // if the number of adjacent live cells is < 2 or > 3, the result is a dead // cell regardless of its current state. (A live cell dies of loneliness if it // has less than 2 neighbors, and of overcrowding if it has more than 3; a new // cell is born in an empty spot only if it has exactly 3 neighbors. if (total < 2 || total > 3) { return 0; } // if we get here and the cell position holds a living cell, it stays alive if (*(source + cellNumber)) { return 1; } // we have an empty position. If there are only 2 neighbors, the position stays // empty. if (total == 2) { return 0; } // we have an empty position and exactly 3 neighbors. A cell is born. return 1; } void UpdateState(Matrix* m_matrix, char* dest, int begin, int end) { for (int i = begin; i <= end; i++) { *(dest + i) = CheckCell(m_matrix, i); } } #endif /* end non-SSE block */ ================================================ FILE: third-party/tbb/examples/parallel_for/polygon_overlay/CMakeLists.txt ================================================ # Copyright (c) 2020-2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. cmake_minimum_required(VERSION 3.5.0...3.31.3) project(polygon_overlay CXX) include(../../common/cmake/common.cmake) set_common_project_settings("tbb;tbbmalloc") add_executable(polygon_overlay pover_video.cpp polymain.cpp polyover.cpp) add_subdirectory(../../common/gui gui) target_link_libraries(polygon_overlay TBB::tbb TBB::tbbmalloc Threads::Threads UI_LIB_polygon_overlay) target_compile_options(polygon_overlay PRIVATE ${TBB_CXX_STD_FLAG}) if (MSVC) target_compile_options(polygon_overlay PRIVATE -DNOMINMAX) endif() set(EXECUTABLE "$") set(ARGS "") set(LIGHT_ARGS --polys 10 --size 5x5) add_execution_target(run_polygon_overlay polygon_overlay ${EXECUTABLE} "${ARGS}") add_execution_target(light_test_polygon_overlay polygon_overlay ${EXECUTABLE} "${LIGHT_ARGS}") ================================================ FILE: third-party/tbb/examples/parallel_for/polygon_overlay/README.md ================================================ # Polygon_overlay sample Polygon Overlay example that demonstrates the use of `parallel_for`. This example is a simple implementation of polygon overlay, as described in Parallelizing the [Polygon Overlay Problem Using Orca, by H.F. Langendoen](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.46.9538). The solution was implemented in three forms: * The naive serial solution. * The naive parallel solution, by splitting list of polygons from one map and intersecting each sub-list against the entire list of polygons from the second map. * A parallel solution where each map is split into submaps, with each resulting submap being intersected against the corresponding submap from the other map. This solution requires some redundancy (some polygons are members of more than one submap). To prevent multiple copies of a polygon from being placed in the solution map, if both polygons are duplicated (that is, if they both appear in more than one map), they are intersected but the result is not placed in the solution map. The only optimization in each solution is that the area of the generated sub-polygons are subtracted from the original area of one of the source polygons. When the remaining area is zero, the intersection process is halted. A word about the speedup of the submap case. One may get superlinear speedup in this case (for instance a laptop with Intel® Core(TM) Duo processor got a speedup of about 20 percent over serial.) This results from two effects: * the number of threads used, and * the fact that for each submap, the number of polygons is smaller than that for the other two cases. If there are, say, 400 polygons in each map, then on average the number of intersections calculated is approximately 80,000 (400 * 200, where 200 is the average number of polygons examined before stopping.) If the maps are split into 2 submaps, the time for each submap is about 200*100, or 20,000. So even comparing the two sets of submaps serially should result in a speedup somewhere around 2. This number is affected by the number of redundant polygons being compared; this effect would eventually swamp the gain from comparing smaller numbers of polygons per submap. And remember the submaps are created by intersecting each map with a rectangular polygon covering the submap being generated, which is additional work taking about `N * O(400)` in the case above, where `N` is the number of submaps generated, that can be done in parallel. Running the default release pover while varying the number of submaps from 1 to 1000, the speedup on the submap case for a 2-processor system looks like ![Speedup vs Submap count](speedup.gif) One further optimization would be to sort one map, say map1, by maxY, and sort the other map (map2) by minY. For p1 in map1, start testing for intersection at the first p2 in map2 that intersected the last polygon tested in map1. This would speed up the intersection process greatly, but the optimization would apply to all the methods, and the sort would have to be accounted for in the timing. The source maps are generated pseudo-randomly in the manner described in the paper above. That is, if we need `N` polygons, then `N` "boxes" are chosen at random, then one-at-a-time the areas are expanded in one of fours directions until the area hits an adjacent polygon. When this process is finished, the resulting map is inspected and any remaining unoccupied "boxes" are made into additional polygons, as large as possible in each case. So the actual number of polygons in each map will in general be larger than the number of polygons requested (sometimes by 10% or more.) One limitation of the program is that if the number of polygons in the source map is greater than the number of "boxes" (pixels in the GUI case), the maps cannot be generated. ## Building the example ``` cmake [EXAMPLES_UI_MODE=value] cmake --build . ``` ### Predefined CMake variables * `EXAMPLES_UI_MODE` - defines the GUI mode, supported values are `gdi`, `d2d`, `con` on Windows, `x`,`con` on Linux and `mac`,`con` on macOS. The default mode is `con`. See the [common page](../../README.md) to get more information. ## Running the sample ### Predefined make targets * `make run_polygon_overlay` - executes the example with predefined parameters. * `make light_test_polygon_overlay` - executes the example with suggested parameters to reduce execution time. ### Application parameters Usage: ``` polygon_overlay [threads[:threads2]] [--polys npolys] [--size nnnxnnn] [--seed nnn] [--csv filename] [--grainsize n] [--use_malloc] ``` * `-h` - prints the help for command line options. * `threads[:threads2]` - number of threads to run. * `--polys npolys` - number of polygons in each map. * `--size nnnxnnn` - size of each map (X x Y). * `--seed nnn` - initial value of random number generator. * `--csv filename` - write timing data to CSV-format file. * `--grainsize n` - set grainsize to n. * `--use_malloc` - allocate polygons with malloc instead of scalable allocator. ================================================ FILE: third-party/tbb/examples/parallel_for/polygon_overlay/gui/polygon_overlay.rc ================================================ // Microsoft Visual C++ generated resource script. // #include "resource.h" #define APSTUDIO_READONLY_SYMBOLS ///////////////////////////////////////////////////////////////////////////// // // Generated from the TEXTINCLUDE 2 resource. // #include ///////////////////////////////////////////////////////////////////////////// #undef APSTUDIO_READONLY_SYMBOLS ///////////////////////////////////////////////////////////////////////////// // English (U.S.) resources #if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_ENU) LANGUAGE 9, 1 #pragma code_page(1252) #ifdef APSTUDIO_INVOKED ///////////////////////////////////////////////////////////////////////////// // // TEXTINCLUDE // 1 TEXTINCLUDE BEGIN "resource.h\0" END 2 TEXTINCLUDE BEGIN "#include ""windows.h""\r\n" "\0" END 3 TEXTINCLUDE BEGIN "\r\n" "\0" END #endif // APSTUDIO_INVOKED #endif // English (U.S.) resources ///////////////////////////////////////////////////////////////////////////// #ifndef APSTUDIO_INVOKED ///////////////////////////////////////////////////////////////////////////// // // Generated from the TEXTINCLUDE 3 resource. // ///////////////////////////////////////////////////////////////////////////// #endif // not APSTUDIO_INVOKED ================================================ FILE: third-party/tbb/examples/parallel_for/polygon_overlay/gui/resource.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ //{{NO_DEPENDENCIES}} // Microsoft Visual C++ generated include file. // Used by pover.rc // Next default values for new objects // #ifdef APSTUDIO_INVOKED #ifndef APSTUDIO_READONLY_SYMBOLS #define _APS_NEXT_RESOURCE_VALUE 101 #define _APS_NEXT_COMMAND_VALUE 40001 #define _APS_NEXT_CONTROL_VALUE 1001 #define _APS_NEXT_SYMED_VALUE 101 #endif #endif ================================================ FILE: third-party/tbb/examples/parallel_for/polygon_overlay/polymain.cpp ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ // Polygon overlay // // Don't want warnings about deprecated sscanf, getenv #ifndef _CRT_SECURE_NO_DEPRECATE #define _CRT_SECURE_NO_DEPRECATE #endif #define _MAIN_C_ 1 #include #include #include #include #include "oneapi/tbb/tick_count.h" #include "pover_global.hpp" #include "polyover.hpp" #include "pover_video.hpp" #include "polymain.hpp" #if _DEBUG const char *faceNames[] = { "North", "East", "South", "West" }; #endif /** **/ int main(int argc, char *argv[]) { pover_video poly; poly.threaded = true; gVideo = &poly; if (!initializeVideo(argc, argv)) { return -1; } gIsGraphicalVersion = poly.graphic_display(); if (argc > 1) { if (!ParseCmdLine(argc, argv)) { if (gIsGraphicalVersion) rt_sleep(10000); // if graphical, we haven't opened the console window so all the error messages we // so carefully wrote out disappeared into the ether. :( return -1; } } if (gCsvFilename != nullptr) { #define BUFLEN 1000 std::string fname_buf = gCsvFilename; fname_buf += ".csv"; gCsvFile.open(fname_buf.c_str()); } // we have gMapXSize and gMapYSize determining the number of "squares" // we have g_xwinsize and g_ywinsize the total size of the window // we also have BORDER_SIZE the size of the border between maps // we need to determine // g_polyBoxSize -- the number of pixels on each size of each square if (gIsGraphicalVersion) { int xpixelsPerMap = (g_xwinsize - 4 * BORDER_SIZE) / 3; // three maps, with borders between and outside gMapXSize = xpixelsPerMap; // make the boxes one per pixel gPolyXBoxSize = xpixelsPerMap / gMapXSize; int ypixelsPerMap = (g_ywinsize - 2 * BORDER_SIZE); // one map vertically gMapYSize = ypixelsPerMap; // one pixel per box, rather. gPolyYBoxSize = ypixelsPerMap / gMapYSize; if ((gPolyXBoxSize == 0) || (gPolyYBoxSize == 0)) { std::cout << "The display window is not large enough to show the maps" << "\n"; int minxSize = 4 * BORDER_SIZE + 3 * gMapXSize; int minySize = 2 * BORDER_SIZE + gMapYSize; std::cout << " Should be at least " << minxSize << " x " << minySize << "." << "\n"; return -1; } map2XLoc = 2 * BORDER_SIZE + gMapXSize * gPolyXBoxSize; maprXLoc = 3 * BORDER_SIZE + 2 * gMapXSize * gPolyXBoxSize; } else { // not gIsGraphicalVersion // gMapXSize, gMapYSize, gNPolygons defined in pover_global.h } // create two polygon maps SetRandomSeed(gMyRandomSeed); // for repeatability gVideo->main_loop(); return 0; } void Usage(int argc, char *argv[]) { char *cmdTail = strrchr(*argv, '\\'); if (cmdTail == nullptr) { cmdTail = *argv; } else { cmdTail++; } std::cout << cmdTail << " [threads[:threads2]] [--polys npolys] [--size nnnxnnn] [--seed nnn] [--csv filename] [--grainsize n] [--use_malloc]" << "\n"; std::cout << "Create polygon maps and overlay them." << "\n" << "\n"; std::cout << "Parameters:" << "\n"; std::cout << " threads[:threads2] - number of threads to run" << "\n"; std::cout << " --polys npolys - number of polygons in each map" << "\n"; std::cout << " --size nnnxnnn - size of each map (X x Y)" << "\n"; std::cout << " --seed nnn - initial value of random number generator" << "\n"; std::cout << " --csv filename - write timing data to CSV-format file" << "\n"; std::cout << " --grainsize n - set grainsize to n" << "\n"; std::cout << " --use_malloc - allocate polygons with malloc instead of scalable allocator" << "\n"; std::cout << "\n"; std::cout << "npolys must be smaller than the size of the map" << "\n"; std::cout << "\n"; std::exit(-1); } bool ParseCmdLine(int argc, char *argv[]) { bool error_found = false; bool nPolysSpecified = false; bool nMapSizeSpecified = false; bool nSeedSpecified = false; bool csvSpecified = false; bool grainsizeSpecified = false; bool mallocSpecified = false; int origArgc = argc; char **origArgv = argv; unsigned int newnPolygons = gNPolygons; unsigned int newSeed = gMyRandomSeed; unsigned int newX = gMapXSize; unsigned int newY = gMapYSize; unsigned int newGrainSize = gGrainSize; argc--; argv++; if (argc > 0 && isdigit((*argv)[0])) { // first argument is one or two numbers, specifying how mny threads to run char *end; gThreadsHigh = gThreadsLow = (int)strtol(argv[0], &end, 0); switch (*end) { case ':': gThreadsHigh = (int)strtol(end + 1, nullptr, 0); break; case '\0': break; default: std::cout << "Unexpected character in thread specifier: " << *end << "\n"; break; } if (gThreadsLow > gThreadsHigh) { int t = gThreadsLow; gThreadsLow = gThreadsHigh; gThreadsHigh = t; } argv++; argc--; } while (argc > 0) { // format 1: --size nnnxnnn, where nnn in {0 .. 9}+ -- size of map in "squares" if (!strncmp("--size", *argv, (std::size_t)6)) { if (nMapSizeSpecified) { std::cout << " Error: map size multiply specified" << "\n"; error_found = true; } else { argv++; argc--; if (argc == 0) { error_found = true; std::cout << " Error: --size must have a value" << "\n"; } if (strchr(*argv, 'x') != strrchr(*argv, 'x')) { // more than one 'x' std::cout << "Error: map size should be nnnxnnn (" << *argv << ")" << "\n"; error_found = true; } else { int rval; rval = sscanf(*argv, "%ux%u", &newX, &newY); if (rval != 2) { std::cout << "Error parsing map size (format should be nnnxnnn (" << *argv << ")" << "\n"; error_found = true; } if (newX == 0 || newY == 0) { std::cout << "Error: size of map should be greater than 0 (" << *argv << ")" << "\n"; error_found = true; } } } argc--; argv++; } // format 2: --seed nnn -- initial random number seed else if (!strncmp("--seed", *argv, (std::size_t)6)) { argv++; argc--; if (nSeedSpecified) { std::cout << "Error: new seed multiply specified" << "\n"; error_found = true; } else { nSeedSpecified = true; int rtval = sscanf(*argv, "%u", &newSeed); if (rtval == 0) { std::cout << "Error: --seed should be an unsigned number (instead of " << *argv << ")" << "\n"; error_found = true; } } argv++; argc--; } // format 3: --polys n[n] -- number of polygons in each map else if (!strncmp("--polys", *argv, (std::size_t)7)) { //unsigned int newnPolygons; argv++; argc--; if (nPolysSpecified) { std::cout << "Error: number of polygons multiply-specified" << "\n"; error_found = true; } else { int rtval = sscanf(*argv, "%u", &newnPolygons); if (newnPolygons == 0) { std::cout << "Error: number of polygons must be greater than 0 (" << *argv << ")" << "\n"; } } argv++; argc--; } // format 4: --csv -- name of CSV output file ("xxx" for "xxx.csv") else if (!strncmp("--csv", *argv, (std::size_t)5)) { argv++; argc--; if (csvSpecified) { std::cout << "Error: Multiple specification of CSV file" << "\n"; error_found = true; } else { gCsvFilename = *argv; argv++; argc--; csvSpecified = true; } } else if (!strncmp("--grainsize", *argv, (std::size_t)11)) { argv++; argc--; if (grainsizeSpecified) { std::cout << "Error: Multiple specification of grainsize" << "\n"; error_found = true; } else { int grval = sscanf(*argv, "%u", &newGrainSize); grainsizeSpecified = true; if (newGrainSize == 0) { std::cout << "Error: grainsize must be greater than 0" << "\n"; error_found = true; } } argv++; argc--; } else if (!strncmp("--use_malloc", *argv, (std::size_t)12)) { argv++; argc--; if (mallocSpecified) { std::cout << "Error: --use_malloc multiply-specified" << "\n"; error_found = true; } else { mallocSpecified = true; gMBehavior = UseMalloc; } } else { std::cout << "Error: unrecognized argument: " << *argv << "\n"; error_found = true; argv++; argc--; } } if (!error_found) { if (newX * newY < newnPolygons) { error_found = true; std::cout << "Error: map size should not be smaller than the number of polygons (gNPolygons = " << newnPolygons << ", map size " << newX << "x" << newY << ")" << "\n"; } } if (!error_found) { gMapXSize = newX; gMapYSize = newY; gNPolygons = newnPolygons; gMyRandomSeed = newSeed; gGrainSize = (int)newGrainSize; } else { Usage(origArgc, origArgv); } return !error_found; } // create a polygon map with at least gNPolygons polygons. // Usually more than gNPolygons polygons will be generated, because the // process of growing the polygons results in holes. bool GenerateMap(Polygon_map_t **newMap, int xSize, int ySize, int gNPolygons, colorcomp_t maxR, colorcomp_t maxG, colorcomp_t maxB) { bool error_found = false; int *validPolys; int *validSide; int maxSides; RPolygon *newPoly; if (xSize <= 0) { std::cout << "xSize (" << xSize << ") should be > 0." << "\n"; error_found = true; } if (ySize <= 0) { std::cout << "ySize (" << ySize << ") should be > 0." << "\n"; error_found = true; } if (gNPolygons > (xSize * ySize)) { std::cout << "gNPolygons (" << gNPolygons << ") should be less than " << (xSize * ySize) << "\n"; error_found = true; } if (error_found) return false; // the whole map is [xSize x ySize] squares // the way we create the map is to // 1) pick nPolygon discrete squares on an [xSize x ySize] grid // 2) while there are unused squares on the grid // 3) pick a polygon with a side that has unused squares on a side // 4) expand the polygon by 1 to occupy the unused squares // // Continue until every square on the grid is occupied by a polygon int *tempMap; tempMap = (int *)malloc(xSize * ySize * sizeof(int)); for (int i = 0; i < xSize; i++) { for (int j = 0; j < ySize; j++) { tempMap[i * ySize + j] = 0; } } // *newMap = new vector; *newMap = new Polygon_map_t; (*newMap)->reserve(gNPolygons + 1); // how much bigger does this need to be on average? (*newMap)->push_back(RPolygon(0, 0, xSize - 1, ySize - 1)); for (int i = 0; i < gNPolygons; i++) { int nX; int nY; do { // look for an empty square. nX = NextRan(xSize); nY = NextRan(ySize); } while (tempMap[nX * ySize + nY] != 0); int nR = (maxR * NextRan(1000)) / 999; int nG = (maxG * NextRan(1000)) / 999; int nB = (maxB * NextRan(1000)) / 999; (*newMap)->push_back(RPolygon(nX, nY, nX, nY, nR, nG, nB)); tempMap[nX * ySize + nY] = i + 1; // index of this polygon + 1 } // now have to grow polygons to fill the space. validPolys = (int *)malloc(4 * gNPolygons * sizeof(int)); validSide = (int *)malloc(4 * gNPolygons * sizeof(int)); for (int i = 0; i < gNPolygons; i++) { validPolys[4 * i] = validPolys[4 * i + 1] = validPolys[4 * i + 2] = validPolys[4 * i + 3] = i + 1; validSide[4 * i] = NORTH_SIDE; validSide[4 * i + 1] = EAST_SIDE; validSide[4 * i + 2] = SOUTH_SIDE; validSide[4 * i + 3] = WEST_SIDE; } maxSides = 4 * gNPolygons; while (maxSides > 0) { int indx = NextRan(maxSides); int polyIndx = validPolys[indx]; int checkSide = validSide[indx]; int xlow, xhigh, ylow, yhigh; int xlnew, xhnew, ylnew, yhnew; (**newMap)[polyIndx].get(&xlow, &ylow, &xhigh, &yhigh); xlnew = xlow; xhnew = xhigh; ylnew = ylow; yhnew = yhigh; // can this polygon be expanded along the chosen side? switch (checkSide) { case NORTH_SIDE: // y-1 from xlow to xhigh ylow = yhigh = (ylow - 1); ylnew--; break; case EAST_SIDE: // x+1 from ylow to yhigh xlow = xhigh = (xhigh + 1); xhnew++; break; case SOUTH_SIDE: // y+1 from xlow to xhigh ylow = yhigh = (yhigh + 1); yhnew++; break; case WEST_SIDE: // x-1 from ylow to yhigh xlow = xhigh = (xlow - 1); xlnew--; break; } bool okay_to_extend = !(((xlow < 0) || (xlow >= xSize)) || ((ylow < 0) || (ylow >= ySize))); for (int i = xlow; (i <= xhigh) && okay_to_extend; i++) { for (int j = ylow; (j <= yhigh) && okay_to_extend; j++) { okay_to_extend = tempMap[i * ySize + j] == 0; } } if (okay_to_extend) { (**newMap)[polyIndx].set(xlnew, ylnew, xhnew, yhnew); for (int i = xlow; i <= xhigh; i++) { for (int j = ylow; j <= yhigh && okay_to_extend; j++) { tempMap[i * ySize + j] = polyIndx; } } } else { // once we cannot expand along a side, we will never be able to; remove from the list. for (int i = indx + 1; i < maxSides; i++) { validPolys[i - 1] = validPolys[i]; validSide[i - 1] = validSide[i]; } maxSides--; } } // Once no polygons can be grown, look for unused squares, and fill them with polygons. for (int j = 0; j < ySize; j++) { for (int i = 0; i < xSize; i++) { if (tempMap[i * ySize + j] == 0) { // try to grow in the x direction, then the y direction int ilen = i; int jlen = j; while (ilen < (xSize - 1) && tempMap[(ilen + 1) * ySize + jlen] == 0) { ilen++; } bool yok = true; while (yok && jlen < (ySize - 1)) { for (int k = i; k <= ilen && yok; k++) { yok = (tempMap[k * ySize + jlen + 1] == 0); } if (yok) { jlen++; } } // create new polygon and push it on our list. int nR = (maxR * NextRan(1000)) / 999; int nG = (maxG * NextRan(1000)) / 999; int nB = (maxB * NextRan(1000)) / 999; (*newMap)->push_back(RPolygon(i, j, ilen, jlen, nR, nG, nB)); gNPolygons++; for (int k = i; k <= ilen; k++) { for (int l = j; l <= jlen; l++) { tempMap[k * ySize + l] = gNPolygons; } } } } } #if _DEBUG if (!gIsGraphicalVersion) { std::cout << "\n" << "Final Map:" << "\n"; for (int j = 0; j < ySize; j++) { std::cout << "Row " << std::setw(2) << j << ":"; for (int i = 0; i < xSize; i++) { int it = tempMap[i * ySize + j]; if (it < 10) { std::cout << std::setw(2) << it; } else { char ct = (int)'a' + it - 10; std::cout << " " << ct; } } std::cout << "\n"; } } #endif // _DEBUG free(tempMap); free(validPolys); free(validSide); return true; } void CheckPolygonMap(Polygon_map_t *checkMap) { #define indx(i, j) (i * gMapYSize + j) #define rangeCheck(str, n, limit) \ if (((n) < 0) || ((n) >= limit)) { \ std::cout << "checkMap error: " << str << " out of range (" << n << ")" \ << "\n"; \ anError = true; \ } #define xRangeCheck(str, n) rangeCheck(str, n, gMapXSize) #define yRangeCheck(str, n) rangeCheck(str, n, gMapYSize) // The first polygon is the whole map. bool anError = false; int *cArray; if (checkMap->size() <= 0) { std::cout << "checkMap error: no polygons in map" << "\n"; return; } // mapXhigh and mapYhigh are inclusive, that is, if the map is 5x5, those values would be 4. int mapXhigh, mapYhigh, mapLowX, mapLowY; int gMapXSize, gMapYSize; (*checkMap)[0].get(&mapLowX, &mapLowY, &mapXhigh, &mapYhigh); if ((mapLowX != 0) || (mapLowY != 0)) { std::cout << "checkMap error: map origin not (0,0) (X=" << mapLowX << ", Y=" << mapLowY << ")" << "\n"; anError = true; } if ((mapXhigh < 0) || (mapYhigh < 0)) { std::cout << "checkMap error: no area in map (X=" << mapXhigh << ", Y=" << mapYhigh << ")" << "\n"; anError = true; } if (anError) return; // bounds for array. gMapXSize = mapXhigh + 1; gMapYSize = mapYhigh + 1; cArray = (int *)malloc(sizeof(int) * (gMapXSize * gMapYSize)); for (int i = 0; i < gMapXSize; i++) { for (int j = 0; j < gMapYSize; j++) { cArray[indx(i, j)] = 0; } } int xlow, xhigh, ylow, yhigh; for (int k = 1; k < int(checkMap->size()) && !anError; k++) { (*checkMap)[k].get(&xlow, &ylow, &xhigh, &yhigh); xRangeCheck("xlow", xlow); yRangeCheck("ylow", ylow); xRangeCheck("xhigh", xhigh); yRangeCheck("yhigh", yhigh); if (xlow > xhigh) { std::cout << "checkMap error: xlow > xhigh (" << xlow << "," << xhigh << ")" << "\n"; anError = true; } if (ylow > yhigh) { std::cout << "checkMap error: ylow > yhigh (" << ylow << "," << yhigh << ")" << "\n"; anError = true; } for (int i = xlow; i <= xhigh; i++) { for (int j = ylow; j <= yhigh; j++) { if (cArray[indx(i, j)] != 0) { std::cout << "checkMap error: polygons " << cArray[indx(i, j)] << " and " << k << " intersect" << "\n"; anError = true; } cArray[indx(i, j)] = k; } } } for (int i = 0; i < gMapXSize; i++) { for (int j = 0; j < gMapYSize; j++) { if (cArray[indx(i, j)] == 0) { std::cout << "checkMap error: block(" << i << ", " << j << ") not in any polygon" << "\n"; anError = true; } } } free(cArray); } bool CompOnePolygon(RPolygon &p1, RPolygon &p2) { int xl1, xh1, yl1, yh1; int xl2, xh2, yl2, yh2; p1.get(&xl1, &yl1, &xh1, &yh1); p2.get(&xl2, &yl2, &xh2, &yh2); if (yl1 > yl2) return true; if (yl1 < yl2) return false; return (xl1 > xl2); } bool PolygonsEqual(RPolygon *p1, RPolygon *p2) { int xl1, xh1, yl1, yh1; int xl2, xh2, yl2, yh2; p1->get(&xl1, &yl1, &xh1, &yh1); p2->get(&xl2, &yl2, &xh2, &yh2); return ((xl1 == xl2) && (yl1 == yl2) && (xh1 == xh2) && (yh1 == yh2)); } bool ComparePolygonMaps(Polygon_map_t *map1, Polygon_map_t *map2) { // create two new polygon maps, copy the pointers from the original to these. // we have to skip the first polygon, which is the size of the whole map Polygon_map_t *t1, *t2; bool is_ok = true; t1 = new Polygon_map_t; t1->reserve(map1->size()); for (unsigned int i = 1; i < map1->size(); i++) { t1->push_back(map1->at(i)); } t2 = new Polygon_map_t; t2->reserve(map2->size()); for (unsigned int i = 1; i < map2->size(); i++) { t2->push_back(map2->at(i)); } // sort the two created maps by (xlow, ylow) sort(t1->begin(), t1->end()); sort(t2->begin(), t2->end()); // compare each element of both maps. if (t1->size() != t2->size()) { std::cout << "Error: maps not the same size ( " << int(t1->size()) << " vs " << int(t2->size()) << ")." << "\n"; } int maxSize = (int)((t1->size() < t2->size()) ? t1->size() : t2->size()); for (int i = 0; i < maxSize; i++) { if (!PolygonsEqual(&((*t1)[i]), &((*t2)[i]))) { std::cout << "Error: polygons unequal (" << (*t1)[i] << " vs " << (*t2)[i] << "\n"; is_ok = false; } } delete t1; delete t2; return is_ok; } void SetRandomSeed(int newSeed) { srand((unsigned)newSeed); } int NextRan(int n) { // assert(n > 1); // if we are given 1, we will just return 0 //assert(n < RAND_MAX); int rrand = rand() << 15 | rand(); if (rrand < 0) rrand = -rrand; return rrand % n; } std::ostream &operator<<(std::ostream &s, const RPolygon &p) { int xl, yl, xh, yh; p.get(&xl, &yl, &xh, &yh); return s << "[(" << xl << "," << yl << ")-(" << xh << "," << yh << ")] "; } ================================================ FILE: third-party/tbb/examples/parallel_for/polygon_overlay/polymain.hpp ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef TBB_examples_polygon_overlay_polymain_H #define TBB_examples_polygon_overlay_polymain_H #include "pover_global.hpp" // for declaration of DEFINE and INIT DEFINE Polygon_map_t *gPolymap1 INIT(nullptr); DEFINE Polygon_map_t *gPolymap2 INIT(nullptr); DEFINE Polygon_map_t *gResultMap INIT(nullptr); extern void Usage(int argc, char *argv[]); extern bool ParseCmdLine(int argc, char *argv[]); extern bool GenerateMap(Polygon_map_t **newMap, int xSize, int ySize, int gNPolygons, colorcomp_t maxR, colorcomp_t maxG, colorcomp_t maxB); extern bool PolygonsOverlap(RPolygon *p1, RPolygon *p2, int &xl, int &yl, int &xh, int &yh); extern void CheckPolygonMap(Polygon_map_t *checkMap); extern bool CompOnePolygon(RPolygon *p1, RPolygon *p2); extern bool PolygonsEqual(RPolygon *p1, RPolygon *p2); extern bool ComparePolygonMaps(Polygon_map_t *map1, Polygon_map_t *map2); extern void SetRandomSeed(int newSeed); extern int NextRan(int n); #endif /* TBB_examples_polygon_overlay_polymain_H */ ================================================ FILE: third-party/tbb/examples/parallel_for/polygon_overlay/polyover.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ // Polygon overlay // #include #include #include #include #include #include "oneapi/tbb/tick_count.h" #include "oneapi/tbb/blocked_range.h" #include "oneapi/tbb/parallel_for.h" #include "oneapi/tbb/spin_mutex.h" #include "oneapi/tbb/global_control.h" #include "common/utility/get_default_num_threads.hpp" #include "polyover.hpp" #include "polymain.hpp" #include "pover_video.hpp" /*! * @brief intersects a polygon with a map, adding any results to output map * * @param[out] resultMap output map (must be allocated) * @param[in] polygon to be intersected * @param[in] map intersected against * @param[in] lock to use when adding output polygons to result map * */ void OverlayOnePolygonWithMap(Polygon_map_t *resultMap, RPolygon *myPoly, Polygon_map_t *map2, oneapi::tbb::spin_mutex *rMutex) { int r1, g1, b1, r2, g2, b2; int myr = 0; int myg = 0; int myb = 0; int p1Area = myPoly->area(); for (unsigned int j = 1; (j < map2->size()) && (p1Area > 0); j++) { RPolygon *p2 = &((*map2)[j]); RPolygon *pnew; int newxMin, newxMax, newyMin, newyMax; myPoly->getColor(&r1, &g1, &b1); if (PolygonsOverlap(myPoly, p2, newxMin, newyMin, newxMax, newyMax)) { p2->getColor(&r2, &g2, &b2); myr = r1 + r2; myg = g1 + g2; myb = b1 + b2; p1Area -= (newxMax - newxMin + 1) * (newyMax - newyMin + 1); if (rMutex) { oneapi::tbb::spin_mutex::scoped_lock lock(*rMutex); resultMap->push_back(RPolygon(newxMin, newyMin, newxMax, newyMax, myr, myg, myb)); } else { resultMap->push_back(RPolygon(newxMin, newyMin, newxMax, newyMax, myr, myg, myb)); } } } } /*! * @brief Serial version of polygon overlay * @param[out] output map * @param[in] first map (map that individual polygons are taken from) * @param[in] second map (map passed to OverlayOnePolygonWithMap) */ void SerialOverlayMaps(Polygon_map_t **resultMap, Polygon_map_t *map1, Polygon_map_t *map2) { std::cout << "SerialOverlayMaps called" << "\n"; *resultMap = new Polygon_map_t; RPolygon *p0 = &((*map1)[0]); int mapxSize, mapySize, ignore1, ignore2; p0->get(&ignore1, &ignore2, &mapxSize, &mapySize); (*resultMap)->reserve(mapxSize * mapySize); // can't be any bigger than this // push the map size as the first polygon, (*resultMap)->push_back(RPolygon(0, 0, mapxSize, mapySize)); for (unsigned int i = 1; i < map1->size(); i++) { RPolygon *p1 = &((*map1)[i]); OverlayOnePolygonWithMap(*resultMap, p1, map2, nullptr); } } /*! * @class ApplyOverlay * @brief Simple version of parallel overlay (make parallel on polygons in map1) */ class ApplyOverlay { Polygon_map_t *m_map1, *m_map2, *m_resultMap; oneapi::tbb::spin_mutex *m_rMutex; public: /*! * @brief functor to apply * @param[in] r range of polygons to intersect from map1 */ void operator()(const oneapi::tbb::blocked_range &r) const { PRINT_DEBUG("From " << r.begin() << " to " << r.end()); for (int i = r.begin(); i != r.end(); i++) { RPolygon *myPoly = &((*m_map1)[i]); OverlayOnePolygonWithMap(m_resultMap, myPoly, m_map2, m_rMutex); } } ApplyOverlay(Polygon_map_t *resultMap, Polygon_map_t *map1, Polygon_map_t *map2, oneapi::tbb::spin_mutex *rmutex) : m_resultMap(resultMap), m_map1(map1), m_map2(map2), m_rMutex(rmutex) {} }; /*! * @brief apply the parallel algorithm * @param[out] result_map generated map * @param[in] polymap1 first map to be applied (algorithm is parallel on this map) * @param[in] polymap2 second map. */ void NaiveParallelOverlay(Polygon_map_t *&result_map, Polygon_map_t &polymap1, Polygon_map_t &polymap2) { // ----------------------------------- bool automatic_threadcount = false; if (gThreadsLow == THREADS_UNSET || gThreadsLow == utility::get_default_num_threads()) { gThreadsLow = gThreadsHigh = utility::get_default_num_threads(); automatic_threadcount = true; } result_map = new Polygon_map_t; RPolygon *p0 = &(polymap1[0]); int mapxSize, mapySize, ignore1, ignore2; p0->get(&ignore1, &ignore2, &mapxSize, &mapySize); result_map->reserve(mapxSize * mapySize); // can't be any bigger than this // push the map size as the first polygon, oneapi::tbb::spin_mutex *resultMutex = new oneapi::tbb::spin_mutex(); int grain_size = gGrainSize; for (int nthreads = gThreadsLow; nthreads <= gThreadsHigh; nthreads++) { oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism, nthreads); if (gIsGraphicalVersion) { RPolygon *xp = new RPolygon(0, 0, gMapXSize - 1, gMapYSize - 1, 0, 0, 0); // Clear the output space delete xp; } // put size polygon in result map result_map->push_back(RPolygon(0, 0, mapxSize, mapySize)); oneapi::tbb::tick_count t0 = oneapi::tbb::tick_count::now(); oneapi::tbb::parallel_for( oneapi::tbb::blocked_range(1, (int)(polymap1.size()), grain_size), ApplyOverlay(result_map, &polymap1, &polymap2, resultMutex)); oneapi::tbb::tick_count t1 = oneapi::tbb::tick_count::now(); double naiveParallelTime = (t1 - t0).seconds() * 1000; std::cout << "Naive parallel with spin lock and "; if (automatic_threadcount) std::cout << "automatic"; else std::cout << nthreads; std::cout << ((nthreads == 1) ? " thread" : " threads"); std::cout << " took " << naiveParallelTime << " msec : speedup over serial " << (gSerialTime / naiveParallelTime) << "\n"; if (gCsvFile.is_open()) { gCsvFile << "," << naiveParallelTime; } #if _DEBUG CheckPolygonMap(result_map); ComparePolygonMaps(result_map, gResultMap); #endif result_map->clear(); } delete resultMutex; if (gCsvFile.is_open()) { gCsvFile << "\n"; } // ----------------------------------- } template void split_at(Flagged_map_t &in_map, Flagged_map_t &left_out, Flagged_map_t &right_out, const T median) { left_out.reserve(in_map.size()); right_out.reserve(in_map.size()); for (Flagged_map_t::iterator i = in_map.begin(); i != in_map.end(); ++i) { RPolygon *p = i->p(); if (p->xmax() < median) { // in left map left_out.push_back(*i); } else if (p->xmin() >= median) { right_out.push_back(*i); // in right map } else { // in both maps. left_out.push_back(*i); right_out.push_back(RPolygon_flagged(p, true)); } } } // range that splits the maps as well as the range. the flagged_map_t are // vectors of pointers, and each range owns its maps (has to free them on destruction.) template class blocked_range_with_maps { typedef oneapi::tbb::blocked_range my_range_type; private: my_range_type my_range; Flagged_map_t my_map1; Flagged_map_t my_map2; public: blocked_range_with_maps(T begin, T end, typename my_range_type::size_type my_grainsize, Polygon_map_t *p1, Polygon_map_t *p2) : my_range(begin, end, my_grainsize) { my_map1.reserve(p1->size()); my_map2.reserve(p2->size()); for (int i = 1; i < p1->size(); ++i) { my_map1.push_back(RPolygon_flagged(&((*p1)[i]), false)); } for (int i = 1; i < p2->size(); ++i) { my_map2.push_back(RPolygon_flagged(&(p2->at(i)), false)); } } // copy-constructor required for deep copy of flagged maps. One copy is done at the start of the // parallel for. blocked_range_with_maps(const blocked_range_with_maps &other) : my_range(other.my_range), my_map1(other.my_map1), my_map2(other.my_map2) {} bool empty() const { return my_range.empty(); } bool is_divisible() const { return my_range.is_divisible(); } #if _DEBUG void check_my_map() { assert(my_range.begin() <= my_range.end()); for (Flagged_map_t::iterator i = my_map1.begin(); i != my_map1.end(); ++i) { RPolygon *rp = i->p(); assert(rp->xmax() >= my_range.begin()); assert(rp->xmin() < my_range.end()); } for (Flagged_map_t::iterator i = my_map2.begin(); i != my_map2.end(); ++i) { RPolygon *rp = i->p(); assert(rp->xmax() >= my_range.begin()); assert(rp->xmin() < my_range.end()); } } void dump_map(Flagged_map_t &mapx) { std::cout << " ** MAP **\n"; for (Flagged_map_t::iterator i = mapx.begin(); i != mapx.end(); ++i) { std::cout << *(i->p()); if (i->isDuplicate()) { std::cout << " -- is_duplicate"; } std::cout << "\n"; } std::cout << "\n"; } #endif blocked_range_with_maps(blocked_range_with_maps &lhs_r, oneapi::tbb::split) : my_range(my_range_type(lhs_r.my_range, oneapi::tbb::split())) { // lhs_r.my_range makes my_range from [median, high) and rhs_r.my_range from [low, median) Flagged_map_t original_map1 = lhs_r.my_map1; Flagged_map_t original_map2 = lhs_r.my_map2; lhs_r.my_map1.clear(); lhs_r.my_map2.clear(); split_at(original_map1, lhs_r.my_map1, my_map1, my_range.begin()); split_at(original_map2, lhs_r.my_map2, my_map2, my_range.begin()); #if _DEBUG this->check_my_map(); lhs_r.check_my_map(); #endif } const my_range_type &range() const { return my_range; } Flagged_map_t &map1() { return my_map1; } Flagged_map_t &map2() { return my_map2; } }; /*! * @class ApplySplitOverlay * @brief parallel by columnar strip */ class ApplySplitOverlay { Polygon_map_t *m_map1, *m_map2, *m_resultMap; oneapi::tbb::spin_mutex *m_rMutex; public: /*! * @brief functor for columnar parallel version * @param[in] r range of map to be operated on */ void operator()(/*const*/ blocked_range_with_maps &r) const { #ifdef _DEBUG // if we are debugging, serialize the method. That way we can // see what is happening in each strip without the interleaving // confusing things. oneapi::tbb::spin_mutex::scoped_lock lock(*m_rMutex); std::cout << std::unitbuf << "From " << r.range().begin() << " to " << r.range().end() - 1 << "\n"; #endif // get yMapSize int r1, g1, b1, r2, g2, b2; int myr = -1; int myg = -1; int myb = -1; int i1, i2, i3, yMapSize; (*m_map1)[0].get(&i1, &i2, &i3, &yMapSize); Flagged_map_t &fmap1 = r.map1(); Flagged_map_t &fmap2 = r.map2(); // When intersecting polygons from fmap1 and fmap2, if BOTH are flagged // as duplicate, don't add the result to the output map. We can still // intersect them, because we are keeping track of how much of the polygon // is left over from intersecting, and quitting when the polygon is // used up. for (unsigned int i = 0; i < fmap1.size(); i++) { RPolygon *p1 = fmap1[i].p(); bool is_dup = fmap1[i].isDuplicate(); int parea = p1->area(); p1->getColor(&r1, &g1, &b1); for (unsigned int j = 0; (j < fmap2.size()) && (parea > 0); j++) { int xl, yl, xh, yh; RPolygon *p2 = fmap2[j].p(); if (PolygonsOverlap(p1, p2, xl, yl, xh, yh)) { if (!(is_dup && fmap2[j].isDuplicate())) { p2->getColor(&r2, &g2, &b2); myr = r1 + r2; myg = g1 + g2; myb = b1 + b2; #ifdef _DEBUG #else oneapi::tbb::spin_mutex::scoped_lock lock(*m_rMutex); #endif (*m_resultMap).push_back(RPolygon(xl, yl, xh, yh, myr, myg, myb)); } parea -= (xh - xl + 1) * (yh - yl + 1); } } } } ApplySplitOverlay(Polygon_map_t *resultMap, Polygon_map_t *map1, Polygon_map_t *map2, oneapi::tbb::spin_mutex *rmutex) : m_resultMap(resultMap), m_map1(map1), m_map2(map2), m_rMutex(rmutex) {} }; /*! * @brief intersects two maps strip-wise * * @param[out] resultMap output map (must be allocated) * @param[in] polymap1 map to be intersected * @param[in] polymap2 map to be intersected */ void SplitParallelOverlay(Polygon_map_t **result_map, Polygon_map_t *polymap1, Polygon_map_t *polymap2) { int nthreads; bool automatic_threadcount = false; double domainSplitParallelTime; oneapi::tbb::tick_count t0, t1; oneapi::tbb::spin_mutex *resultMutex; if (gThreadsLow == THREADS_UNSET || gThreadsLow == utility::get_default_num_threads()) { gThreadsLow = gThreadsHigh = utility::get_default_num_threads(); automatic_threadcount = true; } *result_map = new Polygon_map_t; RPolygon *p0 = &((*polymap1)[0]); int mapxSize, mapySize, ignore1, ignore2; p0->get(&ignore1, &ignore2, &mapxSize, &mapySize); (*result_map)->reserve(mapxSize * mapySize); // can't be any bigger than this resultMutex = new oneapi::tbb::spin_mutex(); int grain_size; #ifdef _DEBUG grain_size = gMapXSize / 4; #else grain_size = gGrainSize; #endif for (nthreads = gThreadsLow; nthreads <= gThreadsHigh; nthreads++) { oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism, nthreads); if (gIsGraphicalVersion) { RPolygon *xp = new RPolygon(0, 0, gMapXSize - 1, gMapYSize - 1, 0, 0, 0); // Clear the output space delete xp; } // push the map size as the first polygon, (*result_map)->push_back(RPolygon(0, 0, mapxSize, mapySize)); t0 = oneapi::tbb::tick_count::now(); oneapi::tbb::parallel_for( blocked_range_with_maps(0, (int)(mapxSize + 1), grain_size, polymap1, polymap2), ApplySplitOverlay((*result_map), polymap1, polymap2, resultMutex)); t1 = oneapi::tbb::tick_count::now(); domainSplitParallelTime = (t1 - t0).seconds() * 1000; std::cout << "Splitting parallel with spin lock and "; if (automatic_threadcount) std::cout << "automatic"; else std::cout << nthreads; std::cout << ((nthreads == 1) ? " thread" : " threads"); std::cout << " took " << domainSplitParallelTime << " msec : speedup over serial " << (gSerialTime / domainSplitParallelTime) << "\n"; if (gCsvFile.is_open()) { gCsvFile << "," << domainSplitParallelTime; } #if _DEBUG CheckPolygonMap(*result_map); ComparePolygonMaps(*result_map, gResultMap); #endif (*result_map)->clear(); } delete resultMutex; if (gCsvFile.is_open()) { gCsvFile << "\n"; } } class ApplySplitOverlayCV { Polygon_map_t *m_map1, *m_map2; concurrent_Polygon_map_t *m_resultMap; public: /*! * @brief functor for columnar parallel version * @param[in] r range of map to be operated on */ void operator()(blocked_range_with_maps &r) const { // get yMapSize int r1, g1, b1, r2, g2, b2; int myr = -1; int myg = -1; int myb = -1; int i1, i2, i3, yMapSize; (*m_map1)[0].get(&i1, &i2, &i3, &yMapSize); Flagged_map_t &fmap1 = r.map1(); Flagged_map_t &fmap2 = r.map2(); // When intersecting polygons from fmap1 and fmap2, if BOTH are flagged // as duplicate, don't add the result to the output map. We can still // intersect them, because we are keeping track of how much of the polygon // is left over from intersecting, and quitting when the polygon is // used up. for (unsigned int i = 0; i < fmap1.size(); i++) { RPolygon *p1 = fmap1[i].p(); bool is_dup = fmap1[i].isDuplicate(); int parea = p1->area(); p1->getColor(&r1, &g1, &b1); for (unsigned int j = 0; (j < fmap2.size()) && (parea > 0); j++) { int xl, yl, xh, yh; RPolygon *p2 = fmap2[j].p(); if (PolygonsOverlap(p1, p2, xl, yl, xh, yh)) { if (!(is_dup && fmap2[j].isDuplicate())) { p2->getColor(&r2, &g2, &b2); myr = r1 + r2; myg = g1 + g2; myb = b1 + b2; (*m_resultMap).push_back(RPolygon(xl, yl, xh, yh, myr, myg, myb)); } parea -= (xh - xl + 1) * (yh - yl + 1); } } } } ApplySplitOverlayCV(concurrent_Polygon_map_t *resultMap, Polygon_map_t *map1, Polygon_map_t *map2) : m_resultMap(resultMap), m_map1(map1), m_map2(map2) {} }; /*! * @brief intersects two maps strip-wise, accumulating into a concurrent_vector * * @param[out] resultMap output map (must be allocated) * @param[in] polymap1 map to be intersected * @param[in] polymap2 map to be intersected */ void SplitParallelOverlayCV(concurrent_Polygon_map_t **result_map, Polygon_map_t *polymap1, Polygon_map_t *polymap2) { int nthreads; bool automatic_threadcount = false; double domainSplitParallelTime; oneapi::tbb::tick_count t0, t1; if (gThreadsLow == THREADS_UNSET || gThreadsLow == utility::get_default_num_threads()) { gThreadsLow = gThreadsHigh = utility::get_default_num_threads(); automatic_threadcount = true; } *result_map = new concurrent_Polygon_map_t; RPolygon *p0 = &((*polymap1)[0]); int mapxSize, mapySize, ignore1, ignore2; p0->get(&ignore1, &ignore2, &mapxSize, &mapySize); // (*result_map)->reserve(mapxSize*mapySize); // can't be any bigger than this int grain_size; #ifdef _DEBUG grain_size = gMapXSize / 4; #else grain_size = gGrainSize; #endif for (nthreads = gThreadsLow; nthreads <= gThreadsHigh; nthreads++) { oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism, nthreads); if (gIsGraphicalVersion) { RPolygon *xp = new RPolygon(0, 0, gMapXSize - 1, gMapYSize - 1, 0, 0, 0); // Clear the output space delete xp; } // push the map size as the first polygon, (*result_map)->push_back(RPolygon(0, 0, mapxSize, mapySize)); t0 = oneapi::tbb::tick_count::now(); oneapi::tbb::parallel_for( blocked_range_with_maps(0, (int)(mapxSize + 1), grain_size, polymap1, polymap2), ApplySplitOverlayCV((*result_map), polymap1, polymap2)); t1 = oneapi::tbb::tick_count::now(); domainSplitParallelTime = (t1 - t0).seconds() * 1000; std::cout << "Splitting parallel with concurrent_vector and "; if (automatic_threadcount) std::cout << "automatic"; else std::cout << nthreads; std::cout << ((nthreads == 1) ? " thread" : " threads"); std::cout << " took " << domainSplitParallelTime << " msec : speedup over serial " << (gSerialTime / domainSplitParallelTime) << "\n"; if (gCsvFile.is_open()) { gCsvFile << "," << domainSplitParallelTime; } #if _DEBUG { Polygon_map_t s_result_map; for (concurrent_Polygon_map_t::const_iterator i = (*result_map)->begin(); i != (*result_map)->end(); ++i) { s_result_map.push_back(*i); } CheckPolygonMap(&s_result_map); ComparePolygonMaps(&s_result_map, gResultMap); } #endif (*result_map)->clear(); } if (gCsvFile.is_open()) { gCsvFile << "\n"; } } // ------------------------------------------------------ class ApplySplitOverlayETS { Polygon_map_t *m_map1, *m_map2; ETS_Polygon_map_t *m_resultMap; public: /*! * @brief functor for columnar parallel version * @param[in] r range of map to be operated on */ void operator()(blocked_range_with_maps &r) const { // get yMapSize int r1, g1, b1, r2, g2, b2; int myr = -1; int myg = -1; int myb = -1; int i1, i2, i3, yMapSize; (*m_map1)[0].get(&i1, &i2, &i3, &yMapSize); Flagged_map_t &fmap1 = r.map1(); Flagged_map_t &fmap2 = r.map2(); // When intersecting polygons from fmap1 and fmap2, if BOTH are flagged // as duplicate, don't add the result to the output map. We can still // intersect them, because we are keeping track of how much of the polygon // is left over from intersecting, and quitting when the polygon is // used up. for (unsigned int i = 0; i < fmap1.size(); i++) { RPolygon *p1 = fmap1[i].p(); bool is_dup = fmap1[i].isDuplicate(); int parea = p1->area(); p1->getColor(&r1, &g1, &b1); for (unsigned int j = 0; (j < fmap2.size()) && (parea > 0); j++) { int xl, yl, xh, yh; RPolygon *p2 = fmap2[j].p(); if (PolygonsOverlap(p1, p2, xl, yl, xh, yh)) { if (!(is_dup && fmap2[j].isDuplicate())) { p2->getColor(&r2, &g2, &b2); myr = r1 + r2; myg = g1 + g2; myb = b1 + b2; (*m_resultMap).local().push_back(RPolygon(xl, yl, xh, yh, myr, myg, myb)); } parea -= (xh - xl + 1) * (yh - yl + 1); } } } } ApplySplitOverlayETS(ETS_Polygon_map_t *resultMap, Polygon_map_t *map1, Polygon_map_t *map2) : m_resultMap(resultMap), m_map1(map1), m_map2(map2) {} }; /*! * @brief intersects two maps strip-wise, accumulating into an ets variable * * @param[out] resultMap output map (must be allocated) * @param[in] polymap1 map to be intersected * @param[in] polymap2 map to be intersected */ void SplitParallelOverlayETS(ETS_Polygon_map_t **result_map, Polygon_map_t *polymap1, Polygon_map_t *polymap2) { int nthreads; bool automatic_threadcount = false; double domainSplitParallelTime; oneapi::tbb::tick_count t0, t1; if (gThreadsLow == THREADS_UNSET || gThreadsLow == utility::get_default_num_threads()) { gThreadsLow = gThreadsHigh = utility::get_default_num_threads(); automatic_threadcount = true; } *result_map = new ETS_Polygon_map_t; RPolygon *p0 = &((*polymap1)[0]); int mapxSize, mapySize, ignore1, ignore2; p0->get(&ignore1, &ignore2, &mapxSize, &mapySize); // (*result_map)->reserve(mapxSize*mapySize); // can't be any bigger than this int grain_size; #ifdef _DEBUG grain_size = gMapXSize / 4; #else grain_size = gGrainSize; #endif for (nthreads = gThreadsLow; nthreads <= gThreadsHigh; nthreads++) { oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism, nthreads); if (gIsGraphicalVersion) { RPolygon *xp = new RPolygon(0, 0, gMapXSize - 1, gMapYSize - 1, 0, 0, 0); // Clear the output space delete xp; } // push the map size as the first polygon, // This polygon needs to be first, so we can push it at the start of a combine. // (*result_map)->local.push_back(RPolygon(0,0,mapxSize, mapySize)); t0 = oneapi::tbb::tick_count::now(); oneapi::tbb::parallel_for( blocked_range_with_maps(0, (int)(mapxSize + 1), grain_size, polymap1, polymap2), ApplySplitOverlayETS((*result_map), polymap1, polymap2)); t1 = oneapi::tbb::tick_count::now(); domainSplitParallelTime = (t1 - t0).seconds() * 1000; std::cout << "Splitting parallel with ETS and "; if (automatic_threadcount) std::cout << "automatic"; else std::cout << nthreads; std::cout << ((nthreads == 1) ? " thread" : " threads"); std::cout << " took " << domainSplitParallelTime << " msec : speedup over serial " << (gSerialTime / domainSplitParallelTime) << "\n"; if (gCsvFile.is_open()) { gCsvFile << "," << domainSplitParallelTime; } #if _DEBUG { Polygon_map_t s_result_map; oneapi::tbb::flattened2d psv = flatten2d(**result_map); s_result_map.push_back(RPolygon(0, 0, mapxSize, mapySize)); for (oneapi::tbb::flattened2d::const_iterator ci = psv.begin(); ci != psv.end(); ++ci) { s_result_map.push_back(*ci); } CheckPolygonMap(&s_result_map); ComparePolygonMaps(&s_result_map, gResultMap); } #endif (*result_map)->clear(); } if (gCsvFile.is_open()) { gCsvFile << "\n"; } } ================================================ FILE: third-party/tbb/examples/parallel_for/polygon_overlay/polyover.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /*! * polyover.h : extern declarations for polyover.cpp */ #ifndef TBB_examples_polygon_overlay_polyover_H #define TBB_examples_polygon_overlay_polyover_H #include "oneapi/tbb/spin_mutex.h" #include "rpolygon.hpp" extern void OverlayOnePolygonWithMap(Polygon_map_t *resultMap, RPolygon *myPoly, Polygon_map_t *map2, oneapi::tbb::spin_mutex *rMutex); extern void SerialOverlayMaps(Polygon_map_t **resultMap, Polygon_map_t *map1, Polygon_map_t *map2); // extern void NaiveParallelOverlay(Polygon_map_t **result_map, Polygon_map_t *polymap1, Polygon_map_t *polymap2); extern void NaiveParallelOverlay(Polygon_map_t *&result_map, Polygon_map_t &polymap1, Polygon_map_t &polymap2); extern void SplitParallelOverlay(Polygon_map_t **result_map, Polygon_map_t *polymap1, Polygon_map_t *polymap2); extern void SplitParallelOverlayCV(concurrent_Polygon_map_t **result_map, Polygon_map_t *polymap1, Polygon_map_t *polymap2); extern void SplitParallelOverlayETS(ETS_Polygon_map_t **result_map, Polygon_map_t *polymap1, Polygon_map_t *polymap2); extern void CheckPolygonMap(Polygon_map_t *checkMap); extern bool ComparePolygonMaps(Polygon_map_t *map1, Polygon_map_t *map2); #endif /* TBB_examples_polygon_overlay_polyover_H */ ================================================ FILE: third-party/tbb/examples/parallel_for/polygon_overlay/pover_global.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ // // pover_global.h // #ifndef TBB_examples_polygon_overlay_pover_global_H #define TBB_examples_polygon_overlay_pover_global_H #ifdef _MAIN_C_ #define DEFINE // nothing #define STATIC static #define INIT(n) = n #else // not in main file #define DEFINE extern #define STATIC // nothing #define INIT(n) // nothing #endif // _MAIN_C_ #include #include #ifdef _WINDOWS #include #endif // this Polygon class only supports rectangles DEFINE int gDrawXOffset INIT(0); // used for drawing polygons DEFINE int gDrawYOffset INIT(0); DEFINE int gPolyXBoxSize INIT(0); // number of pixels orresponding to one "square" (x) DEFINE int gPolyYBoxSize INIT(0); // number of pixels orresponding to one "square" (y) DEFINE bool gDoDraw INIT(false); // render the boxes #define THREADS_UNSET 0 DEFINE int gThreadsLow INIT(THREADS_UNSET); DEFINE int gThreadsHigh INIT(THREADS_UNSET); DEFINE std::ofstream gCsvFile; DEFINE double gSerialTime; DEFINE char *gCsvFilename INIT(nullptr); #define BORDER_SIZE 10 // number of pixels between maps // The map size and the number of polygons depends on the version we are compiling. // If DEBUG then it is small; else it is large. #ifdef _DEBUG DEFINE int gNPolygons INIT(30); // default number of polygons in map DEFINE int gMapXSize INIT(30); DEFINE int gMapYSize INIT(30); DEFINE int gGrainSize INIT(5); #else DEFINE int gNPolygons INIT(50000); // default number of polygons in map DEFINE int gMapXSize INIT(1000); DEFINE int gMapYSize INIT(1000); DEFINE int gGrainSize INIT(20); #endif DEFINE int gMyRandomSeed INIT(2453185); DEFINE bool gIsGraphicalVersion INIT(false); typedef enum { NORTH_SIDE, EAST_SIDE, SOUTH_SIDE, WEST_SIDE } allSides; #if _DEBUG #define PRINT_DEBUG(x) (std::cout << x << "\n") #else #define PRINT_DEBUG(x) #endif #endif // TBB_examples_polygon_overlay_pover_global_H ================================================ FILE: third-party/tbb/examples/parallel_for/polygon_overlay/pover_video.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ // Support for GUI display for Polygon overlay demo #define VIDEO_WINMAIN_ARGS #include #include "oneapi/tbb/tick_count.h" #include "common/utility/get_default_num_threads.hpp" #include "polyover.hpp" #include "polymain.hpp" #include "pover_video.hpp" #ifndef _WIN32 #include #include void rt_sleep(int msec) { usleep(msec * 1000); } #else //_WIN32 #undef OLDUNIXTIME #undef STDTIME #include void rt_sleep(int msec) { Sleep(msec); } #endif /* _WIN32 */ bool g_next_frame() { if (++n_next_frame_calls >= frame_skips) { // the data race here is benign n_next_frame_calls = 0; return gVideo->next_frame(); } return gVideo->running; } bool g_last_frame() { if (n_next_frame_calls) return gVideo->next_frame(); return gVideo->running; } bool initializeVideo(int argc, char *argv[]) { //pover_video *l_video = new pover_video(); //gVideo = l_video; gVideo->init_console(); // don't check return code. gVideo->title = g_windowTitle; g_useGraphics = gVideo->init_window(g_xwinsize, g_ywinsize); return true; } void pover_video::on_process() { oneapi::tbb::tick_count t0, t1; double naiveParallelTime, domainSplitParallelTime; // create map1 These could be done in parallel, if the pseudorandom number generator were re-seeded. GenerateMap( &gPolymap1, gMapXSize, gMapYSize, gNPolygons, /*red*/ 255, /*green*/ 0, /*blue*/ 127); // create map2 GenerateMap( &gPolymap2, gMapXSize, gMapYSize, gNPolygons, /*red*/ 0, /*green*/ 255, /*blue*/ 127); // // Draw source maps gDrawXOffset = map1XLoc; gDrawYOffset = map1YLoc; for (int i = 0; i < int(gPolymap1->size()); i++) { (*gPolymap1)[i].drawPoly(); } gDrawXOffset = map2XLoc; gDrawYOffset = map2YLoc; for (int i = 0; i < int(gPolymap2->size()); i++) { (*gPolymap2)[i].drawPoly(); } gDoDraw = true; // run serial map generation gDrawXOffset = maprXLoc; gDrawYOffset = maprYLoc; { RPolygon *xp = new RPolygon(0, 0, gMapXSize - 1, gMapYSize - 1, 0, 0, 0); // Clear the output space delete xp; t0 = oneapi::tbb::tick_count::now(); SerialOverlayMaps(&gResultMap, gPolymap1, gPolymap2); t1 = oneapi::tbb::tick_count::now(); std::cout << "Serial overlay took " << (t1 - t0).seconds() * 1000 << " msec" << "\n"; gSerialTime = (t1 - t0).seconds() * 1000; #if _DEBUG CheckPolygonMap(gResultMap); // keep the map for comparison purposes. #else delete gResultMap; #endif if (gCsvFile.is_open()) { gCsvFile << "Serial Time," << gSerialTime << "\n"; gCsvFile << "Threads,"; if (gThreadsLow == THREADS_UNSET || gThreadsLow == utility::get_default_num_threads()) { gCsvFile << "Threads,Automatic"; } else { for (int i = gThreadsLow; i <= gThreadsHigh; i++) { gCsvFile << i; if (i < gThreadsHigh) gCsvFile << ","; } } gCsvFile << "\n"; } if (gIsGraphicalVersion) rt_sleep(2000); } // run naive parallel map generation { Polygon_map_t *resultMap; if (gCsvFile.is_open()) { gCsvFile << "Naive Time"; } NaiveParallelOverlay(resultMap, *gPolymap1, *gPolymap2); delete resultMap; if (gIsGraphicalVersion) rt_sleep(2000); } // run split map generation { Polygon_map_t *resultMap; if (gCsvFile.is_open()) { gCsvFile << "Split Time"; } SplitParallelOverlay(&resultMap, gPolymap1, gPolymap2); delete resultMap; if (gIsGraphicalVersion) rt_sleep(2000); } // split, accumulating into concurrent vector { concurrent_Polygon_map_t *cresultMap; if (gCsvFile.is_open()) { gCsvFile << "Split CV time"; } SplitParallelOverlayCV(&cresultMap, gPolymap1, gPolymap2); delete cresultMap; if (gIsGraphicalVersion) rt_sleep(2000); } // split, accumulating into ETS { ETS_Polygon_map_t *cresultMap; if (gCsvFile.is_open()) { gCsvFile << "Split ETS time"; } SplitParallelOverlayETS(&cresultMap, gPolymap1, gPolymap2); delete cresultMap; if (gIsGraphicalVersion) rt_sleep(2000); } if (gIsGraphicalVersion) rt_sleep(8000); delete gPolymap1; delete gPolymap2; #if _DEBUG delete gResultMap; #endif } ================================================ FILE: third-party/tbb/examples/parallel_for/polygon_overlay/pover_video.hpp ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ // support for GUI for polygon overlay demo // #ifndef TBB_examples_polygon_overlay_pover_video_H #define TBB_examples_polygon_overlay_pover_video_H #include "common/gui/video.hpp" #include "pover_global.hpp" // for declaration of DEFINE and INIT DEFINE class video *gVideo INIT(nullptr); DEFINE int n_next_frame_calls INIT(0); DEFINE int frame_skips INIT(10); extern bool g_next_frame(); extern bool g_last_frame(); class pover_video : public video { void on_process(); public: #ifdef _WINDOWS bool graphic_display() { return video::win_hInstance != (HINSTANCE) nullptr; } #else bool graphic_display() { return true; } // fix this for Linux #endif //void on_key(int key); }; DEFINE int g_xwinsize INIT(1024); DEFINE int g_ywinsize INIT(768); DEFINE int map1XLoc INIT(10); DEFINE int map1YLoc INIT(10); DEFINE int map2XLoc INIT(270); DEFINE int map2YLoc INIT(10); DEFINE int maprXLoc INIT(530); DEFINE int maprYLoc INIT(10); DEFINE const char *g_windowTitle INIT("Polygon Overlay"); DEFINE bool g_useGraphics INIT(true); extern bool initializeVideo(int argc, char *argv[]); extern void rt_sleep(int msec); #endif // TBB_examples_polygon_overlay_pover_video_H ================================================ FILE: third-party/tbb/examples/parallel_for/polygon_overlay/rpolygon.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ // rpolygon.h // #ifndef TBB_examples_polygon_overlay_rpolygon_H #define TBB_examples_polygon_overlay_rpolygon_H #include #include #include "oneapi/tbb/scalable_allocator.h" #include "oneapi/tbb/concurrent_vector.h" #include "oneapi/tbb/enumerable_thread_specific.h" #include "pover_video.hpp" class RPolygon; typedef oneapi::tbb::scalable_allocator RPolygon_allocator; DEFINE RPolygon_allocator rAlloc; enum MallocBehavior { UseMalloc, UseScalableAllocator }; DEFINE MallocBehavior gMBehavior INIT(UseScalableAllocator); class RPolygon { public: RPolygon() { m_XMin = m_YMin = m_XMax = m_YMax = 0; m_r = m_g = m_b = 0; } RPolygon(int xMin, int yMin, int xMax, int yMax, int r = -1, int g = -1, int b = -1) : m_XMin(xMin), m_YMin(yMin), m_XMax(xMax), m_YMax(yMax) { if (r >= 0) { m_r = (colorcomp_t)r; m_g = (colorcomp_t)g; m_b = (colorcomp_t)b; if (gDoDraw) drawPoly(); } } void set_nodraw(int xMin, int yMin, int xMax, int yMax) { m_XMin = xMin; m_YMin = yMin; m_XMax = xMax; m_YMax = yMax; } RPolygon &intersect(RPolygon &otherPoly); void set(int xMin, int yMin, int xMax, int yMax) { set_nodraw(xMin, yMin, xMax, yMax); if (gDoDraw) { drawPoly(); } } void get(int *xMin, int *yMin, int *xMax, int *yMax) const { *xMin = m_XMin; *yMin = m_YMin; *xMax = m_XMax; *yMax = m_YMax; } int xmax() const { return m_XMax; } int xmin() const { return m_XMin; } int ymax() const { return m_YMax; } int ymin() const { return m_YMin; } void setColor(colorcomp_t newr, colorcomp_t newg, colorcomp_t newb) { m_r = newr; m_g = newg; m_b = newb; } void getColor(int *myr, int *myg, int *myb) { *myr = m_r; *myg = m_g; *myb = m_b; } color_t myColor() { return gVideo->get_color(m_r, m_g, m_b); } void drawPoly() { if (gVideo->running) { if (g_next_frame()) { // Shouldn't call next_frame each time drawing_area ldrawing(gDrawXOffset + m_XMin * gPolyXBoxSize, //x gDrawYOffset + m_YMin * gPolyYBoxSize, //y (m_XMax - m_XMin + 1) * gPolyXBoxSize, //sizex (m_YMax - m_YMin + 1) * gPolyYBoxSize); //sizey for (int y = 0; y < ldrawing.size_y; y++) { ldrawing.set_pos(0, y); color_t my_color = myColor(); for (int x = 0; x < ldrawing.size_x; x++) { ldrawing.put_pixel(my_color); } } } } } int area() { return ((m_XMax - m_XMin + 1) * (m_YMax - m_YMin + 1)); } void print(int i) { std::cout << "RPolygon " << i << " (" << m_XMin << ", " << m_YMin << ")-(" << m_XMax << ", " << m_YMax << ") " << "\n"; fflush(stdout); } private: int m_XMin; int m_YMin; int m_XMax; int m_YMax; colorcomp_t m_r; colorcomp_t m_g; colorcomp_t m_b; }; #if _MAIN_C_ bool operator<(const RPolygon &a, const RPolygon &b) { if (a.ymin() > b.ymin()) return false; if (a.ymin() < b.ymin()) return true; return a.xmin() < b.xmin(); } #else extern bool operator<(const RPolygon &a, const RPolygon &b); #endif extern std::ostream &operator<<(std::ostream &s, const RPolygon &p); class RPolygon_flagged { RPolygon *myPoly; bool is_duplicate; public: RPolygon_flagged() { myPoly = nullptr; is_duplicate = false; } RPolygon_flagged(RPolygon *_p, bool _is_duplicate) : myPoly(_p), is_duplicate(_is_duplicate) {} bool isDuplicate() { return is_duplicate; } void setDuplicate(bool newValue) { is_duplicate = newValue; } RPolygon *p() { return myPoly; } void setp(RPolygon *newp) { myPoly = newp; } }; typedef class std::vector Polygon_map_t; typedef class oneapi::tbb::concurrent_vector concurrent_Polygon_map_t; typedef class oneapi::tbb::enumerable_thread_specific ETS_Polygon_map_t; typedef class std::vector> Flagged_map_t; // we'll make shallow copies inline bool PolygonsOverlap(RPolygon *p1, RPolygon *p2, int &xl, int &yl, int &xh, int &yh) { int xl1, yl1, xh1, yh1, xl2, yl2, xh2, yh2; #if _DEBUG rt_sleep(1); // slow down the process so we can see it. #endif p1->get(&xl1, &yl1, &xh1, &yh1); p2->get(&xl2, &yl2, &xh2, &yh2); if (xl1 > xh2) return false; if (xh1 < xl2) return false; if (yl1 > yh2) return false; if (yh1 < yl2) return false; xl = (xl1 < xl2) ? xl2 : xl1; xh = (xh1 < xh2) ? xh1 : xh2; yl = (yl1 < yl2) ? yl2 : yl1; yh = (yh1 < yh2) ? yh1 : yh2; return true; } #endif // TBB_examples_polygon_overlay_rpolygon_H ================================================ FILE: third-party/tbb/examples/parallel_for/seismic/CMakeLists.txt ================================================ # Copyright (c) 2020-2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. cmake_minimum_required(VERSION 3.5.0...3.31.3) project(seismic CXX) include(../../common/cmake/common.cmake) set_common_project_settings(tbb) add_executable(seismic universe.cpp seismic_video.cpp main.cpp) add_subdirectory(../../common/gui gui) target_link_libraries(seismic PUBLIC TBB::tbb Threads::Threads UI_LIB_seismic) target_compile_options(seismic PRIVATE ${TBB_CXX_STD_FLAG}) if (EXAMPLES_UI_MODE STREQUAL "con") target_compile_definitions(seismic PRIVATE _CONSOLE) endif() set(EXECUTABLE "$") set(ARGS auto 0) set(PERF_ARGS auto 10000 silent) set(LIGHT_ARGS 1:2 100) add_execution_target(run_seismic seismic ${EXECUTABLE} "${ARGS}") add_execution_target(perf_run_seismic seismic ${EXECUTABLE} "${PERF_ARGS}") add_execution_target(light_test_seismic seismic ${EXECUTABLE} "${LIGHT_ARGS}") ================================================ FILE: third-party/tbb/examples/parallel_for/seismic/README.md ================================================ # Seismic sample Parallel seismic wave simulation that demonstrates use of `parallel_for` and `affinity_partitioner`. ## Building the example ``` cmake [EXAMPLES_UI_MODE=value] cmake --build . ``` ### Predefined CMake variables * `EXAMPLES_UI_MODE` - defines the GUI mode, supported values are `gdi`, `d2d`, `con` on Windows, `x`,`con` on Linux and `mac`,`con` on macOS. The default mode is `con`. See the [common page](../../README.md) to get more information. ## Running the sample ### Predefined make targets * `make run_seismic` - executes the example with predefined parameters. * `make perf_run_seismic` ` - executes the example with suggested parameters to measure the oneTBB performance. * `make light_test_seismic` - executes the example with suggested parameters to reduce execution time. ### Application parameters Usage: ``` seismic [n-of-threads=value] [n-of-frames=value] [silent] [serial] [-h] [n-of-threads [n-of-frames]] ``` * `-h` - prints the help for command line options. * `n-of-threads` -e number of threads to use; a range of the form low\[:high\], where low and optional high are non-negative integers or `auto` for a platform-specific default number. * `n-of-frames` - number of frames the example processes internally. * `silent` - no output except elapsed time. * `serial` - in GUI mode start with serial version of algorithm. ### Interactive graphical user interface The following hot keys can be used in interactive execution mode when the example is compiled with the graphical user interface: * `left mouse button` - starts new seismic wave in place specified by mouse cursor. * `space` - toggles between parallel and serial execution modes. * `p` - enables parallel execution mode. * `s` - enables serial execution mode. * `e` - enables screen updates. * `d` - disables screen updates (strongly recommended when measuring performance or scalability; see note below). * `esc` - stop execution. ================================================ FILE: third-party/tbb/examples/parallel_for/seismic/gui/resource.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ //{{NO_DEPENDENCIES}} // Microsoft Visual C++ generated include file. // Used by SeismicSimulation.rc // #define IDC_MYICON 2 #define IDD_SEISMICSIMULATION_DIALOG 102 #define IDS_APP_TITLE 103 #define IDD_ABOUTBOX 103 #define IDM_ABOUT 104 #define IDM_EXIT 105 #define IDI_SEISMICSIMULATION 107 #define IDI_SMALL 108 #define IDC_SEISMICSIMULATION 109 #define IDR_MAINFRAME 128 #define ID_FILE_PARALLEL 32771 #define ID_FILE_SERIAL 32772 #define IDM_PARALLEL 32773 #define ID_FILE_ENABLEGUI 32774 #define ID_FILE_DISABLEGUI 32775 #define IDC_STATIC -1 // Next default values for new objects // #ifdef APSTUDIO_INVOKED #ifndef APSTUDIO_READONLY_SYMBOLS #define _APS_NO_MFC 1 #define _APS_NEXT_RESOURCE_VALUE 129 #define _APS_NEXT_COMMAND_VALUE 32782 #define _APS_NEXT_CONTROL_VALUE 1000 #define _APS_NEXT_SYMED_VALUE 110 #endif #endif ================================================ FILE: third-party/tbb/examples/parallel_for/seismic/gui/seismic.rc ================================================ // Microsoft Visual C++ generated resource script. // #include "resource.h" #define APSTUDIO_READONLY_SYMBOLS ///////////////////////////////////////////////////////////////////////////// // // Generated from the TEXTINCLUDE 2 resource. // #define APSTUDIO_HIDDEN_SYMBOLS #include "windows.h" #undef APSTUDIO_HIDDEN_SYMBOLS ///////////////////////////////////////////////////////////////////////////// #undef APSTUDIO_READONLY_SYMBOLS ///////////////////////////////////////////////////////////////////////////// // English (U.S.) resources #if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_ENU) #ifdef _WIN32 LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US #pragma code_page(1252) #endif //_WIN32 ///////////////////////////////////////////////////////////////////////////// // // Icon // // Icon with lowest ID value placed first to ensure application icon // remains consistent on all systems. IDI_SEISMICSIMULATION ICON "SeismicSimulation.ico" IDI_SMALL ICON "small.ico" ///////////////////////////////////////////////////////////////////////////// // // Menu // IDC_SEISMICSIMULATION MENU BEGIN POPUP "&File" BEGIN MENUITEM "&Parallel", ID_FILE_PARALLEL MENUITEM "&Serial", ID_FILE_SERIAL MENUITEM SEPARATOR MENUITEM "&Enable GUI", ID_FILE_ENABLEGUI MENUITEM "&Disable GUI", ID_FILE_DISABLEGUI MENUITEM SEPARATOR MENUITEM "E&xit", IDM_EXIT END POPUP "&Help" BEGIN MENUITEM "&About ...", IDM_ABOUT END END ///////////////////////////////////////////////////////////////////////////// // // Accelerator // IDC_SEISMICSIMULATION ACCELERATORS BEGIN VK_OEM_2, IDM_ABOUT, VIRTKEY, ALT, NOINVERT "P", ID_FILE_PARALLEL, VIRTKEY, ALT, NOINVERT "S", ID_FILE_SERIAL, VIRTKEY, ALT, NOINVERT "D", ID_FILE_DISABLEGUI, VIRTKEY, ALT, NOINVERT "E", ID_FILE_ENABLEGUI, VIRTKEY, ALT, NOINVERT END ///////////////////////////////////////////////////////////////////////////// // // Dialog // IDD_ABOUTBOX DIALOG 22, 17, 230, 75 STYLE DS_SETFONT | DS_MODALFRAME | WS_CAPTION | WS_SYSMENU CAPTION "About" FONT 8, "System" BEGIN ICON IDI_SEISMICSIMULATION,IDC_MYICON,14,9,16,16 LTEXT "SeismicSimulation Version 1.1",IDC_STATIC,49,10,119,8,SS_NOPREFIX LTEXT "Copyright (C) 2005-2008",IDC_STATIC,49,20,119,8 DEFPUSHBUTTON "OK",IDOK,195,6,30,11,WS_GROUP END #ifdef APSTUDIO_INVOKED ///////////////////////////////////////////////////////////////////////////// // // TEXTINCLUDE // 1 TEXTINCLUDE BEGIN "resource.h\0" END 2 TEXTINCLUDE BEGIN "#define APSTUDIO_HIDDEN_SYMBOLS\r\n" "#include ""windows.h""\r\n" "#undef APSTUDIO_HIDDEN_SYMBOLS\r\n" "\0" END 3 TEXTINCLUDE BEGIN "\r\n" "\0" END #endif // APSTUDIO_INVOKED ///////////////////////////////////////////////////////////////////////////// // // String Table // STRINGTABLE BEGIN IDS_APP_TITLE "SeismicSimulation" IDC_SEISMICSIMULATION "SEISMICSIMULATION" END #endif // English (U.S.) resources ///////////////////////////////////////////////////////////////////////////// #ifndef APSTUDIO_INVOKED ///////////////////////////////////////////////////////////////////////////// // // Generated from the TEXTINCLUDE 3 resource. // ///////////////////////////////////////////////////////////////////////////// #endif // not APSTUDIO_INVOKED ================================================ FILE: third-party/tbb/examples/parallel_for/seismic/main.cpp ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #define VIDEO_WINMAIN_ARGS #include #include "oneapi/tbb/tick_count.h" #include "oneapi/tbb/global_control.h" #include "common/utility/utility.hpp" #include "common/utility/get_default_num_threads.hpp" #include "seismic_video.hpp" #include "universe.hpp" Universe u; struct RunOptions { //! It is used for console mode for test with different number of threads and also has //! meaning for GUI: threads.first - use separate event/updating loop thread (>0) or not (0). //! threads.second - initialization value for scheduler utility::thread_number_range threads; int numberOfFrames; int numberOfIterations; bool silent; bool parallel; RunOptions(utility::thread_number_range threads_, int number_of_frames_, int number_of_iterations_, bool silent_, bool parallel_) : threads(threads_), numberOfFrames(number_of_frames_), numberOfIterations(number_of_iterations_), silent(silent_), parallel(parallel_) {} }; RunOptions ParseCommandLine(int argc, char *argv[]) { // zero number of threads means to run serial version utility::thread_number_range threads( utility::get_default_num_threads, 0, utility::get_default_num_threads()); int numberOfFrames = 0; int numberOfIterations = 0; bool silent = false; bool serial = false; utility::parse_cli_arguments( argc, argv, utility::cli_argument_pack() //"-h" option for displaying help is present implicitly .positional_arg(threads, "n-of-threads", utility::thread_number_range_desc) .positional_arg(numberOfFrames, "n-of-frames", "number of frames the example processes internally (0 means unlimited)") .positional_arg(numberOfIterations, "n-of-iterations", "number of iterations the example runs internally") .arg(silent, "silent", "no output except elapsed time") .arg(serial, "serial", "in GUI mode start with serial version of algorithm")); return RunOptions(threads, numberOfFrames, numberOfIterations, silent, !serial); } int main(int argc, char *argv[]) { oneapi::tbb::tick_count mainStartTime = oneapi::tbb::tick_count::now(); RunOptions options = ParseCommandLine(argc, argv); SeismicVideo video(u, options.numberOfFrames, options.threads.last, options.parallel); double rel_error; // video layer init if (video.init_window(u.UniverseWidth, u.UniverseHeight)) { video.calc_fps = true; video.threaded = options.threads.first > 0; // video is ok, init Universe u.InitializeUniverse(video); // main loop video.main_loop(); } else if (video.init_console()) { // do console mode if (options.numberOfFrames == 0) { options.numberOfFrames = 1000; std::cout << "Substituting 1000 for unlimited frames because not running interactively" << "\n"; } // TODO : Extend utility::cli_argument_pack() to allow specifying the default value. if (options.numberOfIterations <= 0) { options.numberOfIterations = 10; std::cout << "Setting the number of iterations = 10 default" << "\n"; } for (int p = options.threads.first; p <= options.threads.last; p = options.threads.step(p)) { oneapi::tbb::tick_count xwayParallelismStartTime = oneapi::tbb::tick_count::now(); u.InitializeUniverse(video); int numberOfFrames = options.numberOfFrames; assert(options.numberOfIterations > 0 && "Number of iterations cannot be <= 0"); unsigned numberOfIterations = unsigned(options.numberOfIterations); if (p == 0) { //run a serial version for (int i = 0; i < numberOfFrames; ++i) { u.SerialUpdateUniverse(); } } else { oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism, p); utility::measurements mu(numberOfIterations); for (int iter = 0; iter < numberOfIterations; ++iter) { mu.start(); for (int i = 0; i < numberOfFrames; ++i) { u.ParallelUpdateUniverse(); } mu.stop(); } rel_error = mu.computeRelError(); } if (!options.silent) { double fps = options.numberOfFrames / ((oneapi::tbb::tick_count::now() - xwayParallelismStartTime).seconds()); std::cout << fps << " frame per sec with "; if (p == 0) { std::cout << "serial code" << "\n"; } else { std::cout << p << " way parallelism" << "\n"; } } } } video.terminate(); utility::report_elapsed_time((oneapi::tbb::tick_count::now() - mainStartTime).seconds()); utility::report_relative_error(rel_error); return 0; } ================================================ FILE: third-party/tbb/examples/parallel_for/seismic/resource.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ //{{NO_DEPENDENCIES}} // Microsoft Visual C++ generated include file. // Used by SeismicSimulation.rc // #define IDC_MYICON 2 #define IDD_SEISMICSIMULATION_DIALOG 102 #define IDS_APP_TITLE 103 #define IDD_ABOUTBOX 103 #define IDM_ABOUT 104 #define IDM_EXIT 105 #define IDI_SEISMICSIMULATION 107 #define IDI_SMALL 108 #define IDC_SEISMICSIMULATION 109 #define IDR_MAINFRAME 128 #define ID_FILE_PARALLEL 32771 #define ID_FILE_SERIAL 32772 #define IDM_PARALLEL 32773 #define ID_FILE_ENABLEGUI 32774 #define ID_FILE_DISABLEGUI 32775 #define IDC_STATIC -1 // Next default values for new objects // #ifdef APSTUDIO_INVOKED #ifndef APSTUDIO_READONLY_SYMBOLS #define _APS_NO_MFC 1 #define _APS_NEXT_RESOURCE_VALUE 129 #define _APS_NEXT_COMMAND_VALUE 32782 #define _APS_NEXT_CONTROL_VALUE 1000 #define _APS_NEXT_SYMED_VALUE 110 #endif #endif ================================================ FILE: third-party/tbb/examples/parallel_for/seismic/seismic_video.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "oneapi/tbb/global_control.h" #include "seismic_video.hpp" #include "universe.hpp" const char *const SeismicVideo::titles[2] = { "Seismic Simulation: Serial", "Seismic Simulation: Parallel" }; void SeismicVideo::on_mouse(int x, int y, int key) { if (key == 1) { u_.TryPutNewPulseSource(x, y); } } void SeismicVideo::on_key(int key) { key &= 0xff; if (char(key) == ' ') initIsParallel = !initIsParallel; else if (char(key) == 'p') initIsParallel = true; else if (char(key) == 's') initIsParallel = false; else if (char(key) == 'e') updating = true; else if (char(key) == 'd') updating = false; else if (key == 27) running = false; title = titles[initIsParallel ? 1 : 0]; } void SeismicVideo::on_process() { oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism, threadsHigh); for (int frames = 0; numberOfFrames_ == 0 || frames < numberOfFrames_; ++frames) { if (initIsParallel) u_.ParallelUpdateUniverse(); else u_.SerialUpdateUniverse(); if (!next_frame()) break; } } #if defined(_WINDOWS) && !defined(_CONSOLE) #include "resource.hpp" LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam); SeismicVideo *gVideo = nullptr; #endif SeismicVideo::SeismicVideo(Universe &u, int number_of_frames, int threads_high, bool init_is_parallel) : numberOfFrames_(number_of_frames), initIsParallel(init_is_parallel), u_(u), threadsHigh(threads_high) { title = titles[initIsParallel ? 1 : 0]; #if defined(_WINDOWS) && !defined(_CONSOLE) gVideo = this; LoadStringA(video::win_hInstance, IDC_SEISMICSIMULATION, szWindowClass, MAX_LOADSTRING); memset(&wcex, 0, sizeof(wcex)); wcex.lpfnWndProc = (WNDPROC)WndProc; wcex.hIcon = LoadIcon(video::win_hInstance, MAKEINTRESOURCE(IDI_SEISMICSIMULATION)); wcex.hCursor = LoadCursor(nullptr, IDC_ARROW); wcex.hbrBackground = (HBRUSH)(COLOR_WINDOW + 1); wcex.lpszMenuName = LPCTSTR(IDC_SEISMICSIMULATION); wcex.lpszClassName = szWindowClass; wcex.hIconSm = LoadIcon(video::win_hInstance, MAKEINTRESOURCE(IDI_SMALL)); win_set_class(wcex); // ascii convention here win_load_accelerators(IDC_SEISMICSIMULATION); #endif } #if defined(_WINDOWS) && !defined(_CONSOLE) // FUNCTION: WndProc(HWND, unsigned, WORD, LONG) // PURPOSE: Processes messages for the main window. // WM_COMMAND - process the application menu // WM_PAINT - Paint the main window // WM_DESTROY - post a quit message and return LRESULT CALLBACK About(HWND hDlg, UINT message, WPARAM wParam, LPARAM lParam) { switch (message) { case WM_INITDIALOG: return TRUE; case WM_COMMAND: if (LOWORD(wParam) == IDOK || LOWORD(wParam) == IDCANCEL) { EndDialog(hDlg, LOWORD(wParam)); return TRUE; } break; } return FALSE; } LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam) { int wmId; switch (message) { case WM_COMMAND: wmId = LOWORD(wParam); // Parse the menu selections: switch (wmId) { case IDM_ABOUT: DialogBox( video::win_hInstance, MAKEINTRESOURCE(IDD_ABOUTBOX), hWnd, (DLGPROC)About); break; case IDM_EXIT: PostQuitMessage(0); break; case ID_FILE_PARALLEL: gVideo->on_key('p'); break; case ID_FILE_SERIAL: gVideo->on_key('s'); break; case ID_FILE_ENABLEGUI: gVideo->on_key('e'); break; case ID_FILE_DISABLEGUI: gVideo->on_key('d'); break; default: return DefWindowProc(hWnd, message, wParam, lParam); } break; default: return DefWindowProc(hWnd, message, wParam, lParam); } return 0; } #endif ================================================ FILE: third-party/tbb/examples/parallel_for/seismic/seismic_video.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef TBB_examples_seismic_video_H #define TBB_examples_seismic_video_H #include "common/gui/video.hpp" class Universe; class SeismicVideo : public video { #if defined(_WINDOWS) && !defined(_CONSOLE) #define MAX_LOADSTRING 100 TCHAR szWindowClass[MAX_LOADSTRING]; // the main window class name WNDCLASSEX wcex; #endif static const char *const titles[2]; bool initIsParallel; Universe &u_; int numberOfFrames_; // 0 means forever, positive means number of frames, negative is undefined int threadsHigh; private: void on_mouse(int x, int y, int key); void on_process(); #if defined(_WINDOWS) && !defined(_CONSOLE) public: #endif void on_key(int key); public: SeismicVideo(Universe &u, int numberOfFrames, int threadsHigh, bool initIsParallel = true); }; #endif /* TBB_examples_seismic_video_H */ ================================================ FILE: third-party/tbb/examples/parallel_for/seismic/universe.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include "oneapi/tbb/blocked_range.h" #include "oneapi/tbb/parallel_for.h" #include "common/gui/video.hpp" #ifdef _MSC_VER // warning C4068: unknown pragma #pragma warning(disable : 4068) // warning C4351: new behavior: elements of array 'array' will be default initialized #pragma warning(disable : 4351) #endif #include "universe.hpp" const colorcomp_t MaterialColor[4][3] = { // BGR { 96, 0, 0 }, // WATER { 0, 48, 48 }, // SANDSTONE { 32, 32, 23 } // SHALE }; void Universe::InitializeUniverse(video const& colorizer) { pulseCounter = pulseTime = 100; pulseX = UniverseWidth / 3; pulseY = UniverseHeight / 4; // Initialize V, S, and T to slightly non-zero values, in order to avoid denormal waves. for (int i = 0; i < UniverseHeight; ++i) #pragma ivdep for (int j = 0; j < UniverseWidth; ++j) { T[i][j] = S[i][j] = V[i][j] = ValueType(1.0E-6); } for (int i = 1; i < UniverseHeight - 1; ++i) { for (int j = 1; j < UniverseWidth - 1; ++j) { float x = float(j - UniverseWidth / 2) / (UniverseWidth / 2); ValueType t = (ValueType)i / (ValueType)UniverseHeight; MaterialType m; D[i][j] = 1.0; // Coefficient values are fictitious, and chosen to visually exaggerate // physical effects such as Rayleigh waves. The fabs/exp line generates // a shale layer with a gentle upwards slope and an anticline. if (t < 0.3f) { m = WATER; M[i][j] = 0.125; L[i][j] = 0.125; } else if (fabs(t - 0.7 + 0.2 * exp(-8 * x * x) + 0.025 * x) <= 0.1) { m = SHALE; M[i][j] = 0.5; L[i][j] = 0.6; } else { m = SANDSTONE; M[i][j] = 0.3; L[i][j] = 0.4; } material[i][j] = m; } } ValueType scale = 2.0f / (ValueType)ColorMapSize; for (int k = 0; k < 4; ++k) { for (int i = 0; i < ColorMapSize; ++i) { colorcomp_t c[3]; ValueType t = (i - ColorMapSize / 2) * scale; ValueType r = t > 0 ? t : 0; ValueType b = t < 0 ? -t : 0; ValueType g = 0.5f * fabs(t); memcpy(c, MaterialColor[k], sizeof(c)); c[2] = colorcomp_t(r * (255 - c[2]) + c[2]); c[1] = colorcomp_t(g * (255 - c[1]) + c[1]); c[0] = colorcomp_t(b * (255 - c[0]) + c[0]); ColorMap[k][i] = colorizer.get_color(c[2], c[1], c[0]); } } // Set damping coefficients around border to reduce reflections from boundaries. ValueType d = 1.0; for (int k = DamperSize - 1; k > 0; --k) { d *= 1 - 1.0f / (DamperSize * DamperSize); for (int j = 1; j < UniverseWidth - 1; ++j) { D[k][j] *= d; D[UniverseHeight - 1 - k][j] *= d; } for (int i = 1; i < UniverseHeight - 1; ++i) { D[i][k] *= d; D[i][UniverseWidth - 1 - k] *= d; } } drawingMemory = colorizer.get_drawing_memory(); } void Universe::UpdatePulse() { if (pulseCounter > 0) { ValueType t = (pulseCounter - pulseTime / 2) * 0.05f; V[pulseY][pulseX] += 64 * sqrt(M[pulseY][pulseX]) * exp(-t * t); --pulseCounter; } } struct Universe::Rectangle { struct std::pair xRange; struct std::pair yRange; Rectangle(int startX, int startY, int width, int height) : xRange(startX, width), yRange(startY, height) {} int StartX() const { return xRange.first; } int StartY() const { return yRange.first; } int Width() const { return xRange.second; } int Height() const { return yRange.second; } int EndX() const { return xRange.first + xRange.second; } int EndY() const { return yRange.first + yRange.second; } }; void Universe::UpdateStress(Rectangle const& r) { drawing_area drawing(r.StartX(), r.StartY(), r.Width(), r.Height(), drawingMemory); for (int i = r.StartY(); i < r.EndY(); ++i) { drawing.set_pos(1, i - r.StartY()); #pragma ivdep for (int j = r.StartX(); j < r.EndX(); ++j) { S[i][j] += M[i][j] * (V[i][j + 1] - V[i][j]); T[i][j] += M[i][j] * (V[i + 1][j] - V[i][j]); int index = (int)(V[i][j] * (ColorMapSize / 2)) + ColorMapSize / 2; if (index < 0) index = 0; if (index >= ColorMapSize) index = ColorMapSize - 1; color_t* c = ColorMap[material[i][j]]; drawing.put_pixel(c[index]); } } } void Universe::SerialUpdateStress() { Rectangle area(0, 0, UniverseWidth - 1, UniverseHeight - 1); UpdateStress(area); } struct UpdateStressBody { Universe& u_; UpdateStressBody(Universe& u) : u_(u) {} void operator()(const oneapi::tbb::blocked_range& range) const { Universe::Rectangle area(0, range.begin(), u_.UniverseWidth - 1, range.size()); u_.UpdateStress(area); } }; void Universe::ParallelUpdateStress(oneapi::tbb::affinity_partitioner& affinity) { oneapi::tbb::parallel_for( oneapi::tbb::blocked_range(0, UniverseHeight - 1), // Index space for loop UpdateStressBody(*this), // Body of loop affinity); // Affinity hint } void Universe::UpdateVelocity(Rectangle const& r) { for (int i = r.StartY(); i < r.EndY(); ++i) #pragma ivdep for (int j = r.StartX(); j < r.EndX(); ++j) V[i][j] = D[i][j] * (V[i][j] + L[i][j] * (S[i][j] - S[i][j - 1] + T[i][j] - T[i - 1][j])); } void Universe::SerialUpdateVelocity() { UpdateVelocity(Rectangle(1, 1, UniverseWidth - 1, UniverseHeight - 1)); } struct UpdateVelocityBody { Universe& u_; UpdateVelocityBody(Universe& u) : u_(u) {} void operator()(const oneapi::tbb::blocked_range& y_range) const { u_.UpdateVelocity( Universe::Rectangle(1, y_range.begin(), u_.UniverseWidth - 1, y_range.size())); } }; void Universe::ParallelUpdateVelocity(oneapi::tbb::affinity_partitioner& affinity) { oneapi::tbb::parallel_for( oneapi::tbb::blocked_range(1, UniverseHeight), // Index space for loop UpdateVelocityBody(*this), // Body of loop affinity); // Affinity hint } void Universe::SerialUpdateUniverse() { UpdatePulse(); SerialUpdateStress(); SerialUpdateVelocity(); } void Universe::ParallelUpdateUniverse() { /** Affinity is an argument to parallel_for to hint that an iteration of a loop is best replayed on the same processor for each execution of the loop. It is a static object because it must remember where the iterations happened in previous executions. */ static oneapi::tbb::affinity_partitioner affinity; UpdatePulse(); ParallelUpdateStress(affinity); ParallelUpdateVelocity(affinity); } bool Universe::TryPutNewPulseSource(int x, int y) { if (pulseCounter == 0) { pulseCounter = pulseTime; pulseX = x; pulseY = y; return true; } return false; } void Universe::SetDrawingMemory(const drawing_memory& dmem) { drawingMemory = dmem; } ================================================ FILE: third-party/tbb/examples/parallel_for/seismic/universe.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef TBB_examples_seismic_universe_H #define TBB_examples_seismic_universe_H #ifndef UNIVERSE_WIDTH #define UNIVERSE_WIDTH 1024 #endif #ifndef UNIVERSE_HEIGHT #define UNIVERSE_HEIGHT 512 #endif #include "oneapi/tbb/partitioner.h" #include "common/gui/video.hpp" class Universe { public: enum { UniverseWidth = UNIVERSE_WIDTH, UniverseHeight = UNIVERSE_HEIGHT }; private: //in order to avoid performance degradation due to cache aliasing issue //some padding is needed after each row in array, and between array themselves. //the padding is achieved by adjusting number of rows and columns. //as the compiler is forced to place class members of the same clause in order of the //declaration this seems to be the right way of padding. //magic constants added below are chosen experimentally for 1024x512. enum { MaxWidth = UniverseWidth + 1, MaxHeight = UniverseHeight + 3 }; typedef float ValueType; //! Horizontal stress ValueType S[MaxHeight][MaxWidth]; //! Velocity at each grid point ValueType V[MaxHeight][MaxWidth]; //! Vertical stress ValueType T[MaxHeight][MaxWidth]; //! Coefficient related to modulus ValueType M[MaxHeight][MaxWidth]; //! Damping coefficients ValueType D[MaxHeight][MaxWidth]; //! Coefficient related to lightness ValueType L[MaxHeight][MaxWidth]; enum { ColorMapSize = 1024 }; color_t ColorMap[4][ColorMapSize]; enum MaterialType { WATER = 0, SANDSTONE = 1, SHALE = 2 }; //! Values are MaterialType, cast to an unsigned char to save space. unsigned char material[MaxHeight][MaxWidth]; private: enum { DamperSize = 32 }; int pulseTime; int pulseCounter; int pulseX; int pulseY; drawing_memory drawingMemory; public: void InitializeUniverse(video const& colorizer); void SerialUpdateUniverse(); void ParallelUpdateUniverse(); bool TryPutNewPulseSource(int x, int y); void SetDrawingMemory(const drawing_memory& dmem); private: struct Rectangle; void UpdatePulse(); void UpdateStress(Rectangle const& r); void SerialUpdateStress(); friend struct UpdateStressBody; friend struct UpdateVelocityBody; void ParallelUpdateStress(oneapi::tbb::affinity_partitioner& affinity); void UpdateVelocity(Rectangle const& r); void SerialUpdateVelocity(); void ParallelUpdateVelocity(oneapi::tbb::affinity_partitioner& affinity); }; #endif /* TBB_examples_seismic_universe_H */ ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/CMakeLists.txt ================================================ # Copyright (c) 2020-2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. cmake_minimum_required(VERSION 3.5.0...3.31.3) project(tachyon CXX) include(../../common/cmake/common.cmake) set_common_project_settings(tbb) set(TACHYON_VERSION tbb CACHE STRING "Defines the version of the tachyon example") add_executable( tachyon src/trace.${TACHYON_VERSION}.cpp src/main.cpp src/pthread.cpp src/tachyon_video.cpp src/api.cpp src/apigeom.cpp src/apitrigeom.cpp src/bndbox.cpp src/box.cpp src/camera.cpp src/coordsys.cpp src/cylinder.cpp src/extvol.cpp src/global.cpp src/grid.cpp src/imageio.cpp src/imap.cpp src/intersect.cpp src/light.cpp src/objbound.cpp src/parse.cpp src/plane.cpp src/ppm.cpp src/quadric.cpp src/render.cpp src/ring.cpp src/shade.cpp src/sphere.cpp src/texture.cpp src/tgafile.cpp src/trace_rest.cpp src/triangle.cpp src/ui.cpp src/util.cpp src/vector.cpp src/vol.cpp ) add_subdirectory(../../common/gui gui) target_link_libraries(tachyon TBB::tbb Threads::Threads UI_LIB_tachyon) target_compile_options(tachyon PRIVATE ${TBB_CXX_STD_FLAG}) if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL IntelLLVM) target_compile_options(tachyon PRIVATE -D_CRT_SECURE_NO_WARNINGS) endif() set(EXECUTABLE "$") set(ARGS ${CMAKE_CURRENT_SOURCE_DIR}/dat/balls.dat) set(PERF_ARGS silent ${CMAKE_CURRENT_SOURCE_DIR}/dat/balls3.dat) set(LIGHT_ARGS ${CMAKE_CURRENT_SOURCE_DIR}/dat/model2.dat) add_execution_target(run_tachyon tachyon ${EXECUTABLE} "${ARGS}") add_execution_target(perf_run_tachyon tachyon ${EXECUTABLE} "${PERF_ARGS}") add_execution_target(light_test_tachyon tachyon ${EXECUTABLE} "${LIGHT_ARGS}") ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/README.md ================================================ # Tachyon sample Parallel raytracer / renderer that demonstrates the use of parallel_for. *This example includes software developed by John E. Stone.* This example is a 2-D raytracer/renderer that visually shows different parallel scheduling methods and their resulting speedup. The code was parallelized by speculating that each pixel could be rendered in parallel. The resulting parallel code was then checked for correctness by using Intel® Thread Checker, which pointed out where synchronization was needed. Minimal synchronization was then inserted into the parallel code. The resulting parallel code exhibits good speedup. ## Building the example ``` cmake [EXAMPLES_UI_MODE=value] cmake --build . ``` ### Predefined CMake variables * `EXAMPLES_UI_MODE` - defines the GUI mode, supported values are `gdi`, `d2d`, `con` on Windows, `x`,`con` on Linux and `mac`,`con` on macOS. The default mode is `con`. See the [common page](../../README.md) to get more information. * `TACHYON_VERSION` - this examples contains several version that may be changed via `TACHYON_VERSION` Cmake variable. * **serial** - Original sequential version. * **tbb1d** - Parallel version that uses oneAPI Threading Building Blocks (oneTBB) and `blocked_range` to parallelize over tasks that are groups of scan-lines. * **tbb** (Default) - Parallel version that uses oneTBB and `blocked_range2d` to parallelize over tasks that are rectangular sub-areas. ## Running the sample ### Predefined make targets * `make run_tachyon` - executes the example with predefined parameters. * `make perf_run_tachyon` ` - executes the example with suggested parameters to measure the oneTBB performance. * `make light_test_tachyon` - executes the example with suggested parameters to reduce execution time. ### Application parameters Usage: ``` tachyon [dataset=value] [boundthresh=value] [no-display-updating] [no-bounding] [silent] [-h] [dataset [boundthresh]] ``` * `-h` - Prints the help for command line options. * `dataset` - path/name of one of the *.dat files in the dat directory for the example. * `boundthresh` - bounding threshold value. * `no-display-updating` - disable run-time display updating. * `no-bounding` - disable bounding technique. ### Environment variables The `tbb` and `tbb1d` version of examples has the following settings that may be handled by environment variables: * By default, these versions use one thread per available processor. To change this default, set the `TBB_NUM_THREADS` environment variable to the desired number of threads before running. * These versions use `auto_partitioner` by default. To change this default, set the `TBB_PARTITIONER` environment variable to the `aff` value to use `affinity_partitioner` and to `simp` to use `simple_partitioner`. * These versions use a reasonable task grain size by default. To change this default, set the `TBB_GRAINSIZE` environment variable to the desired grain size before running. The grain size corresponds to the number of pixels (in the `X` or `Y` direction, for a rectangular sub-area) in each parallel task. ### Interactive graphical user interface The following hot keys can be used in interactive execution mode when the example is compiled with the graphical user interface: * `any key` - enable repetition of rendering after the pause. Press ESC to stop the application. * `space` - toggle run-time display updating mode while rendering (see no-display-updating above). * `p` - holds the picture after rendering completion. Press 'p' again to continue. * `esc` - stop execution. ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/gui/resource.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #define IDC_MYICON 2 #define IDD_GUI 102 #define IDS_APP_TITLE 103 #define IDI_GUI 107 #define IDI_SMALL 108 #define IDC_GUI 109 #define IDR_MAINFRAME 128 #define IDC_STATIC -1 ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/gui/tachyon.rc ================================================ // Microsoft Visual C++ generated resource script. // #include "resource.h" #define APSTUDIO_READONLY_SYMBOLS ///////////////////////////////////////////////////////////////////////////// // // Generated from the TEXTINCLUDE 2 resource. // #define APSTUDIO_HIDDEN_SYMBOLS #include "windows.h" #undef APSTUDIO_HIDDEN_SYMBOLS ///////////////////////////////////////////////////////////////////////////// #undef APSTUDIO_READONLY_SYMBOLS ///////////////////////////////////////////////////////////////////////////// // English (U.S.) resources #if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_ENU) #ifdef _WIN32 LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US #pragma code_page(1252) #endif //_WIN32 ///////////////////////////////////////////////////////////////////////////// // // Icon // // Icon with lowest ID value placed first to ensure application icon // remains consistent on all systems. IDI_GUI ICON "gui.ico" IDI_SMALL ICON "small.ico" #ifdef APSTUDIO_INVOKED ///////////////////////////////////////////////////////////////////////////// // // TEXTINCLUDE // 1 TEXTINCLUDE BEGIN "resource.h\0" END 2 TEXTINCLUDE BEGIN "#define APSTUDIO_HIDDEN_SYMBOLS\r\n" "#include ""windows.h""\r\n" "#undef APSTUDIO_HIDDEN_SYMBOLS\r\n" "\0" END 3 TEXTINCLUDE BEGIN "\r\n" "\0" END #endif // APSTUDIO_INVOKED ///////////////////////////////////////////////////////////////////////////// // // String Table // STRINGTABLE BEGIN IDS_APP_TITLE "gui" IDC_GUI "GUI" END #endif // English (U.S.) resources ///////////////////////////////////////////////////////////////////////////// #ifndef APSTUDIO_INVOKED ///////////////////////////////////////////////////////////////////////////// // // Generated from the TEXTINCLUDE 3 resource. // ///////////////////////////////////////////////////////////////////////////// #endif // not APSTUDIO_INVOKED ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/api.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * api.cpp - This file contains all of the API calls that are defined for * external driver code to use. */ #include "machine.hpp" #include "types.hpp" #include "macros.hpp" #include "box.hpp" #include "cylinder.hpp" #include "plane.hpp" #include "quadric.hpp" #include "ring.hpp" #include "sphere.hpp" #include "triangle.hpp" #include "vol.hpp" #include "extvol.hpp" #include "texture.hpp" #include "light.hpp" #include "render.hpp" #include "camera.hpp" #include "vector.hpp" #include "intersect.hpp" #include "shade.hpp" #include "util.hpp" #include "imap.hpp" #include "global.hpp" #include "tachyon_video.hpp" typedef void *SceneHandle; #include "api.hpp" vector rt_vector(apiflt x, apiflt y, apiflt z) { vector v; v.x = x; v.y = y; v.z = z; return v; } color rt_color(apiflt r, apiflt g, apiflt b) { color c; c.r = r; c.g = g; c.b = b; return c; } void rt_initialize() { rpcmsg msg; reset_object(); reset_lights(); InitTextures(); if (!parinitted) { parinitted = 1; msg.type = 1; /* setup a ping message */ } } void rt_renderscene(SceneHandle voidscene) { scenedef *scene = (scenedef *)voidscene; renderscene(*scene); } void rt_camerasetup(SceneHandle voidscene, apiflt zoom, apiflt aspectratio, int antialiasing, int raydepth, vector camcent, vector viewvec, vector upvec) { scenedef *scene = (scenedef *)voidscene; vector newupvec; vector newviewvec; vector newrightvec; VCross((vector *)&upvec, &viewvec, &newrightvec); VNorm(&newrightvec); VCross((vector *)&viewvec, &newrightvec, &newupvec); VNorm(&newupvec); newviewvec = viewvec; VNorm(&newviewvec); scene->camzoom = zoom; scene->aspectratio = aspectratio; scene->antialiasing = antialiasing; scene->raydepth = raydepth; scene->camcent = camcent; scene->camviewvec = newviewvec; scene->camrightvec = newrightvec; scene->camupvec = newupvec; } void rt_outputfile(SceneHandle voidscene, const char *outname) { scenedef *scene = (scenedef *)voidscene; strcpy((char *)&scene->outfilename, outname); } void rt_resolution(SceneHandle voidscene, int hres, int vres) { scenedef *scene = (scenedef *)voidscene; scene->hres = hres; scene->vres = vres; } void rt_verbose(SceneHandle voidscene, int v) { scenedef *scene = (scenedef *)voidscene; scene->verbosemode = v; } void rt_rawimage(SceneHandle voidscene, unsigned char *rawimage) { scenedef *scene = (scenedef *)voidscene; scene->rawimage = rawimage; } void rt_background(SceneHandle voidscene, color col) { scenedef *scene = (scenedef *)voidscene; scene->background.r = col.r; scene->background.g = col.g; scene->background.b = col.b; } void rt_boundmode(SceneHandle voidscene, int mode) { scenedef *scene = (scenedef *)voidscene; scene->boundmode = mode; } void rt_boundthresh(SceneHandle voidscene, int threshold) { scenedef *scene = (scenedef *)voidscene; if (threshold > 1) { scene->boundthresh = threshold; } else { rtmesg("Ignoring out-of-range automatic bounding threshold.\n"); rtmesg("Automatic bounding threshold reset to default.\n"); scene->boundthresh = MAXOCTNODES; } } void rt_displaymode(SceneHandle voidscene, int mode) { scenedef *scene = (scenedef *)voidscene; scene->displaymode = mode; } void rt_scenesetup(SceneHandle voidscene, char *outname, int hres, int vres, int verbose) { rt_outputfile(voidscene, outname); rt_resolution(voidscene, hres, vres); rt_verbose(voidscene, verbose); } SceneHandle rt_newscene(void) { scenedef *scene; SceneHandle voidscene; scene = (scenedef *)malloc(sizeof(scenedef)); memset(scene, 0, sizeof(scenedef)); /* clear all valuas to 0 */ voidscene = (SceneHandle)scene; rt_outputfile(voidscene, "/dev/null"); /* default output file (.tga) */ rt_resolution(voidscene, 512, 512); /* 512x512 resolution */ rt_verbose(voidscene, 0); /* verbose messages off */ rt_rawimage(voidscene, nullptr); /* raw image output off */ rt_boundmode(voidscene, RT_BOUNDING_ENABLED); /* spatial subdivision on */ rt_boundthresh(voidscene, MAXOCTNODES); /* default threshold */ rt_displaymode(voidscene, RT_DISPLAY_ENABLED); /* video output on */ rt_camerasetup(voidscene, 1.0, 1.0, 0, 6, rt_vector(0.0, 0.0, 0.0), rt_vector(0.0, 0.0, 1.0), rt_vector(0.0, 1.0, 0.0)); return scene; } void rt_deletescene(SceneHandle scene) { if (scene != nullptr) free(scene); } void apitextotex(apitexture *apitex, texture *tex) { switch (apitex->texturefunc) { case 0: tex->texfunc = (color(*)(void *, void *, void *))(standard_texture); break; case 1: tex->texfunc = (color(*)(void *, void *, void *))(checker_texture); break; case 2: tex->texfunc = (color(*)(void *, void *, void *))(grit_texture); break; case 3: tex->texfunc = (color(*)(void *, void *, void *))(marble_texture); break; case 4: tex->texfunc = (color(*)(void *, void *, void *))(wood_texture); break; case 5: tex->texfunc = (color(*)(void *, void *, void *))(gnoise_texture); break; case 6: tex->texfunc = (color(*)(void *, void *, void *))(cyl_checker_texture); break; case 7: tex->texfunc = (color(*)(void *, void *, void *))(image_sphere_texture); tex->img = AllocateImage((char *)apitex->imap); break; case 8: tex->texfunc = (color(*)(void *, void *, void *))(image_cyl_texture); tex->img = AllocateImage((char *)apitex->imap); break; case 9: tex->texfunc = (color(*)(void *, void *, void *))(image_plane_texture); tex->img = AllocateImage((char *)apitex->imap); break; default: tex->texfunc = (color(*)(void *, void *, void *))(standard_texture); break; } tex->ctr = apitex->ctr; tex->rot = apitex->rot; tex->scale = apitex->scale; tex->uaxs = apitex->uaxs; tex->vaxs = apitex->vaxs; tex->ambient = apitex->ambient; tex->diffuse = apitex->diffuse; tex->specular = apitex->specular; tex->opacity = apitex->opacity; tex->col = apitex->col; tex->islight = 0; tex->shadowcast = 1; tex->phong = 0.0; tex->phongexp = 0.0; tex->phongtype = 0; } void *rt_texture(apitexture *apitex) { texture *tex; tex = (texture *)rt_getmem(sizeof(texture)); apitextotex(apitex, tex); return (tex); } void rt_tex_color(void *voidtex, color col) { texture *tex = (texture *)voidtex; tex->col = col; } void rt_tex_phong(void *voidtex, apiflt phong, apiflt phongexp, int type) { texture *tex = (texture *)voidtex; tex->phong = phong; tex->phongexp = phongexp; tex->phongtype = type; } void rt_light(void *tex, vector ctr, apiflt rad) { point_light *li; li = newlight(tex, (vector)ctr, rad); li->tex->islight = 1; li->tex->shadowcast = 1; li->tex->diffuse = 0.0; li->tex->specular = 0.0; li->tex->opacity = 1.0; add_light(li); add_object((object *)li); } void rt_scalarvol(void *tex, vector min, vector max, int xs, int ys, int zs, char *fname, void *invol) { add_object((object *)newscalarvol( tex, (vector)min, (vector)max, xs, ys, zs, fname, (scalarvol *)invol)); } void rt_extvol(void *tex, vector min, vector max, int samples, flt (*evaluator)(flt, flt, flt)) { add_object((object *)newextvol(tex, (vector)min, (vector)max, samples, evaluator)); } void rt_box(void *tex, vector min, vector max) { add_object((object *)newbox(tex, (vector)min, (vector)max)); } void rt_cylinder(void *tex, vector ctr, vector axis, apiflt rad) { add_object(newcylinder(tex, (vector)ctr, (vector)axis, rad)); } void rt_fcylinder(void *tex, vector ctr, vector axis, apiflt rad) { add_object(newfcylinder(tex, (vector)ctr, (vector)axis, rad)); } void rt_plane(void *tex, vector ctr, vector norm) { add_object(newplane(tex, (vector)ctr, (vector)norm)); } void rt_ring(void *tex, vector ctr, vector norm, apiflt a, apiflt b) { add_object(newring(tex, (vector)ctr, (vector)norm, a, b)); } void rt_sphere(void *tex, vector ctr, apiflt rad) { add_object(newsphere(tex, (vector)ctr, rad)); } void rt_tri(void *tex, vector v0, vector v1, vector v2) { object *trn; trn = newtri(tex, (vector)v0, (vector)v1, (vector)v2); if (trn != nullptr) { add_object(trn); } } void rt_stri(void *tex, vector v0, vector v1, vector v2, vector n0, vector n1, vector n2) { object *trn; trn = newstri(tex, (vector)v0, (vector)v1, (vector)v2, (vector)n0, (vector)n1, (vector)n2); if (trn != nullptr) { add_object(trn); } } void rt_quadsphere(void *tex, vector ctr, apiflt rad) { quadric *q; flt factor; q = (quadric *)newquadric(); factor = 1.0 / (rad * rad); q->tex = (texture *)tex; q->ctr = ctr; q->mat.a = factor; q->mat.b = 0.0; q->mat.c = 0.0; q->mat.d = 0.0; q->mat.e = factor; q->mat.f = 0.0; q->mat.g = 0.0; q->mat.h = factor; q->mat.i = 0.0; q->mat.j = -1.0; add_object((object *)q); } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/api.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /***************************************************************************** * api.h - The declarations and prototypes needed so that 3rd party driver * * code can run the raytracer. Third party driver code should * * only use the functions in this header file to interface with * * the rendering engine. * *************************************************************************** */ /* * $Id: api.h,v 1.2 2007-02-22 17:54:15 Exp $ */ /********************************************/ /* Types defined for use with the API calls */ /********************************************/ #ifdef USESINGLEFLT typedef float apiflt; /* generic floating point number */ #else typedef double apiflt; /* generic floating point number */ #endif typedef void *SceneHandle; typedef struct { int texturefunc; /* which texture function to use */ color col; /* base object color */ int shadowcast; /* does the object cast a shadow */ apiflt ambient; /* ambient lighting */ apiflt diffuse; /* diffuse reflection */ apiflt specular; /* specular reflection */ apiflt opacity; /* how opaque the object is */ vector ctr; /* origin of texture */ vector rot; /* rotation of texture around origin */ vector scale; /* scale of texture in x,y,z */ vector uaxs; /* planar map u axis */ vector vaxs; /* planar map v axis */ char imap[96]; /* name of image map */ } apitexture; /******************************************************************* * NOTE: The value passed in apitexture.texturefunc corresponds to * the meanings given in this table: * * 0 - No texture function is applied other than standard lighting. * 1 - 3D checkerboard texture. Red & Blue checkers through 3d space. * 2 - Grit texture, roughens up the surface of the object a bit. * 3 - 3D marble texture. Makes a 3D swirl pattern through the object. * 4 - 3D wood texture. Makes a 3D wood pattern through the object. * 5 - 3D gradient noise function. * 6 - I've forgotten :-) * 7 - Cylindrical Image Map **** IMAGE MAPS REQUIRE the filename * 8 - Spherical Image Map of the image be put in imap[] * 9 - Planar Image Map part of the texture... * planar requires uaxs, and vaxs.. * *******************************************************************/ /********************************************/ /* Functions implemented to provide the API */ /********************************************/ vector rt_vector(apiflt x, apiflt y, apiflt z); /* helper to make vectors */ color rt_color(apiflt r, apiflt g, apiflt b); /* helper to make colors */ void rt_initialize(); /* reset raytracer, memory deallocation */ void rt_finalize(void); /* close down for good.. */ SceneHandle rt_newscene(void); /* allocate new scene */ void rt_deletescene(SceneHandle); /* delete a scene */ void rt_renderscene(SceneHandle); /* raytrace the current scene */ void rt_outputfile(SceneHandle, const char *outname); void rt_resolution(SceneHandle, int hres, int vres); void rt_verbose(SceneHandle, int v); void rt_rawimage(SceneHandle, unsigned char *rawimage); void rt_background(SceneHandle, color); /* Parameter values for rt_boundmode() */ #define RT_BOUNDING_DISABLED 0 #define RT_BOUNDING_ENABLED 1 void rt_boundmode(SceneHandle, int); void rt_boundthresh(SceneHandle, int); /* Parameter values for rt_displaymode() */ #define RT_DISPLAY_DISABLED 0 #define RT_DISPLAY_ENABLED 1 void rt_displaymode(SceneHandle, int); void rt_scenesetup(SceneHandle, char *, int, int, int); /* scene, output filename, horizontal resolution, vertical resolution, verbose mode */ void rt_camerasetup(SceneHandle, apiflt, apiflt, int, int, vector, vector, vector); /* camera parms: scene, zoom, aspectratio, antialiasing, raydepth, camera center, view direction, up direction */ void *rt_texture(apitexture *); /* pointer to the texture struct that would have been passed to each object() call in older revisions.. */ void rt_light(void *, vector, apiflt); /* add a light */ /* light parms: texture, center, radius */ void rt_sphere(void *, vector, apiflt); /* add a sphere */ /* sphere parms: texture, center, radius */ void rt_scalarvol(void *, vector, vector, int, int, int, char *, void *); void rt_extvol(void *, vector, vector, int, apiflt (*evaluator)(apiflt, apiflt, apiflt)); void rt_box(void *, vector, vector); /* box parms: texture, min, max */ void rt_plane(void *, vector, vector); /* plane parms: texture, center, normal */ void rt_ring(void *, vector, vector, apiflt, apiflt); /* ring parms: texture, center, normal, inner, outer */ void rt_tri(void *, vector, vector, vector); /* tri parms: texture, vertex 0, vertex 1, vertex 2 */ void rt_stri(void *, vector, vector, vector, vector, vector, vector); /* stri parms: texture, vertex 0, vertex 1, vertex 2, norm 0, norm 1, norm 2 */ void rt_heightfield(void *, vector, int, int, apiflt *, apiflt, apiflt); /* field parms: texture, center, m, n, field, wx, wy */ void rt_landscape(void *, int, int, vector, apiflt, apiflt); void rt_quadsphere(void *, vector, apiflt); /* add quadric sphere */ /* sphere parms: texture, center, radius */ void rt_cylinder(void *, vector, vector, apiflt); void rt_fcylinder(void *, vector, vector, apiflt); void rt_polycylinder(void *, vector *, int, apiflt); /* new texture handling routines */ void rt_tex_color(void *voidtex, color col); #define RT_PHONG_PLASTIC 0 #define RT_PHONG_METAL 1 void rt_tex_phong(void *voidtex, apiflt phong, apiflt phongexp, int type); ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/apigeom.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * api.cpp - This file contains all of the API calls that are defined for * external driver code to use. */ #include "machine.hpp" #include "types.hpp" #include "api.hpp" #include "macros.hpp" #include "vector.hpp" #define MyVNorm(a) VNorm((vector *)a) void rt_polycylinder(void *tex, vector *points, int numpts, apiflt rad) { vector a; int i; if ((points == nullptr) || (numpts == 0)) { return; } if (numpts > 0) { rt_sphere(tex, points[0], rad); if (numpts > 1) { for (i = 1; i < numpts; i++) { a.x = points[i].x - points[i - 1].x; a.y = points[i].y - points[i - 1].y; a.z = points[i].z - points[i - 1].z; rt_fcylinder(tex, points[i - 1], a, rad); rt_sphere(tex, points[i], rad); } } } } void rt_heightfield(void *tex, vector ctr, int m, int n, apiflt *field, apiflt wx, apiflt wy) { int xx, yy; vector v0, v1, v2; apiflt xoff, yoff, zoff; xoff = ctr.x - (wx / 2.0); yoff = ctr.z - (wy / 2.0); zoff = ctr.y; for (yy = 0; yy < (n - 1); yy++) { for (xx = 0; xx < (m - 1); xx++) { v0.x = wx * (xx) / (m * 1.0) + xoff; v0.y = field[(yy)*m + (xx)] + zoff; v0.z = wy * (yy) / (n * 1.0) + yoff; v1.x = wx * (xx + 1) / (m * 1.0) + xoff; v1.y = field[(yy)*m + (xx + 1)] + zoff; v1.z = wy * (yy) / (n * 1.0) + yoff; v2.x = wx * (xx + 1) / (m * 1.0) + xoff; v2.y = field[(yy + 1) * m + (xx + 1)] + zoff; v2.z = wy * (yy + 1) / (n * 1.0) + yoff; rt_tri(tex, v1, v0, v2); v0.x = wx * (xx) / (m * 1.0) + xoff; v0.y = field[(yy)*m + (xx)] + zoff; v0.z = wy * (yy) / (n * 1.0) + yoff; v1.x = wx * (xx) / (m * 1.0) + xoff; v1.y = field[(yy + 1) * m + (xx)] + zoff; v1.z = wy * (yy + 1) / (n * 1.0) + yoff; v2.x = wx * (xx + 1) / (m * 1.0) + xoff; v2.y = field[(yy + 1) * m + (xx + 1)] + zoff; v2.z = wy * (yy + 1) / (n * 1.0) + yoff; rt_tri(tex, v0, v1, v2); } } } /* end of heightfield */ static void rt_sheightfield(void *tex, vector ctr, int m, int n, apiflt *field, apiflt wx, apiflt wy) { vector *vertices; vector *normals; vector offset; apiflt xinc, yinc; int x, y, addr; vertices = (vector *)malloc(m * n * sizeof(vector)); normals = (vector *)malloc(m * n * sizeof(vector)); offset.x = ctr.x - (wx / 2.0); offset.y = ctr.z - (wy / 2.0); offset.z = ctr.y; xinc = wx / ((apiflt)m); yinc = wy / ((apiflt)n); /* build vertex list */ for (y = 0; y < n; y++) { for (x = 0; x < m; x++) { addr = y * m + x; vertices[addr] = rt_vector(x * xinc + offset.x, field[addr] + offset.z, y * yinc + offset.y); } } /* build normals from vertex list */ for (x = 1; x < m; x++) { normals[x] = normals[(n - 1) * m + x] = rt_vector(0.0, 1.0, 0.0); } for (y = 1; y < n; y++) { normals[y * m] = normals[y * m + (m - 1)] = rt_vector(0.0, 1.0, 0.0); } for (y = 1; y < (n - 1); y++) { for (x = 1; x < (m - 1); x++) { addr = y * m + x; normals[addr] = rt_vector(-(field[addr + 1] - field[addr - 1]) / (2.0 * xinc), 1.0, -(field[addr + m] - field[addr - m]) / (2.0 * yinc)); MyVNorm(&normals[addr]); } } /* generate actual triangles */ for (y = 0; y < (n - 1); y++) { for (x = 0; x < (m - 1); x++) { addr = y * m + x; rt_stri(tex, vertices[addr], vertices[addr + 1 + m], vertices[addr + 1], normals[addr], normals[addr + 1 + m], normals[addr + 1]); rt_stri(tex, vertices[addr], vertices[addr + m], vertices[addr + 1 + m], normals[addr], normals[addr + m], normals[addr + 1 + m]); } } free(normals); free(vertices); } /* end of smoothed heightfield */ static void adjust(apiflt *base, int xres, int yres, apiflt wx, apiflt wy, int xa, int ya, int x, int y, int xb, int yb) { apiflt d, v; if (base[x + (xres * y)] == 0.0) { d = (abs(xa - xb) / (xres * 1.0)) * wx + (abs(ya - yb) / (yres * 1.0)) * wy; v = (base[xa + (xres * ya)] + base[xb + (xres * yb)]) / 2.0 + (((((rand() % 1000) - 500.0) / 500.0) * d) / 8.0); if (v < 0.0) v = 0.0; if (v > (xres + yres)) v = (xres + yres); base[x + (xres * y)] = v; } } static void subdivide(apiflt *base, int xres, int yres, apiflt wx, apiflt wy, int x1, int y1, int x2, int y2) { long x, y; if (((x2 - x1) < 2) && ((y2 - y1) < 2)) { return; } x = (x1 + x2) / 2; y = (y1 + y2) / 2; adjust(base, xres, yres, wx, wy, x1, y1, x, y1, x2, y1); adjust(base, xres, yres, wx, wy, x2, y1, x2, y, x2, y2); adjust(base, xres, yres, wx, wy, x1, y2, x, y2, x2, y2); adjust(base, xres, yres, wx, wy, x1, y1, x1, y, x1, y2); if (base[x + xres * y] == 0.0) { base[x + (xres * y)] = (base[x1 + xres * y1] + base[x2 + xres * y1] + base[x2 + xres * y2] + base[x1 + xres * y2]) / 4.0; } subdivide(base, xres, yres, wx, wy, x1, y1, x, y); subdivide(base, xres, yres, wx, wy, x, y1, x2, y); subdivide(base, xres, yres, wx, wy, x, y, x2, y2); subdivide(base, xres, yres, wx, wy, x1, y, x, y2); } void rt_landscape(void *tex, int m, int n, vector ctr, apiflt wx, apiflt wy) { int totalsize, x, y; apiflt *field; totalsize = m * n; srand(totalsize); field = (apiflt *)malloc(totalsize * sizeof(apiflt)); for (y = 0; y < n; y++) { for (x = 0; x < m; x++) { field[x + y * m] = 0.0; } } field[0 + 0] = 1.0 + (rand() % 100) / 100.0; field[m - 1] = 1.0 + (rand() % 100) / 100.0; field[0 + m * (n - 1)] = 1.0 + (rand() % 100) / 100.0; field[m - 1 + m * (n - 1)] = 1.0 + (rand() % 100) / 100.0; subdivide(field, m, n, wx, wy, 0, 0, m - 1, n - 1); rt_sheightfield(tex, ctr, m, n, field, wx, wy); free(field); } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/apitrigeom.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * apitrigeom.cpp - This file contains code for generating triangle tessellated * geometry, for use with OpenGL, XGL, etc. */ #include "machine.hpp" #include "types.hpp" #include "api.hpp" #include "macros.hpp" #include "vector.hpp" #define MyVNorm(a) VNorm((vector *)a) #define MyVCross(a, b, c) VCross((vector *)a, (vector *)b, (vector *)c) #define MyVAddS(x, a, b, c) VAddS((flt)x, (vector *)a, (vector *)b, (vector *)c) #define CYLFACETS 36 #define RINGFACETS 36 #define SPHEREFACETS 25 void rt_tri_fcylinder(void *tex, vector ctr, vector axis, apiflt rad) { vector x, y, z, tmp; double u, v, u2, v2; int j; vector p1, p2, p3, p4; vector n1, n2; z = axis; MyVNorm(&z); tmp.x = z.y - 2.1111111; tmp.y = -z.z + 3.14159267; tmp.z = z.x - 3.915292342341; MyVNorm(&z); MyVNorm(&tmp); MyVCross(&z, &tmp, &x); MyVNorm(&x); MyVCross(&x, &z, &y); MyVNorm(&y); for (j = 0; j < CYLFACETS; j++) { u = rad * sin((6.28 * j) / (CYLFACETS - 1.0)); v = rad * cos((6.28 * j) / (CYLFACETS - 1.0)); u2 = rad * sin((6.28 * (j + 1.0)) / (CYLFACETS - 1.0)); v2 = rad * cos((6.28 * (j + 1.0)) / (CYLFACETS - 1.0)); p1.x = p1.y = p1.z = 0.0; p4 = p3 = p2 = p1; MyVAddS(u, &x, &p1, &p1); MyVAddS(v, &y, &p1, &p1); n1 = p1; MyVNorm(&n1); MyVAddS(1.0, &ctr, &p1, &p1); MyVAddS(u2, &x, &p2, &p2); MyVAddS(v2, &y, &p2, &p2); n2 = p2; MyVNorm(&n2); MyVAddS(1.0, &ctr, &p2, &p2); MyVAddS(1.0, &axis, &p1, &p3); MyVAddS(1.0, &axis, &p2, &p4); rt_stri(tex, p1, p2, p3, n1, n2, n1); rt_stri(tex, p3, p2, p4, n1, n2, n2); } } void rt_tri_cylinder(void *tex, vector ctr, vector axis, apiflt rad) { rt_fcylinder(tex, ctr, axis, rad); } void rt_tri_ring(void *tex, vector ctr, vector norm, apiflt a, apiflt b) { vector x, y, z, tmp; double u, v, u2, v2; int j; vector p1, p2, p3, p4; vector n1, n2; z = norm; MyVNorm(&z); tmp.x = z.y - 2.1111111; tmp.y = -z.z + 3.14159267; tmp.z = z.x - 3.915292342341; MyVNorm(&z); MyVNorm(&tmp); MyVCross(&z, &tmp, &x); MyVNorm(&x); MyVCross(&x, &z, &y); MyVNorm(&y); for (j = 0; j < RINGFACETS; j++) { u = sin((6.28 * j) / (RINGFACETS - 1.0)); v = cos((6.28 * j) / (RINGFACETS - 1.0)); u2 = sin((6.28 * (j + 1.0)) / (RINGFACETS - 1.0)); v2 = cos((6.28 * (j + 1.0)) / (RINGFACETS - 1.0)); p1.x = p1.y = p1.z = 0.0; p4 = p3 = p2 = p1; MyVAddS(u, &x, &p1, &p1); MyVAddS(v, &y, &p1, &p1); n1 = p1; MyVNorm(&n1); MyVAddS(a, &n1, &ctr, &p1); MyVAddS(b, &n1, &ctr, &p3); MyVAddS(u2, &x, &p2, &p2); MyVAddS(v2, &y, &p2, &p2); n2 = p2; MyVNorm(&n2); MyVAddS(a, &n2, &ctr, &p2); MyVAddS(b, &n2, &ctr, &p4); rt_stri(tex, p1, p2, p3, norm, norm, norm); rt_stri(tex, p3, p2, p4, norm, norm, norm); } } void rt_tri_box(void *tex, vector min, vector max) { /* -XY face */ rt_tri(tex, rt_vector(min.x, min.y, min.z), rt_vector(min.x, max.y, min.z), rt_vector(max.x, max.y, min.z)); rt_tri(tex, rt_vector(min.x, min.y, min.z), rt_vector(max.x, max.y, min.z), rt_vector(max.x, min.y, min.z)); /* +XY face */ rt_tri(tex, rt_vector(min.x, min.y, max.z), rt_vector(max.x, max.y, max.z), rt_vector(min.x, max.y, max.z)); rt_tri(tex, rt_vector(min.x, min.y, max.z), rt_vector(max.x, min.y, max.z), rt_vector(max.x, max.y, max.z)); /* -YZ face */ rt_tri(tex, rt_vector(min.x, min.y, min.z), rt_vector(min.x, max.y, max.z), rt_vector(min.x, min.y, max.z)); rt_tri(tex, rt_vector(min.x, min.y, min.z), rt_vector(min.x, max.y, min.z), rt_vector(min.x, max.y, max.z)); /* +YZ face */ rt_tri(tex, rt_vector(max.x, min.y, min.z), rt_vector(max.x, min.y, max.z), rt_vector(max.x, max.y, max.z)); rt_tri(tex, rt_vector(max.x, min.y, min.z), rt_vector(max.x, max.y, max.z), rt_vector(max.x, max.y, min.z)); /* -XZ face */ rt_tri(tex, rt_vector(min.x, min.y, min.z), rt_vector(min.x, min.y, max.z), rt_vector(max.x, min.y, max.z)); rt_tri(tex, rt_vector(min.x, min.y, min.z), rt_vector(max.x, min.y, max.z), rt_vector(max.x, min.y, min.z)); /* +XZ face */ rt_tri(tex, rt_vector(min.x, max.y, min.z), rt_vector(max.x, max.y, max.z), rt_vector(min.x, max.y, max.z)); rt_tri(tex, rt_vector(min.x, max.y, min.z), rt_vector(max.x, max.y, min.z), rt_vector(max.x, max.y, max.z)); } void rt_tri_sphere(void *tex, vector ctr, apiflt rad) {} void rt_tri_plane(void *tex, vector ctr, vector norm) { rt_tri_ring(tex, ctr, norm, 0.0, 10000.0); } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/apitrigeom.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * apitrigeom.h - header for functions to generate triangle tessellated * geometry for use with OpenGL, XGL, etc. * */ void rt_tri_fcylinder(void* tex, vector ctr, vector axis, apiflt rad); void rt_tri_cylinder(void* tex, vector ctr, vector axis, apiflt rad); void rt_tri_ring(void* tex, vector ctr, vector norm, apiflt a, apiflt b); void rt_tri_plane(void* tex, vector ctr, vector norm); void rt_tri_box(void* tex, vector min, vector max); ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/bndbox.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * bndbox.cpp - This file contains the functions for dealing with bounding boxes. */ #include "machine.hpp" #include "types.hpp" #include "macros.hpp" #include "vector.hpp" #include "intersect.hpp" #include "util.hpp" #define BNDBOX_PRIVATE #include "bndbox.hpp" static object_methods bndbox_methods = { (void (*)(void *, void *))(bndbox_intersect), (void (*)(void *, void *, void *, void *))(nullptr), bndbox_bbox, free_bndbox }; bndbox *newbndbox(vector min, vector max) { bndbox *b; b = (bndbox *)rt_getmem(sizeof(bndbox)); memset(b, 0, sizeof(bndbox)); b->min = min; b->max = max; b->methods = &bndbox_methods; b->objlist = nullptr; b->tex = nullptr; b->nextobj = nullptr; return b; } static int bndbox_bbox(void *obj, vector *min, vector *max) { bndbox *b = (bndbox *)obj; *min = b->min; *max = b->max; return 1; } static void free_bndbox(void *v) { bndbox *b = (bndbox *)v; free_objects(b->objlist); free(b); } static void bndbox_intersect(bndbox *bx, ray *ry) { flt a, tx1, tx2, ty1, ty2, tz1, tz2; flt tnear, tfar; object *obj; ray newray; /* eliminate bounded rays whose bounds do not intersect */ /* the bounds of the box.. */ if (ry->flags & RT_RAY_BOUNDED) { if ((ry->s.x > bx->max.x) && (ry->e.x > bx->max.x)) return; if ((ry->s.x < bx->min.x) && (ry->e.x < bx->min.x)) return; if ((ry->s.y > bx->max.y) && (ry->e.y > bx->max.y)) return; if ((ry->s.y < bx->min.y) && (ry->e.y < bx->min.y)) return; if ((ry->s.z > bx->max.z) && (ry->e.z > bx->max.z)) return; if ((ry->s.z < bx->min.z) && (ry->e.z < bx->min.z)) return; } tnear = -FHUGE; tfar = FHUGE; if (ry->d.x == 0.0) { if ((ry->o.x < bx->min.x) || (ry->o.x > bx->max.x)) return; } else { tx1 = (bx->min.x - ry->o.x) / ry->d.x; tx2 = (bx->max.x - ry->o.x) / ry->d.x; if (tx1 > tx2) { a = tx1; tx1 = tx2; tx2 = a; } if (tx1 > tnear) tnear = tx1; if (tx2 < tfar) tfar = tx2; } if (tnear > tfar) return; if (tfar < 0.0) return; if (ry->d.y == 0.0) { if ((ry->o.y < bx->min.y) || (ry->o.y > bx->max.y)) return; } else { ty1 = (bx->min.y - ry->o.y) / ry->d.y; ty2 = (bx->max.y - ry->o.y) / ry->d.y; if (ty1 > ty2) { a = ty1; ty1 = ty2; ty2 = a; } if (ty1 > tnear) tnear = ty1; if (ty2 < tfar) tfar = ty2; } if (tnear > tfar) return; if (tfar < 0.0) return; if (ry->d.z == 0.0) { if ((ry->o.z < bx->min.z) || (ry->o.z > bx->max.z)) return; } else { tz1 = (bx->min.z - ry->o.z) / ry->d.z; tz2 = (bx->max.z - ry->o.z) / ry->d.z; if (tz1 > tz2) { a = tz1; tz1 = tz2; tz2 = a; } if (tz1 > tnear) tnear = tz1; if (tz2 < tfar) tfar = tz2; } if (tnear > tfar) return; if (tfar < 0.0) return; /* intersect all of the enclosed objects */ newray = *ry; newray.flags |= RT_RAY_BOUNDED; RAYPNT(newray.s, (*ry), tnear); RAYPNT(newray.e, (*ry), (tfar + EPSILON)); obj = bx->objlist; while (obj != nullptr) { obj->methods->intersect(obj, &newray); obj = (object *)obj->nextobj; } } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/bndbox.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * bndbox.h - This file contains the defines for bounding boxes etc. * * $Id: bndbox.h,v 1.2 2007-02-22 17:54:15 Exp $ */ typedef struct { unsigned int id; /* Unique Object serial number */ void *nextobj; /* pointer to next object in list */ object_methods *methods; /* this object's methods */ texture *tex; /* object texture */ vector min; vector max; object *objlist; } bndbox; bndbox *newbndbox(vector min, vector max); #ifdef BNDBOX_PRIVATE static int bndbox_bbox(void *obj, vector *min, vector *max); static void free_bndbox(void *v); static void bndbox_intersect(bndbox *, ray *); #endif ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/box.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * box.cpp - This file contains the functions for dealing with boxes. */ #include "machine.hpp" #include "types.hpp" #include "macros.hpp" #include "box.hpp" #include "vector.hpp" #include "intersect.hpp" #include "util.hpp" int box_bbox(void *obj, vector *min, vector *max) { box *b = (box *)obj; *min = b->min; *max = b->max; return 1; } static object_methods box_methods = { (void (*)(void *, void *))(box_intersect), (void (*)(void *, void *, void *, void *))(box_normal), box_bbox, free }; box *newbox(void *tex, vector min, vector max) { box *b; b = (box *)rt_getmem(sizeof(box)); memset(b, 0, sizeof(box)); b->methods = &box_methods; b->tex = (texture *)tex; b->min = min; b->max = max; return b; } void box_intersect(box *bx, ray *ry) { flt a, tx1, tx2, ty1, ty2, tz1, tz2; flt tnear, tfar; tnear = -FHUGE; tfar = FHUGE; if (ry->d.x == 0.0) { if ((ry->o.x < bx->min.x) || (ry->o.x > bx->max.x)) return; } else { tx1 = (bx->min.x - ry->o.x) / ry->d.x; tx2 = (bx->max.x - ry->o.x) / ry->d.x; if (tx1 > tx2) { a = tx1; tx1 = tx2; tx2 = a; } if (tx1 > tnear) tnear = tx1; if (tx2 < tfar) tfar = tx2; } if (tnear > tfar) return; if (tfar < 0.0) return; if (ry->d.y == 0.0) { if ((ry->o.y < bx->min.y) || (ry->o.y > bx->max.y)) return; } else { ty1 = (bx->min.y - ry->o.y) / ry->d.y; ty2 = (bx->max.y - ry->o.y) / ry->d.y; if (ty1 > ty2) { a = ty1; ty1 = ty2; ty2 = a; } if (ty1 > tnear) tnear = ty1; if (ty2 < tfar) tfar = ty2; } if (tnear > tfar) return; if (tfar < 0.0) return; if (ry->d.z == 0.0) { if ((ry->o.z < bx->min.z) || (ry->o.z > bx->max.z)) return; } else { tz1 = (bx->min.z - ry->o.z) / ry->d.z; tz2 = (bx->max.z - ry->o.z) / ry->d.z; if (tz1 > tz2) { a = tz1; tz1 = tz2; tz2 = a; } if (tz1 > tnear) tnear = tz1; if (tz2 < tfar) tfar = tz2; } if (tnear > tfar) return; if (tfar < 0.0) return; add_intersection(tnear, (object *)bx, ry); add_intersection(tfar, (object *)bx, ry); } void box_normal(box *bx, vector *pnt, ray *incident, vector *N) { vector a, b, c; flt t; c.x = (bx->max.x + bx->min.x) / 2.0; c.y = (bx->max.y + bx->min.y) / 2.0; c.z = (bx->max.z + bx->min.z) / 2.0; VSub((vector *)pnt, &c, N); b = (*N); a.x = fabs(N->x); a.y = fabs(N->y); a.z = fabs(N->z); N->x = 0.0; N->y = 0.0; N->z = 0.0; t = MYMAX(a.x, MYMAX(a.y, a.z)); if (t == a.x) N->x = b.x; if (t == a.y) N->y = b.y; if (t == a.z) N->z = b.z; VNorm(N); } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/box.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * box.h - This file contains the defines for boxes etc. * * $Id: box.h,v 1.2 2007-02-22 17:54:15 Exp $ */ typedef struct { unsigned int id; /* Unique Object serial number */ void *nextobj; /* pointer to next object in list */ object_methods *methods; /* this object's methods */ texture *tex; /* object texture */ vector min; vector max; } box; box *newbox(void *tex, vector min, vector max); void box_intersect(box *, ray *); void box_normal(box *, vector *, ray *incident, vector *); ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/camera.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * camera.cpp - This file contains all of the functions for doing camera work. */ #include "machine.hpp" #include "types.hpp" #include "macros.hpp" #include "vector.hpp" #include "camera.hpp" #include "util.hpp" ray camray(scenedef *scene, int x, int y) { ray ray1, newray; vector projcent; vector projpixel; flt px, py, sx, sy; sx = (flt)scene->hres; sy = (flt)scene->vres; /* calculate the width and height of the image plane given */ /* the aspect ratio, image resolution, and zoom factor */ px = ((sx / sy) / scene->aspectratio) / scene->camzoom; py = 1.0 / scene->camzoom; /* assuming viewvec is a unit vector, then the center of the */ /* image plane is the camera center + vievec */ projcent.x = scene->camcent.x + scene->camviewvec.x; projcent.y = scene->camcent.y + scene->camviewvec.y; projcent.z = scene->camcent.z + scene->camviewvec.z; /* starting from the center of the image plane, we move the */ /* center of the pel we're calculating, to */ /* projcent + (rightvec * x distance) */ ray1.o = projcent; ray1.d = scene->camrightvec; projpixel = Raypnt(&ray1, ((x * px / sx) - (px / 2.0))); /* starting from the horizontally translated pel, we move the */ /* center of the pel we're calculating, to */ /* projcent + (upvec * y distance) */ ray1.o = projpixel; ray1.d = scene->camupvec; projpixel = Raypnt(&ray1, ((y * py / sy) - (py / 2.0))); /* now that we have the exact pel center in the image plane */ /* we create the real primary ray that will be used by the */ /* rest of the system. */ /* The ray is expected to be re-normalized elsewhere, we're */ /* only really concerned about getting its direction right. */ newray.o = scene->camcent; VSub(&projpixel, &scene->camcent, &newray.d); newray.depth = scene->raydepth; newray.flags = RT_RAY_REGULAR; /* camera only generates primary rays */ return newray; } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/camera.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * camera.h - This file contains the defines for camera routines etc. * * $Id: camera.h,v 1.2 2007-02-22 17:54:15 Exp $ */ ray camray(scenedef *, int, int); ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/coordsys.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * coordsys.cpp - Routines to translate from one coordinate system to another. */ #include "machine.hpp" #include "types.hpp" #include "coordsys.hpp" void xytopolar(flt x, flt y, flt rad, flt* u, flt* v) { flt r1; r1 = x * x + y * y; *v = sqrt(r1 / (rad * rad)); if (y < 0.0) *u = 1.0 - acos(x / sqrt(r1)) / TWOPI; else *u = acos(x / sqrt(r1)) / TWOPI; } void xyztocyl(vector pnt, flt height, flt* u, flt* v) { flt r1; r1 = pnt.x * pnt.x + pnt.y * pnt.y; *v = pnt.z / height; if (pnt.y < 0.0) *u = 1.0 - acos(pnt.x / sqrt(r1)) / TWOPI; else *u = acos(pnt.x / sqrt(r1)) / TWOPI; } void xyztospr(vector pnt, flt* u, flt* v) { flt r1, phi, theta; r1 = sqrt(pnt.x * pnt.x + pnt.y * pnt.y + pnt.z * pnt.z); phi = acos(-pnt.y / r1); *v = phi / 3.1415926; theta = acos((pnt.x / r1) / sin(phi)) / TWOPI; if (pnt.z > 0.0) *u = theta; else *u = 1 - theta; } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/coordsys.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * coordsys.h - defines for coordinate system routines. * * $Id: coordsys.h,v 1.2 2007-02-22 17:54:15 Exp $ */ #define TWOPI 6.2831853 void xytopolar(flt, flt, flt, flt *, flt *); void xyztocyl(vector, flt, flt *, flt *); void xyztospr(vector, flt *, flt *); ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/cylinder.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * cylinder.cpp - This file contains the functions for dealing with cylinders. */ #include "machine.hpp" #include "types.hpp" #include "macros.hpp" #include "vector.hpp" #include "intersect.hpp" #include "util.hpp" #define CYLINDER_PRIVATE #include "cylinder.hpp" static object_methods cylinder_methods = { (void (*)(void *, void *))(cylinder_intersect), (void (*)(void *, void *, void *, void *))( cylinder_normal), cylinder_bbox, free }; static object_methods fcylinder_methods = { (void (*)(void *, void *))(fcylinder_intersect), (void (*)(void *, void *, void *, void *))( cylinder_normal), fcylinder_bbox, free }; object *newcylinder(void *tex, vector ctr, vector axis, flt rad) { cylinder *c; c = (cylinder *)rt_getmem(sizeof(cylinder)); memset(c, 0, sizeof(cylinder)); c->methods = &cylinder_methods; c->tex = (texture *)tex; c->ctr = ctr; c->axis = axis; c->rad = rad; return (object *)c; } static int cylinder_bbox(void *obj, vector *min, vector *max) { return 0; /* infinite / unbounded object */ } static void cylinder_intersect(cylinder *cyl, ray *ry) { vector rc, n, D, O; flt t, s, tin, tout, ln, d; rc.x = ry->o.x - cyl->ctr.x; rc.y = ry->o.y - cyl->ctr.y; rc.z = ry->o.z - cyl->ctr.z; VCross(&ry->d, &cyl->axis, &n); VDOT(ln, n, n); ln = sqrt(ln); /* finish length calculation */ if (ln == 0.0) { /* ray is parallel to the cylinder.. */ VDOT(d, rc, cyl->axis); D.x = rc.x - d * cyl->axis.x; D.y = rc.y - d * cyl->axis.y; D.z = rc.z - d * cyl->axis.z; VDOT(d, D, D); d = sqrt(d); tin = -FHUGE; tout = FHUGE; /* if (d <= cyl->rad) then ray is inside cylinder.. else outside */ } VNorm(&n); VDOT(d, rc, n); d = fabs(d); if (d <= cyl->rad) { /* ray intersects cylinder.. */ VCross(&rc, &cyl->axis, &O); VDOT(t, O, n); t = -t / ln; VCross(&n, &cyl->axis, &O); VNorm(&O); VDOT(s, ry->d, O); s = fabs(sqrt(cyl->rad * cyl->rad - d * d) / s); tin = t - s; add_intersection(tin, (object *)cyl, ry); tout = t + s; add_intersection(tout, (object *)cyl, ry); } } static void cylinder_normal(cylinder *cyl, vector *pnt, ray *incident, vector *N) { vector a, b, c; flt t; VSub((vector *)pnt, &(cyl->ctr), &a); c = cyl->axis; VNorm(&c); VDOT(t, a, c); b.x = c.x * t + cyl->ctr.x; b.y = c.y * t + cyl->ctr.y; b.z = c.z * t + cyl->ctr.z; VSub(pnt, &b, N); VNorm(N); if (VDot(N, &(incident->d)) > 0.0) { /* make cylinder double sided */ N->x = -N->x; N->y = -N->y; N->z = -N->z; } } object *newfcylinder(void *tex, vector ctr, vector axis, flt rad) { cylinder *c; c = (cylinder *)rt_getmem(sizeof(cylinder)); memset(c, 0, sizeof(cylinder)); c->methods = &fcylinder_methods; c->tex = (texture *)tex; c->ctr = ctr; c->axis = axis; c->rad = rad; return (object *)c; } static int fcylinder_bbox(void *obj, vector *min, vector *max) { cylinder *c = (cylinder *)obj; vector mintmp, maxtmp; mintmp.x = c->ctr.x; mintmp.y = c->ctr.y; mintmp.z = c->ctr.z; maxtmp.x = c->ctr.x + c->axis.x; maxtmp.y = c->ctr.y + c->axis.y; maxtmp.z = c->ctr.z + c->axis.z; min->x = MYMIN(mintmp.x, maxtmp.x); min->y = MYMIN(mintmp.y, maxtmp.y); min->z = MYMIN(mintmp.z, maxtmp.z); min->x -= c->rad; min->y -= c->rad; min->z -= c->rad; max->x = MYMAX(mintmp.x, maxtmp.x); max->y = MYMAX(mintmp.y, maxtmp.y); max->z = MYMAX(mintmp.z, maxtmp.z); max->x += c->rad; max->y += c->rad; max->z += c->rad; return 1; } static void fcylinder_intersect(cylinder *cyl, ray *ry) { vector rc, n, O, hit, tmp2, ctmp4; flt t, s, tin, tout, ln, d, tmp, tmp3; rc.x = ry->o.x - cyl->ctr.x; rc.y = ry->o.y - cyl->ctr.y; rc.z = ry->o.z - cyl->ctr.z; VCross(&ry->d, &cyl->axis, &n); VDOT(ln, n, n); ln = sqrt(ln); /* finish length calculation */ if (ln == 0.0) { /* ray is parallel to the cylinder.. */ return; /* in this case, we want to miss or go through the "hole" */ } VNorm(&n); VDOT(d, rc, n); d = fabs(d); if (d <= cyl->rad) { /* ray intersects cylinder.. */ VCross(&rc, &cyl->axis, &O); VDOT(t, O, n); t = -t / ln; VCross(&n, &cyl->axis, &O); VNorm(&O); VDOT(s, ry->d, O); s = fabs(sqrt(cyl->rad * cyl->rad - d * d) / s); tin = t - s; RAYPNT(hit, (*ry), tin); ctmp4 = cyl->axis; VNorm(&ctmp4); tmp2.x = hit.x - cyl->ctr.x; tmp2.y = hit.y - cyl->ctr.y; tmp2.z = hit.z - cyl->ctr.z; VDOT(tmp, tmp2, ctmp4); VDOT(tmp3, cyl->axis, cyl->axis); if ((tmp > 0.0) && (tmp < sqrt(tmp3))) add_intersection(tin, (object *)cyl, ry); tout = t + s; RAYPNT(hit, (*ry), tout); tmp2.x = hit.x - cyl->ctr.x; tmp2.y = hit.y - cyl->ctr.y; tmp2.z = hit.z - cyl->ctr.z; VDOT(tmp, tmp2, ctmp4); VDOT(tmp3, cyl->axis, cyl->axis); if ((tmp > 0.0) && (tmp < sqrt(tmp3))) add_intersection(tout, (object *)cyl, ry); } } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/cylinder.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * cylinder.h - This file contains the defines for cylinders etc. * * $Id: cylinder.h,v 1.2 2007-02-22 17:54:15 Exp $ */ object *newcylinder(void *, vector, vector, flt); object *newfcylinder(void *, vector, vector, flt); #ifdef CYLINDER_PRIVATE typedef struct { unsigned int id; /* Unique Object serial number */ void *nextobj; /* pointer to next object in list */ object_methods *methods; /* this object's methods */ texture *tex; /* object texture */ vector ctr; vector axis; flt rad; } cylinder; static void cylinder_intersect(cylinder *, ray *); static void fcylinder_intersect(cylinder *, ray *); static int cylinder_bbox(void *obj, vector *min, vector *max); static int fcylinder_bbox(void *obj, vector *min, vector *max); static void cylinder_normal(cylinder *, vector *, ray *, vector *); #endif ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/extvol.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * extvol.cpp - Volume rendering helper routines etc. */ #include #include "machine.hpp" #include "types.hpp" #include "macros.hpp" #include "vector.hpp" #include "util.hpp" #include "box.hpp" #include "extvol.hpp" #include "trace.hpp" #include "sphere.hpp" #include "light.hpp" #include "shade.hpp" #include "global.hpp" int extvol_bbox(void *obj, vector *min, vector *max) { box *b = (box *)obj; *min = b->min; *max = b->max; return 1; } static object_methods extvol_methods = { (void (*)(void *, void *))(box_intersect), (void (*)(void *, void *, void *, void *))(box_normal), extvol_bbox, free }; extvol *newextvol(void *voidtex, vector min, vector max, int samples, flt (*evaluator)(flt, flt, flt)) { extvol *xvol; texture *tex; tex = (texture *)voidtex; xvol = (extvol *)rt_getmem(sizeof(extvol)); memset(xvol, 0, sizeof(extvol)); xvol->methods = &extvol_methods; xvol->min = min; xvol->max = max; xvol->evaluator = evaluator; xvol->ambient = tex->ambient; xvol->diffuse = tex->diffuse; xvol->opacity = tex->opacity; xvol->samples = samples; xvol->tex = (texture *)rt_getmem(sizeof(texture)); memset(xvol->tex, 0, sizeof(texture)); xvol->tex->ctr.x = 0.0; xvol->tex->ctr.y = 0.0; xvol->tex->ctr.z = 0.0; xvol->tex->rot = xvol->tex->ctr; xvol->tex->scale = xvol->tex->ctr; xvol->tex->uaxs = xvol->tex->ctr; xvol->tex->vaxs = xvol->tex->ctr; xvol->tex->islight = 0; xvol->tex->shadowcast = 0; xvol->tex->col = tex->col; xvol->tex->ambient = 1.0; xvol->tex->diffuse = 0.0; xvol->tex->specular = 0.0; xvol->tex->opacity = 1.0; xvol->tex->img = nullptr; xvol->tex->texfunc = (color(*)(void *, void *, void *))(ext_volume_texture); xvol->tex->obj = (void *)xvol; /* XXX hack! */ return xvol; } color ExtVoxelColor(flt scalar) { color col; if (scalar > 1.0) scalar = 1.0; if (scalar < 0.0) scalar = 0.0; if (scalar < 0.5) { col.g = 0.0; } else { col.g = (scalar - 0.5) * 2.0; } col.r = scalar; col.b = 1.0 - (scalar / 2.0); return col; } color ext_volume_texture(vector *hit, texture *tex, ray *ry) { color col, col2; box *bx; extvol *xvol; flt a, tx1, tx2, ty1, ty2, tz1, tz2; flt tnear, tfar; flt t, tdist, dt, ddt, sum, tt; vector pnt, bln; flt scalar, transval; int i; point_light *li; color diffint; vector N, L; flt inten; col.r = 0.0; col.g = 0.0; col.b = 0.0; bx = (box *)tex->obj; xvol = (extvol *)tex->obj; tnear = -FHUGE; tfar = FHUGE; if (ry->d.x == 0.0) { if ((ry->o.x < bx->min.x) || (ry->o.x > bx->max.x)) return col; } else { tx1 = (bx->min.x - ry->o.x) / ry->d.x; tx2 = (bx->max.x - ry->o.x) / ry->d.x; if (tx1 > tx2) { a = tx1; tx1 = tx2; tx2 = a; } if (tx1 > tnear) tnear = tx1; if (tx2 < tfar) tfar = tx2; } if (tnear > tfar) return col; if (tfar < 0.0) return col; if (ry->d.y == 0.0) { if ((ry->o.y < bx->min.y) || (ry->o.y > bx->max.y)) return col; } else { ty1 = (bx->min.y - ry->o.y) / ry->d.y; ty2 = (bx->max.y - ry->o.y) / ry->d.y; if (ty1 > ty2) { a = ty1; ty1 = ty2; ty2 = a; } if (ty1 > tnear) tnear = ty1; if (ty2 < tfar) tfar = ty2; } if (tnear > tfar) return col; if (tfar < 0.0) return col; if (ry->d.z == 0.0) { if ((ry->o.z < bx->min.z) || (ry->o.z > bx->max.z)) return col; } else { tz1 = (bx->min.z - ry->o.z) / ry->d.z; tz2 = (bx->max.z - ry->o.z) / ry->d.z; if (tz1 > tz2) { a = tz1; tz1 = tz2; tz2 = a; } if (tz1 > tnear) tnear = tz1; if (tz2 < tfar) tfar = tz2; } if (tnear > tfar) return col; if (tfar < 0.0) return col; if (tnear < 0.0) tnear = 0.0; tdist = xvol->samples; tt = (xvol->opacity / tdist); bln.x = fabs(bx->min.x - bx->max.x); bln.y = fabs(bx->min.y - bx->max.y); bln.z = fabs(bx->min.z - bx->max.z); dt = 1.0 / tdist; sum = 0.0; /* Accumulate color as the ray passes through the voxels */ for (t = tnear; t <= tfar; t += dt) { if (sum < 1.0) { pnt.x = ((ry->o.x + (ry->d.x * t)) - bx->min.x) / bln.x; pnt.y = ((ry->o.y + (ry->d.y * t)) - bx->min.y) / bln.y; pnt.z = ((ry->o.z + (ry->d.z * t)) - bx->min.z) / bln.z; /* call external evaluator assume 0.0 -> 1.0 range.. */ scalar = xvol->evaluator(pnt.x, pnt.y, pnt.z); transval = tt * scalar; sum += transval; col2 = ExtVoxelColor(scalar); col.r += transval * col2.r * xvol->ambient; col.g += transval * col2.g * xvol->ambient; col.b += transval * col2.b * xvol->ambient; ddt = dt; /* Add in diffuse shaded light sources (no shadows) */ if (xvol->diffuse > 0.0) { /* Calculate the Volume gradient at the voxel */ N.x = (xvol->evaluator(pnt.x - ddt, pnt.y, pnt.z) - xvol->evaluator(pnt.x + ddt, pnt.y, pnt.z)) * 8.0 * tt; N.y = (xvol->evaluator(pnt.x, pnt.y - ddt, pnt.z) - xvol->evaluator(pnt.x, pnt.y + ddt, pnt.z)) * 8.0 * tt; N.z = (xvol->evaluator(pnt.x, pnt.y, pnt.z - ddt) - xvol->evaluator(pnt.x, pnt.y, pnt.z + ddt)) * 8.0 * tt; /* only light surfaces with enough of a normal.. */ if ((N.x * N.x + N.y * N.y + N.z * N.z) > 0.0) { diffint.r = 0.0; diffint.g = 0.0; diffint.b = 0.0; /* add the contribution of each of the lights.. */ for (i = 0; i < numlights; i++) { li = lightlist[i]; VSUB(li->ctr, (*hit), L) VNorm(&L); VDOT(inten, N, L) /* only add light if its from the front of the surface */ /* could add back-lighting if we wanted to later.. */ if (inten > 0.0) { diffint.r += inten * li->tex->col.r; diffint.g += inten * li->tex->col.g; diffint.b += inten * li->tex->col.b; } } col.r += col2.r * diffint.r * xvol->diffuse; col.g += col2.g * diffint.g * xvol->diffuse; col.b += col2.b * diffint.b * xvol->diffuse; } } } else { sum = 1.0; } } /* Add in transmitted ray from outside environment */ if (sum < 1.0) { /* spawn transmission rays / refraction */ color transcol; transcol = shade_transmission(ry, hit, 1.0 - sum); col.r += transcol.r; /* add the transmitted ray */ col.g += transcol.g; /* to the diffuse and */ col.b += transcol.b; /* transmission total.. */ } return col; } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/extvol.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * vol.h - Volume rendering definitions etc. * * * $Id: extvol.h,v 1.2 2007-02-22 17:54:15 Exp $ */ typedef struct { unsigned int id; /* Unique Object serial number */ void *nextobj; /* pointer to next object in list */ object_methods *methods; /* this object's methods */ texture *tex; /* object texture */ vector min; vector max; flt ambient; flt diffuse; flt opacity; int samples; flt (*evaluator)(flt, flt, flt); } extvol; extvol *newextvol(void *voidtex, vector min, vector max, int samples, flt (*evaluator)(flt, flt, flt)); color ext_volume_texture(vector *, texture *, ray *); ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/global.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * global.cpp - any/all global data items etc should be in this file */ #include "types.hpp" #include "machine.hpp" #include "sphere.hpp" #include "light.hpp" /* stuff moved from intersect.c */ object* rootobj = nullptr; /* starts out empty. */ point_light* lightlist[MAXLIGHTS]; int numlights = 0; unsigned int numobjects = 0; /* used to assign unique object ID's */ /* used in util.c */ unsigned int rt_mem_in_use = 0; /* used in api.c */ int parinitted = 0; int graphicswindowopen = 0; ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/global.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * global.h - any/all global data items etc should be in this file * * $Id: global.h,v 1.2 2007-02-22 17:54:15 Exp $ * */ /* stuff moved from intersect.c */ extern object* rootobj; extern point_light* lightlist[MAXLIGHTS]; extern int numlights; extern unsigned int numobjects; extern unsigned int rt_mem_in_use; extern int parinitted; extern int graphicswindowopen; ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/grid.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * grid.cpp - spatial subdivision efficiency structures */ #include "machine.hpp" #include "types.hpp" #include "macros.hpp" #include "vector.hpp" #include "intersect.hpp" #include "util.hpp" #define GRID_PRIVATE #include "grid.hpp" #ifndef cbrt #define cbrt(x) \ ((x) > 0.0 ? pow((double)(x), 1.0 / 3.0) : ((x) < 0.0 ? -pow((double)-(x), 1.0 / 3.0) : 0.0)) #define qbrt(x) \ ((x) > 0.0 ? pow((double)(x), 1.0 / 4.0) : ((x) < 0.0 ? -pow((double)-(x), 1.0 / 4.0) : 0.0)) #endif static object_methods grid_methods = { (void (*)(void *, void *))(grid_intersect), (void (*)(void *, void *, void *, void *))(nullptr), grid_bbox, grid_free }; extern bool silent_mode; object *newgrid(int xsize, int ysize, int zsize, vector min, vector max) { grid *g; g = (grid *)rt_getmem(sizeof(grid)); memset(g, 0, sizeof(grid)); g->methods = &grid_methods; g->id = new_objectid(); g->xsize = xsize; g->ysize = ysize; g->zsize = zsize; g->min = min; g->max = max; VSub(&g->max, &g->min, &g->voxsize); g->voxsize.x /= (flt)g->xsize; g->voxsize.y /= (flt)g->ysize; g->voxsize.z /= (flt)g->zsize; g->cells = (objectlist **)rt_getmem(xsize * ysize * zsize * sizeof(objectlist *)); memset(g->cells, 0, xsize * ysize * zsize * sizeof(objectlist *)); /* fprintf(stderr, "New grid, size: %8d %8d %8d\n", g->xsize, g->ysize, g->zsize); */ return (object *)g; } static int grid_bbox(void *obj, vector *min, vector *max) { grid *g = (grid *)obj; *min = g->min; *max = g->max; return 1; } static void grid_free(void *v) { int i, numvoxels; grid *g = (grid *)v; /* loop through all voxels and free the object lists */ numvoxels = g->xsize * g->ysize * g->zsize; for (i = 0; i < numvoxels; i++) { objectlist *lcur, *lnext; lcur = g->cells[i]; while (lcur != nullptr) { lnext = lcur->next; free(lcur); } } /* free the grid cells */ free(g->cells); /* free all objects on the grid object list */ free_objects(g->objects); free(g); } static void globalbound(object **rootlist, vector *gmin, vector *gmax) { vector min, max; object *cur; if (*rootlist == nullptr) /* don't bound non-existent objects */ return; gmin->x = FHUGE; gmin->y = FHUGE; gmin->z = FHUGE; gmax->x = -FHUGE; gmax->y = -FHUGE; gmax->z = -FHUGE; cur = *rootlist; while (cur != nullptr) { /* Go! */ min.x = -FHUGE; min.y = -FHUGE; min.z = -FHUGE; max.x = FHUGE; max.y = FHUGE; max.z = FHUGE; if (cur->methods->bbox((void *)cur, &min, &max)) { gmin->x = MYMIN(gmin->x, min.x); gmin->y = MYMIN(gmin->y, min.y); gmin->z = MYMIN(gmin->z, min.z); gmax->x = MYMAX(gmax->x, max.x); gmax->y = MYMAX(gmax->y, max.y); gmax->z = MYMAX(gmax->z, max.z); } cur = (object *)cur->nextobj; } } static int cellbound(grid *g, gridindex *index, vector *cmin, vector *cmax) { vector min, max, cellmin, cellmax; objectlist *cur; int numinbounds = 0; cur = g->cells[index->z * g->xsize * g->ysize + index->y * g->xsize + index->x]; if (cur == nullptr) /* don't bound non-existent objects */ return 0; cellmin.x = voxel2x(g, index->x); cellmin.y = voxel2y(g, index->y); cellmin.z = voxel2z(g, index->z); cellmax.x = cellmin.x + g->voxsize.x; cellmax.y = cellmin.y + g->voxsize.y; cellmax.z = cellmin.z + g->voxsize.z; cmin->x = FHUGE; cmin->y = FHUGE; cmin->z = FHUGE; cmax->x = -FHUGE; cmax->y = -FHUGE; cmax->z = -FHUGE; while (cur != nullptr) { /* Go! */ min.x = -FHUGE; min.y = -FHUGE; min.z = -FHUGE; max.x = FHUGE; max.y = FHUGE; max.z = FHUGE; if (cur->obj->methods->bbox((void *)cur->obj, &min, &max)) { if ((min.x >= cellmin.x) && (max.x <= cellmax.x) && (min.y >= cellmin.y) && (max.y <= cellmax.y) && (min.z >= cellmin.z) && (max.z <= cellmax.z)) { cmin->x = MYMIN(cmin->x, min.x); cmin->y = MYMIN(cmin->y, min.y); cmin->z = MYMIN(cmin->z, min.z); cmax->x = MYMAX(cmax->x, max.x); cmax->y = MYMAX(cmax->y, max.y); cmax->z = MYMAX(cmax->z, max.z); numinbounds++; } } cur = cur->next; } /* in case we get a 0.0 sized axis on the cell bounds, we'll */ /* use the original cell bounds */ if ((cmax->x - cmin->x) < EPSILON) { cmax->x += EPSILON; cmin->x -= EPSILON; } if ((cmax->y - cmin->y) < EPSILON) { cmax->y += EPSILON; cmin->y -= EPSILON; } if ((cmax->z - cmin->z) < EPSILON) { cmax->z += EPSILON; cmin->z -= EPSILON; } return numinbounds; } static int countobj(object *root) { object *cur; /* counts the number of objects on a list */ int numobj; numobj = 0; cur = root; while (cur != nullptr) { cur = (object *)cur->nextobj; numobj++; } return numobj; } static int countobjlist(objectlist *root) { objectlist *cur; int numobj; numobj = 0; cur = root; while (cur != nullptr) { cur = cur->next; numobj++; } return numobj; } int engrid_scene(object **list) { grid *g; int numobj, numcbrt; vector gmin, gmax; gridindex index; if (*list == nullptr) return 0; numobj = countobj(*list); if (!silent_mode) fprintf(stderr, "Scene contains %d bounded objects.\n", numobj); if (numobj > 16) { numcbrt = (int)cbrt(4 * numobj); globalbound(list, &gmin, &gmax); g = (grid *)newgrid(numcbrt, numcbrt, numcbrt, gmin, gmax); engrid_objlist(g, list); numobj = countobj(*list); g->nextobj = *list; *list = (object *)g; /* now create subgrids.. */ for (index.z = 0; index.z < g->zsize; index.z++) { for (index.y = 0; index.y < g->ysize; index.y++) { for (index.x = 0; index.x < g->xsize; index.x++) { engrid_cell(g, &index); } } } } return 1; } void engrid_objlist(grid *g, object **list) { object *cur, *next, **prev; if (*list == nullptr) return; prev = list; cur = *list; while (cur != nullptr) { next = (object *)cur->nextobj; if (engrid_object(g, cur)) *prev = next; else prev = (object **)&cur->nextobj; cur = next; } } static int engrid_cell(grid *gold, gridindex *index) { vector gmin, gmax, gsize; flt len; int numobj, numcbrt, xs, ys, zs; grid *g; objectlist **list; objectlist *newobj; list = &gold->cells[index->z * gold->xsize * gold->ysize + index->y * gold->xsize + index->x]; if (*list == nullptr) return 0; numobj = cellbound(gold, index, &gmin, &gmax); VSub(&gmax, &gmin, &gsize); len = 1.0 / (MYMAX(MYMAX(gsize.x, gsize.y), gsize.z)); gsize.x *= len; gsize.y *= len; gsize.z *= len; if (numobj > 16) { numcbrt = (int)cbrt(2 * numobj); xs = (int)((flt)numcbrt * gsize.x); if (xs < 1) xs = 1; ys = (int)((flt)numcbrt * gsize.y); if (ys < 1) ys = 1; zs = (int)((flt)numcbrt * gsize.z); if (zs < 1) zs = 1; g = (grid *)newgrid(xs, ys, zs, gmin, gmax); engrid_objectlist(g, list); newobj = (objectlist *)rt_getmem(sizeof(objectlist)); newobj->obj = (object *)g; newobj->next = *list; *list = newobj; g->nextobj = gold->objects; gold->objects = (object *)g; } return 1; } static int engrid_objectlist(grid *g, objectlist **list) { objectlist *cur, *next, **prev; int numsucceeded = 0; if (*list == nullptr) return 0; prev = list; cur = *list; while (cur != nullptr) { next = cur->next; if (engrid_object(g, cur->obj)) { *prev = next; free(cur); numsucceeded++; } else { prev = &cur->next; } cur = next; } return numsucceeded; } static int engrid_object(grid *g, object *obj) { vector omin, omax; gridindex low, high; int x, y, z, zindex, yindex, voxindex; objectlist *tmp; if (obj->methods->bbox(obj, &omin, &omax)) { if (!pos2grid(g, &omin, &low) || !pos2grid(g, &omax, &high)) { return 0; /* object is not wholly contained in the grid */ } } else { return 0; /* object is unbounded */ } /* add the object to the complete list of objects in the grid */ obj->nextobj = g->objects; g->objects = obj; /* add this object to all voxels it inhabits */ for (z = low.z; z <= high.z; z++) { zindex = z * g->xsize * g->ysize; for (y = low.y; y <= high.y; y++) { yindex = y * g->xsize; for (x = low.x; x <= high.x; x++) { voxindex = x + yindex + zindex; tmp = (objectlist *)rt_getmem(sizeof(objectlist)); tmp->next = g->cells[voxindex]; tmp->obj = obj; g->cells[voxindex] = tmp; } } } return 1; } static int pos2grid(grid *g, vector *pos, gridindex *index) { index->x = (int)((pos->x - g->min.x) / g->voxsize.x); index->y = (int)((pos->y - g->min.y) / g->voxsize.y); index->z = (int)((pos->z - g->min.z) / g->voxsize.z); if (index->x == g->xsize) index->x--; if (index->y == g->ysize) index->y--; if (index->z == g->zsize) index->z--; if (index->x < 0 || index->x > g->xsize || index->y < 0 || index->y > g->ysize || index->z < 0 || index->z > g->zsize) return 0; if (pos->x < g->min.x || pos->x > g->max.x || pos->y < g->min.y || pos->y > g->max.y || pos->z < g->min.z || pos->z > g->max.z) return 0; return 1; } /* the real thing */ static void grid_intersect(grid *g, ray *ry) { flt tnear, tfar, offset; vector curpos, tmax, tdelta, pdeltaX, pdeltaY, pdeltaZ, nXp, nYp, nZp; gridindex curvox, step, out; int voxindex; objectlist *cur; if (ry->flags & RT_RAY_FINISHED) return; if (!grid_bounds_intersect(g, ry, &tnear, &tfar)) return; if (ry->maxdist < tnear) return; curpos = Raypnt(ry, tnear); pos2grid(g, &curpos, &curvox); offset = tnear; /* Setup X iterator stuff */ if (fabs(ry->d.x) < EPSILON) { tmax.x = FHUGE; tdelta.x = 0.0; step.x = 0; out.x = 0; /* never goes out of bounds on this axis */ } else if (ry->d.x < 0.0) { tmax.x = offset + ((voxel2x(g, curvox.x) - curpos.x) / ry->d.x); tdelta.x = g->voxsize.x / -ry->d.x; step.x = out.x = -1; } else { tmax.x = offset + ((voxel2x(g, curvox.x + 1) - curpos.x) / ry->d.x); tdelta.x = g->voxsize.x / ry->d.x; step.x = 1; out.x = g->xsize; } /* Setup Y iterator stuff */ if (fabs(ry->d.y) < EPSILON) { tmax.y = FHUGE; tdelta.y = 0.0; step.y = 0; out.y = 0; /* never goes out of bounds on this axis */ } else if (ry->d.y < 0.0) { tmax.y = offset + ((voxel2y(g, curvox.y) - curpos.y) / ry->d.y); tdelta.y = g->voxsize.y / -ry->d.y; step.y = out.y = -1; } else { tmax.y = offset + ((voxel2y(g, curvox.y + 1) - curpos.y) / ry->d.y); tdelta.y = g->voxsize.y / ry->d.y; step.y = 1; out.y = g->ysize; } /* Setup Z iterator stuff */ if (fabs(ry->d.z) < EPSILON) { tmax.z = FHUGE; tdelta.z = 0.0; step.z = 0; out.z = 0; /* never goes out of bounds on this axis */ } else if (ry->d.z < 0.0) { tmax.z = offset + ((voxel2z(g, curvox.z) - curpos.z) / ry->d.z); tdelta.z = g->voxsize.z / -ry->d.z; step.z = out.z = -1; } else { tmax.z = offset + ((voxel2z(g, curvox.z + 1) - curpos.z) / ry->d.z); tdelta.z = g->voxsize.z / ry->d.z; step.z = 1; out.z = g->zsize; } pdeltaX = ry->d; VScale(&pdeltaX, tdelta.x); pdeltaY = ry->d; VScale(&pdeltaY, tdelta.y); pdeltaZ = ry->d; VScale(&pdeltaZ, tdelta.z); nXp = Raypnt(ry, tmax.x); nYp = Raypnt(ry, tmax.y); nZp = Raypnt(ry, tmax.z); voxindex = curvox.z * g->xsize * g->ysize + curvox.y * g->xsize + curvox.x; while (1) { if (tmax.x < tmax.y && tmax.x < tmax.z) { cur = g->cells[voxindex]; while (cur != nullptr) { if (ry->mbox[cur->obj->id] != ry->serial) { ry->mbox[cur->obj->id] = ry->serial; cur->obj->methods->intersect(cur->obj, ry); } cur = cur->next; } curvox.x += step.x; if (ry->maxdist < tmax.x || curvox.x == out.x) break; voxindex += step.x; tmax.x += tdelta.x; curpos = nXp; nXp.x += pdeltaX.x; nXp.y += pdeltaX.y; nXp.z += pdeltaX.z; } else if (tmax.z < tmax.y) { cur = g->cells[voxindex]; while (cur != nullptr) { if (ry->mbox[cur->obj->id] != ry->serial) { ry->mbox[cur->obj->id] = ry->serial; cur->obj->methods->intersect(cur->obj, ry); } cur = cur->next; } curvox.z += step.z; if (ry->maxdist < tmax.z || curvox.z == out.z) break; voxindex += step.z * g->xsize * g->ysize; tmax.z += tdelta.z; curpos = nZp; nZp.x += pdeltaZ.x; nZp.y += pdeltaZ.y; nZp.z += pdeltaZ.z; } else { cur = g->cells[voxindex]; while (cur != nullptr) { if (ry->mbox[cur->obj->id] != ry->serial) { ry->mbox[cur->obj->id] = ry->serial; cur->obj->methods->intersect(cur->obj, ry); } cur = cur->next; } curvox.y += step.y; if (ry->maxdist < tmax.y || curvox.y == out.y) break; voxindex += step.y * g->xsize; tmax.y += tdelta.y; curpos = nYp; nYp.x += pdeltaY.x; nYp.y += pdeltaY.y; nYp.z += pdeltaY.z; } if (ry->flags & RT_RAY_FINISHED) break; } } static void voxel_intersect(grid *g, ray *ry, int voxindex) { objectlist *cur; cur = g->cells[voxindex]; while (cur != nullptr) { cur->obj->methods->intersect(cur->obj, ry); cur = cur->next; } } static int grid_bounds_intersect(grid *g, ray *ry, flt *nr, flt *fr) { flt a, tx1, tx2, ty1, ty2, tz1, tz2; flt tnear, tfar; tnear = -FHUGE; tfar = FHUGE; if (ry->d.x == 0.0) { if ((ry->o.x < g->min.x) || (ry->o.x > g->max.x)) return 0; } else { tx1 = (g->min.x - ry->o.x) / ry->d.x; tx2 = (g->max.x - ry->o.x) / ry->d.x; if (tx1 > tx2) { a = tx1; tx1 = tx2; tx2 = a; } if (tx1 > tnear) tnear = tx1; if (tx2 < tfar) tfar = tx2; } if (tnear > tfar) return 0; if (tfar < 0.0) return 0; if (ry->d.y == 0.0) { if ((ry->o.y < g->min.y) || (ry->o.y > g->max.y)) return 0; } else { ty1 = (g->min.y - ry->o.y) / ry->d.y; ty2 = (g->max.y - ry->o.y) / ry->d.y; if (ty1 > ty2) { a = ty1; ty1 = ty2; ty2 = a; } if (ty1 > tnear) tnear = ty1; if (ty2 < tfar) tfar = ty2; } if (tnear > tfar) return 0; if (tfar < 0.0) return 0; if (ry->d.z == 0.0) { if ((ry->o.z < g->min.z) || (ry->o.z > g->max.z)) return 0; } else { tz1 = (g->min.z - ry->o.z) / ry->d.z; tz2 = (g->max.z - ry->o.z) / ry->d.z; if (tz1 > tz2) { a = tz1; tz1 = tz2; tz2 = a; } if (tz1 > tnear) tnear = tz1; if (tz2 < tfar) tfar = tz2; } if (tnear > tfar) return 0; if (tfar < 0.0) return 0; *nr = tnear; *fr = tfar; return 1; } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/grid.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * grid.h - spatial subdivision efficiency structures * * $Id: grid.h,v 1.2 2007-02-22 17:54:15 Exp $ * */ int engrid_scene(object **list); object *newgrid(int xsize, int ysize, int zsize, vector min, vector max); #ifdef GRID_PRIVATE typedef struct objectlist { struct objectlist *next; /* next link in the list */ object *obj; /* the actual object */ } objectlist; typedef struct { unsigned int id; /* Unique Object serial number */ void *nextobj; /* pointer to next object in list */ object_methods *methods; /* this object's methods */ texture *tex; /* object texture */ int xsize; /* number of cells along the X direction */ int ysize; /* number of cells along the Y direction */ int zsize; /* number of cells along the Z direction */ vector min; /* the minimum coords for the box containing the grid */ vector max; /* the maximum coords for the box containing the grid */ vector voxsize; /* the size of a grid cell/voxel */ object *objects; /* all objects contained in the grid */ objectlist **cells; /* the grid cells themselves */ } grid; typedef struct { int x; /* Voxel X address */ int y; /* Voxel Y address */ int z; /* Voxel Z address */ } gridindex; /* * Convert from voxel number along X/Y/Z to corresponding coordinate. */ #define voxel2x(g, X) ((X) * (g->voxsize.x) + (g->min.x)) #define voxel2y(g, Y) ((Y) * (g->voxsize.y) + (g->min.y)) #define voxel2z(g, Z) ((Z) * (g->voxsize.z) + (g->min.z)) /* * And vice-versa. */ #define x2voxel(g, x) (((x)-g->min.x) / g->voxsize.x) #define y2voxel(g, y) (((y)-g->min.y) / g->voxsize.y) #define z2voxel(g, z) (((z)-g->min.z) / g->voxsize.z) static int grid_bbox(void *obj, vector *min, vector *max); static void grid_free(void *v); static int cellbound(grid *g, gridindex *index, vector *cmin, vector *cmax); void engrid_objlist(grid *g, object **list); static int engrid_object(grid *g, object *obj); static int engrid_objectlist(grid *g, objectlist **list); static int engrid_cell(grid *, gridindex *); static int pos2grid(grid *g, vector *pos, gridindex *index); static void grid_intersect(grid *, ray *); static void voxel_intersect(grid *g, ray *ry, int voxaddr); static int grid_bounds_intersect(grid *g, ray *ry, flt *near, flt *far); #endif ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/imageio.cpp ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * imageio.cpp - This file deals with reading/writing image files */ /* For our purposes, we're interested only in the 3 byte per pixel 24 bit * truecolor sort of file.. */ #include #include "machine.hpp" #include "types.hpp" #include "util.hpp" #include "imageio.hpp" #include "ppm.hpp" /* PPM files */ #include "tgafile.hpp" /* Truevision Targa files */ static int fakeimage(char *name, int *xres, int *yres, unsigned char **imgdata) { int i, imgsize; fprintf(stderr, "Error loading image %s. Faking it.\n", name); *xres = 2; *yres = 2; imgsize = 3 * (*xres) * (*yres); *imgdata = (unsigned char *)rt_getmem(imgsize); for (i = 0; i < imgsize; i++) { (*imgdata)[i] = 255; } return IMAGENOERR; } int readimage(rawimage *img) { int rc; int xres, yres; unsigned char *imgdata = nullptr; char *name = img->name; if (strstr(name, ".ppm")) { rc = readppm(name, &xres, &yres, &imgdata); } else if (strstr(name, ".tga")) { rc = readtga(name, &xres, &yres, &imgdata); } else if (strstr(name, ".jpg")) { rc = IMAGEUNSUP; } else if (strstr(name, ".gif")) { rc = IMAGEUNSUP; } else if (strstr(name, ".png")) { rc = IMAGEUNSUP; } else if (strstr(name, ".tiff")) { rc = IMAGEUNSUP; } else if (strstr(name, ".rgb")) { rc = IMAGEUNSUP; } else if (strstr(name, ".xpm")) { rc = IMAGEUNSUP; } else { rc = readppm(name, &xres, &yres, &imgdata); } switch (rc) { case IMAGEREADERR: fprintf(stderr, "Short read encountered while loading image %s\n", name); rc = IMAGENOERR; /* remap to non-fatal error */ break; case IMAGEUNSUP: fprintf(stderr, "Cannot read unsupported image format for image %s\n", name); break; } /* If the image load failed, create a tiny white colored image to fake it */ /* this allows a scene to render even when a file can't be loaded */ if (rc != IMAGENOERR) { rc = fakeimage(name, &xres, &yres, &imgdata); } /* If we succeeded in loading the image, return it. */ if (rc == IMAGENOERR) { img->xres = xres; img->yres = yres; img->bpp = 3; img->data = imgdata; } return rc; } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/imageio.hpp ================================================ /* Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * imageio.h - This file deals with reading/writing image files * * $Id: imageio.h,v 1.2 2007-02-22 17:54:15 Exp $ */ /* For our purposes, we're interested only in the 3 byte per pixel 24 bit truecolor sort of file.. */ #define IMAGENOERR 0 /* no error */ #define IMAGEBADFILE 1 /* can't find or can't open the file */ #define IMAGEUNSUP 2 /* the image file is an unsupported format */ #define IMAGEALLOCERR 3 /* not enough remaining memory to load this image */ #define IMAGEREADERR 4 /* failed read, short reads etc */ int readimage(rawimage *); ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/imap.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * imap.cpp - This file contains code for doing image map type things. */ #include "machine.hpp" #include "types.hpp" #include "imap.hpp" #include "util.hpp" #include "imageio.hpp" rawimage *imagelist[MAXIMGS]; int numimages; void ResetImages(void) { int i; numimages = 0; for (i = 0; i < MAXIMGS; i++) { imagelist[i] = nullptr; } } void LoadImage(rawimage *image) { if (!image->loaded) { readimage(image); image->loaded = 1; } } color ImageMap(rawimage *image, flt u, flt v) { color col, colx, colx2; flt x, y, px, py; int x1, x2, y1, y2; unsigned char *ptr; unsigned char *ptr2; if (!image->loaded) { LoadImage(image); image->loaded = 1; } if ((u <= 1.0) && (u >= 0.0) && (v <= 1.0) && (v >= 0.0)) { x = (image->xres - 1.0) * u; /* floating point X location */ y = (image->yres - 1.0) * v; /* floating point Y location */ px = x - ((int)x); py = y - ((int)y); x1 = (int)x; x2 = x1 + 1; y1 = (int)y; y2 = y1 + 1; ptr = image->data + ((image->xres * y1) + x1) * 3; ptr2 = image->data + ((image->xres * y1) + x2) * 3; colx.r = (flt)((flt)ptr[0] + px * ((flt)ptr2[0] - (flt)ptr[0])) / 255.0; colx.g = (flt)((flt)ptr[1] + px * ((flt)ptr2[1] - (flt)ptr[1])) / 255.0; colx.b = (flt)((flt)ptr[2] + px * ((flt)ptr2[2] - (flt)ptr[2])) / 255.0; ptr = image->data + ((image->xres * y2) + x1) * 3; ptr2 = image->data + ((image->xres * y2) + x2) * 3; colx2.r = ((flt)ptr[0] + px * ((flt)ptr2[0] - (flt)ptr[0])) / 255.0; colx2.g = ((flt)ptr[1] + px * ((flt)ptr2[1] - (flt)ptr[1])) / 255.0; colx2.b = ((flt)ptr[2] + px * ((flt)ptr2[2] - (flt)ptr[2])) / 255.0; col.r = colx.r + py * (colx2.r - colx.r); col.g = colx.g + py * (colx2.g - colx.g); col.b = colx.b + py * (colx2.b - colx.b); } else { col.r = 0.0; col.g = 0.0; col.b = 0.0; } return col; } rawimage *AllocateImage(char *filename) { rawimage *newimage = nullptr; int i, intable; std::size_t len; intable = 0; if (numimages != 0) { for (i = 0; i < numimages; i++) { if (!strcmp(filename, imagelist[i]->name)) { newimage = imagelist[i]; intable = 1; } } } if (!intable) { newimage = (rawimage *)rt_getmem(sizeof(rawimage)); newimage->loaded = 0; newimage->xres = 0; newimage->yres = 0; newimage->bpp = 0; newimage->data = nullptr; len = strlen(filename); if (len > 80) rtbomb("Filename too long in image map!!"); strcpy(newimage->name, filename); imagelist[numimages] = newimage; /* add new one to the table */ numimages++; /* increment the number of images */ } return newimage; } void DeallocateImage(rawimage *image) { image->loaded = 0; rt_freemem(image->data); } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/imap.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * imap.h - This file contains defines etc for doing image map type things. * * $Id: imap.h,v 1.2 2007-02-22 17:54:15 Exp $ */ void ResetImage(void); void LoadImage(rawimage *); color ImageMap(rawimage *, flt, flt); rawimage *AllocateImage(char *); void DeallocateImage(rawimage *); void ResetImages(void); ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/intersect.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * intersect.cpp - This file contains code for CSG and intersection routines. */ #include "machine.hpp" #include "types.hpp" #include "intersect.hpp" #include "light.hpp" #include "util.hpp" #include "global.hpp" unsigned int new_objectid(void) { return numobjects++; /* global used to generate unique object ID's */ } unsigned int max_objectid(void) { return numobjects; } void add_object(object *obj) { object *objtemp; if (obj == nullptr) return; obj->id = new_objectid(); objtemp = rootobj; rootobj = obj; obj->nextobj = objtemp; } void free_objects(object *start) { object *cur; object *cur2; cur = start; while (cur->nextobj != nullptr) { cur2 = (object *)cur->nextobj; cur->methods->free(cur); cur = cur2; } free(cur); } void reset_object(void) { if (rootobj != nullptr) free_objects(rootobj); rootobj = nullptr; numobjects = 0; /* set number of objects back to 0 */ } void intersect_objects(ray *intray) { object *cur; object temp; temp.nextobj = rootobj; /* setup the initial object pointers.. */ cur = &temp; /* ready, set */ while ((cur = (object *)cur->nextobj) != nullptr) cur->methods->intersect(cur, intray); } void reset_intersection(intersectstruct *intstruct) { intstruct->num = 0; intstruct->list[0].t = FHUGE; intstruct->list[0].obj = nullptr; intstruct->list[1].t = FHUGE; intstruct->list[1].obj = nullptr; } void add_intersection(flt t, object *obj, ray *ry) { intersectstruct *intstruct = ry->intstruct; if (t > EPSILON) { /* if we hit something before maxdist update maxdist */ if (t < ry->maxdist) { ry->maxdist = t; /* if we hit *anything* before maxdist, and we're firing a */ /* shadow ray, then we are finished ray tracing the shadow */ if (ry->flags & RT_RAY_SHADOW) ry->flags |= RT_RAY_FINISHED; } intstruct->num++; intstruct->list[intstruct->num].obj = obj; intstruct->list[intstruct->num].t = t; } } int closest_intersection(flt *t, object **obj, intersectstruct *intstruct) { int i; *t = FHUGE; for (i = 1; i <= intstruct->num; i++) { if (intstruct->list[i].t < *t) { *t = intstruct->list[i].t; *obj = intstruct->list[i].obj; } } return intstruct->num; } int shadow_intersection(intersectstruct *intstruct, flt maxdist) { int i; if (intstruct->num > 0) { for (i = 1; i <= intstruct->num; i++) { if ((intstruct->list[i].t < maxdist) && (intstruct->list[i].obj->tex->shadowcast == 1)) { return 1; } } } return 0; } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/intersect.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * intersect.h - This file contains the declarations and defines for the * functions that manage intersection, bounding and CSG.. * * $Id: intersect.h,v 1.2 2007-02-22 17:54:15 Exp $ */ unsigned int new_objectid(void); unsigned int max_objectid(void); void add_object(object *); void reset_object(void); void free_objects(object *); void intersect_objects(ray *); void reset_intersection(intersectstruct *); void add_intersection(flt, object *, ray *); int closest_intersection(flt *, object **, intersectstruct *); int next_intersection(object **, object *, intersectstruct *); int shadow_intersection(intersectstruct *intstruct, flt maxdist); ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/jpeg.cpp ================================================ /* Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * jpeg.cpp - This file deals with JPEG format image files (reading/writing) */ /* * This code requires support from the Independent JPEG Group's libjpeg. * For our purposes, we're interested only in the 3 byte per pixel 24 bit * RGB output. Probably won't implement any decent checking at this point. */ #include #include "machine.hpp" #include "types.hpp" #include "util.hpp" #include "imageio.hpp" /* error codes etc */ #include "jpeg.hpp" /* the protos for this file */ #if !defined(USEJPEG) int readjpeg(char *name, int *xres, int *yres, unsigned char **imgdata) { return IMAGEUNSUP; } #else #include "jpeglib.hpp" /* the IJG jpeg library headers */ int readjpeg(char *name, int *xres, int *yres, unsigned char **imgdata) { FILE *ifp; struct jpeg_decompress_struct cinfo; /* JPEG decompression struct */ struct jpeg_error_mgr jerr; /* JPEG Error handler */ JSAMPROW row_pointer[1]; /* output row buffer */ int row_stride; /* physical row width in output buf */ /* open input file before doing any JPEG decompression setup */ if ((ifp = fopen(name, "rb")) == nullptr) return IMAGEBADFILE; /* Could not open image, return error */ /* * Note: The Independent JPEG Group's library does not have a way * of returning errors without the use of setjmp/longjmp. * This is a problem in multi-threaded environment, since setjmp * and longjmp are declared thread-unsafe by many vendors currently. * For now, JPEG decompression errors will result in the "default" * error handling provided by the JPEG library, which is an error * message and a fatal call to exit(). I'll have to work around this * or find a reasonably thread-safe way of doing setjmp/longjmp.. */ cinfo.err = jpeg_std_error(&jerr); /* Set JPEG error handler to default */ jpeg_create_decompress(&cinfo); /* Create decompression context */ jpeg_stdio_src(&cinfo, ifp); /* Set input mechanism to stdio type */ jpeg_read_header(&cinfo, TRUE); /* Read the JPEG header for info */ jpeg_start_decompress(&cinfo); /* Prepare for actual decompression */ *xres = cinfo.output_width; /* set returned image width */ *yres = cinfo.output_height; /* set returned image height */ /* Calculate the size of a row in the image */ row_stride = cinfo.output_width * cinfo.output_components; /* Allocate the image buffer which will be returned to the ray tracer */ *imgdata = (unsigned char *)malloc(row_stride * cinfo.output_height); /* decompress the JPEG, one scanline at a time into the buffer */ while (cinfo.output_scanline < cinfo.output_height) { row_pointer[0] = &((*imgdata)[(cinfo.output_scanline) * row_stride]); jpeg_read_scanlines(&cinfo, row_pointer, 1); } jpeg_finish_decompress(&cinfo); /* Tell the JPEG library to cleanup */ jpeg_destroy_decompress(&cinfo); /* Destroy JPEG decompression context */ fclose(ifp); /* Close the input file */ return IMAGENOERR; /* No fatal errors */ } #endif ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/jpeg.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * jpeg.h - This file deals with JPEG format image files (reading/writing) * * $Id: jpeg.h,v 1.2 2007-02-22 17:54:15 Exp $ */ int readjpeg(char *name, int *xres, int *yres, unsigned char **imgdata); ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/light.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * light.cpp - This file contains declarations and defines for light sources. */ #include "machine.hpp" #include "types.hpp" #include "macros.hpp" #include "vector.hpp" #include "intersect.hpp" #include "util.hpp" #define LIGHT_PRIVATE #include "light.hpp" static object_methods light_methods = { (void (*)(void *, void *))(light_intersect), (void (*)(void *, void *, void *, void *))(light_normal), light_bbox, free }; point_light *newlight(void *tex, vector ctr, flt rad) { point_light *l; l = (point_light *)rt_getmem(sizeof(point_light)); memset(l, 0, sizeof(point_light)); l->methods = &light_methods; l->tex = (texture *)tex; l->ctr = ctr; l->rad = rad; return l; } static int light_bbox(void *obj, vector *min, vector *max) { return 0; /* lights are unbounded currently */ } static void light_intersect(point_light *l, ray *ry) { flt b, disc, t1, t2, temp; vector V; /* Lights do not cast shadows.. */ if (ry->flags & RT_RAY_SHADOW) return; VSUB(l->ctr, ry->o, V); VDOT(b, V, ry->d); VDOT(temp, V, V); disc = b * b + l->rad * l->rad - temp; if (disc <= 0.0) return; disc = sqrt(disc); t2 = b + disc; if (t2 <= SPEPSILON) return; add_intersection(t2, (object *)l, ry); t1 = b - disc; if (t1 > SPEPSILON) add_intersection(t1, (object *)l, ry); } static void light_normal(point_light *l, vector *pnt, ray *incident, vector *N) { VSub((vector *)pnt, &(l->ctr), N); VNorm(N); if (VDot(N, &(incident->d)) > 0.0) { N->x = -N->x; N->y = -N->y; N->z = -N->z; } } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/light.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * light.h - this file includes declarations and defines for light sources. * * $Id: light.h,v 1.2 2007-02-22 17:54:15 Exp $ */ typedef struct { unsigned int id; /* Unique Object serial number */ void *nextobj; /* pointer to next object in list */ object_methods *methods; /* this object's methods */ texture *tex; /* object texture */ vector ctr; flt rad; } point_light; point_light *newlight(void *, vector, flt); #ifdef LIGHT_PRIVATE static int light_bbox(void *obj, vector *min, vector *max); static void light_intersect(point_light *, ray *); static void light_normal(point_light *, vector *, ray *, vector *); #endif ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/machine.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * machine.h - This is the machine specific include file * * $Id: machine.h,v 1.2 2007-02-22 17:54:15 Exp $ */ #include #include #include #include #include #include #include #define STDTIME ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/macros.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * macros.h - This file contains macro versions of functions that would be best * used as inlined code rather than function calls. * * $Id: macros.h,v 1.2 2007-02-22 17:54:15 Exp $ */ #define MYMAX(a, b) ((a) > (b) ? (a) : (b)) #define MYMIN(a, b) ((a) < (b) ? (a) : (b)) #define VDOT(return, a, b) return = (a.x * b.x + a.y * b.y + a.z * b.z); #define RAYPNT(c, a, b) \ c.x = a.o.x + (a.d.x * b); \ c.y = a.o.y + (a.d.y * b); \ c.z = a.o.z + (a.d.z * b); #define VSUB(a, b, c) \ c.x = (a.x - b.x); \ c.y = (a.y - b.y); \ c.z = (a.z - b.z); #define VCROSS(a, b, c) \ c->x = (a->y * b->z) - (a->z * b->y); \ c->y = (a->z * b->x) - (a->x * b->z); \ c->z = (a->x * b->y) - (a->y * b->x); ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/main.cpp ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #define VIDEO_WINMAIN_ARGS #include "types.hpp" #include "api.hpp" /* The ray tracing library API */ #include "parse.hpp" /* Support for my own file format */ #include "ui.hpp" #include "util.hpp" #include "tachyon_video.hpp" #include "common/utility/utility.hpp" #if WIN8UI_EXAMPLE #include "oneapi/tbb.h" volatile long global_startTime = 0; volatile long global_elapsedTime = 0; volatile bool global_isCancelled = false; volatile int global_number_of_threads; #endif SceneHandle global_scene; int global_xsize; /* size of graphic image rendered in window (from hres, vres) */ int global_ysize; int global_xwinsize; /* size of window (may be larger than above) */ int global_ywinsize; char *global_window_title; bool global_usegraphics; bool silent_mode = false; /* silent mode */ class tachyon_video *video = nullptr; typedef struct { int foundfilename; /* was a model file name found in the args? */ char filename[1024]; /* model file to render */ int useoutfilename; /* command line override of output filename */ char outfilename[1024]; /* name of output image file */ int verbosemode; /* verbose flags */ int antialiasing; /* antialiasing setting */ int displaymode; /* display mode */ int boundmode; /* bounding mode */ int boundthresh; /* bounding threshold */ int usecamfile; /* use camera file */ char camfilename[1024]; /* camera filename */ } argoptions; void initoptions(argoptions *opt) { memset(opt, 0, sizeof(argoptions)); opt->foundfilename = -1; opt->useoutfilename = -1; opt->verbosemode = -1; opt->antialiasing = -1; opt->displaymode = -1; opt->boundmode = -1; opt->boundthresh = -1; opt->usecamfile = -1; } #if WIN8UI_EXAMPLE int CreateScene() { char *filename = "Assets/balls.dat"; global_scene = rt_newscene(); rt_initialize(); if (readmodel(filename, global_scene) != 0) { rt_finalize(); return -1; } // need these early for create_graphics_window() so grab these here... scenedef *scene = (scenedef *)global_scene; // scene->hres and scene->vres should be equal to screen resolution scene->hres = global_xwinsize = global_xsize; scene->vres = global_ywinsize = global_ysize; return 0; } unsigned int __stdcall example_main(void *) { if (CreateScene() != 0) std::exit(-1); tachyon_video tachyon; tachyon.threaded = true; tachyon.init_console(); // always using window even if(!global_usegraphics) global_usegraphics = tachyon.init_window(global_xwinsize, global_ywinsize); if (!tachyon.running) std::exit(-1); video = &tachyon; for (;;) { global_elapsedTime = 0; global_startTime = (long)time(nullptr); global_isCancelled = false; if (video) video->running = true; oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism, global_number_of_threads); memset(g_pImg, 0, sizeof(unsigned int) * global_xsize * global_ysize); tachyon.main_loop(); global_elapsedTime = (long)(time(nullptr) - global_startTime); video->running = false; //The timer to restart drawing then it is complete. int timer = 50; while ((!global_isCancelled && (timer--) > 0)) { rt_sleep(100); } } return nullptr; } #elif __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ #include "oneapi/tbb.h" #include "CoreFoundation/CoreFoundation.hpp" extern "C" void get_screen_resolution(int *x, int *y); int CreateScene() { CFURLRef balls_dat_url = CFBundleCopyResourceURL(CFBundleGetMainBundle(), CFSTR("balls"), CFSTR("dat"), nullptr); char filename[1024]; CFURLGetFileSystemRepresentation( balls_dat_url, true, (UInt8 *)filename, (CFIndex)sizeof(filename)); CFRelease(balls_dat_url); global_scene = rt_newscene(); rt_initialize(); if (readmodel(filename, global_scene) != 0) { rt_finalize(); return -1; } // need these early for create_graphics_window() so grab these here... scenedef *scene = (scenedef *)global_scene; get_screen_resolution(&global_xsize, &global_ysize); // scene->hres and scene->vres should be equal to screen resolution scene->hres = global_xwinsize = global_xsize; scene->vres = global_ywinsize = global_ysize; return 0; } int main(int argc, char *argv[]) { if (CreateScene() != 0) return -1; tachyon_video tachyon; tachyon.threaded = true; tachyon.init_console(); global_usegraphics = tachyon.init_window(global_xwinsize, global_ywinsize); if (!tachyon.running) return -1; //TODO: add a demo loop. video = &tachyon; if (video) video->running = true; memset(g_pImg, 0, sizeof(unsigned int) * global_xsize * global_ysize); tachyon.main_loop(); video->running = false; return 0; } #else static char *window_title_string(int argc, const char *argv[]) { int i; char *name; name = (char *)malloc(8192); char *title = getenv("TITLE"); if (title) strcpy(name, title); else { if (strrchr(argv[0], '\\')) strcpy(name, strrchr(argv[0], '\\') + 1); else if (strrchr(argv[0], '/')) strcpy(name, strrchr(argv[0], '/') + 1); else strcpy(name, *argv[0] ? argv[0] : "Tachyon"); } for (i = 1; i < argc; i++) { strcat(name, " "); strcat(name, argv[i]); } #ifdef _DEBUG strcat(name, " (DEBUG BUILD)"); #endif return name; } int useoptions(argoptions *opt, SceneHandle scene) { if (opt->useoutfilename == 1) { rt_outputfile(scene, opt->outfilename); } if (opt->verbosemode == 1) { rt_verbose(scene, 1); } if (opt->antialiasing != -1) { /* need new api code for this */ } if (opt->displaymode != -1) { rt_displaymode(scene, opt->displaymode); } if (opt->boundmode != -1) { rt_boundmode(scene, opt->boundmode); } if (opt->boundthresh != -1) { rt_boundthresh(scene, opt->boundthresh); } return 0; } argoptions ParseCommandLine(int argc, const char *argv[]) { argoptions opt; initoptions(&opt); bool nobounding = false; bool nodisp = false; std::string filename; utility::parse_cli_arguments( argc, argv, utility::cli_argument_pack() .positional_arg(filename, "dataset", "Model file") .positional_arg(opt.boundthresh, "boundthresh", "bounding threshold value") .arg(nodisp, "no-display-updating", "disable run-time display updating") .arg(nobounding, "no-bounding", "disable bounding technique") .arg(silent_mode, "silent", "no output except elapsed time")); strcpy(opt.filename, filename.c_str()); opt.displaymode = nodisp ? RT_DISPLAY_DISABLED : RT_DISPLAY_ENABLED; opt.boundmode = nobounding ? RT_BOUNDING_DISABLED : RT_BOUNDING_ENABLED; return opt; } int CreateScene(argoptions &opt) { char *filename; global_scene = rt_newscene(); rt_initialize(); /* process command line overrides */ useoptions(&opt, global_scene); #ifdef DEFAULT_MODELFILE #if _WIN32 || _WIN64 #define _GLUE_FILENAME(x) "..\\dat\\" #x #else #define _GLUE_FILENAME(x) #x #endif #define GLUE_FILENAME(x) _GLUE_FILENAME(x) if (opt.foundfilename == -1) filename = GLUE_FILENAME(DEFAULT_MODELFILE); else #endif //DEFAULT_MODELFILE filename = opt.filename; if (readmodel(filename, global_scene) != 0) { fprintf(stderr, "Parser returned a non-zero error code reading %s\n", filename); fprintf(stderr, "Aborting Render...\n"); rt_finalize(); return -1; } // need these early for create_graphics_window() so grab these here... scenedef *scene = (scenedef *)global_scene; global_xsize = scene->hres; global_ysize = scene->vres; global_xwinsize = global_xsize; global_ywinsize = global_ysize; // add some here to leave extra blank space on bottom for status etc. return 0; } int main(int argc, char *argv[]) { timer mainStartTime = gettimer(); global_window_title = window_title_string(argc, (const char **)argv); argoptions opt = ParseCommandLine(argc, (const char **)argv); if (CreateScene(opt) != 0) return -1; tachyon_video tachyon; tachyon.threaded = true; tachyon.init_console(); tachyon.title = global_window_title; // always using window even if(!global_usegraphics) global_usegraphics = tachyon.init_window(global_xwinsize, global_ywinsize); if (!tachyon.running) return -1; video = &tachyon; tachyon.main_loop(); utility::report_elapsed_time(timertime(mainStartTime, gettimer())); return 0; } #endif ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/objbound.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * objbound.cpp - This file contains the functions to find bounding boxes * for the various primitives */ #include "machine.hpp" #include "types.hpp" #include "macros.hpp" #include "bndbox.hpp" #define OBJBOUND_PRIVATE #include "objbound.hpp" static void globalbound(object **rootlist, vector *gmin, vector *gmax) { vector min, max; object *cur; if (*rootlist == nullptr) /* don't bound non-existent objects */ return; gmin->x = FHUGE; gmin->y = FHUGE; gmin->z = FHUGE; gmax->x = -FHUGE; gmax->y = -FHUGE; gmax->z = -FHUGE; cur = *rootlist; while (cur != nullptr) { /* Go! */ min.x = -FHUGE; min.y = -FHUGE; min.z = -FHUGE; max.x = FHUGE; max.y = FHUGE; max.z = FHUGE; cur->methods->bbox((void *)cur, &min, &max); gmin->x = MYMIN(gmin->x, min.x); gmin->y = MYMIN(gmin->y, min.y); gmin->z = MYMIN(gmin->z, min.z); gmax->x = MYMAX(gmax->x, max.x); gmax->y = MYMAX(gmax->y, max.y); gmax->z = MYMAX(gmax->z, max.z); cur = (object *)cur->nextobj; } } static int objinside(object *obj, vector *min, vector *max) { vector omin, omax; if (obj == nullptr) /* non-existent object, shouldn't get here */ return 0; if (obj->methods->bbox((void *)obj, &omin, &omax)) { if ((min->x <= omin.x) && (min->y <= omin.y) && (min->z <= omin.z) && (max->x >= omax.x) && (max->y >= omax.y) && (max->z >= omax.z)) { return 1; } } return 0; } static int countobj(object *root) { object *cur; /* counts the number of objects on a list */ int numobj; numobj = 0; cur = root; while (cur != nullptr) { cur = (object *)cur->nextobj; numobj++; } return numobj; } static void movenextobj(object *thisobj, object **root) { object *cur, *tmp; /* move the object after thisobj to the front of the object list */ /* headed by root */ if (thisobj != nullptr) { if (thisobj->nextobj != nullptr) { cur = (object *)thisobj->nextobj; /* the object to be moved */ thisobj->nextobj = cur->nextobj; /* link around the moved obj */ tmp = *root; /* store the root node */ cur->nextobj = tmp; /* attach root to cur */ *root = cur; /* make cur, the new root */ } } } static void octreespace(object **rootlist, int maxoctnodes) { object *cur; vector gmin, gmax, gctr; vector cmin1, cmin2, cmin3, cmin4, cmin5, cmin6, cmin7, cmin8; vector cmax1, cmax2, cmax3, cmax4, cmax5, cmax6, cmax7, cmax8; bndbox *box1, *box2, *box3, *box4; bndbox *box5, *box6, *box7, *box8; int skipobj; if (*rootlist == nullptr) /* don't subdivide non-existent data */ return; skipobj = 0; globalbound(rootlist, &gmin, &gmax); /* find global min and max */ gctr.x = ((gmax.x - gmin.x) / 2.0) + gmin.x; gctr.y = ((gmax.y - gmin.y) / 2.0) + gmin.y; gctr.z = ((gmax.z - gmin.z) / 2.0) + gmin.z; cmin1 = gmin; cmax1 = gctr; box1 = newbndbox(cmin1, cmax1); cmin2 = gmin; cmin2.x = gctr.x; cmax2 = gmax; cmax2.y = gctr.y; cmax2.z = gctr.z; box2 = newbndbox(cmin2, cmax2); cmin3 = gmin; cmin3.y = gctr.y; cmax3 = gmax; cmax3.x = gctr.x; cmax3.z = gctr.z; box3 = newbndbox(cmin3, cmax3); cmin4 = gmin; cmin4.x = gctr.x; cmin4.y = gctr.y; cmax4 = gmax; cmax4.z = gctr.z; box4 = newbndbox(cmin4, cmax4); cmin5 = gmin; cmin5.z = gctr.z; cmax5 = gctr; cmax5.z = gmax.z; box5 = newbndbox(cmin5, cmax5); cmin6 = gctr; cmin6.y = gmin.y; cmax6 = gmax; cmax6.y = gctr.y; box6 = newbndbox(cmin6, cmax6); cmin7 = gctr; cmin7.x = gmin.x; cmax7 = gctr; cmax7.y = gmax.y; cmax7.z = gmax.z; box7 = newbndbox(cmin7, cmax7); cmin8 = gctr; cmax8 = gmax; box8 = newbndbox(cmin8, cmax8); cur = *rootlist; while (cur != nullptr) { if (objinside((object *)cur->nextobj, &cmin1, &cmax1)) { movenextobj(cur, &box1->objlist); } else if (objinside((object *)cur->nextobj, &cmin2, &cmax2)) { movenextobj(cur, &box2->objlist); } else if (objinside((object *)cur->nextobj, &cmin3, &cmax3)) { movenextobj(cur, &box3->objlist); } else if (objinside((object *)cur->nextobj, &cmin4, &cmax4)) { movenextobj(cur, &box4->objlist); } else if (objinside((object *)cur->nextobj, &cmin5, &cmax5)) { movenextobj(cur, &box5->objlist); } else if (objinside((object *)cur->nextobj, &cmin6, &cmax6)) { movenextobj(cur, &box6->objlist); } else if (objinside((object *)cur->nextobj, &cmin7, &cmax7)) { movenextobj(cur, &box7->objlist); } else if (objinside((object *)cur->nextobj, &cmin8, &cmax8)) { movenextobj(cur, &box8->objlist); } else { skipobj++; cur = (object *)cur->nextobj; } } /* new scope, for redefinition of cur, and old */ { bndbox *cur, *old; old = box1; cur = box2; if (countobj(cur->objlist) > 0) { old->nextobj = cur; globalbound(&cur->objlist, &cur->min, &cur->max); old = cur; } cur = box3; if (countobj(cur->objlist) > 0) { old->nextobj = cur; globalbound(&cur->objlist, &cur->min, &cur->max); old = cur; } cur = box4; if (countobj(cur->objlist) > 0) { old->nextobj = cur; globalbound(&cur->objlist, &cur->min, &cur->max); old = cur; } cur = box5; if (countobj(cur->objlist) > 0) { old->nextobj = cur; globalbound(&cur->objlist, &cur->min, &cur->max); old = cur; } cur = box6; if (countobj(cur->objlist) > 0) { old->nextobj = cur; globalbound(&cur->objlist, &cur->min, &cur->max); old = cur; } cur = box7; if (countobj(cur->objlist) > 0) { old->nextobj = cur; globalbound(&cur->objlist, &cur->min, &cur->max); old = cur; } cur = box8; if (countobj(cur->objlist) > 0) { old->nextobj = cur; globalbound(&cur->objlist, &cur->min, &cur->max); old = cur; } old->nextobj = *rootlist; if (countobj(box1->objlist) > 0) { globalbound(&box1->objlist, &box1->min, &box1->max); *rootlist = (object *)box1; } else { *rootlist = (object *)box1->nextobj; } } /**** end of special cur and old scope */ if (countobj(box1->objlist) > maxoctnodes) { octreespace(&box1->objlist, maxoctnodes); } if (countobj(box2->objlist) > maxoctnodes) { octreespace(&box2->objlist, maxoctnodes); } if (countobj(box3->objlist) > maxoctnodes) { octreespace(&box3->objlist, maxoctnodes); } if (countobj(box4->objlist) > maxoctnodes) { octreespace(&box4->objlist, maxoctnodes); } if (countobj(box5->objlist) > maxoctnodes) { octreespace(&box5->objlist, maxoctnodes); } if (countobj(box6->objlist) > maxoctnodes) { octreespace(&box6->objlist, maxoctnodes); } if (countobj(box7->objlist) > maxoctnodes) { octreespace(&box7->objlist, maxoctnodes); } if (countobj(box8->objlist) > maxoctnodes) { octreespace(&box8->objlist, maxoctnodes); } } void dividespace(int maxoctnodes, object **toplist) { bndbox *gbox; vector gmin, gmax; if (countobj(*toplist) > maxoctnodes) { globalbound(toplist, &gmin, &gmax); octreespace(toplist, maxoctnodes); gbox = newbndbox(gmin, gmax); gbox->objlist = nullptr; gbox->tex = nullptr; gbox->nextobj = nullptr; gbox->objlist = *toplist; *toplist = (object *)gbox; } } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/objbound.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * objbound.h - defines for object bounding code. * * $Id: objbound.h,v 1.2 2007-02-22 17:54:15 Exp $ */ void dividespace(int, object **); #ifdef OBJBOUND_PRIVATE static void globalbound(object **, vector *, vector *); static int objinside(object *obj, vector *min, vector *max); static int countobj(object *); static void movenextobj(object *, object **); static void octreespace(object **, int); #endif ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/parse.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * parse.cpp - an UltraLame (tm) parser for simple data files... */ // Try preventing lots of GCC warnings about ignored results of fscanf etc. #ifdef __GNUC__ // Starting from 4.5, GCC has a suppression option #pragma GCC diagnostic ignored "-Wunused-result" #endif #include #include #include #include #include /* needed for toupper(), macro.. */ #include "types.hpp" #include "api.hpp" /* rendering API */ #define PARSE_INTERNAL #include "parse.hpp" /* self protos */ #undef PARSE_INTERNAL static texentry textable[NUMTEXS]; /* texture lookup table */ static texentry defaulttex; /* The default texture when a lookup fails */ static int numtextures; /* number of TEXDEF textures */ static int numobjectsparsed; /* total number of objects parsed so far */ static color scenebackcol; /* scene background color */ static int stringcmp(const char *a, const char *b) { std::size_t i, s, l; s = strlen(a); l = strlen(b); if (s != l) return 1; for (i = 0; i < s; i++) { if (toupper(a[i]) != toupper(b[i])) { return 1; } } return 0; } static void reset_tex_table(void) { apitexture apitex; numtextures = 0; memset(&textable, 0, sizeof(textable)); apitex.col.r = 1.0; apitex.col.g = 1.0; apitex.col.b = 1.0; apitex.ambient = 0.1; apitex.diffuse = 0.9; apitex.specular = 0.0; apitex.opacity = 1.0; apitex.texturefunc = 0; defaulttex.tex = rt_texture(&apitex); } static errcode add_texture(void *tex, char name[TEXNAMELEN]) { textable[numtextures].tex = tex; strcpy(textable[numtextures].name, name); numtextures++; if (numtextures > NUMTEXS) { fprintf(stderr, "Parse: %d textures allocated, texture slots full!\n", numtextures); numtextures--; /* keep writing over last texture if we've run out.. */ return PARSEALLOCERR; } return PARSENOERR; } static void *find_texture(char name[TEXNAMELEN]) { int i; for (i = 0; i < numtextures; i++) { if (strcmp(name, textable[i].name) == 0) return textable[i].tex; } fprintf(stderr, "Undefined texture '%s', using default. \n", name); return (defaulttex.tex); } apiflt degtorad(apiflt deg) { apiflt tmp; tmp = deg * 3.1415926 / 180.0; return tmp; } static void degvectoradvec(vector *degvec) { vector tmp; tmp.x = degtorad(degvec->x); tmp.y = degtorad(degvec->y); tmp.z = degtorad(degvec->z); *degvec = tmp; } static void InitRot3d(RotMat *rot, apiflt x, apiflt y, apiflt z) { rot->rx1 = cos(y) * cos(z); rot->rx2 = sin(x) * sin(y) * cos(z) - cos(x) * sin(z); rot->rx3 = sin(x) * sin(z) + cos(x) * cos(z) * sin(y); rot->ry1 = cos(y) * sin(z); rot->ry2 = cos(x) * cos(z) + sin(x) * sin(y) * sin(z); rot->ry3 = cos(x) * sin(y) * sin(z) - sin(x) * cos(z); rot->rz1 = sin(y); rot->rz2 = sin(x) * cos(y); rot->rz3 = cos(x) * cos(y); } static void Rotate3d(RotMat *rot, vector *vec) { vector tmp; tmp.x = (vec->x * (rot->rx1) + vec->y * (rot->rx2) + vec->z * (rot->rx3)); tmp.y = (vec->x * (rot->ry1) + vec->y * (rot->ry2) + vec->z * (rot->ry3)); tmp.z = (vec->x * (rot->rz1) + vec->y * (rot->rz2) + vec->z * (rot->rz3)); *vec = tmp; } static void Scale3d(vector *scale, vector *vec) { vec->x = vec->x * scale->x; vec->y = vec->y * scale->y; vec->z = vec->z * scale->z; } static void Trans3d(vector *trans, vector *vec) { vec->x += trans->x; vec->y += trans->y; vec->z += trans->z; } static errcode GetString(FILE *dfile, const char *string) { char data[255]; fscanf(dfile, "%s", data); if (stringcmp(data, string) != 0) { fprintf(stderr, "parse: Expected %s, got %s \n", string, data); fprintf(stderr, "parse: Error while parsing object: %d \n", numobjectsparsed); return PARSEBADSYNTAX; } return PARSENOERR; } unsigned int readmodel(char *modelfile, SceneHandle scene) { FILE *dfile; errcode rc; reset_tex_table(); dfile = nullptr; dfile = fopen(modelfile, "r"); if (dfile == nullptr) { return PARSEBADFILE; } rc = GetScenedefs(dfile, scene); if (rc != PARSENOERR) { fclose(dfile); return rc; } scenebackcol.r = 0.0; /* default background is black */ scenebackcol.g = 0.0; scenebackcol.b = 0.0; numobjectsparsed = 0; while ((rc = GetObject(dfile, scene)) == PARSENOERR) { numobjectsparsed++; } fclose(dfile); if (rc == PARSEEOF) rc = PARSENOERR; rt_background(scene, scenebackcol); return rc; } static errcode GetScenedefs(FILE *dfile, SceneHandle scene) { vector Ccenter, Cview, Cup; apiflt zoom, aspectratio; int raydepth, antialiasing; char outfilename[200]; int xres, yres, verbose; float a, b, c; errcode rc = PARSENOERR; rc |= GetString(dfile, "BEGIN_SCENE"); rc |= GetString(dfile, "OUTFILE"); fscanf(dfile, "%s", outfilename); #ifdef _WIN32 if (strcmp(outfilename, "/dev/null") == 0) { strcpy(outfilename, "NUL:"); } #endif rc |= GetString(dfile, "RESOLUTION"); fscanf(dfile, "%d %d", &xres, &yres); rc |= GetString(dfile, "VERBOSE"); fscanf(dfile, "%d", &verbose); rt_scenesetup(scene, outfilename, xres, yres, verbose); rc |= GetString(dfile, "CAMERA"); rc |= GetString(dfile, "ZOOM"); fscanf(dfile, "%f", &a); zoom = a; rc |= GetString(dfile, "ASPECTRATIO"); fscanf(dfile, "%f", &b); aspectratio = b; rc |= GetString(dfile, "ANTIALIASING"); fscanf(dfile, "%d", &antialiasing); rc |= GetString(dfile, "RAYDEPTH"); fscanf(dfile, "%d", &raydepth); rc |= GetString(dfile, "CENTER"); fscanf(dfile, "%f %f %f", &a, &b, &c); Ccenter.x = a; Ccenter.y = b; Ccenter.z = c; rc |= GetString(dfile, "VIEWDIR"); fscanf(dfile, "%f %f %f", &a, &b, &c); Cview.x = a; Cview.y = b; Cview.z = c; rc |= GetString(dfile, "UPDIR"); fscanf(dfile, "%f %f %f", &a, &b, &c); Cup.x = a; Cup.y = b; Cup.z = c; rc |= GetString(dfile, "END_CAMERA"); rt_camerasetup(scene, zoom, aspectratio, antialiasing, raydepth, Ccenter, Cview, Cup); return rc; } static errcode GetObject(FILE *dfile, SceneHandle scene) { char objtype[80]; fscanf(dfile, "%s", objtype); if (!stringcmp(objtype, "END_SCENE")) { return PARSEEOF; /* end parsing */ } if (!stringcmp(objtype, "TEXDEF")) { return GetTexDef(dfile); } if (!stringcmp(objtype, "TEXALIAS")) { return GetTexAlias(dfile); } if (!stringcmp(objtype, "BACKGROUND")) { return GetBackGnd(dfile); } if (!stringcmp(objtype, "CYLINDER")) { return GetCylinder(dfile); } if (!stringcmp(objtype, "FCYLINDER")) { return GetFCylinder(dfile); } if (!stringcmp(objtype, "POLYCYLINDER")) { return GetPolyCylinder(dfile); } if (!stringcmp(objtype, "SPHERE")) { return GetSphere(dfile); } if (!stringcmp(objtype, "PLANE")) { return GetPlane(dfile); } if (!stringcmp(objtype, "RING")) { return GetRing(dfile); } if (!stringcmp(objtype, "BOX")) { return GetBox(dfile); } if (!stringcmp(objtype, "SCALARVOL")) { return GetVol(dfile); } if (!stringcmp(objtype, "TRI")) { return GetTri(dfile); } if (!stringcmp(objtype, "STRI")) { return GetSTri(dfile); } if (!stringcmp(objtype, "LIGHT")) { return GetLight(dfile); } if (!stringcmp(objtype, "SCAPE")) { return GetLandScape(dfile); } if (!stringcmp(objtype, "TPOLYFILE")) { return GetTPolyFile(dfile); } fprintf(stderr, "Found bad token: %s expected an object type\n", objtype); return PARSEBADSYNTAX; } static errcode GetVector(FILE *dfile, vector *v1) { float a, b, c; fscanf(dfile, "%f %f %f", &a, &b, &c); v1->x = a; v1->y = b; v1->z = c; return PARSENOERR; } static errcode GetColor(FILE *dfile, color *c1) { float r, g, b; int rc; rc = GetString(dfile, "COLOR"); fscanf(dfile, "%f %f %f", &r, &g, &b); c1->r = r; c1->g = g; c1->b = b; return rc; } static errcode GetTexDef(FILE *dfile) { char texname[TEXNAMELEN]; fscanf(dfile, "%s", texname); add_texture(GetTexBody(dfile), texname); return PARSENOERR; } static errcode GetTexAlias(FILE *dfile) { char texname[TEXNAMELEN]; char aliasname[TEXNAMELEN]; fscanf(dfile, "%s", texname); fscanf(dfile, "%s", aliasname); add_texture(find_texture(aliasname), texname); return PARSENOERR; } static errcode GetTexture(FILE *dfile, void **tex) { char tmp[255]; errcode rc = PARSENOERR; fscanf(dfile, "%s", tmp); if (!stringcmp("TEXTURE", tmp)) { *tex = GetTexBody(dfile); } else *tex = find_texture(tmp); return rc; } void *GetTexBody(FILE *dfile) { char tmp[255]; float a, b, c, d, phong, phongexp, phongtype; apitexture tex; void *voidtex; errcode rc; rc = GetString(dfile, "AMBIENT"); fscanf(dfile, "%f", &a); tex.ambient = a; rc |= GetString(dfile, "DIFFUSE"); fscanf(dfile, "%f", &b); tex.diffuse = b; rc |= GetString(dfile, "SPECULAR"); fscanf(dfile, "%f", &c); tex.specular = c; rc |= GetString(dfile, "OPACITY"); fscanf(dfile, "%f", &d); tex.opacity = d; fscanf(dfile, "%s", tmp); if (!stringcmp("PHONG", tmp)) { fscanf(dfile, "%s", tmp); if (!stringcmp("METAL", tmp)) { phongtype = RT_PHONG_METAL; } else if (!stringcmp("PLASTIC", tmp)) { phongtype = RT_PHONG_PLASTIC; } else { phongtype = RT_PHONG_PLASTIC; } fscanf(dfile, "%f", &phong); GetString(dfile, "PHONG_SIZE"); fscanf(dfile, "%f", &phongexp); fscanf(dfile, "%s", tmp); } else { phong = 0.0; phongexp = 100.0; phongtype = RT_PHONG_PLASTIC; } fscanf(dfile, "%f %f %f", &a, &b, &c); tex.col.r = a; tex.col.g = b; tex.col.b = c; rc |= GetString(dfile, "TEXFUNC"); fscanf(dfile, "%d", &tex.texturefunc); if (tex.texturefunc >= 7) { /* if its an image map, we need a filename */ fscanf(dfile, "%s", tex.imap); } if (tex.texturefunc != 0) { rc |= GetString(dfile, "CENTER"); rc |= GetVector(dfile, &tex.ctr); rc |= GetString(dfile, "ROTATE"); rc |= GetVector(dfile, &tex.rot); rc |= GetString(dfile, "SCALE"); rc |= GetVector(dfile, &tex.scale); } if (tex.texturefunc == 9) { rc |= GetString(dfile, "UAXIS"); rc |= GetVector(dfile, &tex.uaxs); rc |= GetString(dfile, "VAXIS"); rc |= GetVector(dfile, &tex.vaxs); } voidtex = rt_texture(&tex); rt_tex_phong(voidtex, phong, phongexp, (int)phongtype); return voidtex; } static errcode GetLight(FILE *dfile) { apiflt rad; vector ctr; apitexture tex; float a; errcode rc; memset(&tex, 0, sizeof(apitexture)); rc = GetString(dfile, "CENTER"); rc |= GetVector(dfile, &ctr); rc |= GetString(dfile, "RAD"); fscanf(dfile, "%f", &a); /* read in radius */ rad = a; rc |= GetColor(dfile, &tex.col); rt_light(rt_texture(&tex), ctr, rad); return rc; } static errcode GetBackGnd(FILE *dfile) { float r, g, b; fscanf(dfile, "%f %f %f", &r, &g, &b); scenebackcol.r = r; scenebackcol.g = g; scenebackcol.b = b; return PARSENOERR; } static errcode GetCylinder(FILE *dfile) { apiflt rad; vector ctr, axis; void *tex; float a; errcode rc; rc = GetString(dfile, "CENTER"); rc |= GetVector(dfile, &ctr); rc |= GetString(dfile, "AXIS"); rc |= GetVector(dfile, &axis); rc |= GetString(dfile, "RAD"); fscanf(dfile, "%f", &a); rad = a; rc |= GetTexture(dfile, &tex); rt_cylinder(tex, ctr, axis, rad); return rc; } static errcode GetFCylinder(FILE *dfile) { apiflt rad; vector ctr, axis; vector pnt1, pnt2; void *tex; float a; errcode rc; rc = GetString(dfile, "BASE"); rc |= GetVector(dfile, &pnt1); rc |= GetString(dfile, "APEX"); rc |= GetVector(dfile, &pnt2); ctr = pnt1; axis.x = pnt2.x - pnt1.x; axis.y = pnt2.y - pnt1.y; axis.z = pnt2.z - pnt1.z; rc |= GetString(dfile, "RAD"); fscanf(dfile, "%f", &a); rad = a; rc |= GetTexture(dfile, &tex); rt_fcylinder(tex, ctr, axis, rad); return rc; } static errcode GetPolyCylinder(FILE *dfile) { apiflt rad; vector *temp; void *tex; float a; int numpts, i; errcode rc; rc = GetString(dfile, "POINTS"); fscanf(dfile, "%d", &numpts); temp = (vector *)malloc(numpts * sizeof(vector)); for (i = 0; i < numpts; i++) { rc |= GetVector(dfile, &temp[i]); } rc |= GetString(dfile, "RAD"); fscanf(dfile, "%f", &a); rad = a; rc |= GetTexture(dfile, &tex); rt_polycylinder(tex, temp, numpts, rad); free(temp); return rc; } static errcode GetSphere(FILE *dfile) { apiflt rad; vector ctr; void *tex; float a; errcode rc; rc = GetString(dfile, "CENTER"); rc |= GetVector(dfile, &ctr); rc |= GetString(dfile, "RAD"); fscanf(dfile, "%f", &a); rad = a; rc |= GetTexture(dfile, &tex); rt_sphere(tex, ctr, rad); return rc; } static errcode GetPlane(FILE *dfile) { vector normal; vector ctr; void *tex; errcode rc; rc = GetString(dfile, "CENTER"); rc |= GetVector(dfile, &ctr); rc |= GetString(dfile, "NORMAL"); rc |= GetVector(dfile, &normal); rc |= GetTexture(dfile, &tex); rt_plane(tex, ctr, normal); return rc; } static errcode GetVol(FILE *dfile) { vector min, max; int x, y, z; char fname[255]; void *tex; errcode rc; rc = GetString(dfile, "MIN"); rc |= GetVector(dfile, &min); rc |= GetString(dfile, "MAX"); rc |= GetVector(dfile, &max); rc |= GetString(dfile, "DIM"); fscanf(dfile, "%d %d %d ", &x, &y, &z); rc |= GetString(dfile, "FILE"); fscanf(dfile, "%s", fname); rc |= GetTexture(dfile, &tex); rt_scalarvol(tex, min, max, x, y, z, fname, nullptr); return rc; } static errcode GetBox(FILE *dfile) { vector min, max; void *tex; errcode rc; rc = GetString(dfile, "MIN"); rc |= GetVector(dfile, &min); rc |= GetString(dfile, "MAX"); rc |= GetVector(dfile, &max); rc |= GetTexture(dfile, &tex); rt_box(tex, min, max); return rc; } static errcode GetRing(FILE *dfile) { vector normal; vector ctr; void *tex; float a, b; errcode rc; rc = GetString(dfile, "CENTER"); rc |= GetVector(dfile, &ctr); rc |= GetString(dfile, "NORMAL"); rc |= GetVector(dfile, &normal); rc |= GetString(dfile, "INNER"); fscanf(dfile, " %f ", &a); rc |= GetString(dfile, "OUTER"); fscanf(dfile, " %f ", &b); rc |= GetTexture(dfile, &tex); rt_ring(tex, ctr, normal, a, b); return rc; } static errcode GetTri(FILE *dfile) { vector v0, v1, v2; void *tex; errcode rc; rc = GetString(dfile, "V0"); rc |= GetVector(dfile, &v0); rc |= GetString(dfile, "V1"); rc |= GetVector(dfile, &v1); rc |= GetString(dfile, "V2"); rc |= GetVector(dfile, &v2); rc |= GetTexture(dfile, &tex); rt_tri(tex, v0, v1, v2); return rc; } static errcode GetSTri(FILE *dfile) { vector v0, v1, v2, n0, n1, n2; void *tex; errcode rc; rc = GetString(dfile, "V0"); rc |= GetVector(dfile, &v0); rc |= GetString(dfile, "V1"); rc |= GetVector(dfile, &v1); rc |= GetString(dfile, "V2"); rc |= GetVector(dfile, &v2); rc |= GetString(dfile, "N0"); rc |= GetVector(dfile, &n0); rc |= GetString(dfile, "N1"); rc |= GetVector(dfile, &n1); rc |= GetString(dfile, "N2"); rc |= GetVector(dfile, &n2); rc |= GetTexture(dfile, &tex); rt_stri(tex, v0, v1, v2, n0, n1, n2); return rc; } static errcode GetLandScape(FILE *dfile) { void *tex; vector ctr; apiflt wx, wy; int m, n; float a, b; errcode rc; rc = GetString(dfile, "RES"); fscanf(dfile, "%d %d", &m, &n); rc |= GetString(dfile, "SCALE"); fscanf(dfile, "%f %f", &a, &b); wx = a; wy = b; rc |= GetString(dfile, "CENTER"); rc |= GetVector(dfile, &ctr); rc |= GetTexture(dfile, &tex); rt_landscape(tex, m, n, ctr, wx, wy); return rc; } static errcode GetTPolyFile(FILE *dfile) { void *tex; vector ctr, rot, scale; vector v1, v2, v0; char ifname[255]; FILE *ifp; int v, totalpolys; RotMat RotA; errcode rc; totalpolys = 0; rc = GetString(dfile, "SCALE"); rc |= GetVector(dfile, &scale); rc |= GetString(dfile, "ROT"); rc |= GetVector(dfile, &rot); degvectoradvec(&rot); InitRot3d(&RotA, rot.x, rot.y, rot.z); rc |= GetString(dfile, "CENTER"); rc |= GetVector(dfile, &ctr); rc |= GetString(dfile, "FILE"); fscanf(dfile, "%s", ifname); rc |= GetTexture(dfile, &tex); if ((ifp = fopen(ifname, "r")) == nullptr) { fprintf(stderr, "Can't open data file %s for input!! Aborting...\n", ifname); return PARSEBADSUBFILE; } while (!feof(ifp)) { fscanf(ifp, "%d", &v); if (v != 3) { break; } totalpolys++; v = 0; rc |= GetVector(ifp, &v0); rc |= GetVector(ifp, &v1); rc |= GetVector(ifp, &v2); Scale3d(&scale, &v0); Scale3d(&scale, &v1); Scale3d(&scale, &v2); Rotate3d(&RotA, &v0); Rotate3d(&RotA, &v1); Rotate3d(&RotA, &v2); Trans3d(&ctr, &v0); Trans3d(&ctr, &v1); Trans3d(&ctr, &v2); rt_tri(tex, v1, v0, v2); } fclose(ifp); return rc; } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/parse.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * parse.h - this file contains defines for model file reading. * * $Id: parse.h,v 1.2 2007-02-22 17:54:16 Exp $ */ #define PARSENOERR 0 #define PARSEBADFILE 1 #define PARSEBADSUBFILE 2 #define PARSEBADSYNTAX 4 #define PARSEEOF 8 #define PARSEALLOCERR 16 unsigned int readmodel(char *, SceneHandle); #ifdef PARSE_INTERNAL #define NUMTEXS 32768 #define TEXNAMELEN 24 typedef struct { double rx1; double rx2; double rx3; double ry1; double ry2; double ry3; double rz1; double rz2; double rz3; } RotMat; typedef struct { char name[TEXNAMELEN]; void *tex; } texentry; #ifdef _ERRCODE_DEFINED #define errcode errcode_t #endif //_ERRCODE_DEFINED typedef unsigned int errcode; static errcode add_texture(void *tex, char name[TEXNAMELEN]); static errcode GetString(FILE *, const char *); static errcode GetScenedefs(FILE *, SceneHandle); static errcode GetColor(FILE *, color *); static errcode GetVector(FILE *, vector *); static errcode GetTexDef(FILE *); static errcode GetTexAlias(FILE *); static errcode GetTexture(FILE *, void **); void *GetTexBody(FILE *); static errcode GetBackGnd(FILE *); static errcode GetCylinder(FILE *); static errcode GetFCylinder(FILE *); static errcode GetPolyCylinder(FILE *); static errcode GetSphere(FILE *); static errcode GetPlane(FILE *); static errcode GetRing(FILE *); static errcode GetBox(FILE *); static errcode GetVol(FILE *); static errcode GetTri(FILE *); static errcode GetSTri(FILE *); static errcode GetLight(FILE *); static errcode GetLandScape(FILE *); static errcode GetTPolyFile(FILE *); static errcode GetMGFFile(FILE *, SceneHandle); static errcode GetObject(FILE *, SceneHandle); #endif ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/plane.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * plane.cpp - This file contains the functions for dealing with planes. */ #include "machine.hpp" #include "types.hpp" #include "macros.hpp" #include "vector.hpp" #include "intersect.hpp" #include "util.hpp" #define PLANE_PRIVATE #include "plane.hpp" static object_methods plane_methods = { (void (*)(void *, void *))(plane_intersect), (void (*)(void *, void *, void *, void *))(plane_normal), plane_bbox, free }; object *newplane(void *tex, vector ctr, vector norm) { plane *p; p = (plane *)rt_getmem(sizeof(plane)); memset(p, 0, sizeof(plane)); p->methods = &plane_methods; p->tex = (texture *)tex; p->norm = norm; VNorm(&p->norm); p->d = -VDot(&ctr, &p->norm); return (object *)p; } static int plane_bbox(void *obj, vector *min, vector *max) { return 0; } static void plane_intersect(plane *pln, ray *ry) { flt t, td; t = -(pln->d + VDot(&pln->norm, &ry->o)); td = VDot(&pln->norm, &ry->d); if (td != 0.0) { t /= td; if (t > 0.0) add_intersection(t, (object *)pln, ry); } } static void plane_normal(plane *pln, vector *pnt, ray *incident, vector *N) { *N = pln->norm; } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/plane.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * plane.h - This file contains the defines for planes etc. * * $Id: plane.h,v 1.2 2007-02-22 17:54:16 Exp $ */ object *newplane(void *tex, vector ctr, vector norm); #ifdef PLANE_PRIVATE typedef struct { unsigned int id; /* Unique Object serial number */ void *nextobj; /* pointer to next object in list */ object_methods *methods; /* this object's methods */ texture *tex; /* object texture */ flt d; vector norm; } plane; static void plane_intersect(plane *, ray *); static int plane_bbox(void *obj, vector *min, vector *max); static void plane_normal(plane *, vector *, ray *incident, vector *); #endif ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/ppm.cpp ================================================ /* Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * ppm.cpp - This file deals with PPM format image files (reading/writing) */ /* For our purposes, we're interested only in the 3 byte per pixel 24 bit truecolor sort of file.. Probably won't implement any decent checking at this point, probably choke on things like the # comments.. */ // Try preventing lots of GCC warnings about ignored results of fscanf etc. #ifdef __GNUC__ #pragma GCC diagnostic ignored "-Wunused-result" #endif #include #include "machine.hpp" #include "types.hpp" #include "util.hpp" #include "imageio.hpp" /* error codes etc */ #include "ppm.hpp" static int getint(FILE *dfile) { char ch[200]; int i; int num; num = 0; while (num == 0) { fscanf(dfile, "%s", ch); while (ch[0] == '#') { fgets(ch, 200, dfile); } num = sscanf(ch, "%d", &i); } return i; } int readppm(char *name, int *xres, int *yres, unsigned char **imgdata) { char data[200]; FILE *ifp; int i; std::size_t bytesread; int datasize; ifp = fopen(name, "r"); if (ifp == nullptr) { return IMAGEBADFILE; /* couldn't open the file */ } fscanf(ifp, "%s", data); if (strcmp(data, "P6")) { fclose(ifp); return IMAGEUNSUP; /* not a format we support */ } *xres = getint(ifp); *yres = getint(ifp); i = getint(ifp); /* eat the maxval number */ fread(&i, 1, 1, ifp); /* eat the newline */ datasize = 3 * (*xres) * (*yres); *imgdata = (unsigned char *)rt_getmem(datasize); bytesread = fread(*imgdata, 1, datasize, ifp); fclose(ifp); if (bytesread != datasize) return IMAGEREADERR; return IMAGENOERR; } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/ppm.hpp ================================================ /* Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * ppm.h - This file deals with PPM format image files (reading/writing) * * $Id: ppm.h,v 1.2 2007-02-22 17:54:16 Exp $ */ /* For our purposes, we're interested only in the 3 byte per pixel 24 bit truecolor sort of file.. Probably won't implement any decent checking at this point, probably choke on things like the # comments.. */ int readppm(char *name, int *xres, int *yres, unsigned char **imgdata); ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/pthread.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef EMULATE_PTHREADS #include #include "pthread_w.hpp" /* Basics */ int pthread_create(pthread_t *thread, pthread_attr_t *attr, void *(*start_routine)(void *), void *arg) { pthread_t th; if (thread == nullptr) return EINVAL; *thread = nullptr; if (start_routine == nullptr) return EINVAL; th = (pthread_t)malloc(sizeof(pthread_s)); memset(th, 0, sizeof(pthread_s)); th->winthread_handle = CreateThread(nullptr, 0, (LPTHREAD_START_ROUTINE)start_routine, arg, 0, &th->winthread_id); if (th->winthread_handle == nullptr) return EAGAIN; /* GetLastError() */ *thread = th; return 0; } int pthread_join(pthread_t th, void **thread_return) { BOOL b_ret; DWORD dw_ret; if (thread_return) *thread_return = nullptr; if ((th == nullptr) || (th->winthread_handle == nullptr)) return EINVAL; dw_ret = WaitForSingleObject(th->winthread_handle, INFINITE); if (dw_ret != WAIT_OBJECT_0) return ERROR_PTHREAD; /* dw_ret == WAIT_FAILED; GetLastError() */ if (thread_return) { BOOL e_ret; DWORD exit_val; e_ret = GetExitCodeThread(th->winthread_handle, &exit_val); if (!e_ret) return ERROR_PTHREAD; /* GetLastError() */ *thread_return = (void *)(std::size_t)exit_val; } b_ret = CloseHandle(th->winthread_handle); if (!b_ret) return ERROR_PTHREAD; /* GetLastError() */ memset(th, 0, sizeof(pthread_s)); free(th); th = nullptr; return 0; } void pthread_exit(void *retval) { /* specific to PTHREAD_TO_WINTHREAD */ ExitThread((DWORD)( (std::size_t)retval)); /* thread becomes signalled so its death can be waited upon */ /*NOTREACHED*/ assert(0); return; /* void fnc; can't return an error code */ } /* Mutex */ int pthread_mutex_init(pthread_mutex_t *mutex, pthread_mutexattr_t *mutex_attr) { InitializeCriticalSection(&mutex->critsec); return 0; } int pthread_mutex_destroy(pthread_mutex_t *mutex) { return 0; } int pthread_mutex_lock(pthread_mutex_t *mutex) { EnterCriticalSection(&mutex->critsec); return 0; } int pthread_mutex_unlock(pthread_mutex_t *mutex) { LeaveCriticalSection(&mutex->critsec); return 0; } #endif /* EMULATE_PTHREADS */ ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/pthread_w.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef EMULATE_PTHREADS #ifndef _PTHREAD_H_DEFINED #define _PTHREAD_H_DEFINED #include #include #ifndef ENOTSUP #define ENOTSUP EPERM #endif /* just need on Windows to get std::size_t defined */ #include #define ERROR_PTHREAD 1000 #define ERROR_MODE 1001 #define ERROR_UNIMPL 1002 /* Basics */ struct pthread_s { HANDLE winthread_handle; DWORD winthread_id; }; typedef struct pthread_s *pthread_t; /* one of the few types that's pointer, not struct */ typedef struct { int i; /* not yet defined... */ } pthread_attr_t; /* Mutex */ typedef struct { int i; /* not yet defined... */ } pthread_mutexattr_t; typedef struct { CRITICAL_SECTION critsec; } pthread_mutex_t; /* Function prototypes */ extern int pthread_create(pthread_t *thread, pthread_attr_t *attr, void *(*start_routine)(void *), void *arg); extern int pthread_join(pthread_t th, void **thread_return); extern void pthread_exit(void *retval); extern int pthread_mutex_init(pthread_mutex_t *mutex, pthread_mutexattr_t *mutex_attr); extern int pthread_mutex_destroy(pthread_mutex_t *mutex); extern int pthread_mutex_lock(pthread_mutex_t *mutex); extern int pthread_mutex_unlock(pthread_mutex_t *mutex); #endif /* _PTHREAD_H_DEFINED */ #endif /* EMULATE_PTHREADS */ ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/quadric.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * quadric.cpp - This file contains the functions for dealing with quadrics. */ #include "machine.hpp" #include "types.hpp" #include "macros.hpp" #include "quadric.hpp" #include "vector.hpp" #include "intersect.hpp" #include "util.hpp" int quadric_bbox(void *obj, vector *min, vector *max) { return 0; } static object_methods quadric_methods = { (void (*)(void *, void *))(quadric_intersect), (void (*)(void *, void *, void *, void *))( quadric_normal), quadric_bbox, free }; quadric *newquadric() { quadric *q; q = (quadric *)rt_getmem(sizeof(quadric)); memset(q, 0, sizeof(quadric)); q->ctr.x = 0.0; q->ctr.y = 0.0; q->ctr.z = 0.0; q->methods = &quadric_methods; return q; } void quadric_intersect(quadric *q, ray *ry) { flt Aq, Bq, Cq; flt t1, t2; flt disc; vector rd; vector ro; rd = ry->d; VNorm(&rd); ro.x = ry->o.x - q->ctr.x; ro.y = ry->o.y - q->ctr.y; ro.z = ry->o.z - q->ctr.z; Aq = (q->mat.a * (rd.x * rd.x)) + (2.0 * q->mat.b * rd.x * rd.y) + (2.0 * q->mat.c * rd.x * rd.z) + (q->mat.e * (rd.y * rd.y)) + (2.0 * q->mat.f * rd.y * rd.z) + (q->mat.h * (rd.z * rd.z)); Bq = 2.0 * ((q->mat.a * ro.x * rd.x) + (q->mat.b * ((ro.x * rd.y) + (rd.x * ro.y))) + (q->mat.c * ((ro.x * rd.z) + (rd.x * ro.z))) + (q->mat.d * rd.x) + (q->mat.e * ro.y * rd.y) + (q->mat.f * ((ro.y * rd.z) + (rd.y * ro.z))) + (q->mat.g * rd.y) + (q->mat.h * ro.z * rd.z) + (q->mat.i * rd.z)); Cq = (q->mat.a * (ro.x * ro.x)) + (2.0 * q->mat.b * ro.x * ro.y) + (2.0 * q->mat.c * ro.x * ro.z) + (2.0 * q->mat.d * ro.x) + (q->mat.e * (ro.y * ro.y)) + (2.0 * q->mat.f * ro.y * ro.z) + (2.0 * q->mat.g * ro.y) + (q->mat.h * (ro.z * ro.z)) + (2.0 * q->mat.i * ro.z) + q->mat.j; if (Aq == 0.0) { t1 = -Cq / Bq; add_intersection(t1, (object *)q, ry); } else { disc = (Bq * Bq - 4.0 * Aq * Cq); if (disc > 0.0) { disc = sqrt(disc); t1 = (-Bq + disc) / (2.0 * Aq); t2 = (-Bq - disc) / (2.0 * Aq); add_intersection(t1, (object *)q, ry); add_intersection(t2, (object *)q, ry); } } } void quadric_normal(quadric *q, vector *pnt, ray *incident, vector *N) { N->x = (q->mat.a * (pnt->x - q->ctr.x) + q->mat.b * (pnt->y - q->ctr.y) + q->mat.c * (pnt->z - q->ctr.z) + q->mat.d); N->y = (q->mat.b * (pnt->x - q->ctr.x) + q->mat.e * (pnt->y - q->ctr.y) + q->mat.f * (pnt->z - q->ctr.z) + q->mat.g); N->z = (q->mat.c * (pnt->x - q->ctr.x) + q->mat.f * (pnt->y - q->ctr.y) + q->mat.h * (pnt->z - q->ctr.z) + q->mat.i); VNorm(N); if (VDot(N, &(incident->d)) > 0.0) { N->x = -N->x; N->y = -N->y; N->z = -N->z; } } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/quadric.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * quadric.h - This file contains the defines for quadrics. * * $Id: quadric.h,v 1.2 2007-02-22 17:54:16 Exp $ */ typedef struct { flt a; flt b; flt c; flt d; flt e; flt f; flt g; flt h; flt i; flt j; } quadmatrix; typedef struct { unsigned int id; /* Unique Object serial number */ void *nextobj; /* pointer to next object in list */ object_methods *methods; /* this object's methods */ texture *tex; /* object texture */ vector ctr; quadmatrix mat; } quadric; quadric *newquadric(void); void quadric_intersect(quadric *, ray *); void quadric_normal(quadric *, vector *, ray *, vector *); ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/render.cpp ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * render.cpp - This file contains the main program and driver for the raytracer. */ #include "machine.hpp" #include "types.hpp" #include "macros.hpp" #include "tgafile.hpp" #include "trace.hpp" #include "render.hpp" #include "util.hpp" #include "light.hpp" #include "global.hpp" #include "ui.hpp" #include "tachyon_video.hpp" #include "objbound.hpp" #include "grid.hpp" /* how many pieces to divide each scanline into */ #define NUMHORZDIV 1 void renderscene(scenedef scene) { //char msgtxt[2048]; //void * outfile; /* Grid based accerlation scheme */ if (scene.boundmode == RT_BOUNDING_ENABLED) engrid_scene(&rootobj); /* grid */ /* Not used now if (scene.verbosemode) { sprintf(msgtxt, "Opening %s for output.", scene.outfilename); rt_ui_message(MSG_0, msgtxt); } createtgafile(scene.outfilename, (unsigned short) scene.hres, (unsigned short) scene.vres); outfile = opentgafile(scene.outfilename); */ trace_region(scene, nullptr /*outfile*/, 0, 0, scene.hres, scene.vres); //fclose((FILE *)outfile); } /* end of renderscene() */ ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/render.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * render.h - This file contains the defines for the top level functions * * $Id: render.h,v 1.2 2007-02-22 17:54:16 Exp $ */ void renderscene(scenedef); ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/ring.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * ring.cpp - This file contains the functions for dealing with rings. */ #include "machine.hpp" #include "types.hpp" #include "macros.hpp" #include "vector.hpp" #include "intersect.hpp" #include "util.hpp" #define RING_PRIVATE #include "ring.hpp" static object_methods ring_methods = { (void (*)(void *, void *))(ring_intersect), (void (*)(void *, void *, void *, void *))(ring_normal), ring_bbox, free }; object *newring(void *tex, vector ctr, vector norm, flt inrad, flt outrad) { ring *r; r = (ring *)rt_getmem(sizeof(ring)); memset(r, 0, sizeof(ring)); r->methods = &ring_methods; r->tex = (texture *)tex; r->ctr = ctr; r->norm = norm; r->inrad = inrad; r->outrad = outrad; return (object *)r; } static int ring_bbox(void *obj, vector *min, vector *max) { ring *r = (ring *)obj; min->x = r->ctr.x - r->outrad; min->y = r->ctr.y - r->outrad; min->z = r->ctr.z - r->outrad; max->x = r->ctr.x + r->outrad; max->y = r->ctr.y + r->outrad; max->z = r->ctr.z + r->outrad; return 1; } static void ring_intersect(ring *rng, ray *ry) { flt d; flt t, td; vector hit, pnt; d = -VDot(&(rng->ctr), &(rng->norm)); t = -(d + VDot(&(rng->norm), &(ry->o))); td = VDot(&(rng->norm), &(ry->d)); if (td != 0.0) { t = t / td; if (t >= 0.0) { hit = Raypnt(ry, t); VSUB(hit, rng->ctr, pnt); VDOT(td, pnt, pnt); td = sqrt(td); if ((td > rng->inrad) && (td < rng->outrad)) add_intersection(t, (object *)rng, ry); } } } static void ring_normal(ring *rng, vector *pnt, ray *incident, vector *N) { *N = rng->norm; VNorm(N); if (VDot(N, &(incident->d)) > 0.0) { N->x = -N->x; N->y = -N->y; N->z = -N->z; } } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/ring.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * ring.h - This file contains the defines for rings etc. * * $Id: ring.h,v 1.2 2007-02-22 17:54:16 Exp $ */ object *newring(void *tex, vector ctr, vector norm, flt in, flt out); #ifdef RING_PRIVATE typedef struct { unsigned int id; /* Unique Object serial number */ void *nextobj; /* pointer to next object in list */ object_methods *methods; /* this object's methods */ texture *tex; /* object texture */ vector ctr; vector norm; flt inrad; flt outrad; } ring; static int ring_bbox(void *obj, vector *min, vector *max); static void ring_intersect(ring *, ray *); static void ring_normal(ring *, vector *, ray *incident, vector *); #endif ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/shade.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * shade.cpp - This file contains the functions that perform surface shading. */ #include "machine.hpp" #include "types.hpp" #include "macros.hpp" #include "light.hpp" #include "intersect.hpp" #include "vector.hpp" #include "trace.hpp" #include "global.hpp" #include "shade.hpp" void reset_lights(void) { numlights = 0; } void add_light(point_light* li) { lightlist[numlights] = li; numlights++; } color shader(ray* incident) { color col, diffuse, phongcol; vector N, L, hit; ray shadowray; flt inten, t, Llen; object* obj; int numints, i; point_light* li; numints = closest_intersection(&t, &obj, incident->intstruct); /* find the number of intersections */ /* and return the closest one. */ if (numints < 1) { /* if there weren't any object intersections then return the */ /* background color for the pixel color. */ return incident->scene->background; } if (obj->tex->islight) { /* if the current object is a light, then we */ return obj->tex->col; /* will only use the objects ambient color */ } RAYPNT(hit, (*incident), t) /* find the point of intersection from t */ obj->methods->normal(obj, &hit, incident, &N); /* find the surface normal */ /* execute the object's texture function */ col = obj->tex->texfunc(&hit, obj->tex, incident); diffuse.r = 0.0; diffuse.g = 0.0; diffuse.b = 0.0; phongcol = diffuse; if ((obj->tex->diffuse > 0.0) || (obj->tex->phong > 0.0)) { for (i = 0; i < numlights; i++) { /* loop for light contributions */ li = lightlist[i]; /* set li=to the current light */ VSUB(li->ctr, hit, L) /* find the light vector */ /* calculate the distance to the light from the hit point */ Llen = sqrt(L.x * L.x + L.y * L.y + L.z * L.z) + EPSILON; L.x /= Llen; /* normalize the light direction vector */ L.y /= Llen; L.z /= Llen; VDOT(inten, N, L) /* light intensity */ /* add in diffuse lighting for this light if we're facing it */ if (inten > 0.0) { /* test for a shadow */ shadowray.intstruct = incident->intstruct; shadowray.flags = RT_RAY_SHADOW | RT_RAY_BOUNDED; incident->serial++; shadowray.serial = incident->serial; shadowray.mbox = incident->mbox; shadowray.o = hit; shadowray.d = L; shadowray.maxdist = Llen; shadowray.s = hit; shadowray.e = li->ctr; shadowray.scene = incident->scene; reset_intersection(incident->intstruct); intersect_objects(&shadowray); if (!shadow_intersection(incident->intstruct, Llen)) { /* XXX now that opacity is in the code, have to be more careful */ ColorAddS(&diffuse, &li->tex->col, inten); /* phong type specular highlights */ if (obj->tex->phong > 0.0) { flt phongval; phongval = shade_phong(incident, &hit, &N, &L, obj->tex->phongexp); if (obj->tex->phongtype) ColorAddS(&phongcol, &col, phongval); else ColorAddS(&phongcol, &(li->tex->col), phongval); } } } } } ColorScale(&diffuse, obj->tex->diffuse); col.r *= (diffuse.r + obj->tex->ambient); /* do a product of the */ col.g *= (diffuse.g + obj->tex->ambient); /* diffuse intensity with */ col.b *= (diffuse.b + obj->tex->ambient); /* object color + ambient */ if (obj->tex->phong > 0.0) { ColorAccum(&col, &phongcol); } /* spawn reflection rays if necessary */ /* note: this will overwrite the old intersection list */ if (obj->tex->specular > 0.0) { color specol; specol = shade_reflection(incident, &hit, &N, obj->tex->specular); ColorAccum(&col, &specol); } /* spawn transmission rays / refraction */ /* note: this will overwrite the old intersection list */ if (obj->tex->opacity < 1.0) { color transcol; transcol = shade_transmission(incident, &hit, 1.0 - obj->tex->opacity); ColorAccum(&col, &transcol); } return col; /* return the color of the shaded pixel... */ } color shade_reflection(ray* incident, vector* hit, vector* N, flt specular) { ray specray; color col; vector R; VAddS(-2.0 * (incident->d.x * N->x + incident->d.y * N->y + incident->d.z * N->z), N, &incident->d, &R); specray.intstruct = incident->intstruct; /* what thread are we */ specray.depth = incident->depth - 1; /* go up a level in recursion depth */ specray.flags = RT_RAY_REGULAR; /* infinite ray, to start with */ specray.serial = incident->serial + 1; /* next serial number */ specray.mbox = incident->mbox; specray.o = *hit; specray.d = R; /* reflect incident ray about normal */ specray.o = Raypnt(&specray, EPSILON); /* avoid numerical precision bugs */ specray.maxdist = FHUGE; /* take any intersection */ specray.scene = incident->scene; /* global scenedef info */ col = trace(&specray); /* trace specular reflection ray */ incident->serial = specray.serial; /* update the serial number */ ColorScale(&col, specular); return col; } color shade_transmission(ray* incident, vector* hit, flt trans) { ray transray; color col; transray.intstruct = incident->intstruct; /* what thread are we */ transray.depth = incident->depth - 1; /* go up a level in recursion depth */ transray.flags = RT_RAY_REGULAR; /* infinite ray, to start with */ transray.serial = incident->serial + 1; /* update serial number */ transray.mbox = incident->mbox; transray.o = *hit; transray.d = incident->d; /* ray continues along incident path */ transray.o = Raypnt(&transray, EPSILON); /* avoid numerical precision bugs */ transray.maxdist = FHUGE; /* take any intersection */ transray.scene = incident->scene; /* global scenedef info */ col = trace(&transray); /* trace transmission ray */ incident->serial = transray.serial; ColorScale(&col, trans); return col; } flt shade_phong(ray* incident, vector* hit, vector* N, vector* L, flt specpower) { vector H, V; flt inten; V = incident->d; VScale(&V, -1.0); VAdd(&V, L, &H); VScale(&H, 0.5); VNorm(&H); inten = VDot(N, &H); if (inten > 0.0) inten = pow(inten, specpower); else inten = 0.0; return inten; } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/shade.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * shade.h - This file contains declarations and definitions for the shader. * * $Id: shade.h,v 1.2 2007-02-22 17:54:16 Exp $ */ void reset_lights(void); void add_light(point_light *); color shader(ray *); color shade_reflection(ray *, vector *, vector *, flt); color shade_transmission(ray *, vector *, flt); flt shade_phong(ray *incident, vector *hit, vector *N, vector *L, flt specpower); ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/sphere.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * sphere.cpp - This file contains the functions for dealing with spheres. */ #include "machine.hpp" #include "types.hpp" #include "macros.hpp" #include "vector.hpp" #include "intersect.hpp" #include "util.hpp" #define SPHERE_PRIVATE #include "sphere.hpp" static object_methods sphere_methods = { (void (*)(void *, void *))(sphere_intersect), (void (*)(void *, void *, void *, void *))(sphere_normal), sphere_bbox, free }; object *newsphere(void *tex, vector ctr, flt rad) { sphere *s; s = (sphere *)rt_getmem(sizeof(sphere)); memset(s, 0, sizeof(sphere)); s->methods = &sphere_methods; s->tex = (texture *)tex; s->ctr = ctr; s->rad = rad; return (object *)s; } static int sphere_bbox(void *obj, vector *min, vector *max) { sphere *s = (sphere *)obj; min->x = s->ctr.x - s->rad; min->y = s->ctr.y - s->rad; min->z = s->ctr.z - s->rad; max->x = s->ctr.x + s->rad; max->y = s->ctr.y + s->rad; max->z = s->ctr.z + s->rad; return 1; } static void sphere_intersect(sphere *spr, ray *ry) { flt b, disc, t1, t2, temp; vector V; VSUB(spr->ctr, ry->o, V); VDOT(b, V, ry->d); VDOT(temp, V, V); disc = b * b + spr->rad * spr->rad - temp; if (disc <= 0.0) return; disc = sqrt(disc); t2 = b + disc; if (t2 <= SPEPSILON) return; add_intersection(t2, (object *)spr, ry); t1 = b - disc; if (t1 > SPEPSILON) add_intersection(t1, (object *)spr, ry); } static void sphere_normal(sphere *spr, vector *pnt, ray *incident, vector *N) { VSub((vector *)pnt, &(spr->ctr), N); VNorm(N); if (VDot(N, &(incident->d)) > 0.0) { N->x = -N->x; N->y = -N->y; N->z = -N->z; } } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/sphere.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * sphere.h - This file contains the defines for spheres etc. * * $Id: sphere.h,v 1.2 2007-02-22 17:54:16 Exp $ */ object *newsphere(void *, vector, flt); #ifdef SPHERE_PRIVATE typedef struct { unsigned int id; /* Unique Object serial number */ void *nextobj; /* pointer to next object in list */ object_methods *methods; /* this object's methods */ texture *tex; /* object texture */ vector ctr; flt rad; } sphere; static int sphere_bbox(void *obj, vector *min, vector *max); static void sphere_intersect(sphere *, ray *); static void sphere_normal(sphere *, vector *, ray *, vector *); #endif /* SPHERE_PRIVATE */ ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/tachyon_video.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include "types.hpp" #include "api.hpp" /* The ray tracing library API */ #include "ui.hpp" #include "util.hpp" #include "tachyon_video.hpp" extern SceneHandle global_scene; extern char *global_window_title; extern bool global_usegraphics; void tachyon_video::on_process() { char buf[8192]; flt runtime; scenedef *scene = (scenedef *)global_scene; updating_mode = scene->displaymode == RT_DISPLAY_ENABLED; recycling = false; pausing = false; do { updating = updating_mode; timer start_timer = gettimer(); rt_renderscene(global_scene); timer end_timer = gettimer(); runtime = timertime(start_timer, end_timer); sprintf(buf, "%s: %.3f seconds", global_window_title, runtime); rt_ui_message(MSG_0, buf); title = buf; show_title(); // show time spent for rendering if (!updating) { updating = true; drawing_memory dm = get_drawing_memory(); drawing_area drawing(0, 0, dm.sizex, dm.sizey); // invalidate whole screen } rt_finalize(); title = global_window_title; show_title(); // reset title to default } while (recycling && running); } void tachyon_video::on_key(int key) { key &= 0xff; recycling = true; if (key == esc_key) running = false; else if (key == ' ') { if (!updating) { updating = true; drawing_memory dm = get_drawing_memory(); drawing_area drawing(0, 0, dm.sizex, dm.sizey); // invalidate whole screen } updating = updating_mode = !updating_mode; } else if (key == 'p') { pausing = !pausing; if (pausing) { title = "Press ESC to exit or 'p' to continue after rendering completion"; show_title(); } } } void rt_finalize(void) { timer t0, t1; t0 = gettimer(); if (global_usegraphics) do { rt_sleep(1); t1 = gettimer(); } while ((timertime(t0, t1) < 10 || video->pausing) && video->next_frame()); #ifdef _WINDOWS else rt_sleep(10000); #endif } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/tachyon_video.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "common/gui/video.hpp" class tachyon_video : public video { public: bool updating_mode; bool recycling; bool pausing; void on_process(); void on_key(int key); }; extern class tachyon_video *video; ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/texture.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * texture.cpp - This file contains functions for implementing textures. */ #include "machine.hpp" #include "types.hpp" #include "macros.hpp" #include "texture.hpp" #include "coordsys.hpp" #include "imap.hpp" #include "vector.hpp" #include "box.hpp" /* plain vanilla texture solely based on object color */ color standard_texture(vector *hit, texture *tex, ray *ry) { return tex->col; } /* cylindrical image map */ color image_cyl_texture(vector *hit, texture *tex, ray *ry) { vector rh; flt u, v; rh.x = hit->x - tex->ctr.x; rh.z = hit->y - tex->ctr.y; rh.y = hit->z - tex->ctr.z; xyztocyl(rh, 1.0, &u, &v); u = u * tex->scale.x; u = u + tex->rot.x; u = fmod(u, 1.0); if (u < 0.0) u += 1.0; v = v * tex->scale.y; v = v + tex->rot.y; v = fmod(v, 1.0); if (v < 0.0) v += 1.0; return ImageMap((rawimage *)tex->img, u, v); } /* spherical image map */ color image_sphere_texture(vector *hit, texture *tex, ray *ry) { vector rh; flt u, v; rh.x = hit->x - tex->ctr.x; rh.y = hit->y - tex->ctr.y; rh.z = hit->z - tex->ctr.z; xyztospr(rh, &u, &v); u = u * tex->scale.x; u = u + tex->rot.x; u = fmod(u, 1.0); if (u < 0.0) u += 1.0; v = v * tex->scale.y; v = v + tex->rot.y; v = fmod(v, 1.0); if (v < 0.0) v += 1.0; return ImageMap((rawimage *)tex->img, u, v); } /* planar image map */ color image_plane_texture(vector *hit, texture *tex, ray *ry) { vector pnt; flt u, v; pnt.x = hit->x - tex->ctr.x; pnt.y = hit->y - tex->ctr.y; pnt.z = hit->z - tex->ctr.z; VDOT(u, tex->uaxs, pnt); /* VDOT(len, tex->uaxs, tex->uaxs); u = u / sqrt(len); */ VDOT(v, tex->vaxs, pnt); /* VDOT(len, tex->vaxs, tex->vaxs); v = v / sqrt(len); */ u = u * tex->scale.x; u = u + tex->rot.x; u = fmod(u, 1.0); if (u < 0.0) u += 1.0; v = v * tex->scale.y; v = v + tex->rot.y; v = fmod(v, 1.0); if (v < 0.0) v += 1.0; return ImageMap((rawimage *)tex->img, u, v); } color grit_texture(vector *hit, texture *tex, ray *ry) { int rnum; flt fnum; color col; rnum = rand() % 4096; fnum = (rnum / 4096.0 * 0.2) + 0.8; col.r = tex->col.r * fnum; col.g = tex->col.g * fnum; col.b = tex->col.b * fnum; return col; } color checker_texture(vector *hit, texture *tex, ray *ry) { long x, y, z; flt xh, yh, zh; color col; xh = hit->x - tex->ctr.x; x = (long)((fabs(xh) * 3) + 0.5); x = x % 2; yh = hit->y - tex->ctr.y; y = (long)((fabs(yh) * 3) + 0.5); y = y % 2; zh = hit->z - tex->ctr.z; z = (long)((fabs(zh) * 3) + 0.5); z = z % 2; if (((x + y + z) % 2) == 1) { col.r = 1.0; col.g = 0.2; col.b = 0.0; } else { col.r = 0.0; col.g = 0.2; col.b = 1.0; } return col; } color cyl_checker_texture(vector *hit, texture *tex, ray *ry) { long x, y; vector rh; flt u, v; color col; rh.x = hit->x - tex->ctr.x; rh.y = hit->y - tex->ctr.y; rh.z = hit->z - tex->ctr.z; xyztocyl(rh, 1.0, &u, &v); x = (long)(fabs(u) * 18.0); x = x % 2; y = (long)(fabs(v) * 10.0); y = y % 2; if (((x + y) % 2) == 1) { col.r = 1.0; col.g = 0.2; col.b = 0.0; } else { col.r = 0.0; col.g = 0.2; col.b = 1.0; } return col; } color wood_texture(vector *hit, texture *tex, ray *ry) { flt radius, angle; int grain; color col; flt x, y, z; x = (hit->x - tex->ctr.x) * 1000; y = (hit->y - tex->ctr.y) * 1000; z = (hit->z - tex->ctr.z) * 1000; radius = sqrt(x * x + z * z); if (z == 0.0) angle = 3.1415926 / 2.0; else angle = atan(x / z); radius = radius + 3.0 * sin(20 * angle + y / 150.0); grain = ((int)(radius + 0.5)) % 60; if (grain < 40) { col.r = 0.8; col.g = 1.0; col.b = 0.2; } else { col.r = 0.0; col.g = 0.0; col.b = 0.0; } return col; } #define NMAX 28 short int NoiseMatrix[NMAX][NMAX][NMAX]; void InitNoise(void) { byte_t x, y, z, i, j, k; for (x = 0; x < NMAX; x++) { for (y = 0; y < NMAX; y++) { for (z = 0; z < NMAX; z++) { NoiseMatrix[x][y][z] = rand() % 12000; if (x == NMAX - 1) i = 0; else i = x; if (y == NMAX - 1) j = 0; else j = y; if (z == NMAX - 1) k = 0; else k = z; NoiseMatrix[x][y][z] = NoiseMatrix[i][j][k]; } } } } int Noise(flt x, flt y, flt z) { byte_t ix, iy, iz; flt ox, oy, oz; int p000, p001, p010, p011; int p100, p101, p110, p111; int p00, p01, p10, p11; int p0, p1; int d00, d01, d10, d11; int d0, d1, d; x = fabs(x); y = fabs(y); z = fabs(z); ix = ((int)x) % (NMAX - 1); iy = ((int)y) % (NMAX - 1); iz = ((int)z) % (NMAX - 1); ox = (x - ((int)x)); oy = (y - ((int)y)); oz = (z - ((int)z)); p000 = NoiseMatrix[ix][iy][iz]; p001 = NoiseMatrix[ix][iy][iz + 1]; p010 = NoiseMatrix[ix][iy + 1][iz]; p011 = NoiseMatrix[ix][iy + 1][iz + 1]; p100 = NoiseMatrix[ix + 1][iy][iz]; p101 = NoiseMatrix[ix + 1][iy][iz + 1]; p110 = NoiseMatrix[ix + 1][iy + 1][iz]; p111 = NoiseMatrix[ix + 1][iy + 1][iz + 1]; d00 = p100 - p000; d01 = p101 - p001; d10 = p110 - p010; d11 = p111 - p011; p00 = (int)((int)d00 * ox) + p000; p01 = (int)((int)d01 * ox) + p001; p10 = (int)((int)d10 * ox) + p010; p11 = (int)((int)d11 * ox) + p011; d0 = p10 - p00; d1 = p11 - p01; p0 = (int)((int)d0 * oy) + p00; p1 = (int)((int)d1 * oy) + p01; d = p1 - p0; return (int)((int)d * oz) + p0; } color marble_texture(vector *hit, texture *tex, ray *ry) { flt i, d; flt x, y, z; color col; x = hit->x; y = hit->y; z = hit->z; x = x * 1.0; d = x + 0.0006 * Noise(x, (y * 1.0), (z * 1.0)); d = d * (((int)d) % 25); i = 0.0 + 0.10 * fabs(d - 10.0 - 20.0 * ((int)d * 0.05)); if (i > 1.0) i = 1.0; if (i < 0.0) i = 0.0; /* col.r=i * tex->col.r; col.g=i * tex->col.g; col.b=i * tex->col.b; */ col.r = (1.0 + sin(i * 6.28)) / 2.0; col.g = (1.0 + sin(i * 16.28)) / 2.0; col.b = (1.0 + cos(i * 30.28)) / 2.0; return col; } color gnoise_texture(vector *hit, texture *tex, ray *ry) { color col; flt f; f = Noise((hit->x - tex->ctr.x), (hit->y - tex->ctr.y), (hit->z - tex->ctr.z)); if (f < 0.01) f = 0.01; if (f > 1.0) f = 1.0; col.r = tex->col.r * f; col.g = tex->col.g * f; col.b = tex->col.b * f; return col; } void InitTextures(void) { InitNoise(); ResetImages(); } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/texture.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * texture.h This file contains all of the includes and defines for the texture * mapping part of the shader. * * $Id: texture.h,v 1.2 2007-02-22 17:54:16 Exp $ */ void InitTextures(void); color standard_texture(vector *, texture *, ray *); color image_cyl_texture(vector *, texture *, ray *); color image_sphere_texture(vector *, texture *, ray *); color image_plane_texture(vector *, texture *, ray *); color checker_texture(vector *, texture *, ray *); color cyl_checker_texture(vector *, texture *, ray *); color grit_texture(vector *, texture *, ray *); color wood_texture(vector *, texture *, ray *); color marble_texture(vector *, texture *, ray *); color gnoise_texture(vector *, texture *, ray *); int Noise(flt, flt, flt); void InitTextures(void); ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/tgafile.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * tgafile.cpp - This file contains the code to write 24 bit targa files... */ #include "machine.hpp" #include "types.hpp" #include "util.hpp" #include "ui.hpp" #include "imageio.hpp" #include "tgafile.hpp" void createtgafile(char *name, unsigned short width, unsigned short height) { int filesize; FILE *ofp; filesize = 3 * width * height + 18 - 10; if (name == nullptr) std::exit(-1); else { ofp = fopen(name, "w+b"); if (ofp == nullptr) { char msgtxt[2048]; sprintf(msgtxt, "Cannot create %s for output!", name); rt_ui_message(MSG_ERR, msgtxt); rt_ui_message(MSG_ABORT, "Rendering Aborted."); std::exit(-1); } fputc(0, ofp); /* IdLength */ fputc(0, ofp); /* ColorMapType */ fputc(2, ofp); /* ImageTypeCode */ fputc(0, ofp); /* ColorMapOrigin, low byte */ fputc(0, ofp); /* ColorMapOrigin, high byte */ fputc(0, ofp); /* ColorMapLength, low byte */ fputc(0, ofp); /* ColorMapLength, high byte */ fputc(0, ofp); /* ColorMapEntrySize */ fputc(0, ofp); /* XOrigin, low byte */ fputc(0, ofp); /* XOrigin, high byte */ fputc(0, ofp); /* YOrigin, low byte */ fputc(0, ofp); /* YOrigin, high byte */ fputc((width & 0xff), ofp); /* Width, low byte */ fputc(((width >> 8) & 0xff), ofp); /* Width, high byte */ fputc((height & 0xff), ofp); /* Height, low byte */ fputc(((height >> 8) & 0xff), ofp); /* Height, high byte */ fputc(24, ofp); /* ImagePixelSize */ fputc(0x20, ofp); /* ImageDescriptorByte 0x20 == flip vertically */ fseek(ofp, filesize, 0); fprintf(ofp, "9876543210"); fclose(ofp); } } void *opentgafile(char *filename) { FILE *ofp; ofp = fopen(filename, "r+b"); if (ofp == nullptr) { char msgtxt[2048]; sprintf(msgtxt, "Cannot open %s for output!", filename); rt_ui_message(MSG_ERR, msgtxt); rt_ui_message(MSG_ABORT, "Rendering Aborted."); std::exit(-1); } return ofp; } void writetgaregion(void *voidofp, int iwidth, int iheight, int startx, int starty, int stopx, int stopy, char *buffer) { int y, totalx, totaly; char *bufpos; long filepos; std::size_t numbytes; FILE *ofp = (FILE *)voidofp; totalx = stopx - startx + 1; totaly = stopy - starty + 1; for (y = 0; y < totaly; y++) { bufpos = buffer + (totalx * 3) * (totaly - y - 1); filepos = 18 + iwidth * 3 * (iheight - starty - totaly + y + 1) + (startx - 1) * 3; if (filepos >= 18) { fseek(ofp, filepos, 0); numbytes = fwrite(bufpos, 3, totalx, ofp); if (numbytes != totalx) { char msgtxt[256]; sprintf(msgtxt, "File write problem, %d bytes written.", (int)numbytes); rt_ui_message(MSG_ERR, msgtxt); } } else { rt_ui_message(MSG_ERR, "writetgaregion: file ptr out of range!!!\n"); return; /* don't try to continue */ } } } int readtga(char *name, int *xres, int *yres, unsigned char **imgdata) { int format, width, height, w1, w2, h1, h2, depth, flags; int imgsize, i, tmp; std::size_t bytesread; FILE *ifp; ifp = fopen(name, "r"); if (ifp == nullptr) { return IMAGEBADFILE; /* couldn't open the file */ } /* read the targa header */ getc(ifp); /* ID length */ getc(ifp); /* colormap type */ format = getc(ifp); /* image type */ getc(ifp); /* color map origin */ getc(ifp); /* color map origin */ getc(ifp); /* color map length */ getc(ifp); /* color map length */ getc(ifp); /* color map entry size */ getc(ifp); /* x origin */ getc(ifp); /* x origin */ getc(ifp); /* y origin */ getc(ifp); /* y origin */ w1 = getc(ifp); /* width (low) */ w2 = getc(ifp); /* width (hi) */ h1 = getc(ifp); /* height (low) */ h2 = getc(ifp); /* height (hi) */ depth = getc(ifp); /* image pixel size */ flags = getc(ifp); /* image descriptor byte */ if ((format != 2) || (depth != 24)) { fclose(ifp); return IMAGEUNSUP; /* unsupported targa format */ } width = ((w2 << 8) | w1); height = ((h2 << 8) | h1); imgsize = 3 * width * height; *imgdata = (unsigned char *)rt_getmem(imgsize); bytesread = fread(*imgdata, 1, imgsize, ifp); fclose(ifp); /* flip image vertically */ if (flags == 0x20) { int rowsize = 3 * width; unsigned char *copytmp; copytmp = (unsigned char *)malloc(rowsize); for (i = 0; i < height / 2; i++) { memcpy(copytmp, &((*imgdata)[rowsize * i]), rowsize); memcpy(&(*imgdata)[rowsize * i], &(*imgdata)[rowsize * (height - 1 - i)], rowsize); memcpy(&(*imgdata)[rowsize * (height - 1 - i)], copytmp, rowsize); } free(copytmp); } /* convert from BGR order to RGB order */ for (i = 0; i < imgsize; i += 3) { tmp = (*imgdata)[i]; /* Blue */ (*imgdata)[i] = (*imgdata)[i + 2]; /* Red */ (*imgdata)[i + 2] = tmp; /* Blue */ } *xres = width; *yres = height; if (bytesread != imgsize) return IMAGEREADERR; return IMAGENOERR; } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/tgafile.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * tgafile.h - this file contains defines and structures for tgafile.c * * $Id: tgafile.h,v 1.2 2007-02-22 17:54:16 Exp $ */ /* declare other functions */ void createtgafile(char *, unsigned short, unsigned short); void *opentgafile(char *); void writetgaregion(void *, int, int, int, int, int, int, char *); int readtga(char *name, int *xres, int *yres, unsigned char **imgdata); ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/trace.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * trace.h - This file contains the declarations and defines for the trace module * * $Id: trace.h,v 1.2 2007-02-22 17:54:16 Exp $ */ extern char *global_buffer; typedef struct { int tid; int nthr; scenedef scene; char *buffer; int startx; int stopx; int starty; int stopy; } thr_parms; typedef struct { int startx; int stopx; int starty; int stopy; } patch; typedef struct { void *tga; int iwidth; int iheight; int startx; int starty; int stopx; int stopy; char *buffer; } thr_io_parms; color trace(ray *); void *thread_trace(thr_parms *parms); void thread_trace1(thr_parms *, patch *, int depth); void thread_trace2(thr_parms *, patch *); void *thread_io(void *); void trace_shm(scenedef, /*char *,*/ int, int, int, int); void trace_region(scenedef, void *, int, int, int, int); ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/trace.omp.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include "machine.hpp" #include "types.hpp" #include "macros.hpp" #include "vector.hpp" #include "tgafile.hpp" #include "trace.hpp" #include "light.hpp" #include "shade.hpp" #include "camera.hpp" #include "util.hpp" #include "intersect.hpp" #include "global.hpp" #include "ui.hpp" #include "tachyon_video.hpp" // shared but read-only so could be private too static thr_parms *all_parms; static scenedef scene; static int startx; static int stopx; static int starty; static int stopy; static flt jitterscale; static int totaly; static color_t render_one_pixel(int x, int y, unsigned int *local_mbox, unsigned int &serial, int startx, int stopx, int starty, int stopy) { /* private vars moved inside loop */ ray primary, sample; color col, avcol; int R, G, B; intersectstruct local_intersections; int alias; /* end private */ primary = camray(&scene, x, y); primary.intstruct = &local_intersections; primary.flags = RT_RAY_REGULAR; serial++; primary.serial = serial; primary.mbox = local_mbox; primary.maxdist = FHUGE; primary.scene = &scene; col = trace(&primary); serial = primary.serial; /* perform antialiasing if enabled.. */ if (scene.antialiasing > 0) { for (alias = 0; alias < scene.antialiasing; alias++) { serial++; /* increment serial number */ sample = primary; /* copy the regular primary ray to start with */ sample.serial = serial; #pragma omp critical { sample.d.x += ((rand() % 100) - 50) / jitterscale; sample.d.y += ((rand() % 100) - 50) / jitterscale; sample.d.z += ((rand() % 100) - 50) / jitterscale; } avcol = trace(&sample); serial = sample.serial; /* update our overall serial # */ col.r += avcol.r; col.g += avcol.g; col.b += avcol.b; } col.r /= (scene.antialiasing + 1.0); col.g /= (scene.antialiasing + 1.0); col.b /= (scene.antialiasing + 1.0); } /* Handle overexposure and underexposure here... */ R = (int)(col.r * 255); if (R > 255) R = 255; else if (R < 0) R = 0; G = (int)(col.g * 255); if (G > 255) G = 255; else if (G < 0) G = 0; B = (int)(col.b * 255); if (B > 255) B = 255; else if (B < 0) B = 0; return video->get_color(R, G, B); } static void parallel_thread(void) { // thread-local storage unsigned int serial = 1; unsigned int mboxsize = sizeof(unsigned int) * (max_objectid() + 20); unsigned int *local_mbox = (unsigned int *)alloca(mboxsize); memset(local_mbox, 0, mboxsize); #pragma omp for nowait schedule(runtime) for (int y = starty; y < stopy; y++) { if (!video->running) continue; drawing_area drawing(startx, totaly - y, stopx - startx, 1); for (int x = startx; x < stopx; x++) { color_t c = render_one_pixel(x, y, local_mbox, serial, startx, stopx, starty, stopy); drawing.put_pixel(c); } video->next_frame(); } } void *thread_trace(thr_parms *parms) { // shared but read-only so could be private too all_parms = parms; scene = parms->scene; startx = parms->startx; stopx = parms->stopx; starty = parms->starty; stopy = parms->stopy; jitterscale = 40.0 * (scene.hres + scene.vres); totaly = parms->scene.vres - 1; #pragma omp parallel parallel_thread(); return (nullptr); } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/trace.serial.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "machine.hpp" #include "types.hpp" #include "macros.hpp" #include "vector.hpp" #include "tgafile.hpp" #include "trace.hpp" #include "light.hpp" #include "shade.hpp" #include "camera.hpp" #include "util.hpp" #include "intersect.hpp" #include "global.hpp" #include "ui.hpp" #include "tachyon_video.hpp" // shared but read-only so could be private too static thr_parms *all_parms; static scenedef scene; static int startx; static int stopx; static int starty; static int stopy; static flt jitterscale; static int totaly; static color_t render_one_pixel(int x, int y, unsigned int *local_mbox, unsigned int &serial, int startx, int stopx, int starty, int stopy) { /* private vars moved inside loop */ ray primary, sample; color col, avcol; int R, G, B; intersectstruct local_intersections; int alias; /* end private */ primary = camray(&scene, x, y); primary.intstruct = &local_intersections; primary.flags = RT_RAY_REGULAR; serial++; primary.serial = serial; primary.mbox = local_mbox; primary.maxdist = FHUGE; primary.scene = &scene; col = trace(&primary); serial = primary.serial; /* perform antialiasing if enabled.. */ if (scene.antialiasing > 0) { for (alias = 0; alias < scene.antialiasing; alias++) { serial++; /* increment serial number */ sample = primary; /* copy the regular primary ray to start with */ sample.serial = serial; { sample.d.x += ((std::rand() % 100) - 50) / jitterscale; sample.d.y += ((std::rand() % 100) - 50) / jitterscale; sample.d.z += ((std::rand() % 100) - 50) / jitterscale; } avcol = trace(&sample); serial = sample.serial; /* update our overall serial # */ col.r += avcol.r; col.g += avcol.g; col.b += avcol.b; } col.r /= (scene.antialiasing + 1.0); col.g /= (scene.antialiasing + 1.0); col.b /= (scene.antialiasing + 1.0); } /* Handle overexposure and underexposure here... */ R = (int)(col.r * 255); if (R > 255) R = 255; else if (R < 0) R = 0; G = (int)(col.g * 255); if (G > 255) G = 255; else if (G < 0) G = 0; B = (int)(col.b * 255); if (B > 255) B = 255; else if (B < 0) B = 0; return video->get_color(R, G, B); } static void parallel_thread(void) { // thread-local storage unsigned int serial = 1; unsigned int mboxsize = sizeof(unsigned int) * (max_objectid() + 20); unsigned int *local_mbox = (unsigned int *)alloca(mboxsize); memset(local_mbox, 0, mboxsize); for (int y = starty; y < stopy; y++) { { drawing_area drawing(startx, totaly - y, stopx - startx, 1); for (int x = startx; x < stopx; x++) { color_t c = render_one_pixel(x, y, local_mbox, serial, startx, stopx, starty, stopy); drawing.put_pixel(c); } } if (!video->next_frame()) return; } } void *thread_trace(thr_parms *parms) { // shared but read-only so could be private too all_parms = parms; scene = parms->scene; startx = parms->startx; stopx = parms->stopx; starty = parms->starty; stopy = parms->stopy; jitterscale = 40.0 * (scene.hres + scene.vres); totaly = parms->scene.vres - 1; parallel_thread(); return (nullptr); } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/trace.simple.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "machine.hpp" #include "types.hpp" #include "macros.hpp" #include "vector.hpp" #include "tgafile.hpp" #include "trace.hpp" #include "light.hpp" #include "shade.hpp" #include "camera.hpp" #include "util.hpp" #include "intersect.hpp" #include "global.hpp" #include "ui.hpp" #include "tachyon_video.hpp" // shared but read-only so could be private too static thr_parms *all_parms; static scenedef scene; static int startx; static int stopx; static int starty; static int stopy; static flt jitterscale; static int totaly; static color_t render_one_pixel(int x, int y, unsigned int *local_mbox, unsigned int &serial, int startx, int stopx, int starty, int stopy) { /* private vars moved inside loop */ ray primary; color col; int R, G, B; intersectstruct local_intersections; /* end private */ primary = camray(&scene, x, y); primary.intstruct = &local_intersections; primary.flags = RT_RAY_REGULAR; serial++; primary.serial = serial; primary.mbox = local_mbox; primary.maxdist = FHUGE; primary.scene = &scene; col = trace(&primary); serial = primary.serial; /* Handle overexposure and underexposure here... */ R = (int)(col.r * 255); if (R > 255) R = 255; else if (R < 0) R = 0; G = (int)(col.g * 255); if (G > 255) G = 255; else if (G < 0) G = 0; B = (int)(col.b * 255); if (B > 255) B = 255; else if (B < 0) B = 0; return video->get_color(R, G, B); } #if DO_ITT_NOTIFY #include "ittnotify.h" #endif #define RUNTIME_SERIAL 1 #define RUNTIME_OPENMP 2 #define RUNTIME_TBB 3 #ifndef RUNTIME #define RUNTIME RUNTIME_TBB #endif #if RUNTIME == RUNTIME_OPENMP #include #elif RUNTIME == RUNTIME_TBB #include "oneapi/tbb.h" #endif static void parallel_thread(void) { unsigned int mboxsize = sizeof(unsigned int) * (max_objectid() + 20); #if RUNTIME == RUNTIME_SERIAL for (int y = starty; y < stopy; y++) #elif RUNTIME == RUNTIME_OPENMP #pragma omp parallel for for (int y = starty; y < stopy; y++) #elif RUNTIME == RUNTIME_TBB oneapi::tbb::parallel_for(starty, stopy, [mboxsize] (int y) #endif { unsigned int serial = 1; unsigned int local_mbox[mboxsize]; memset(local_mbox, 0, mboxsize); drawing_area drawing(startx, totaly - y, stopx - startx, 1); for (int x = startx; x < stopx; x++) { color_t c = render_one_pixel(x, y, local_mbox, serial, startx, stopx, starty, stopy); drawing.put_pixel(c); } video->next_frame(); } #if RUNTIME == RUNTIME_TBB ); #endif } void *thread_trace(thr_parms *parms) { // shared but read-only so could be private too all_parms = parms; scene = parms->scene; startx = parms->startx; stopx = parms->stopx; starty = parms->starty; stopy = parms->stopy; jitterscale = 40.0 * (scene.hres + scene.vres); totaly = parms->scene.vres - 1; #if DO_ITT_NOTIFY __itt_resume(); #endif parallel_thread(); #if DO_ITT_NOTIFY __itt_pause(); #endif return (nullptr); } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/trace.taskq.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include "machine.hpp" #include "types.hpp" #include "macros.hpp" #include "vector.hpp" #include "tgafile.hpp" #include "trace.hpp" #include "light.hpp" #include "shade.hpp" #include "camera.hpp" #include "util.hpp" #include "intersect.hpp" #include "global.hpp" #include "ui.hpp" #include "tachyon_video.hpp" // shared but read-only so could be private too static thr_parms *all_parms; static scenedef scene; static int startx; static int stopx; static int starty; static int stopy; static flt jitterscale; static int totaly, totalx; static int grain_size = 50; const int DIVFACTOR = 2; #define MIN(a, b) ((a) < (b) ? (a) : (b)) static color_t render_one_pixel(int x, int y, unsigned int *local_mbox, unsigned int &serial, int startx, int stopx, int starty, int stopy) { /* private vars moved inside loop */ ray primary, sample; color col, avcol; int R, G, B; intersectstruct local_intersections; int alias; /* end private */ primary = camray(&scene, x, y); primary.intstruct = &local_intersections; primary.flags = RT_RAY_REGULAR; serial++; primary.serial = serial; primary.mbox = local_mbox; primary.maxdist = FHUGE; primary.scene = &scene; col = trace(&primary); serial = primary.serial; /* perform antialiasing if enabled.. */ if (scene.antialiasing > 0) { for (alias = 0; alias < scene.antialiasing; alias++) { serial++; /* increment serial number */ sample = primary; /* copy the regular primary ray to start with */ sample.serial = serial; #pragma omp critical { sample.d.x += ((rand() % 100) - 50) / jitterscale; sample.d.y += ((rand() % 100) - 50) / jitterscale; sample.d.z += ((rand() % 100) - 50) / jitterscale; } avcol = trace(&sample); serial = sample.serial; /* update our overall serial # */ col.r += avcol.r; col.g += avcol.g; col.b += avcol.b; } col.r /= (scene.antialiasing + 1.0); col.g /= (scene.antialiasing + 1.0); col.b /= (scene.antialiasing + 1.0); } /* Handle overexposure and underexposure here... */ R = (int)(col.r * 255); if (R > 255) R = 255; else if (R < 0) R = 0; G = (int)(col.g * 255); if (G > 255) G = 255; else if (G < 0) G = 0; B = (int)(col.b * 255); if (B > 255) B = 255; else if (B < 0) B = 0; return video->get_color(R, G, B); } static void parallel_thread(patch *pchin, int depth) { unsigned char col[3]; col[0] = col[1] = col[2] = (32 * depth) % 256; depth++; #pragma intel omp taskq firstprivate(depth) { int startx, stopx, starty, stopy; int xs, ys; startx = pchin->startx; stopx = pchin->stopx; starty = pchin->starty; stopy = pchin->stopy; if (((stopx - startx) >= grain_size) || ((stopy - starty) >= grain_size)) { int xpatchsize = (stopx - startx) / DIVFACTOR + 1; int ypatchsize = (stopy - starty) / DIVFACTOR + 1; for (ys = starty; ys <= stopy; ys += ypatchsize) for (xs = startx; xs <= stopx; xs += xpatchsize) { patch pch; pch.startx = xs; pch.starty = ys; pch.stopx = MIN(xs + xpatchsize, stopx); pch.stopy = MIN(ys + ypatchsize, stopy); #pragma intel omp task parallel_thread(&pch, depth); } } else { /* just trace this patch */ unsigned int mboxsize = sizeof(unsigned int) * (max_objectid() + 20); unsigned int *local_mbox = (unsigned int *)alloca(mboxsize); memset(local_mbox, 0, mboxsize); drawing_area drawing(startx, totaly - stopy, stopx - startx, stopy - starty); for (int i = 1, y = starty; y < stopy; ++y, i++) { if (!video->running) continue; drawing.set_pos(0, drawing.size_y - i); unsigned int serial = 5 * ((stopx - startx) + (stopy - starty) * totalx); for (int x = startx; x < stopx; x++) { color_t c = render_one_pixel(x, y, local_mbox, serial, startx, stopx, starty, stopy); drawing.put_pixel(c); } } video->next_frame(); } } } void *thread_trace(thr_parms *parms) { // shared but read-only so could be private too all_parms = parms; scene = parms->scene; startx = parms->startx; stopx = parms->stopx; starty = parms->starty; stopy = parms->stopy; jitterscale = 40.0 * (scene.hres + scene.vres); totalx = parms->stopx - parms->startx + 1; totaly = parms->scene.vres; patch pch; pch.startx = startx; pch.stopx = stopx; pch.starty = starty; pch.stopy = stopy; int g; char *grain_str = getenv("TASKQ_GRAINSIZE"); if (grain_str && (sscanf(grain_str, "%d", &g) > 0) && (g > 0)) grain_size = g; #pragma omp parallel parallel_thread(&pch, 0); return (nullptr); } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/trace.tbb.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "machine.hpp" #include "types.hpp" #include "macros.hpp" #include "vector.hpp" #include "tgafile.hpp" #include "trace.hpp" #include "light.hpp" #include "shade.hpp" #include "camera.hpp" #include "util.hpp" #include "intersect.hpp" #include "global.hpp" #include "ui.hpp" #include "tachyon_video.hpp" // shared but read-only so could be private too static thr_parms *all_parms; static scenedef scene; static int startx; static int stopx; static int starty; static int stopy; static flt jitterscale; static int totaly; #ifdef MARK_RENDERING_AREA // rgb colors list for coloring image by each thread static const float inner_alpha = 0.3; static const float border_alpha = 0.5; #define NUM_COLORS 24 static int colors[NUM_COLORS][3] = { { 255, 110, 0 }, { 220, 254, 0 }, { 102, 254, 0 }, { 0, 21, 254 }, { 97, 0, 254 }, { 254, 30, 0 }, { 20, 41, 8 }, { 144, 238, 38 }, { 184, 214, 139 }, { 28, 95, 20 }, { 139, 173, 148 }, { 188, 228, 183 }, { 145, 47, 56 }, { 204, 147, 193 }, { 45, 202, 143 }, { 204, 171, 143 }, { 143, 160, 204 }, { 220, 173, 3 }, { 1, 152, 231 }, { 79, 235, 237 }, { 52, 193, 72 }, { 67, 136, 151 }, { 78, 87, 179 }, { 143, 255, 9 }, }; #include #include "oneapi/tbb/enumerable_thread_specific.h" // storage and counter for thread numbers in order of first task run typedef oneapi::tbb::enumerable_thread_specific thread_id_t; thread_id_t thread_ids(-1); std::atomic thread_number; #endif #include "oneapi/tbb/parallel_for.h" #include "oneapi/tbb/spin_mutex.h" #include "oneapi/tbb/blocked_range2d.h" #include "oneapi/tbb/global_control.h" #include "common/utility/get_default_num_threads.hpp" static oneapi::tbb::spin_mutex MyMutex, MyMutex2; static color_t render_one_pixel(int x, int y, unsigned int *local_mbox, unsigned int &serial, int startx, int stopx, int starty, int stopy #ifdef MARK_RENDERING_AREA , int *blend, float alpha #endif ) { /* private vars moved inside loop */ ray primary, sample; color col, avcol; int R, G, B; intersectstruct local_intersections; int alias; /* end private */ primary = camray(&scene, x, y); primary.intstruct = &local_intersections; primary.flags = RT_RAY_REGULAR; serial++; primary.serial = serial; primary.mbox = local_mbox; primary.maxdist = FHUGE; primary.scene = &scene; col = trace(&primary); serial = primary.serial; /* perform antialiasing if enabled.. */ if (scene.antialiasing > 0) { for (alias = 0; alias < scene.antialiasing; alias++) { serial++; /* increment serial number */ sample = primary; /* copy the regular primary ray to start with */ sample.serial = serial; { oneapi::tbb::spin_mutex::scoped_lock lock(MyMutex); sample.d.x += ((rand() % 100) - 50) / jitterscale; sample.d.y += ((rand() % 100) - 50) / jitterscale; sample.d.z += ((rand() % 100) - 50) / jitterscale; } avcol = trace(&sample); serial = sample.serial; /* update our overall serial # */ col.r += avcol.r; col.g += avcol.g; col.b += avcol.b; } col.r /= (scene.antialiasing + 1.0); col.g /= (scene.antialiasing + 1.0); col.b /= (scene.antialiasing + 1.0); } /* Handle overexposure and underexposure here... */ R = (int)(col.r * 255); if (R > 255) R = 255; else if (R < 0) R = 0; G = (int)(col.g * 255); if (G > 255) G = 255; else if (G < 0) G = 0; B = (int)(col.b * 255); if (B > 255) B = 255; else if (B < 0) B = 0; #ifdef MARK_RENDERING_AREA R = int((1.0 - alpha) * R + alpha * blend[0]); G = int((1.0 - alpha) * G + alpha * blend[1]); B = int((1.0 - alpha) * B + alpha * blend[2]); #endif return video->get_color(R, G, B); } class parallel_task { public: void operator()(const oneapi::tbb::blocked_range2d &r) const { // task-local storage unsigned int serial = 1; unsigned int mboxsize = sizeof(unsigned int) * (max_objectid() + 20); unsigned int *local_mbox = (unsigned int *)alloca(mboxsize); memset(local_mbox, 0, mboxsize); #ifdef MARK_RENDERING_AREA // compute thread number while first task run thread_id_t::reference thread_id = thread_ids.local(); if (thread_id == -1) thread_id = thread_number++; // choose thread color int pos = thread_id % NUM_COLORS; if (video->running) { drawing_area drawing(r.cols().begin(), totaly - r.rows().end(), r.cols().end() - r.cols().begin(), r.rows().end() - r.rows().begin()); for (int i = 1, y = r.rows().begin(); y != r.rows().end(); ++y, i++) { drawing.set_pos(0, drawing.size_y - i); for (int x = r.cols().begin(); x != r.cols().end(); x++) { int d = (y % 3 == 0) ? 2 : 1; drawing.put_pixel(video->get_color( colors[pos][0] / d, colors[pos][1] / d, colors[pos][2] / d)); } } } #endif if (video->next_frame()) { drawing_area drawing(r.cols().begin(), totaly - r.rows().end(), r.cols().end() - r.cols().begin(), r.rows().end() - r.rows().begin()); for (int i = 1, y = r.rows().begin(); y != r.rows().end(); ++y, i++) { drawing.set_pos(0, drawing.size_y - i); for (int x = r.cols().begin(); x != r.cols().end(); x++) { #ifdef MARK_RENDERING_AREA float alpha = y == r.rows().begin() || y == r.rows().end() - 1 || x == r.cols().begin() || x == r.cols().end() - 1 ? border_alpha : inner_alpha; color_t c = render_one_pixel( x, y, local_mbox, serial, startx, stopx, starty, stopy, colors[pos], alpha); #else color_t c = render_one_pixel(x, y, local_mbox, serial, startx, stopx, starty, stopy); #endif drawing.put_pixel(c); } } } } parallel_task() {} }; void *thread_trace(thr_parms *parms) { #if !WIN8UI_EXAMPLE int n, nthreads = utility::get_default_num_threads(); char *nthreads_str = getenv("TBB_NUM_THREADS"); if (nthreads_str && (sscanf(nthreads_str, "%d", &n) > 0) && (n > 0)) nthreads = n; oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism, nthreads); #endif // shared but read-only so could be private too all_parms = parms; scene = parms->scene; startx = parms->startx; stopx = parms->stopx; starty = parms->starty; stopy = parms->stopy; jitterscale = 40.0 * (scene.hres + scene.vres); totaly = parms->scene.vres; #ifdef MARK_RENDERING_AREA thread_ids.clear(); #endif int grain_size = 8; //WIN8UI does not support getenv() function so using auto_partitioner unconditionally #if !WIN8UI_EXAMPLE int g; char *grain_str = getenv("TBB_GRAINSIZE"); if (grain_str && (sscanf(grain_str, "%d", &g) > 0) && (g > 0)) grain_size = g; char *sched_str = getenv("TBB_PARTITIONER"); static oneapi::tbb::affinity_partitioner g_ap; // reused across calls to thread_trace if (sched_str && !strncmp(sched_str, "aff", 3)) oneapi::tbb::parallel_for( oneapi::tbb::blocked_range2d(starty, stopy, grain_size, startx, stopx, grain_size), parallel_task(), g_ap); else if (sched_str && !strncmp(sched_str, "simp", 4)) oneapi::tbb::parallel_for( oneapi::tbb::blocked_range2d(starty, stopy, grain_size, startx, stopx, grain_size), parallel_task(), oneapi::tbb::simple_partitioner()); else #endif oneapi::tbb::parallel_for( oneapi::tbb::blocked_range2d(starty, stopy, grain_size, startx, stopx, grain_size), parallel_task(), oneapi::tbb::auto_partitioner()); return (nullptr); } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/trace.tbb1d.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "machine.hpp" #include "types.hpp" #include "macros.hpp" #include "vector.hpp" #include "tgafile.hpp" #include "trace.hpp" #include "light.hpp" #include "shade.hpp" #include "camera.hpp" #include "util.hpp" #include "intersect.hpp" #include "global.hpp" #include "ui.hpp" #include "tachyon_video.hpp" // shared but read-only so could be private too static thr_parms *all_parms; static scenedef scene; static int startx; static int stopx; static int starty; static int stopy; static flt jitterscale; static int totaly; #include "oneapi/tbb/parallel_for.h" #include "oneapi/tbb/spin_mutex.h" #include "oneapi/tbb/blocked_range.h" #include "oneapi/tbb/global_control.h" #include "common/utility/get_default_num_threads.hpp" static oneapi::tbb::spin_mutex MyMutex, MyMutex2; static color_t render_one_pixel(int x, int y, unsigned int *local_mbox, unsigned int &serial, int startx, int stopx, int starty, int stopy) { /* private vars moved inside loop */ ray primary, sample; color col, avcol; int R, G, B; intersectstruct local_intersections; int alias; /* end private */ primary = camray(&scene, x, y); primary.intstruct = &local_intersections; primary.flags = RT_RAY_REGULAR; serial++; primary.serial = serial; primary.mbox = local_mbox; primary.maxdist = FHUGE; primary.scene = &scene; col = trace(&primary); serial = primary.serial; /* perform antialiasing if enabled.. */ if (scene.antialiasing > 0) { for (alias = 0; alias < scene.antialiasing; alias++) { serial++; /* increment serial number */ sample = primary; /* copy the regular primary ray to start with */ sample.serial = serial; { oneapi::tbb::spin_mutex::scoped_lock lock(MyMutex); sample.d.x += ((rand() % 100) - 50) / jitterscale; sample.d.y += ((rand() % 100) - 50) / jitterscale; sample.d.z += ((rand() % 100) - 50) / jitterscale; } avcol = trace(&sample); serial = sample.serial; /* update our overall serial # */ col.r += avcol.r; col.g += avcol.g; col.b += avcol.b; } col.r /= (scene.antialiasing + 1.0); col.g /= (scene.antialiasing + 1.0); col.b /= (scene.antialiasing + 1.0); } /* Handle overexposure and underexposure here... */ R = (int)(col.r * 255); if (R > 255) R = 255; else if (R < 0) R = 0; G = (int)(col.g * 255); if (G > 255) G = 255; else if (G < 0) G = 0; B = (int)(col.b * 255); if (B > 255) B = 255; else if (B < 0) B = 0; return video->get_color(R, G, B); } class parallel_task { public: void operator()(const oneapi::tbb::blocked_range &r) const { // task-local storage unsigned int serial = 1; unsigned int mboxsize = sizeof(unsigned int) * (max_objectid() + 20); unsigned int *local_mbox = (unsigned int *)alloca(mboxsize); memset(local_mbox, 0, mboxsize); for (int y = r.begin(); y != r.end(); ++y) { { drawing_area drawing(startx, totaly - y, stopx - startx, 1); for (int x = startx; x < stopx; x++) { color_t c = render_one_pixel(x, y, local_mbox, serial, startx, stopx, starty, stopy); drawing.put_pixel(c); } } if (!video->next_frame()) return; } } parallel_task() {} }; void *thread_trace(thr_parms *parms) { int n, nthreads = utility::get_default_num_threads(); char *nthreads_str = getenv("TBB_NUM_THREADS"); if (nthreads_str && (sscanf(nthreads_str, "%d", &n) > 0) && (n > 0)) nthreads = n; oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism, nthreads); // shared but read-only so could be private too all_parms = parms; scene = parms->scene; startx = parms->startx; stopx = parms->stopx; starty = parms->starty; stopy = parms->stopy; jitterscale = 40.0 * (scene.hres + scene.vres); totaly = parms->scene.vres - 1; int g, grain_size = 1; char *grain_str = getenv("TBB_GRAINSIZE"); if (grain_str && (sscanf(grain_str, "%d", &g) > 0) && (g > 0)) grain_size = g; char *sched_str = getenv("TBB_PARTITIONER"); static oneapi::tbb::affinity_partitioner g_ap; if (sched_str && !strncmp(sched_str, "aff", 3)) oneapi::tbb::parallel_for( oneapi::tbb::blocked_range(starty, stopy, grain_size), parallel_task(), g_ap); else if (sched_str && !strncmp(sched_str, "simp", 4)) oneapi::tbb::parallel_for(oneapi::tbb::blocked_range(starty, stopy, grain_size), parallel_task(), oneapi::tbb::simple_partitioner()); else oneapi::tbb::parallel_for(oneapi::tbb::blocked_range(starty, stopy, grain_size), parallel_task(), oneapi::tbb::auto_partitioner()); return (nullptr); } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/trace.threads.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "machine.hpp" #include "types.hpp" #include "macros.hpp" #include "vector.hpp" #include "tgafile.hpp" #include "trace.hpp" #include "light.hpp" #include "shade.hpp" #include "camera.hpp" #include "util.hpp" #include "intersect.hpp" #include "global.hpp" #include "ui.hpp" #include "tachyon_video.hpp" // shared but read-only so could be private too static thr_parms *all_parms; static scenedef scene; static int startx; static int stopx; static int starty; static int stopy; static flt jitterscale; static int totaly; static int nthreads; static int grain_size = 50; #ifdef _WIN32 #include #include "pthread_w.hpp" #else #include #endif static pthread_mutex_t MyMutex, MyMutex2, MyMutex3; static color_t render_one_pixel(int x, int y, unsigned int *local_mbox, unsigned int &serial, int startx, int stopx, int starty, int stopy) { /* private vars moved inside loop */ ray primary, sample; color col, avcol; int R, G, B; intersectstruct local_intersections; int alias; /* end private */ primary = camray(&scene, x, y); primary.intstruct = &local_intersections; primary.flags = RT_RAY_REGULAR; serial++; primary.serial = serial; primary.mbox = local_mbox; primary.maxdist = FHUGE; primary.scene = &scene; col = trace(&primary); serial = primary.serial; /* perform antialiasing if enabled.. */ if (scene.antialiasing > 0) { for (alias = 0; alias < scene.antialiasing; alias++) { serial++; /* increment serial number */ sample = primary; /* copy the regular primary ray to start with */ sample.serial = serial; { pthread_mutex_lock(&MyMutex); sample.d.x += ((rand() % 100) - 50) / jitterscale; sample.d.y += ((rand() % 100) - 50) / jitterscale; sample.d.z += ((rand() % 100) - 50) / jitterscale; pthread_mutex_unlock(&MyMutex); } avcol = trace(&sample); serial = sample.serial; /* update our overall serial # */ col.r += avcol.r; col.g += avcol.g; col.b += avcol.b; } col.r /= (scene.antialiasing + 1.0); col.g /= (scene.antialiasing + 1.0); col.b /= (scene.antialiasing + 1.0); } /* Handle overexposure and underexposure here... */ R = (int)(col.r * 255); if (R > 255) R = 255; else if (R < 0) R = 0; G = (int)(col.g * 255); if (G > 255) G = 255; else if (G < 0) G = 0; B = (int)(col.b * 255); if (B > 255) B = 255; else if (B < 0) B = 0; return video->get_color(R, G, B); } // need this so threads can self-schedule work; returns true (and bounds of work) if more work to do #define MIN(a, b) (((a) < (b)) ? (a) : (b)) static int sched_nexty; static bool schedule_thread_work(int &y1, int &y2) { pthread_mutex_lock(&MyMutex3); #ifdef STATIC_EVEN_SCHEDULING // optional static-even scheduling y1 = sched_nexty; sched_nexty += ((stopy - starty + 1) / nthreads); y2 = MIN(sched_nexty, stopy); #else // dynamic-chunk scheduling with specified grain_size y1 = sched_nexty; sched_nexty += grain_size; y2 = MIN(sched_nexty, stopy); #endif pthread_mutex_unlock(&MyMutex3); return (y1 <= stopy); } static void parallel_thread(void *arg) { // thread-local storage unsigned int serial = 1; unsigned int mboxsize = sizeof(unsigned int) * (max_objectid() + 20); unsigned int *local_mbox = (unsigned int *)alloca(mboxsize); memset(local_mbox, 0, mboxsize); // int thread_no = (int) arg; int y1, y2; while (schedule_thread_work(y1, y2)) { for (int y = y1; y < y2; y++) { { drawing_area drawing(startx, totaly - y, stopx - startx, 1); for (int x = startx; x < stopx; x++) { color_t c = render_one_pixel(x, y, local_mbox, serial, startx, stopx, starty, stopy); drawing.put_pixel(c); } } if (!video->next_frame()) pthread_exit(arg); } } pthread_exit(arg); } // need this (for each platform) so we can create the right number of threads, to work efficiently #if defined(_WIN32) static int get_num_cpus(void) { SYSTEM_INFO si; GetNativeSystemInfo(&si); return (int)si.dwNumberOfProcessors; } #elif defined(__APPLE__) #include "sys/types.hpp" #include "sys/sysctl.hpp" static int get_num_cpus(void) { int name[2] = { CTL_HW, HW_NCPU }; int ncpu; std::size_t size = sizeof(ncpu); sysctl(name, 2, &ncpu, &size, nullptr, 0); return ncpu; } #else /* Linux */ #include static int get_num_cpus(void) { return get_nprocs(); } #endif void *thread_trace(thr_parms *parms) { // shared but read-only so could be private too all_parms = parms; scene = parms->scene; startx = parms->startx; stopx = parms->stopx; starty = parms->starty; stopy = parms->stopy; jitterscale = 40.0 * (scene.hres + scene.vres); totaly = parms->scene.vres - 1; int n; nthreads = get_num_cpus(); char *nthreads_str = getenv("THR_NUM_THREADS"); if (nthreads_str && (sscanf(nthreads_str, "%d", &n) > 0) && (n > 0)) nthreads = n; char *grain_str = getenv("THR_GRAINSIZE"); if (grain_str && (sscanf(grain_str, "%d", &n) > 0) && (n > 0)) grain_size = n; pthread_t *threads = (pthread_t *)alloca(nthreads * sizeof(pthread_t)); pthread_mutex_init(&MyMutex, nullptr); pthread_mutex_init(&MyMutex2, nullptr); pthread_mutex_init(&MyMutex3, nullptr); sched_nexty = starty; // initialize schedule_thread_work() self-scheduler for (int i = 0; i < nthreads; i++) { pthread_create( &threads[i], nullptr, (void *(*)(void *))parallel_thread, (void *)((std::size_t)i)); } for (int i = 0; i < nthreads; i++) { void *exit_val; pthread_join(threads[i], &exit_val); // expect i = (int) exit_val } return (nullptr); } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/trace.threads2d.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "machine.hpp" #include "types.hpp" #include "macros.hpp" #include "vector.hpp" #include "tgafile.hpp" #include "trace.hpp" #include "light.hpp" #include "shade.hpp" #include "camera.hpp" #include "util.hpp" #include "intersect.hpp" #include "global.hpp" #include "ui.hpp" #include "tachyon_video.hpp" // shared but read-only so could be private too static thr_parms *all_parms; static scenedef scene; static int startx; static int stopx; static int starty; static int stopy; static flt jitterscale; static int totaly; static int nthreads; static int grain_size = 50; const int DIVFACTOR = 2; #define MIN(a, b) ((a) < (b) ? (a) : (b)) #ifdef _WIN32 #include #include "pthread_w.hpp" #else #include #endif static pthread_mutex_t MyMutex, MyMutex2, MyMutex3; static color_t render_one_pixel(int x, int y, unsigned int *local_mbox, unsigned int &serial, int startx, int stopx, int starty, int stopy) { /* private vars moved inside loop */ ray primary, sample; color col, avcol; int R, G, B; intersectstruct local_intersections; int alias; /* end private */ primary = camray(&scene, x, y); primary.intstruct = &local_intersections; primary.flags = RT_RAY_REGULAR; serial++; primary.serial = serial; primary.mbox = local_mbox; primary.maxdist = FHUGE; primary.scene = &scene; col = trace(&primary); serial = primary.serial; /* perform antialiasing if enabled.. */ if (scene.antialiasing > 0) { for (alias = 0; alias < scene.antialiasing; alias++) { serial++; /* increment serial number */ sample = primary; /* copy the regular primary ray to start with */ sample.serial = serial; { pthread_mutex_lock(&MyMutex); sample.d.x += ((rand() % 100) - 50) / jitterscale; sample.d.y += ((rand() % 100) - 50) / jitterscale; sample.d.z += ((rand() % 100) - 50) / jitterscale; pthread_mutex_unlock(&MyMutex); } avcol = trace(&sample); serial = sample.serial; /* update our overall serial # */ col.r += avcol.r; col.g += avcol.g; col.b += avcol.b; } col.r /= (scene.antialiasing + 1.0); col.g /= (scene.antialiasing + 1.0); col.b /= (scene.antialiasing + 1.0); } /* Handle overexposure and underexposure here... */ R = (int)(col.r * 255); if (R > 255) R = 255; else if (R < 0) R = 0; G = (int)(col.g * 255); if (G > 255) G = 255; else if (G < 0) G = 0; B = (int)(col.b * 255); if (B > 255) B = 255; else if (B < 0) B = 0; return video->get_color(R, G, B); } // need this so threads can self-schedule work; returns true (and bounds of work) if more work to do typedef struct work_queue_entry_s { patch pch; struct work_queue_entry_s *next; } work_queue_entry_t; static work_queue_entry_t *work_queue_head = nullptr; static work_queue_entry_t *work_queue_tail = nullptr; static void generate_work(patch *pchin) { int startx, stopx, starty, stopy; int xs, ys; startx = pchin->startx; stopx = pchin->stopx; starty = pchin->starty; stopy = pchin->stopy; if (((stopx - startx) >= grain_size) || ((stopy - starty) >= grain_size)) { int xpatchsize = (stopx - startx) / DIVFACTOR + 1; int ypatchsize = (stopy - starty) / DIVFACTOR + 1; for (ys = starty; ys <= stopy; ys += ypatchsize) for (xs = startx; xs <= stopx; xs += xpatchsize) { patch pch; pch.startx = xs; pch.starty = ys; pch.stopx = MIN(xs + xpatchsize, stopx); pch.stopy = MIN(ys + ypatchsize, stopy); generate_work(&pch); } } else { /* just trace this patch */ work_queue_entry_t *q = (work_queue_entry_t *)malloc(sizeof(work_queue_entry_t)); q->pch.starty = starty; q->pch.stopy = stopy; q->pch.startx = startx; q->pch.stopx = stopx; q->next = nullptr; if (work_queue_head == nullptr) { work_queue_head = q; } else { work_queue_tail->next = q; } work_queue_tail = q; } } static void generate_worklist(void) { patch pch; pch.startx = startx; pch.stopx = stopx; pch.starty = starty; pch.stopy = stopy; generate_work(&pch); } static bool schedule_thread_work(patch &pch) { pthread_mutex_lock(&MyMutex3); work_queue_entry_t *q = work_queue_head; if (q != nullptr) { pch = q->pch; work_queue_head = work_queue_head->next; } pthread_mutex_unlock(&MyMutex3); return (q != nullptr); } static void parallel_thread(void *arg) { // thread-local storage unsigned int serial = 1; unsigned int mboxsize = sizeof(unsigned int) * (max_objectid() + 20); unsigned int *local_mbox = (unsigned int *)alloca(mboxsize); memset(local_mbox, 0, mboxsize); // int thread_no = (int) arg; patch pch; while (schedule_thread_work(pch)) { { drawing_area drawing( pch.startx, totaly - pch.stopy, pch.stopx - pch.startx, pch.stopy - pch.starty); for (int i = 1, y = pch.starty; y < pch.stopy; ++y, i++) { drawing.set_pos(0, drawing.size_y - i); for (int x = pch.startx; x < pch.stopx; x++) { color_t c = render_one_pixel(x, y, local_mbox, serial, startx, stopx, starty, stopy); drawing.put_pixel(c); } } } if (!video->next_frame()) pthread_exit(arg); } pthread_exit(arg); } // need this (for each platform) so we can create the right number of threads, to work efficiently #if defined(_WIN32) static int get_num_cpus(void) { SYSTEM_INFO si; GetNativeSystemInfo(&si); return (int)si.dwNumberOfProcessors; } #elif defined(__APPLE__) #include "sys/types.hpp" #include "sys/sysctl.hpp" static int get_num_cpus(void) { int name[2] = { CTL_HW, HW_NCPU }; int ncpu; std::size_t size = sizeof(ncpu); sysctl(name, 2, &ncpu, &size, nullptr, 0); return ncpu; } #else /* Linux */ #include static int get_num_cpus(void) { return get_nprocs(); } #endif void *thread_trace(thr_parms *parms) { // shared but read-only so could be private too all_parms = parms; scene = parms->scene; startx = parms->startx; stopx = parms->stopx; starty = parms->starty; stopy = parms->stopy; jitterscale = 40.0 * (scene.hres + scene.vres); totaly = parms->scene.vres; int n; nthreads = get_num_cpus(); char *nthreads_str = getenv("THR_NUM_THREADS"); if (nthreads_str && (sscanf(nthreads_str, "%d", &n) > 0) && (n > 0)) nthreads = n; char *grain_str = getenv("THR_GRAINSIZE"); if (grain_str && (sscanf(grain_str, "%d", &n) > 0) && (n > 0)) grain_size = n; pthread_t *threads = (pthread_t *)alloca(nthreads * sizeof(pthread_t)); pthread_mutex_init(&MyMutex, nullptr); pthread_mutex_init(&MyMutex2, nullptr); pthread_mutex_init(&MyMutex3, nullptr); generate_worklist(); // initialize schedule_thread_work() self-scheduler for (int i = 0; i < nthreads; i++) { pthread_create( &threads[i], nullptr, (void *(*)(void *))parallel_thread, (void *)((std::size_t)i)); } for (int i = 0; i < nthreads; i++) { void *exit_val; pthread_join(threads[i], &exit_val); // expect i = (int) exit_val } return (nullptr); } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/trace_rest.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * trace.cpp - This file contains the functions for firing primary rays * and handling subsequent calculations */ #include "machine.hpp" #include "types.hpp" #include "macros.hpp" #include "vector.hpp" #include "tgafile.hpp" #include "trace.hpp" #include "light.hpp" #include "shade.hpp" #include "camera.hpp" #include "util.hpp" #include "intersect.hpp" #include "global.hpp" #include "ui.hpp" #include "tachyon_video.hpp" color trace(ray *primary) { if (primary->depth > 0) { VNorm(&primary->d); reset_intersection(primary->intstruct); intersect_objects(primary); return shader(primary); } /* if ray is truncated, return the background as its color */ return primary->scene->background; } void *thread_io(void *parms) { thr_io_parms p; p = *((thr_io_parms *)parms); writetgaregion(p.tga, p.iwidth, p.iheight, p.startx, p.starty, p.stopx, p.stopy, p.buffer); free(p.buffer); /* free the buffer once we are done with it.. */ free(parms); return (nullptr); } void trace_shm(scenedef scene, /*char * buffer, */ int startx, int stopx, int starty, int stopy) { thr_parms *parms; parms = (thr_parms *)rt_getmem(sizeof(thr_parms)); parms->tid = 0; parms->nthr = 1; parms->scene = scene; parms->startx = startx; parms->stopx = stopx; parms->starty = starty; parms->stopy = stopy; thread_trace(parms); rt_freemem(parms); } void trace_region(scenedef scene, void *tga, int startx, int starty, int stopx, int stopy) { if (scene.verbosemode) { char msgtxt[2048]; sprintf(msgtxt, "Node %3d tracing region %4d, %4d ---> %4d, %4d \n", 0, startx, starty, stopx, stopy); rt_ui_message(MSG_0, msgtxt); } trace_shm(scene, /*buffer,*/ startx, stopx, starty, stopy); /* not used now writetgaregion(tga, scene.hres, scene.vres, startx, starty, stopx, stopy, global_buffer); if (scene.rawimage != nullptr) { int x, y; int totalx = stopx - startx + 1; for (y=starty; y<=stopy; y++) { for (x=0; x= EPSILON) && (VLength(&edge2) >= EPSILON) && (VLength(&edge3) >= EPSILON)) { t = (tri *)rt_getmem(sizeof(tri)); t->nextobj = nullptr; t->methods = &tri_methods; t->tex = (texture *)tex; t->v0 = v0; t->edge1 = edge1; t->edge2 = edge2; return (object *)t; } return nullptr; /* was a degenerate triangle */ } object *newstri(void *tex, vector v0, vector v1, vector v2, vector n0, vector n1, vector n2) { stri *t; vector edge1, edge2, edge3; VSub(&v1, &v0, &edge1); VSub(&v2, &v0, &edge2); VSub(&v2, &v1, &edge3); /* check to see if this will be a degenerate triangle before creation */ if ((VLength(&edge1) >= EPSILON) && (VLength(&edge2) >= EPSILON) && (VLength(&edge3) >= EPSILON)) { t = (stri *)rt_getmem(sizeof(stri)); t->nextobj = nullptr; t->methods = &stri_methods; t->tex = (texture *)tex; t->v0 = v0; t->edge1 = edge1; t->edge2 = edge2; t->n0 = n0; t->n1 = n1; t->n2 = n2; return (object *)t; } return nullptr; /* was a degenerate triangle */ } #define CROSS(dest, v1, v2) \ dest.x = v1.y * v2.z - v1.z * v2.y; \ dest.y = v1.z * v2.x - v1.x * v2.z; \ dest.z = v1.x * v2.y - v1.y * v2.x; #define DOT(v1, v2) (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z) #define SUB(dest, v1, v2) \ dest.x = v1.x - v2.x; \ dest.y = v1.y - v2.y; \ dest.z = v1.z - v2.z; static int tri_bbox(void *obj, vector *min, vector *max) { tri *t = (tri *)obj; vector v1, v2; VAdd(&t->v0, &t->edge1, &v1); VAdd(&t->v0, &t->edge2, &v2); min->x = MYMIN(t->v0.x, MYMIN(v1.x, v2.x)); min->y = MYMIN(t->v0.y, MYMIN(v1.y, v2.y)); min->z = MYMIN(t->v0.z, MYMIN(v1.z, v2.z)); max->x = MYMAX(t->v0.x, MYMAX(v1.x, v2.x)); max->y = MYMAX(t->v0.y, MYMAX(v1.y, v2.y)); max->z = MYMAX(t->v0.z, MYMAX(v1.z, v2.z)); return 1; } static void tri_intersect(tri *trn, ray *ry) { vector tvec, pvec, qvec; flt det, inv_det, t, u, v; /* begin calculating determinant - also used to calculate U parameter */ CROSS(pvec, ry->d, trn->edge2); /* if determinant is near zero, ray lies in plane of triangle */ det = DOT(trn->edge1, pvec); if (det > -EPSILON && det < EPSILON) return; inv_det = 1.0 / det; /* calculate distance from vert0 to ray origin */ SUB(tvec, ry->o, trn->v0); /* calculate U parameter and test bounds */ u = DOT(tvec, pvec) * inv_det; if (u < 0.0 || u > 1.0) return; /* prepare to test V parameter */ CROSS(qvec, tvec, trn->edge1); /* calculate V parameter and test bounds */ v = DOT(ry->d, qvec) * inv_det; if (v < 0.0 || u + v > 1.0) return; /* calculate t, ray intersects triangle */ t = DOT(trn->edge2, qvec) * inv_det; add_intersection(t, (object *)trn, ry); } static void tri_normal(tri *trn, vector *pnt, ray *incident, vector *N) { CROSS((*N), trn->edge1, trn->edge2); VNorm(N); if (VDot(N, &(incident->d)) > 0.0) { N->x = -N->x; N->y = -N->y; N->z = -N->z; } } static void stri_normal(stri *trn, vector *pnt, ray *incident, vector *N) { flt U, V, W, lensqr; vector P, tmp, norm; CROSS(norm, trn->edge1, trn->edge2); lensqr = DOT(norm, norm); VSUB((*pnt), trn->v0, P); CROSS(tmp, P, trn->edge2); U = DOT(tmp, norm) / lensqr; CROSS(tmp, trn->edge1, P); V = DOT(tmp, norm) / lensqr; W = 1.0 - (U + V); N->x = W * trn->n0.x + U * trn->n1.x + V * trn->n2.x; N->y = W * trn->n0.y + U * trn->n1.y + V * trn->n2.y; N->z = W * trn->n0.z + U * trn->n1.z + V * trn->n2.z; VNorm(N); } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/triangle.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * triangle.h - This file contains the defines for triangles etc. * * $Id: triangle.h,v 1.2 2007-02-22 17:54:16 Exp $ */ object *newtri(void *, vector, vector, vector); object *newstri(void *, vector, vector, vector, vector, vector, vector); #ifdef TRIANGLE_PRIVATE #define TRIXMAJOR 0 #define TRIYMAJOR 1 #define TRIZMAJOR 2 typedef struct { unsigned int id; /* Unique Object serial number */ void *nextobj; /* pointer to next object in list */ object_methods *methods; /* this object's methods */ texture *tex; /* object texture */ vector edge2; vector edge1; vector v0; } tri; typedef struct { unsigned int id; /* Unique Object serial number */ void *nextobj; /* pointer to next object in list */ object_methods *methods; /* this object's methods */ texture *tex; /* object texture */ vector edge2; vector edge1; vector v0; vector n0; vector n1; vector n2; } stri; static int tri_bbox(void *obj, vector *min, vector *max); static void tri_intersect(tri *, ray *); static void tri_normal(tri *, vector *, ray *, vector *); static void stri_normal(stri *, vector *, ray *, vector *); #endif ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/types.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #if __MINGW32__ #include #elif _WIN32 #include #define alloca _alloca #elif __FreeBSD__ || __NetBSD__ || __OpenBSD__ #include #else #include #endif /* * types.h - This file contains all of the type definitions for the raytracer * * $Id: types.h,v 1.2 2007-02-22 17:54:16 Exp $ */ #define MAXOCTNODES 25 /* subdivide octants /w > # of children */ #define SPEPSILON 0.000001 /* amount to crawl down a ray */ #define EPSILON 0.000001 /* amount to crawl down a ray */ #define TWOPI 6.2831853 /* guess */ #define FHUGE 1e18 /* biggest fp number we can represent */ /* Maximum internal table sizes */ /* Use prime numbers for best memory system performance */ #define INTTBSIZE 1024 /* maximum intersections we can hold */ #define MAXLIGHTS 39 /* maximum number of lights in a scene */ #define MAXIMGS 39 /* maximum number of distinct images */ #define RPCQSIZE 113 /* number of RPC messages to queue */ /* Parameter values for rt_boundmode() */ #define RT_BOUNDING_DISABLED 0 /* spatial subdivision/bounding disabled */ #define RT_BOUNDING_ENABLED 1 /* spatial subdivision/bounding enabled */ /* Parameter values for rt_displaymode() */ #define RT_DISPLAY_DISABLED 0 /* video output enabled */ #define RT_DISPLAY_ENABLED 1 /* video output disabled */ /* Ray flags */ #define RT_RAY_REGULAR 1 #define RT_RAY_SHADOW 2 #define RT_RAY_BOUNDED 4 #define RT_RAY_FINISHED 8 #ifdef USESINGLEFLT typedef float flt; /* generic floating point number, using float */ #else typedef double flt; /* generic floating point number, using double */ #endif typedef unsigned char byte_t; /* 1 byte */ typedef signed int word_t; /* 32 bit integer */ typedef struct { flt x; /* X coordinate value */ flt y; /* Y coordinate value */ flt z; /* Z coordinate value */ } vector; typedef struct { flt r; /* Red component */ flt g; /* Green component */ flt b; /* Blue component */ } color; typedef struct { byte_t r; /* Red component */ byte_t g; /* Green component */ byte_t b; /* Blue component */ } bytecolor; typedef struct { /* Raw 24 bit image structure, for tga, ppm etc */ int loaded; /* image memory residence flag */ int xres; /* image X axis size */ int yres; /* image Y axis size */ int bpp; /* image bits per pixel */ char name[96]; /* image filename (with path) */ unsigned char *data; /* pointer to raw byte image data */ } rawimage; typedef struct { /* Scalar Volume Data */ int loaded; /* Volume data memory residence flag */ int xres; /* volume X axis size */ int yres; /* volume Y axis size */ int zres; /* volume Z axis size */ flt opacity; /* opacity per unit length */ char name[96]; /* Volume data filename */ unsigned char *data; /* pointer to raw byte volume data */ } scalarvol; typedef struct { color (*texfunc)(void *, void *, void *); int shadowcast; /* does the object cast a shadow */ int islight; /* light flag... */ color col; /* base object color */ flt ambient; /* ambient lighting */ flt diffuse; /* diffuse reflection */ flt phong; /* phong specular highlights */ flt phongexp; /* phong exponent/shininess factor */ int phongtype; /* phong type: 0 == plastic, nonzero == metal */ flt specular; /* specular reflection */ flt opacity; /* how opaque the object is */ vector ctr; /* origin of texture */ vector rot; /* rotation of texture about origin */ vector scale; /* scale of texture in x,y,z */ vector uaxs; /* planar map U axis */ vector vaxs; /* planar map V axis */ void *img; /* pointer to image for image mapping */ void *obj; /* object ptr, hack for volume shaders for now */ } texture; typedef struct { void (*intersect)(void *, void *); /* intersection func ptr */ void (*normal)(void *, void *, void *, void *); /* normal function ptr */ int (*bbox)(void *, vector *, vector *); /* return the object bbox */ void (*free)(void *); /* free the object */ } object_methods; typedef struct { unsigned int id; /* Unique Object serial number */ void *nextobj; /* pointer to next object in list */ object_methods *methods; /* this object's methods */ texture *tex; /* object texture */ } object; typedef struct { object *obj; /* to object we hit */ flt t; /* distance along the ray to the hit point */ } intersection; typedef struct { int num; /* number of intersections */ intersection closest; /* closest intersection > 0.0 */ intersection list[INTTBSIZE]; /* list of all intersections */ } intersectstruct; typedef struct { char outfilename[200]; /* name of the output image */ unsigned char *rawimage; /* pointer to a raw rgb image to be stored */ int hres; /* horizontal output image resolution */ int vres; /* vertical output image resolution */ flt aspectratio; /* aspect ratio of output image */ int raydepth; /* maximum recursion depth */ int antialiasing; /* number of antialiasing rays to fire */ int verbosemode; /* verbose reporting flag */ int boundmode; /* automatic spatial subdivision flag */ int boundthresh; /* threshold number of subobjects */ int displaymode; /* run-time X11 display flag */ vector camcent; /* center of the camera in world coords */ vector camviewvec; /* view direction of the camera (Z axis) */ vector camrightvec; /* right axis for the camera (X axis) */ vector camupvec; /* up axis for the camera (Y axis) */ flt camzoom; /* zoom factor for the camera */ color background; /* scene background color */ } scenedef; typedef struct { intersectstruct *intstruct; /* ptr to thread's intersection data */ unsigned int depth; /* levels left to recurse.. (maxdepth - curdepth) */ unsigned int flags; /* ray flags, any special treatment needed etc */ unsigned int serial; /* serial number of the ray */ unsigned int *mbox; /* mailbox array for optimizing intersections */ vector o; /* origin of the ray X,Y,Z */ vector d; /* normalized direction of the ray */ flt maxdist; /* maximum distance to search for intersections */ vector s; /* startpoint of the ray (may differ from origin */ vector e; /* endpoint of the ray if bounded */ scenedef *scene; /* pointer to the scene, for global parms such as */ /* background colors etc */ } ray; typedef struct { int type; /* RPC call type */ int from; /* Sending processor */ int len; /* length of parms in bytes */ void *parms; /* Parameters to RPC */ } rpcmsg; ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/ui.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * ui.cpp - Contains functions for dealing with user interfaces */ #include "machine.hpp" #include "types.hpp" #include "macros.hpp" #include "util.hpp" #include "ui.hpp" static void (*rt_static_ui_message)(int, const char *) = nullptr; static void (*rt_static_ui_progress)(int) = nullptr; static int (*rt_static_ui_checkaction)(void) = nullptr; extern bool silent_mode; void set_rt_ui_message(void (*func)(int, const char *)) { rt_static_ui_message = func; } void set_rt_ui_progress(void (*func)(int)) { rt_static_ui_progress = func; } void rt_ui_message(int level, const char *msg) { if (rt_static_ui_message == nullptr) { if (!silent_mode) { fprintf(stderr, "%s\n", msg); fflush(stderr); } } else { rt_static_ui_message(level, msg); } } void rt_ui_progress(int percent) { if (rt_static_ui_progress != nullptr) rt_static_ui_progress(percent); else { if (!silent_mode) { fprintf(stderr, "\r %3d%% Complete \r", percent); fflush(stderr); } } } int rt_ui_checkaction(void) { if (rt_static_ui_checkaction != nullptr) return rt_static_ui_checkaction(); else return 0; } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/ui.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * ui.h - defines for user interface functions * * $Id: ui.h,v 1.2 2007-02-22 17:54:16 Exp $ */ /* Different types of message, for levels of verbosity etc */ #define MSG_0 100 #define MSG_1 101 #define MSG_2 102 #define MSG_3 103 #define MSG_4 104 #define MSG_5 105 #define MSG_ERR 200 #define MSG_ABORT 300 void rt_ui_message(int, const char *); void rt_ui_progress(int); int rt_ui_checkaction(void); ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/util.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * util.cpp - Contains all of the timing functions for various platforms. */ #include "machine.hpp" #include "types.hpp" #include "macros.hpp" #include "util.hpp" #include "light.hpp" #include "global.hpp" #include "ui.hpp" void rt_finalize(void); #if !defined(_WIN32) #include #include void rt_sleep(int msec) { usleep(msec * 1000); } #else //_WIN32 #undef OLDUNIXTIME #undef STDTIME void rt_sleep(int msec) { #if !WIN8UI_EXAMPLE Sleep(msec); #else std::chrono::milliseconds sleep_time(msec); std::this_thread::sleep_for(sleep_time); #endif } timer gettimer(void) { return GetTickCount(); } flt timertime(timer st, timer fn) { double ttime, start, end; start = ((double)st) / ((double)1000.00); end = ((double)fn) / ((double)1000.00); ttime = end - start; return ttime; } #endif /* _WIN32 */ /* if we're on a Unix with gettimeofday() we'll use newer timers */ #if defined(STDTIME) struct timezone tz; timer gettimer(void) { timer t; gettimeofday(&t, &tz); return t; } flt timertime(timer st, timer fn) { double ttime, start, end; start = (st.tv_sec + 1.0 * st.tv_usec / 1000000.0); end = (fn.tv_sec + 1.0 * fn.tv_usec / 1000000.0); ttime = end - start; return ttime; } #endif /* STDTIME */ /* use the old fashioned Unix time functions */ #if defined(OLDUNIXTIME) timer gettimer(void) { return time(nullptr); } flt timertime(timer st, timer fn) { return difftime(fn, st); ; } #endif /* OLDUNIXTIME */ /* random other helper utility functions */ int rt_meminuse(void) { return rt_mem_in_use; } void* rt_getmem(unsigned int bytes) { void* mem; mem = malloc(bytes); if (mem != nullptr) { rt_mem_in_use += bytes; } else { rtbomb("No more memory!!!!"); } return mem; } unsigned int rt_freemem(void* addr) { unsigned int bytes; free(addr); bytes = 0; rt_mem_in_use -= bytes; return bytes; } void rtbomb(const char* msg) { rt_ui_message(MSG_ERR, msg); rt_ui_message(MSG_ABORT, "Rendering Aborted."); rt_finalize(); std::exit(-1); } void rtmesg(const char* msg) { rt_ui_message(MSG_0, msg); } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/util.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * util.h - This file contains defines for the timer functions... * * $Id: util.h,v 1.3 2007-02-22 17:54:17 Exp $ */ #include "machine.hpp" #if defined(_WIN32) #include #if defined(WINAPI_FAMILY) && (WINAPI_FAMILY == WINAPI_FAMILY_APP) #define WIN8UI_EXAMPLE 1 #include typedef ULONGLONG timer; #ifdef GetTickCount #undef GetTickCount #endif #define GetTickCount GetTickCount64 #else typedef DWORD timer; #endif #else #include #include #if defined(STDTIME) typedef timeval timer; #elif defined(OLDUNIXTIME) typedef time_t timer; #endif /* OLDUNIXTIME */ /* STDTIME */ #endif /* _WIN32 */ timer gettimer(void); flt timertime(timer st, timer fn); void rt_sleep(int); int rt_meminuse(void); void *rt_getmem(unsigned int); unsigned int rt_freemem(void *); void rtbomb(const char *); void rtmesg(const char *); ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/vector.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * vector.cpp - This file contains all of the vector arithmetic functions. */ #include "machine.hpp" #include "types.hpp" #include "macros.hpp" flt VDot(vector *a, vector *b) { return (a->x * b->x + a->y * b->y + a->z * b->z); } void VCross(vector *a, vector *b, vector *c) { c->x = (a->y * b->z) - (a->z * b->y); c->y = (a->z * b->x) - (a->x * b->z); c->z = (a->x * b->y) - (a->y * b->x); } flt VLength(vector *a) { return (flt)sqrt((a->x * a->x) + (a->y * a->y) + (a->z * a->z)); } void VNorm(vector *a) { flt len; len = sqrt((a->x * a->x) + (a->y * a->y) + (a->z * a->z)); if (len != 0.0) { a->x /= len; a->y /= len; a->z /= len; } } void VAdd(vector *a, vector *b, vector *c) { c->x = (a->x + b->x); c->y = (a->y + b->y); c->z = (a->z + b->z); } void VSub(vector *a, vector *b, vector *c) { c->x = (a->x - b->x); c->y = (a->y - b->y); c->z = (a->z - b->z); } void VAddS(flt a, vector *A, vector *B, vector *C) { C->x = (a * A->x) + B->x; C->y = (a * A->y) + B->y; C->z = (a * A->z) + B->z; } vector Raypnt(ray *a, flt t) { vector temp; temp.x = a->o.x + (a->d.x * t); temp.y = a->o.y + (a->d.y * t); temp.z = a->o.z + (a->d.z * t); return temp; } void VScale(vector *a, flt s) { a->x *= s; a->y *= s; a->z *= s; } void ColorAddS(color *a, color *b, flt s) { a->r += b->r * s; a->g += b->g * s; a->b += b->b * s; } void ColorAccum(color *a, color *b) { a->r += b->r; a->g += b->g; a->b += b->b; } void ColorScale(color *a, flt s) { a->r *= s; a->g *= s; a->b *= s; } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/vector.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * vector.h - This file contains declarations of vector functions * * $Id: vector.h,v 1.2 2007-02-22 17:54:17 Exp $ */ flt VDot(vector *, vector *); void VCross(vector *, vector *, vector *); flt VLength(vector *); void VNorm(vector *); void VAdd(vector *, vector *, vector *); void VSub(vector *, vector *, vector *); void VAddS(flt, vector *, vector *, vector *); vector Raypnt(ray *, flt); void VScale(vector *a, flt s); void ColorAddS(color *a, color *b, flt s); void ColorAccum(color *a, color *b); void ColorScale(color *a, flt s); ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/vol.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * vol.cpp - Volume rendering helper routines etc. */ #include #include "machine.hpp" #include "types.hpp" #include "macros.hpp" #include "vector.hpp" #include "util.hpp" #include "vol.hpp" #include "box.hpp" #include "trace.hpp" #include "ui.hpp" #include "light.hpp" #include "shade.hpp" int scalarvol_bbox(void *obj, vector *min, vector *max) { box *b = (box *)obj; *min = b->min; *max = b->max; return 1; } void *newscalarvol(void *intex, vector min, vector max, int xs, int ys, int zs, char *fname, scalarvol *invol) { box *bx; texture *tx, *tex; scalarvol *vol; tex = (texture *)intex; tex->shadowcast = 0; /* doesn't cast a shadow */ tx = (texture *)rt_getmem(sizeof(texture)); /* is the volume data already loaded? */ if (invol == nullptr) { vol = (scalarvol *)rt_getmem(sizeof(scalarvol)); vol->loaded = 0; vol->data = nullptr; } else vol = invol; vol->opacity = tex->opacity; vol->xres = xs; vol->yres = ys; vol->zres = zs; strcpy(vol->name, fname); tx->ctr.x = 0.0; tx->ctr.y = 0.0; tx->ctr.z = 0.0; tx->rot = tx->ctr; tx->scale = tx->ctr; tx->uaxs = tx->ctr; tx->vaxs = tx->ctr; tx->islight = 0; tx->shadowcast = 0; /* doesn't cast a shadow */ tx->col = tex->col; tx->ambient = 1.0; tx->diffuse = 0.0; tx->specular = 0.0; tx->opacity = 1.0; tx->img = vol; tx->texfunc = (color(*)(void *, void *, void *))(scalar_volume_texture); bx = newbox(tx, min, max); tx->obj = (void *)bx; /* XXX hack! */ return (void *)bx; } color VoxelColor(flt scalar) { color col; if (scalar > 1.0) scalar = 1.0; if (scalar < 0.0) scalar = 0.0; if (scalar < 0.25) { col.r = scalar * 4.0; col.g = 0.0; col.b = 0.0; } else { if (scalar < 0.75) { col.r = 1.0; col.g = (scalar - 0.25) * 2.0; col.b = 0.0; } else { col.r = 1.0; col.g = 1.0; col.b = (scalar - 0.75) * 4.0; } } return col; } color scalar_volume_texture(vector *hit, texture *tex, ray *ry) { color col, col2; box *bx; flt a, tx1, tx2, ty1, ty2, tz1, tz2; flt tnear, tfar; flt t, tdist, dt, sum, tt; vector pnt, bln; scalarvol *vol; flt scalar, transval; int x, y, z; unsigned char *ptr; bx = (box *)tex->obj; vol = (scalarvol *)bx->tex->img; col.r = 0.0; col.g = 0.0; col.b = 0.0; tnear = -FHUGE; tfar = FHUGE; if (ry->d.x == 0.0) { if ((ry->o.x < bx->min.x) || (ry->o.x > bx->max.x)) return col; } else { tx1 = (bx->min.x - ry->o.x) / ry->d.x; tx2 = (bx->max.x - ry->o.x) / ry->d.x; if (tx1 > tx2) { a = tx1; tx1 = tx2; tx2 = a; } if (tx1 > tnear) tnear = tx1; if (tx2 < tfar) tfar = tx2; } if (tnear > tfar) return col; if (tfar < 0.0) return col; if (ry->d.y == 0.0) { if ((ry->o.y < bx->min.y) || (ry->o.y > bx->max.y)) return col; } else { ty1 = (bx->min.y - ry->o.y) / ry->d.y; ty2 = (bx->max.y - ry->o.y) / ry->d.y; if (ty1 > ty2) { a = ty1; ty1 = ty2; ty2 = a; } if (ty1 > tnear) tnear = ty1; if (ty2 < tfar) tfar = ty2; } if (tnear > tfar) return col; if (tfar < 0.0) return col; if (ry->d.z == 0.0) { if ((ry->o.z < bx->min.z) || (ry->o.z > bx->max.z)) return col; } else { tz1 = (bx->min.z - ry->o.z) / ry->d.z; tz2 = (bx->max.z - ry->o.z) / ry->d.z; if (tz1 > tz2) { a = tz1; tz1 = tz2; tz2 = a; } if (tz1 > tnear) tnear = tz1; if (tz2 < tfar) tfar = tz2; } if (tnear > tfar) return col; if (tfar < 0.0) return col; if (tnear < 0.0) tnear = 0.0; tdist = sqrt((flt)(vol->xres * vol->xres + vol->yres * vol->yres + vol->zres * vol->zres)); tt = (vol->opacity / tdist); bln.x = fabs(bx->min.x - bx->max.x); bln.y = fabs(bx->min.y - bx->max.y); bln.z = fabs(bx->min.z - bx->max.z); dt = sqrt(bln.x * bln.x + bln.y * bln.y + bln.z * bln.z) / tdist; sum = 0.0; /* move the volume residency check out of loop.. */ if (!vol->loaded) { LoadVol(vol); vol->loaded = 1; } for (t = tnear; t <= tfar; t += dt) { pnt.x = ((ry->o.x + (ry->d.x * t)) - bx->min.x) / bln.x; pnt.y = ((ry->o.y + (ry->d.y * t)) - bx->min.y) / bln.y; pnt.z = ((ry->o.z + (ry->d.z * t)) - bx->min.z) / bln.z; x = (int)((vol->xres - 1.5) * pnt.x + 0.5); y = (int)((vol->yres - 1.5) * pnt.y + 0.5); z = (int)((vol->zres - 1.5) * pnt.z + 0.5); ptr = vol->data + ((vol->xres * vol->yres * z) + (vol->xres * y) + x); scalar = (flt)((flt)1.0 * ((int)ptr[0])) / 255.0; sum += tt * scalar; transval = tt * scalar; col2 = VoxelColor(scalar); if (sum < 1.0) { col.r += transval * col2.r; col.g += transval * col2.g; col.b += transval * col2.b; if (sum < 0.0) sum = 0.0; } else { sum = 1.0; } } if (sum < 1.0) { /* spawn transmission rays / refraction */ color transcol; transcol = shade_transmission(ry, hit, 1.0 - sum); col.r += transcol.r; /* add the transmitted ray */ col.g += transcol.g; /* to the diffuse and */ col.b += transcol.b; /* transmission total.. */ } return col; } void LoadVol(scalarvol *vol) { FILE *dfile; std::size_t status; char msgtxt[2048]; dfile = fopen(vol->name, "r"); if (dfile == nullptr) { char msgtxt[2048]; sprintf(msgtxt, "Vol: can't open %s for input!!! Aborting\n", vol->name); rt_ui_message(MSG_ERR, msgtxt); rt_ui_message(MSG_ABORT, "Rendering Aborted."); std::exit(-1); } sprintf( msgtxt, "loading %dx%dx%d volume set from %s", vol->xres, vol->yres, vol->zres, vol->name); rt_ui_message(MSG_0, msgtxt); vol->data = (unsigned char *)rt_getmem(vol->xres * vol->yres * vol->zres); status = fread(vol->data, 1, (vol->xres * vol->yres * vol->zres), dfile); fclose(dfile); } ================================================ FILE: third-party/tbb/examples/parallel_for/tachyon/src/vol.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* The original source for this example is Copyright (c) 1994-2008 John E. Stone All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. The name of the author may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * vol.h - Volume rendering definitions etc. * * * $Id: vol.h,v 1.2 2007-02-22 17:54:17 Exp $ */ void *newscalarvol(void *intex, vector min, vector max, int xs, int ys, int zs, char *fname, scalarvol *invol); void LoadVol(scalarvol *); color scalar_volume_texture(vector *, texture *, ray *); ================================================ FILE: third-party/tbb/examples/parallel_for_each/README.md ================================================ # Code Samples of oneAPI Threading Building Blocks (oneTBB) Examples using `parallel_for_each` algorithm. | Code sample name | Description |:--- |:--- | parallel_preorder | Parallel preorder traversal of a graph. ================================================ FILE: third-party/tbb/examples/parallel_for_each/parallel_preorder/CMakeLists.txt ================================================ # Copyright (c) 2020-2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. cmake_minimum_required(VERSION 3.5.0...3.31.3) project(parallel_preorder CXX) include(../../common/cmake/common.cmake) set_common_project_settings(tbb) add_executable(parallel_preorder parallel_preorder.cpp Graph.cpp main.cpp) target_link_libraries(parallel_preorder TBB::tbb Threads::Threads) target_compile_options(parallel_preorder PRIVATE ${TBB_CXX_STD_FLAG}) set(EXECUTABLE "$") set(ARGS "") set(PERF_ARGS auto silent 500000 100) set(LIGHT_ARGS 1:auto:+4 n-of-traversals=50) add_execution_target(run_parallel_preorder parallel_preorder ${EXECUTABLE} "${ARGS}") add_execution_target(perf_run_parallel_preorder parallel_preorder ${EXECUTABLE} "${PERF_ARGS}") add_execution_target(light_test_parallel_preorder parallel_preorder ${EXECUTABLE} "${LIGHT_ARGS}") ================================================ FILE: third-party/tbb/examples/parallel_for_each/parallel_preorder/Graph.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include "Graph.hpp" Cell::Cell(const Cell& other) : op(other.op), value(other.value), successor(other.successor) { ref_count = other.ref_count.load(); input[0] = other.input[0]; input[1] = other.input[1]; } void Graph::create_random_dag(std::size_t number_of_nodes) { my_vertex_set.resize(number_of_nodes); for (std::size_t k = 0; k < number_of_nodes; ++k) { Cell& c = my_vertex_set[k]; int op = int((rand() >> 8) % 5u); if (op > int(k)) op = int(k); switch (op) { default: c.op = OP_VALUE; c.value = Cell::value_type((float)k); break; case 1: c.op = OP_NEGATE; break; case 2: c.op = OP_SUB; break; case 3: c.op = OP_ADD; break; case 4: c.op = OP_MUL; break; } for (int j = 0; j < ArityOfOp[c.op]; ++j) { Cell& input = my_vertex_set[rand() % k]; c.input[j] = &input; } } } void Graph::print() { for (std::size_t k = 0; k < my_vertex_set.size(); ++k) { std::cout << "Cell " << k << ":"; for (std::size_t j = 0; j < my_vertex_set[k].successor.size(); ++j) std::cout << " " << int(my_vertex_set[k].successor[j] - &my_vertex_set[0]); std::cout << "\n"; } } void Graph::get_root_set(std::vector& root_set) { for (std::size_t k = 0; k < my_vertex_set.size(); ++k) { my_vertex_set[k].successor.clear(); } root_set.clear(); for (std::size_t k = 0; k < my_vertex_set.size(); ++k) { Cell& c = my_vertex_set[k]; c.ref_count = ArityOfOp[c.op]; for (int j = 0; j < ArityOfOp[c.op]; ++j) { c.input[j]->successor.push_back(&c); } if (ArityOfOp[c.op] == 0) root_set.push_back(&my_vertex_set[k]); } } void Cell::update() { switch (op) { case OP_VALUE: break; case OP_NEGATE: value = -(input[0]->value); break; case OP_ADD: value = input[0]->value + input[1]->value; break; case OP_SUB: value = input[0]->value - input[1]->value; break; case OP_MUL: value = input[0]->value * input[1]->value; break; } } ================================================ FILE: third-party/tbb/examples/parallel_for_each/parallel_preorder/Graph.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef TBB_examples_parallel_preorder_graph_H #define TBB_examples_parallel_preorder_graph_H #include #include #include "Matrix.hpp" enum OpKind { // Use Cell's value OP_VALUE, // Unary negation OP_NEGATE, // Addition OP_ADD, // Subtraction OP_SUB, // Multiplication OP_MUL }; static const int ArityOfOp[] = { 0, 1, 2, 2, 2 }; class Cell { public: //! Operation for this cell OpKind op; //! Inputs to this cell Cell* input[2]; //! Type of value stored in a Cell typedef Matrix value_type; //! Value associated with this Cell value_type value; //! Set of cells that use this Cell as an input std::vector successor; //! Reference count of number of inputs that are not yet updated. std::atomic ref_count; //! Update the Cell's value. void update(); //! Default constructor Cell() {} //! Copy constructor Cell(const Cell& other); }; //! A directed graph where the vertices are Cells. class Graph { std::vector my_vertex_set; public: //! Create a random acyclic directed graph void create_random_dag(std::size_t number_of_nodes); //! Print the graph void print(); //! Get set of cells that have no inputs. void get_root_set(std::vector& root_set); }; #endif /* TBB_examples_parallel_preorder_graph_H */ ================================================ FILE: third-party/tbb/examples/parallel_for_each/parallel_preorder/Matrix.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef TBB_examples_parallel_preorder_matrix_H #define TBB_examples_parallel_preorder_matrix_H class Matrix { static const int n = 20; float array[n][n]; public: Matrix() {} Matrix(float z) { for (int i = 0; i < n; ++i) for (int j = 0; j < n; ++j) array[i][j] = i == j ? z : 0; } friend Matrix operator-(const Matrix& x) { Matrix result; for (int i = 0; i < n; ++i) for (int j = 0; j < n; ++j) result.array[i][j] = -x.array[i][j]; return result; } friend Matrix operator+(const Matrix& x, const Matrix& y) { Matrix result; for (int i = 0; i < n; ++i) for (int j = 0; j < n; ++j) result.array[i][j] = x.array[i][j] + y.array[i][j]; return result; } friend Matrix operator-(const Matrix& x, const Matrix& y) { Matrix result; for (int i = 0; i < n; ++i) for (int j = 0; j < n; ++j) result.array[i][j] = x.array[i][j] - y.array[i][j]; return result; } friend Matrix operator*(const Matrix& x, const Matrix& y) { Matrix result(0); for (int i = 0; i < n; ++i) for (int k = 0; k < n; ++k) for (int j = 0; j < n; ++j) result.array[i][j] += x.array[i][k] * y.array[k][j]; return result; } }; #endif /* TBB_examples_parallel_preorder_matrix_H */ ================================================ FILE: third-party/tbb/examples/parallel_for_each/parallel_preorder/README.md ================================================ # Parallel_preorder sample Example that uses `parallel_for_each` to do parallel preorder traversal of a sparse graph. Each vertex in the graph is called a "cell". Each cell has a value. The value is a matrix. Some of the cells have operators that compute the cell's value, using other cell's values as input. A cell that uses the value of cell `x` is called a successor of `x`. The algorithm works as follows. 1. Compute the set of cells that have no inputs. This set is called `root_set`. 2. Each cell has an associated field `ref_count` that is an atomic integer. Initialize `ref_count` to the number of inputs for the `Cell`. 3. Update each cell in `root_set`, by applying a `parallel_for_each` to a `root_set`. 4. After updating a cell, for each of its successors 1. Atomically decrement the successor's ref_count 2. If the count became zero, add the cell to the set of cells to be updated, by calling `feeder::add`. The times printed are for the traversal and update, and do not include time for computing the `root_set`. The example is using custom synchronization via ref_count atomic variable. Correctness checking tools might not take this into account, and report data races between different tasks that are actually synchronized. **Note:** It is important to understand that this example is unlikely to show speedup if the cell values are changed to type "float". The reason is twofold. * The smaller value type causes each `Cell` to be significantly smaller than a cache line, which leads to false sharing conflicts. * The time to update the cells becomes very small, and consequently the overhead of `parallel_for_each` swamps the useful work. ## Building the example ``` cmake cmake --build . ``` ## Running the sample ### Predefined make targets * `make run_parallel_preorder` - executes the example with predefined parameters * `make perf_run_parallel_preorder` - executes the example with suggested parameters to measure the oneTBB performance * `make light_test_parallel_preorder` - executes the example with suggested parameters to reduce execution time. ### Application parameters Usage: ``` parallel_preorder [n-of-threads=value] [n-of-nodes=value] [n-of-traversals=value] [silent] [-h] [n-of-threads [n-of-nodes [n-of-traversals]]] ``` * `-h` - prints the help for command line options. * `n-of-threads` - the number of threads to use; a range of the form low\[:high\], where low and optional high are non-negative integers or `auto` for a platform-specific default number. * `n-of-nodes` - the number of nodes in the graph. Default value is 1000. * `n-of-traversals` - the number of times to evaluate the graph. Default value is 500. * `silent` - no output except elapsed time. ================================================ FILE: third-party/tbb/examples/parallel_for_each/parallel_preorder/main.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* Example program that shows how to use parallel_for_each to do parallel preorder traversal of a directed acyclic graph. */ #include #include #include #include "oneapi/tbb/tick_count.h" #include "oneapi/tbb/global_control.h" #include "common/utility/utility.hpp" #include "common/utility/get_default_num_threads.hpp" #include "Graph.hpp" // some forward declarations class Cell; void ParallelPreorderTraversal(const std::vector& root_set); //------------------------------------------------------------------------ // Test driver //------------------------------------------------------------------------ static unsigned nodes = 1000; static unsigned traversals = 500; static bool SilentFlag = false; //! Parse the command line. static void ParseCommandLine(int argc, char* argv[], utility::thread_number_range& threads) { utility::parse_cli_arguments( argc, argv, utility::cli_argument_pack() //"-h" option for displaying help is present implicitly .positional_arg(threads, "n-of-threads", utility::thread_number_range_desc) .positional_arg(nodes, "n-of-nodes", "number of nodes in the graph.") .positional_arg( traversals, "n-of-traversals", "number of times to evaluate the graph. Reduce it (e.g. to 100) to shorten example run time\n") .arg(SilentFlag, "silent", "no output except elapsed time ")); } int main(int argc, char* argv[]) { utility::thread_number_range threads(utility::get_default_num_threads); oneapi::tbb::tick_count main_start = oneapi::tbb::tick_count::now(); ParseCommandLine(argc, argv, threads); // Start scheduler with given number of threads. for (int p = threads.first; p <= threads.last; p = threads.step(p)) { oneapi::tbb::tick_count t0 = oneapi::tbb::tick_count::now(); oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism, p); srand(2); std::size_t root_set_size = 0; { Graph g; g.create_random_dag(nodes); std::vector root_set; g.get_root_set(root_set); root_set_size = root_set.size(); for (unsigned int trial = 0; trial < traversals; ++trial) { ParallelPreorderTraversal(root_set); } } oneapi::tbb::tick_count::interval_t interval = oneapi::tbb::tick_count::now() - t0; if (!SilentFlag) { std::cout << interval.seconds() << " seconds using " << p << " threads (" << root_set_size << " nodes in root_set)\n"; } } utility::report_elapsed_time((oneapi::tbb::tick_count::now() - main_start).seconds()); return 0; } ================================================ FILE: third-party/tbb/examples/parallel_for_each/parallel_preorder/parallel_preorder.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include "oneapi/tbb/parallel_for_each.h" #include "Graph.hpp" class Body { public: Body(){}; //------------------------------------------------------------------------ // Following signatures are required by parallel_for_each //------------------------------------------------------------------------ typedef Cell* argument_type; void operator()(Cell* c, oneapi::tbb::feeder& feeder) const { c->update(); // Restore ref_count in preparation for subsequent traversal. c->ref_count = ArityOfOp[c->op]; for (std::size_t k = 0; k < c->successor.size(); ++k) { Cell* successor = c->successor[k]; // ref_count is used for inter-task synchronization. // Correctness checking tools might not take this into account, and report // data races between different tasks, that are actually synchronized. if (0 == --(successor->ref_count)) { feeder.add(successor); } } } }; void ParallelPreorderTraversal(const std::vector& root_set) { oneapi::tbb::parallel_for_each(root_set.begin(), root_set.end(), Body()); } ================================================ FILE: third-party/tbb/examples/parallel_pipeline/README.md ================================================ # Code Samples of oneAPI Threading Building Blocks (oneTBB) Examples using `parallel_pipeline` algorithm. | Code sample name | Description |:--- |:--- | square | Another string transformation example that squares numbers read from a file. ================================================ FILE: third-party/tbb/examples/parallel_pipeline/square/CMakeLists.txt ================================================ # Copyright (c) 2020-2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. cmake_minimum_required(VERSION 3.5.0...3.31.3) project(square CXX) include(../../common/cmake/common.cmake) set_common_project_settings(tbb) add_executable(square gen_input.cpp square.cpp) target_link_libraries(square TBB::tbb Threads::Threads) target_compile_options(square PRIVATE ${TBB_CXX_STD_FLAG}) if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL IntelLLVM) target_compile_options(square PRIVATE -D_CRT_SECURE_NO_WARNINGS) endif() set(EXECUTABLE "$") set(ARGS 0 input.txt output.txt) set(PERF_ARGS auto input.txt output.txt silent) add_execution_target(run_square square ${EXECUTABLE} "${ARGS}") add_execution_target(perf_run_square square ${EXECUTABLE} "${PERF_ARGS}") ================================================ FILE: third-party/tbb/examples/parallel_pipeline/square/README.md ================================================ # Square sample Text filter that demonstrates the use of `parallel_pipeline`. Example program reads a file containing decimal integers in text format, and changes each to its square. ## Building the example ``` cmake cmake --build . ``` ## Running the sample ### Predefined make targets * `make run_square` - executes the example with predefined parameters * `make perf_run_square` - executes the example with suggested parameters to measure the oneTBB performance * `make light_test_square` - executes the example with suggested parameters to reduce execution time. ### Application parameters Usage: ``` square [n-of-threads=value] [input-file=value] [output-file=value] [max-slice-size=value] [silent] [-h] [n-of-threads [input-file [output-file [max-slice-size]]]] ``` * `-h` - prints the help for command line options. * `n-of-threads` - the number of threads to use; a range of the form low\[:high\], where low and optional high are non-negative integers or `auto` for a platform-specific default number. * `input`- file is an input file name. * `output`- file is an output file name. * `max-slice-size` - the maximum number of characters in one slice. * `silent` - no output except elapsed time. ================================================ FILE: third-party/tbb/examples/parallel_pipeline/square/gen_input.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #if _WIN32 #include #ifndef F_OK #define F_OK 0 #endif #define access _access #else #include #endif const long INPUT_SIZE = 1000000; //! Generates sample input for square.cpp void gen_input(const char *fname) { long num = INPUT_SIZE; FILE *fptr = fopen(fname, "w"); if (!fptr) { throw std::runtime_error("Could not open file for generating input"); } int a = 0; int b = 1; for (long j = 0; j < num; ++j) { fprintf(fptr, "%u\n", a); b += a; a = (b - a) % 10000; if (a < 0) a = -a; } if (fptr) { fclose(fptr); } } void generate_if_needed(const char *fname) { if (access(fname, F_OK) != 0) gen_input(fname); } ================================================ FILE: third-party/tbb/examples/parallel_pipeline/square/square.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ // // Example program that reads a file of decimal integers in text format // and changes each to its square. // #include #include #include #include #include "oneapi/tbb/parallel_pipeline.h" #include "oneapi/tbb/tick_count.h" #include "oneapi/tbb/tbb_allocator.h" #include "oneapi/tbb/global_control.h" #include "common/utility/utility.hpp" #include "common/utility/get_default_num_threads.hpp" extern void generate_if_needed(const char*); //! Holds a slice of text. /** Instances *must* be allocated/freed using methods herein, because the C++ declaration represents only the header of a much larger object in memory. */ class TextSlice { //! Pointer to one past last character in sequence char* logical_end; //! Pointer to one past last available byte in sequence. char* physical_end; public: //! Allocate a TextSlice object that can hold up to max_size characters. static TextSlice* allocate(std::size_t max_size) { // +1 leaves room for a terminating null character. TextSlice* t = (TextSlice*)oneapi::tbb::tbb_allocator().allocate(sizeof(TextSlice) + max_size + 1); t->logical_end = t->begin(); t->physical_end = t->begin() + max_size; return t; } //! Free a TextSlice object void free() { oneapi::tbb::tbb_allocator().deallocate( (char*)this, sizeof(TextSlice) + (physical_end - begin()) + 1); } //! Pointer to beginning of sequence char* begin() { return (char*)(this + 1); } //! Pointer to one past last character in sequence char* end() { return logical_end; } //! Length of sequence std::size_t size() const { return logical_end - (char*)(this + 1); } //! Maximum number of characters that can be appended to sequence std::size_t avail() const { return physical_end - logical_end; } //! Append sequence [first,last) to this sequence. void append(char* first, char* last) { memcpy(logical_end, first, last - first); logical_end += last - first; } //! Set end() to given value. void set_end(char* p) { logical_end = p; } }; std::size_t MAX_CHAR_PER_INPUT_SLICE = 4000; std::string InputFileName = "input.txt"; std::string OutputFileName = "output.txt"; TextSlice* next_slice = nullptr; class MyInputFunc { public: MyInputFunc(FILE* input_file_); MyInputFunc(const MyInputFunc& f) : input_file(f.input_file) {} ~MyInputFunc(); TextSlice* operator()(oneapi::tbb::flow_control& fc) const; private: FILE* input_file; }; MyInputFunc::MyInputFunc(FILE* input_file_) : input_file(input_file_) {} MyInputFunc::~MyInputFunc() {} TextSlice* MyInputFunc::operator()(oneapi::tbb::flow_control& fc) const { // Read characters into space that is available in the next slice. if (!next_slice) next_slice = TextSlice::allocate(MAX_CHAR_PER_INPUT_SLICE); std::size_t m = next_slice->avail(); std::size_t n = fread(next_slice->end(), 1, m, input_file); if (!n && next_slice->size() == 0) { // No more characters to process fc.stop(); return nullptr; } else { // Have more characters to process. TextSlice* t = next_slice; next_slice = TextSlice::allocate(MAX_CHAR_PER_INPUT_SLICE); char* p = t->end() + n; if (n == m) { // Might have read partial number. // If so, transfer characters of partial number to next slice. while (p > t->begin() && isdigit(p[-1])) --p; assert(p > t->begin()); // Number too large to fit in buffer next_slice->append(p, t->end() + n); } t->set_end(p); return t; } } // Functor that changes each decimal number to its square. class MyTransformFunc { public: TextSlice* operator()(TextSlice* input) const; }; TextSlice* MyTransformFunc::operator()(TextSlice* input) const { // Add terminating null so that strtol works right even if number is at end of the input. *input->end() = '\0'; char* p = input->begin(); TextSlice* out = TextSlice::allocate(2 * MAX_CHAR_PER_INPUT_SLICE); char* q = out->begin(); for (;;) { while (p < input->end() && !isdigit(*p)) *q++ = *p++; if (p == input->end()) break; long x = strtol(p, &p, 10); // Note: no overflow checking is needed here, as we have twice the // input string length, but the square of a non-negative integer n // cannot have more than twice as many digits as n. long y = x * x; sprintf(q, "%ld", y); q = strchr(q, 0); } out->set_end(q); input->free(); return out; } // Functor that writes a TextSlice to a file. class MyOutputFunc { FILE* my_output_file; public: MyOutputFunc(FILE* output_file); void operator()(TextSlice* item) const; }; MyOutputFunc::MyOutputFunc(FILE* output_file) : my_output_file(output_file) {} void MyOutputFunc::operator()(TextSlice* out) const { std::size_t n = fwrite(out->begin(), 1, out->size(), my_output_file); if (n != out->size()) { fprintf(stderr, "Can't write into file '%s'\n", OutputFileName.c_str()); std::exit(-1); } out->free(); } bool silent = false; int run_pipeline(int nthreads) { FILE* input_file = fopen(InputFileName.c_str(), "r"); if (!input_file) { throw std::invalid_argument(("Invalid input file name: " + InputFileName).c_str()); return 0; } FILE* output_file = fopen(OutputFileName.c_str(), "w"); if (!output_file) { throw std::invalid_argument(("Invalid output file name: " + OutputFileName).c_str()); return 0; } oneapi::tbb::tick_count t0 = oneapi::tbb::tick_count::now(); // Need more than one token in flight per thread to keep all threads // busy; 2-4 works oneapi::tbb::parallel_pipeline( nthreads * 4, oneapi::tbb::make_filter(oneapi::tbb::filter_mode::serial_in_order, MyInputFunc(input_file)) & oneapi::tbb::make_filter(oneapi::tbb::filter_mode::parallel, MyTransformFunc()) & oneapi::tbb::make_filter(oneapi::tbb::filter_mode::serial_in_order, MyOutputFunc(output_file))); oneapi::tbb::tick_count t1 = oneapi::tbb::tick_count::now(); fclose(output_file); fclose(input_file); if (!silent) printf("time = %g\n", (t1 - t0).seconds()); return 1; } int main(int argc, char* argv[]) { oneapi::tbb::tick_count mainStartTime = oneapi::tbb::tick_count::now(); // The 1st argument is the function to obtain 'auto' value; the 2nd is the default value // The example interprets 0 threads as "run serially, then fully subscribed" utility::thread_number_range threads(utility::get_default_num_threads, 0); utility::parse_cli_arguments( argc, argv, utility::cli_argument_pack() //"-h" option for displaying help is present implicitly .positional_arg(threads, "n-of-threads", utility::thread_number_range_desc) .positional_arg(InputFileName, "input-file", "input file name") .positional_arg(OutputFileName, "output-file", "output file name") .positional_arg(MAX_CHAR_PER_INPUT_SLICE, "max-slice-size", "the maximum number of characters in one slice") .arg(silent, "silent", "no output except elapsed time")); generate_if_needed(InputFileName.c_str()); if (threads.first) { for (int p = threads.first; p <= threads.last; p = threads.step(p)) { if (!silent) printf("threads = %d ", p); oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism, p); if (!run_pipeline(p)) return -1; } } else { // Number of threads wasn't set explicitly. Run serial and parallel version { // serial run if (!silent) printf("serial run "); oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism, 1); if (!run_pipeline(1)) return -1; } { // parallel run (number of threads is selected automatically) if (!silent) printf("parallel run "); oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism, utility::get_default_num_threads()); if (!run_pipeline(utility::get_default_num_threads())) return -1; } } utility::report_elapsed_time((oneapi::tbb::tick_count::now() - mainStartTime).seconds()); return 0; } ================================================ FILE: third-party/tbb/examples/parallel_reduce/README.md ================================================ # Code Samples of oneAPI Threading Building Blocks (oneTBB) Examples using `parallel_reduce` algorithm. | Code sample name | Description |:--- |:--- | convex_hull | Parallel version of convex hull algorithm (quick hull). | pi | Parallel version of calculating π by numerical integration. | primes | Parallel version of the Sieve of Eratosthenes. ================================================ FILE: third-party/tbb/examples/parallel_reduce/convex_hull/CMakeLists.txt ================================================ # Copyright (c) 2020-2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. cmake_minimum_required(VERSION 3.5.0...3.31.3) project(convex_hull_bench CXX) project(convex_hull_sample CXX) include(../../common/cmake/common.cmake) set_common_project_settings(tbb) add_executable(convex_hull_bench convex_hull_bench.cpp) add_executable(convex_hull_sample convex_hull_sample.cpp) target_link_libraries(convex_hull_bench TBB::tbb Threads::Threads) target_link_libraries(convex_hull_sample TBB::tbb Threads::Threads) target_compile_options(convex_hull_bench PRIVATE ${TBB_CXX_STD_FLAG}) target_compile_options(convex_hull_sample PRIVATE ${TBB_CXX_STD_FLAG}) add_custom_target(convex_hull) add_dependencies(convex_hull convex_hull_bench convex_hull_sample) set(EXECUTABLE "$") set(PERF_EXECUTABLE "$") set(ARGS "") set(LIGHT_ARGS 4 400) set(PERF_ARGS silent auto 40000000) add_execution_target(run_convex_hull convex_hull ${EXECUTABLE} "${ARGS}") add_execution_target(light_test_convex_hull convex_hull ${EXECUTABLE} "${LIGHT_ARGS}") add_execution_target(perf_run_convex_hull convex_hull ${PERF_EXECUTABLE} "${PERF_ARGS}") ================================================ FILE: third-party/tbb/examples/parallel_reduce/convex_hull/README.md ================================================ # Convex_hull sample Parallel version of convex hull algorithm (quick hull). ## Building the example ``` cmake cmake --build . ``` This sample contains two additional predefined build targets: - `convex_hull_sample` - builds parallel version of the example which uses `parallel_reduce`, `parallel_for` and `concurrent_vector`. - `convex_hull_bench` - build version of the example that compares serial and parallel buffered and unbuffered implementations. ## Running the sample ### Predefined make targets * `make run_convex_hull` - executes the example with predefined parameters. * `make perf_run_convex_hull` - executes the example with suggested parameters to measure the oneTBB performance. * `make light_test_convex_hull` - executes the example with suggested parameters to reduce execution time. ### Application parameters Usage: ``` convex_hull_sample [n-of-threads=value] [n-of-points=value] [silent] [verbose] [-h] [n-of-threads [n-of-points]] convex_hull_bench [n-of-threads=value] [n-of-points=value] [silent] [verbose] [-h] [n-of-threads [n-of-points]] ``` * `-h` - prints the help for command line options. * `n-of-threads` - the number of threads to use; a range of the form low\[:high\], where low and optional high are non-negative integers or `auto` for a platform-specific default number. * `n-of-points` - number of points. * `silent` - no output except elapsed time. * `verbose` - turns verbose ON. ================================================ FILE: third-party/tbb/examples/parallel_reduce/convex_hull/convex_hull.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef TBB_examples_convex_hull_H #define TBB_examples_convex_hull_H #include #include #include #include #include #include #include #include #include #include #include #include "oneapi/tbb/tick_count.h" #include "oneapi/tbb/global_control.h" #include "common/utility/utility.hpp" #include "common/utility/get_default_num_threads.hpp" #include "common/utility/fast_random.hpp" namespace cfg { // convex hull problem user set parameters long numberOfPoints = 5000000; // problem size // convex hull grain sizes for 3 subproblems. Be sure 16*GS < 512Kb const std::size_t generateGrainSize = 25000; const std::size_t findExtremumGrainSize = 25000; const std::size_t divideGrainSize = 25000; }; // namespace cfg namespace util { bool silent = false; bool verbose = false; std::vector OUTPUT; // utility functionality void ParseInputArgs(int argc, char* argv[], utility::thread_number_range& threads) { utility::parse_cli_arguments( argc, argv, utility::cli_argument_pack() //"-h" option for displaying help is present implicitly .positional_arg(threads, "n-of-threads", utility::thread_number_range_desc) .positional_arg(cfg::numberOfPoints, "n-of-points", "number of points") .arg(silent, "silent", "no output except elapsed time") .arg(verbose, "verbose", "turns verbose ON")); //disabling verbose if silent is specified if (silent) verbose = false; ; } template struct point { T x; T y; //According to subparagraph 4 of paragraph 12.6.2 "Initializing bases and members" [class.base.init] //of ANSI-ISO-IEC C++ 2003 standard, POD members will _not_ be initialized if they are not mentioned //in the base-member initializer list. //For more details why this needed please see comment in FillRNDPointsVector_buf point() {} point(T _x, T _y) : x(_x), y(_y) {} }; std::ostream& operator<<(std::ostream& o, point const& p) { return o << "(" << p.x << "," << p.y << ")"; } struct rng { static const std::size_t max_rand = USHRT_MAX; utility::FastRandom my_fast_random; rng(std::size_t seed) : my_fast_random(seed) {} unsigned short operator()() { return my_fast_random.get(); } unsigned short operator()(std::size_t& seed) { return my_fast_random.get(seed); } }; template point GenerateRNDPoint(std::size_t& count, rng_functor_type random, std::size_t rand_max) { /* generates random points on 2D plane so that the cluster is somewhat circle shaped */ const std::size_t maxsize = 500; T x = random() * 2.0 / (double)rand_max - 1; T y = random() * 2.0 / (double)rand_max - 1; T r = (x * x + y * y); if (r > 1) { count++; if (count > 10) { if (random() / (double)rand_max > 0.5) x /= r; if (random() / (double)rand_max > 0.5) y /= r; count = 0; } else { x /= r; y /= r; } } x = (x + 1) * 0.5 * maxsize; y = (y + 1) * 0.5 * maxsize; return point(x, y); } template struct edge { Index start; Index end; edge(Index _p1, Index _p2) : start(_p1), end(_p2){}; }; template std::ostream& operator<<(std::ostream& _ostr, point _p) { return _ostr << '(' << _p.x << ',' << _p.y << ')'; } template std::istream& operator>>(std::istream& _istr, point _p) { return _istr >> _p.x >> _p.y; } template bool operator==(point p1, point p2) { return (p1.x == p2.x && p1.y == p2.y); } template bool operator!=(point p1, point p2) { return !(p1 == p2); } template double cross_product(const point& start, const point& end1, const point& end2) { return ((end1.x - start.x) * (end2.y - start.y) - (end2.x - start.x) * (end1.y - start.y)); } // Timing functions are based on TBB to always obtain wall-clock time typedef oneapi::tbb::tick_count my_time_t; my_time_t gettime() { return oneapi::tbb::tick_count::now(); } double time_diff(my_time_t start, my_time_t end) { return (end - start).seconds(); } void WriteResults(int nthreads, double initTime, double calcTime) { if (verbose) { std::cout << " Step by step hull construction:" << "\n"; for (std::size_t i = 0; i < OUTPUT.size(); ++i) std::cout << OUTPUT[i] << "\n"; } if (!silent) { std::cout << " Number of nodes:" << cfg::numberOfPoints << " Number of threads:" << nthreads << " Initialization time:" << std::setw(10) << std::setprecision(3) << initTime << " Calculation time:" << std::setw(10) << std::setprecision(3) << calcTime << "\n"; } } }; // namespace util #endif /* TBB_examples_convex_hull_H */ ================================================ FILE: third-party/tbb/examples/parallel_reduce/convex_hull/convex_hull_bench.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* This file contains a few implementations, so it may look overly complicated. The most efficient implementation is also separated into convex_hull_sample.cpp */ #include "convex_hull.hpp" typedef util::point point_t; #ifndef USETBB #define USETBB 1 #endif #ifndef USECONCVEC #define USECONCVEC 1 #endif #if !USETBB // Serial implementation of Quick Hull algorithm typedef std::vector pointVec_t; void serial_initialize(pointVec_t &points); class FindXExtremum : public std::unary_function { public: typedef enum { minX, maxX } extremumType; FindXExtremum(const point_t &frstPoint, extremumType exType_) : extrXPoint(frstPoint), exType(exType_) {} void operator()(const point_t &p) { if (closerToExtremum(p)) extrXPoint = p; } operator point_t() { return extrXPoint; } private: const extremumType exType; point_t extrXPoint; bool closerToExtremum(const point_t &p) const { switch (exType) { case minX: return p.x < extrXPoint.x; break; case maxX: return p.x > extrXPoint.x; break; } return false; // avoid warning } }; template point_t extremum(const pointVec_t &points) { assert(!points.empty()); return std::for_each(points.begin(), points.end(), FindXExtremum(points[0], type)); } class SplitByCP : public std::unary_function { pointVec_t &reducedSet; point_t p1, p2; point_t farPoint; double howFar; public: SplitByCP(point_t _p1, point_t _p2, pointVec_t &_reducedSet) : p1(_p1), p2(_p2), reducedSet(_reducedSet), howFar(0), farPoint(p1) {} void operator()(const point_t &p) { double cp; if ((p != p1) && (p != p2)) { cp = util::cross_product(p1, p2, p); if (cp > 0) { reducedSet.push_back(p); if (cp > howFar) { farPoint = p; howFar = cp; } } } } operator point_t() { return farPoint; } }; point_t divide(const pointVec_t &P, pointVec_t &P_reduced, const point_t &p1, const point_t &p2) { SplitByCP splitByCP(p1, p2, P_reduced); point_t farPoint = std::for_each(P.begin(), P.end(), splitByCP); if (util::verbose) { std::stringstream ss; ss << P.size() << " nodes in bucket" << ", " << "dividing by: [ " << p1 << ", " << p2 << " ], " << "farthest node: " << farPoint; util::OUTPUT.push_back(ss.str()); } return farPoint; } void divide_and_conquer(const pointVec_t &P, pointVec_t &H, point_t p1, point_t p2) { assert(P.size() >= 2); pointVec_t P_reduced; pointVec_t H1, H2; point_t p_far = divide(P, P_reduced, p1, p2); if (P_reduced.size() < 2) { H.push_back(p1); H.insert(H.end(), P_reduced.begin(), P_reduced.end()); } else { divide_and_conquer(P_reduced, H1, p1, p_far); divide_and_conquer(P_reduced, H2, p_far, p2); H.insert(H.end(), H1.begin(), H1.end()); H.insert(H.end(), H2.begin(), H2.end()); } } void quickhull(const pointVec_t &points, pointVec_t &hull) { if (points.size() < 2) { hull.insert(hull.end(), points.begin(), points.end()); return; } point_t p_maxx = extremum(points); point_t p_minx = extremum(points); pointVec_t H; divide_and_conquer(points, hull, p_maxx, p_minx); divide_and_conquer(points, H, p_minx, p_maxx); hull.insert(hull.end(), H.begin(), H.end()); } int main(int argc, char *argv[]) { utility::thread_number_range single_thread([] { return -1; }); util::ParseInputArgs(argc, argv, single_thread); pointVec_t points; pointVec_t hull; util::my_time_t tm_init, tm_start, tm_end; std::cout << "Starting serial version of QUICK HULL algorithm" << "\n"; tm_init = util::gettime(); serial_initialize(points); tm_start = util::gettime(); std::cout << "Init time: " << util::time_diff(tm_init, tm_start) << " Points in input: " << points.size() << "\n"; tm_start = util::gettime(); quickhull(points, hull); tm_end = util::gettime(); std::cout << "Serial time: " << util::time_diff(tm_start, tm_end) << " Points in hull: " << hull.size() << "\n"; return 0; } #else // USETBB - parallel version of Quick Hull algorithm #include "oneapi/tbb/parallel_for.h" #include "oneapi/tbb/parallel_reduce.h" #include "oneapi/tbb/blocked_range.h" typedef oneapi::tbb::blocked_range range_t; #if USECONCVEC #include "oneapi/tbb/concurrent_vector.h" typedef oneapi::tbb::concurrent_vector pointVec_t; void appendVector(const point_t *src, std::size_t srcSize, pointVec_t &dest) { std::copy(src, src + srcSize, dest.grow_by(srcSize)); } void appendVector(const pointVec_t &src, pointVec_t &dest) { std::copy(src.begin(), src.end(), dest.grow_by(src.size())); } void grow_vector_to_at_least(pointVec_t &vect, std::size_t size) { vect.grow_to_at_least(size); } #else // USE STD::VECTOR - include spin_mutex.h and lock vector operations #include "oneapi/tbb/spin_mutex.h" typedef oneapi::tbb::spin_mutex mutex_t; typedef std::vector pointVec_t; void appendVector(mutex_t &insertMutex, const pointVec_t &src, pointVec_t &dest) { mutex_t::scoped_lock lock(insertMutex); dest.insert(dest.end(), src.begin(), src.end()); } void appendVector(mutex_t &insertMutex, const point_t *src, std::size_t srcSize, pointVec_t &dest) { mutex_t::scoped_lock lock(insertMutex); dest.insert(dest.end(), src, src + srcSize); } void grow_vector_to_at_least(mutex_t &mutex, pointVec_t &vect, std::size_t size) { mutex_t::scoped_lock lock(mutex); if (vect.size() < size) { vect.resize(size); } } #endif // USECONCVEC class FillRNDPointsVector { pointVec_t &points; public: static const std::size_t grainSize = cfg::generateGrainSize; #if !USECONCVEC static mutex_t pushBackMutex; #endif // USECONCVEC explicit FillRNDPointsVector(pointVec_t &_points) : points(_points) {} void operator()(const range_t &range) const { util::rng the_rng(range.begin()); const std::size_t i_end = range.end(); std::size_t count = 0; #if USECONCVEC points.grow_to_at_least(i_end); #else // Locked enlarge to a not thread-safe STD::VECTOR grow_vector_to_at_least(pushBackMutex, points, i_end); #endif // USECONCVEC for (std::size_t i = range.begin(); i != i_end; ++i) { points[i] = util::GenerateRNDPoint(count, the_rng, util::rng::max_rand); } } }; class FillRNDPointsVector_buf { pointVec_t &points; public: static const std::size_t grainSize = cfg::generateGrainSize; #if !USECONCVEC static mutex_t insertMutex; #endif // USECONCVEC explicit FillRNDPointsVector_buf(pointVec_t &_points) : points(_points) {} void operator()(const range_t &range) const { util::rng the_rng(range.begin()); const std::size_t i_end = range.end(); std::size_t count = 0, j = 0; point_t tmp_vec[grainSize]; for (std::size_t i = range.begin(); i != i_end; ++i) { tmp_vec[j++] = util::GenerateRNDPoint(count, the_rng, util::rng::max_rand); } #if USECONCVEC grow_vector_to_at_least(points, range.end()); #else // USE STD::VECTOR grow_vector_to_at_least(insertMutex, points, range.end()); #endif // USECONCVEC std::copy(tmp_vec, tmp_vec + j, points.begin() + range.begin()); } }; #if !USECONCVEC mutex_t FillRNDPointsVector::pushBackMutex{}; mutex_t FillRNDPointsVector_buf::insertMutex{}; #endif template void initialize(pointVec_t &points) { //This function generate the same series of point on every call. //Reproducibility is needed for benchmarking to produce reliable results. //It is achieved through the following points: // - FillRNDPointsVector_buf instance has its own local instance // of random number generator, which in turn does not use any global data // - oneapi::tbb::simple_partitioner produce the same set of ranges on every call to // oneapi::tbb::parallel_for // - local RNG instances are seeded by the starting indexes of corresponding ranges // - grow_to_at_least() enables putting points into the resulting vector in deterministic order // (unlike concurrent push_back or grow_by). // In the buffered version, a temporary storage for as much as grainSize elements // is allocated inside the body. Since auto_partitioner may increase effective // range size which would cause a crash, simple partitioner has to be used. oneapi::tbb::parallel_for(range_t(0, cfg::numberOfPoints, BodyType::grainSize), BodyType(points), oneapi::tbb::simple_partitioner()); } class FindXExtremum { public: typedef enum { minX, maxX } extremumType; static const std::size_t grainSize = cfg::findExtremumGrainSize; FindXExtremum(const pointVec_t &points_, extremumType exType_) : points(points_), exType(exType_), extrXPoint(points[0]) {} FindXExtremum(const FindXExtremum &fxex, oneapi::tbb::split) : points(fxex.points), exType(fxex.exType), extrXPoint(fxex.extrXPoint) {} void operator()(const range_t &range) { const std::size_t i_end = range.end(); if (!range.empty()) { for (std::size_t i = range.begin(); i != i_end; ++i) { if (closerToExtremum(points[i])) { extrXPoint = points[i]; } } } } void join(const FindXExtremum &rhs) { if (closerToExtremum(rhs.extrXPoint)) { extrXPoint = rhs.extrXPoint; } } point_t extremeXPoint() { return extrXPoint; } private: const pointVec_t &points; const extremumType exType; point_t extrXPoint; bool closerToExtremum(const point_t &p) const { switch (exType) { case minX: return p.x < extrXPoint.x; break; case maxX: return p.x > extrXPoint.x; break; } return false; // avoid warning } }; template point_t extremum(const pointVec_t &P) { FindXExtremum fxBody(P, type); oneapi::tbb::parallel_reduce(range_t(0, P.size(), FindXExtremum::grainSize), fxBody); return fxBody.extremeXPoint(); } class SplitByCP { const pointVec_t &initialSet; pointVec_t &reducedSet; point_t p1, p2; point_t farPoint; double howFar; public: static const std::size_t grainSize = cfg::divideGrainSize; #if !USECONCVEC static mutex_t pushBackMutex; #endif // USECONCVEC SplitByCP(point_t _p1, point_t _p2, const pointVec_t &_initialSet, pointVec_t &_reducedSet) : p1(_p1), p2(_p2), initialSet(_initialSet), reducedSet(_reducedSet), howFar(0), farPoint(p1) {} SplitByCP(SplitByCP &sbcp, oneapi::tbb::split) : p1(sbcp.p1), p2(sbcp.p2), initialSet(sbcp.initialSet), reducedSet(sbcp.reducedSet), howFar(0), farPoint(p1) {} void operator()(const range_t &range) { const std::size_t i_end = range.end(); double cp; for (std::size_t i = range.begin(); i != i_end; ++i) { if ((initialSet[i] != p1) && (initialSet[i] != p2)) { cp = util::cross_product(p1, p2, initialSet[i]); if (cp > 0) { #if USECONCVEC reducedSet.push_back(initialSet[i]); #else // Locked push_back to a not thread-safe STD::VECTOR { mutex_t::scoped_lock lock(pushBackMutex); reducedSet.push_back(initialSet[i]); } #endif // USECONCVEC if (cp > howFar) { farPoint = initialSet[i]; howFar = cp; } } } } } void join(const SplitByCP &rhs) { if (rhs.howFar > howFar) { howFar = rhs.howFar; farPoint = rhs.farPoint; } } point_t farthestPoint() const { return farPoint; } }; class SplitByCP_buf { const pointVec_t &initialSet; pointVec_t &reducedSet; point_t p1, p2; point_t farPoint; double howFar; public: static const std::size_t grainSize = cfg::divideGrainSize; #if !USECONCVEC static mutex_t insertMutex; #endif // USECONCVEC SplitByCP_buf(point_t _p1, point_t _p2, const pointVec_t &_initialSet, pointVec_t &_reducedSet) : p1(_p1), p2(_p2), initialSet(_initialSet), reducedSet(_reducedSet), howFar(0), farPoint(p1) {} SplitByCP_buf(SplitByCP_buf &sbcp, oneapi::tbb::split) : p1(sbcp.p1), p2(sbcp.p2), initialSet(sbcp.initialSet), reducedSet(sbcp.reducedSet), howFar(0), farPoint(p1) {} void operator()(const range_t &range) { const std::size_t i_end = range.end(); std::size_t j = 0; double cp; point_t tmp_vec[grainSize]; for (std::size_t i = range.begin(); i != i_end; ++i) { if ((initialSet[i] != p1) && (initialSet[i] != p2)) { cp = util::cross_product(p1, p2, initialSet[i]); if (cp > 0) { tmp_vec[j++] = initialSet[i]; if (cp > howFar) { farPoint = initialSet[i]; howFar = cp; } } } } #if USECONCVEC appendVector(tmp_vec, j, reducedSet); #else // USE STD::VECTOR appendVector(insertMutex, tmp_vec, j, reducedSet); #endif // USECONCVEC } void join(const SplitByCP_buf &rhs) { if (rhs.howFar > howFar) { howFar = rhs.howFar; farPoint = rhs.farPoint; } } point_t farthestPoint() const { return farPoint; } }; #if !USECONCVEC mutex_t SplitByCP::pushBackMutex{}; mutex_t SplitByCP_buf::insertMutex{}; #endif template point_t divide(const pointVec_t &P, pointVec_t &P_reduced, const point_t &p1, const point_t &p2) { BodyType body(p1, p2, P, P_reduced); // Must use simple_partitioner (see the comment in initialize() above) oneapi::tbb::parallel_reduce( range_t(0, P.size(), BodyType::grainSize), body, oneapi::tbb::simple_partitioner()); if (util::verbose) { std::stringstream ss; ss << P.size() << " nodes in bucket" << ", " << "dividing by: [ " << p1 << ", " << p2 << " ], " << "farthest node: " << body.farthestPoint(); util::OUTPUT.push_back(ss.str()); } return body.farthestPoint(); } void divide_and_conquer(const pointVec_t &P, pointVec_t &H, point_t p1, point_t p2, bool buffered) { assert(P.size() >= 2); pointVec_t P_reduced; pointVec_t H1, H2; point_t p_far; if (buffered) { p_far = divide(P, P_reduced, p1, p2); } else { p_far = divide(P, P_reduced, p1, p2); } if (P_reduced.size() < 2) { H.push_back(p1); #if USECONCVEC appendVector(P_reduced, H); #else // insert into STD::VECTOR H.insert(H.end(), P_reduced.begin(), P_reduced.end()); #endif } else { divide_and_conquer(P_reduced, H1, p1, p_far, buffered); divide_and_conquer(P_reduced, H2, p_far, p2, buffered); #if USECONCVEC appendVector(H1, H); appendVector(H2, H); #else // insert into STD::VECTOR H.insert(H.end(), H1.begin(), H1.end()); H.insert(H.end(), H2.begin(), H2.end()); #endif } } void quickhull(const pointVec_t &points, pointVec_t &hull, bool buffered) { if (points.size() < 2) { #if USECONCVEC appendVector(points, hull); #else // STD::VECTOR hull.insert(hull.end(), points.begin(), points.end()); #endif // USECONCVEC return; } point_t p_maxx = extremum(points); point_t p_minx = extremum(points); pointVec_t H; divide_and_conquer(points, hull, p_maxx, p_minx, buffered); divide_and_conquer(points, H, p_minx, p_maxx, buffered); #if USECONCVEC appendVector(H, hull); #else // STD::VECTOR hull.insert(hull.end(), H.begin(), H.end()); #endif // USECONCVEC } int main(int argc, char *argv[]) { utility::thread_number_range threads(utility::get_default_num_threads); util::ParseInputArgs(argc, argv, threads); int nthreads; util::my_time_t tm_init, tm_start, tm_end; #if USECONCVEC std::cout << "Starting TBB unbuffered push_back version of QUICK HULL algorithm" << "\n"; #else std::cout << "Starting STL locked unbuffered push_back version of QUICK HULL algorithm" << "\n"; #endif // USECONCVEC for (nthreads = threads.first; nthreads <= threads.last; nthreads = threads.step(nthreads)) { pointVec_t points; pointVec_t hull; oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism, nthreads); tm_init = util::gettime(); initialize(points); tm_start = util::gettime(); std::cout << "Parallel init time on " << nthreads << " threads: " << util::time_diff(tm_init, tm_start) << " Points in input: " << points.size() << "\n"; tm_start = util::gettime(); quickhull(points, hull, false); tm_end = util::gettime(); std::cout << "Time on " << nthreads << " threads: " << util::time_diff(tm_start, tm_end) << " Points in hull: " << hull.size() << "\n"; } #if USECONCVEC std::cout << "Starting TBB buffered version of QUICK HULL algorithm" << "\n"; #else std::cout << "Starting STL locked buffered version of QUICK HULL algorithm" << "\n"; #endif for (nthreads = threads.first; nthreads <= threads.last; nthreads = threads.step(nthreads)) { pointVec_t points; pointVec_t hull; oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism, nthreads); tm_init = util::gettime(); initialize(points); tm_start = util::gettime(); std::cout << "Init time on " << nthreads << " threads: " << util::time_diff(tm_init, tm_start) << " Points in input: " << points.size() << "\n"; tm_start = util::gettime(); quickhull(points, hull, true); tm_end = util::gettime(); std::cout << "Time on " << nthreads << " threads: " << util::time_diff(tm_start, tm_end) << " Points in hull: " << hull.size() << "\n"; } return 0; } #endif // USETBB void serial_initialize(pointVec_t &points) { points.reserve(cfg::numberOfPoints); unsigned int rseed = 1; for (std::size_t i = 0, count = 0; long(i) < cfg::numberOfPoints; ++i) { points.push_back(util::GenerateRNDPoint(count, &std::rand, RAND_MAX)); } } ================================================ FILE: third-party/tbb/examples/parallel_reduce/convex_hull/convex_hull_sample.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* This file contains the TBB-based implementation of convex hull algorithm. It corresponds to the following settings in convex_hull_bench.cpp: - USETBB defined to 1 - USECONCVEC defined to 1 - INIT_ONCE defined to 0 - only buffered version is used */ #include "oneapi/tbb/parallel_for.h" #include "oneapi/tbb/parallel_reduce.h" #include "oneapi/tbb/blocked_range.h" #include "oneapi/tbb/tick_count.h" #include "oneapi/tbb/concurrent_vector.h" #include "convex_hull.hpp" typedef util::point point_t; typedef oneapi::tbb::concurrent_vector pointVec_t; typedef oneapi::tbb::blocked_range range_t; void appendVector(const point_t *src, std::size_t srcSize, pointVec_t &dest) { std::copy(src, src + srcSize, dest.grow_by(srcSize)); } void appendVector(const pointVec_t &src, pointVec_t &dest) { std::copy(src.begin(), src.end(), dest.grow_by(src.size())); } class FillRNDPointsVector_buf { pointVec_t &points; public: static const std::size_t grainSize = cfg::generateGrainSize; explicit FillRNDPointsVector_buf(pointVec_t &_points) : points(_points) {} void operator()(const range_t &range) const { util::rng the_rng(range.begin()); const std::size_t i_end = range.end(); std::size_t count = 0, j = 0; point_t tmp_vec[grainSize]; for (std::size_t i = range.begin(); i != i_end; ++i) { tmp_vec[j++] = util::GenerateRNDPoint(count, the_rng, util::rng::max_rand); } //Here we have race condition. Elements being written to may be still under construction. //For C++ 2003 it is workarounded by vector element type which default constructor does not touch memory, //it being constructed on. See comments near default ctor of point class for more details. //Strictly speaking it is UB. //TODO: need to find more reliable/correct way points.grow_to_at_least(range.end()); std::copy(tmp_vec, tmp_vec + j, points.begin() + range.begin()); } }; void initialize(pointVec_t &points) { //This function generate the same series of point on every call. //Reproducibility is needed for benchmarking to produce reliable results. //It is achieved through the following points: // - FillRNDPointsVector_buf instance has its own local instance // of random number generator, which in turn does not use any global data // - oneapi::tbb::simple_partitioner produce the same set of ranges on every call to // oneapi::tbb::parallel_for // - local RNG instances are seeded by the starting indexes of corresponding ranges // - grow_to_at_least() enables putting points into the resulting vector in deterministic order // (unlike concurrent push_back or grow_by). // In the buffered version, a temporary storage for as much as grainSize elements // is allocated inside the body. Since auto_partitioner may increase effective // range size which would cause a crash, simple partitioner has to be used. oneapi::tbb::parallel_for(range_t(0, cfg::numberOfPoints, FillRNDPointsVector_buf::grainSize), FillRNDPointsVector_buf(points), oneapi::tbb::simple_partitioner()); } class FindXExtremum { public: typedef enum { minX, maxX } extremumType; static const std::size_t grainSize = cfg::findExtremumGrainSize; FindXExtremum(const pointVec_t &points_, extremumType exType_) : points(points_), exType(exType_), extrXPoint(points[0]) {} FindXExtremum(const FindXExtremum &fxex, oneapi::tbb::split) // Can run in parallel with fxex.operator()() or fxex.join(). // The data race reported by tools is harmless. : points(fxex.points), exType(fxex.exType), extrXPoint(fxex.extrXPoint) {} void operator()(const range_t &range) { const std::size_t i_end = range.end(); if (!range.empty()) { for (std::size_t i = range.begin(); i != i_end; ++i) { if (closerToExtremum(points[i])) { extrXPoint = points[i]; } } } } void join(const FindXExtremum &rhs) { if (closerToExtremum(rhs.extrXPoint)) { extrXPoint = rhs.extrXPoint; } } point_t extremeXPoint() { return extrXPoint; } private: const pointVec_t &points; const extremumType exType; point_t extrXPoint; bool closerToExtremum(const point_t &p) const { switch (exType) { case minX: return p.x < extrXPoint.x; break; case maxX: return p.x > extrXPoint.x; break; } return false; // avoid warning } }; template point_t extremum(const pointVec_t &P) { FindXExtremum fxBody(P, type); oneapi::tbb::parallel_reduce(range_t(0, P.size(), FindXExtremum::grainSize), fxBody); return fxBody.extremeXPoint(); } class SplitByCP_buf { const pointVec_t &initialSet; pointVec_t &reducedSet; point_t p1, p2; point_t farPoint; double howFar; public: static const std::size_t grainSize = cfg::divideGrainSize; SplitByCP_buf(point_t _p1, point_t _p2, const pointVec_t &_initialSet, pointVec_t &_reducedSet) : p1(_p1), p2(_p2), initialSet(_initialSet), reducedSet(_reducedSet), howFar(0), farPoint(p1) {} SplitByCP_buf(SplitByCP_buf &sbcp, oneapi::tbb::split) : p1(sbcp.p1), p2(sbcp.p2), initialSet(sbcp.initialSet), reducedSet(sbcp.reducedSet), howFar(0), farPoint(p1) {} void operator()(const range_t &range) { const std::size_t i_end = range.end(); std::size_t j = 0; double cp; point_t tmp_vec[grainSize]; for (std::size_t i = range.begin(); i != i_end; ++i) { if ((initialSet[i] != p1) && (initialSet[i] != p2)) { cp = util::cross_product(p1, p2, initialSet[i]); if (cp > 0) { tmp_vec[j++] = initialSet[i]; if (cp > howFar) { farPoint = initialSet[i]; howFar = cp; } } } } appendVector(tmp_vec, j, reducedSet); } void join(const SplitByCP_buf &rhs) { if (rhs.howFar > howFar) { howFar = rhs.howFar; farPoint = rhs.farPoint; } } point_t farthestPoint() const { return farPoint; } }; point_t divide(const pointVec_t &P, pointVec_t &P_reduced, const point_t &p1, const point_t &p2) { SplitByCP_buf sbcpb(p1, p2, P, P_reduced); // Must use simple_partitioner (see the comment in initialize() above) oneapi::tbb::parallel_reduce( range_t(0, P.size(), SplitByCP_buf::grainSize), sbcpb, oneapi::tbb::simple_partitioner()); if (util::verbose) { std::stringstream ss; ss << P.size() << " nodes in bucket" << ", " << "dividing by: [ " << p1 << ", " << p2 << " ], " << "farthest node: " << sbcpb.farthestPoint(); util::OUTPUT.push_back(ss.str()); } return sbcpb.farthestPoint(); } void divide_and_conquer(const pointVec_t &P, pointVec_t &H, point_t p1, point_t p2) { assert(P.size() >= 2); pointVec_t P_reduced; pointVec_t H1, H2; point_t p_far = divide(P, P_reduced, p1, p2); if (P_reduced.size() < 2) { H.push_back(p1); appendVector(P_reduced, H); } else { divide_and_conquer(P_reduced, H1, p1, p_far); divide_and_conquer(P_reduced, H2, p_far, p2); appendVector(H1, H); appendVector(H2, H); } } void quickhull(const pointVec_t &points, pointVec_t &hull) { if (points.size() < 2) { appendVector(points, hull); return; } point_t p_maxx = extremum(points); point_t p_minx = extremum(points); pointVec_t H; divide_and_conquer(points, hull, p_maxx, p_minx); divide_and_conquer(points, H, p_minx, p_maxx); appendVector(H, hull); } int main(int argc, char *argv[]) { utility::thread_number_range threads(utility::get_default_num_threads); util::my_time_t tm_main_begin = util::gettime(); util::ParseInputArgs(argc, argv, threads); pointVec_t points; pointVec_t hull; int nthreads; points.reserve(cfg::numberOfPoints); if (!util::silent) { std::cout << "Starting TBB-buffered version of QUICK HULL algorithm" << "\n"; } for (nthreads = threads.first; nthreads <= threads.last; nthreads = threads.step(nthreads)) { oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism, nthreads); points.clear(); util::my_time_t tm_init = util::gettime(); initialize(points); util::my_time_t tm_start = util::gettime(); if (!util::silent) { std::cout << "Init time on " << nthreads << " threads: " << util::time_diff(tm_init, tm_start) << " Points in input: " << points.size() << "\n"; } tm_start = util::gettime(); quickhull(points, hull); util::my_time_t tm_end = util::gettime(); if (!util::silent) { std::cout << "Time on " << nthreads << " threads: " << util::time_diff(tm_start, tm_end) << " Points in hull: " << hull.size() << "\n"; } hull.clear(); } utility::report_elapsed_time(util::time_diff(tm_main_begin, util::gettime())); return 0; } ================================================ FILE: third-party/tbb/examples/parallel_reduce/pi/CMakeLists.txt ================================================ # Copyright (c) 2023-2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. cmake_minimum_required(VERSION 3.5.0...3.31.3) project(pi CXX) include(../../common/cmake/common.cmake) set_common_project_settings(tbb) add_executable(pi main.cpp pi.cpp) target_link_libraries(pi TBB::tbb Threads::Threads) target_compile_options(pi PRIVATE ${TBB_CXX_STD_FLAG}) set(EXECUTABLE "$") set(ARGS "") set(PERF_ARGS auto 100000000000) add_execution_target(run_pi pi ${EXECUTABLE} "${ARGS}") add_execution_target(perf_run_pi pi ${EXECUTABLE} "${PERF_ARGS}") ================================================ FILE: third-party/tbb/examples/parallel_reduce/pi/README.md ================================================ # Pi Sample Parallel version of calculating π by numerical integration. ## Build To build the sample, run the following commands: ``` cmake cmake --build . ``` ## Run ### Predefined Make Targets * `make run_pi` - executes the example with predefined parameters * `make perf_run_pi` - executes the example with suggested parameters to measure the oneTBB performance ### Application Parameters You can use the following application parameters: ``` pi [n-of-threads=value] [n-of-intervals=value] [silent] [-h] [n-of-threads [n-of-intervals]] ``` * `-h` - prints the help for command-line options. * `n-of-threads` - the number of threads to use. This number is specified in the low\[:high\] range format, where both ``low`` and, optionally, ``high`` are non-negative integers. You can also use ``auto`` to let the system choose a default number of threads suitable for the platform. * `n-of-intervals` - the number of intervals to subdivide into. Must be a positive integer. * `silent` - no output except the elapsed time. ================================================ FILE: third-party/tbb/examples/parallel_reduce/pi/common.h ================================================ /* Copyright (c) 2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef TBB_examples_pi_H #define TBB_examples_pi_H #include typedef std::size_t number_t; typedef double pi_t; extern const number_t chunk_size; extern number_t num_intervals; extern pi_t step; extern bool silent; inline pi_t pi_kernel(number_t i) { pi_t dx = (pi_t(i) + pi_t(0.5)) * step; return pi_t(4.0) / (pi_t(1.0) + dx * dx); } inline double pi_slice_kernel(number_t slice, number_t slice_size = chunk_size) { pi_t pi = pi_t(0.0); for (number_t i = slice; i < slice + slice_size; ++i) { pi += pi_kernel(i); } return pi; } struct threading { threading(int p); ~threading(); }; double compute_pi_parallel(); #endif // TBB_examples_pi_H ================================================ FILE: third-party/tbb/examples/parallel_reduce/pi/main.cpp ================================================ /* Copyright (c) 2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "oneapi/tbb/tick_count.h" #include "common/utility/get_default_num_threads.hpp" #include "common/utility/utility.hpp" #include "common.h" const number_t chunk_size = 4096; // Multiple of 16, to fit float datatype to a vector register. // number of intervals number_t num_intervals = 1000000000; pi_t step = pi_t(0.0); bool silent = false; double compute_pi_serial() { double ret = 0; step = pi_t(1.0) / num_intervals; number_t tail = num_intervals % chunk_size; number_t last = num_intervals - tail; for (number_t slice = 0; slice < last; slice += chunk_size) { ret += pi_slice_kernel(slice); } ret += pi_slice_kernel(last, tail); ret *= step; return ret; } int main(int argc, char* argv[]) { try { tbb::tick_count main_start_time = tbb::tick_count::now(); // zero number of threads means to run serial version utility::thread_number_range threads(utility::get_default_num_threads, 0); utility::parse_cli_arguments( argc, argv, utility::cli_argument_pack() //"-h" option for for displaying help is present implicitly .positional_arg(threads, "n-of-threads", utility::thread_number_range_desc) .positional_arg(num_intervals, "n-of-intervals", "number of intervals") .arg(silent, "silent", "no output except time elapsed")); for (int p = threads.first; p <= threads.last; p = threads.step(p)) { pi_t pi; double compute_time; if (p == 0) { //run a serial version tbb::tick_count compute_start_time = tbb::tick_count::now(); pi = compute_pi_serial(); compute_time = (tbb::tick_count::now() - compute_start_time).seconds(); } else { //run a parallel version threading tp(p); tbb::tick_count compute_start_time = tbb::tick_count::now(); pi = compute_pi_parallel(); compute_time = (tbb::tick_count::now() - compute_start_time).seconds(); } if (!silent) { if (p == 0) { std::cout << "Serial run:\tpi = " << pi << "\tcompute time = " << compute_time << " sec\n"; } else { std::cout << "Parallel run:\tpi = " << pi << "\tcompute time = " << compute_time << " sec\t on " << p << " threads\n"; } } } utility::report_elapsed_time((tbb::tick_count::now() - main_start_time).seconds()); return 0; } catch (std::exception& e) { std::cerr << "error occurred. error text is :\"" << e.what() << "\"\n"; return 1; } } ================================================ FILE: third-party/tbb/examples/parallel_reduce/pi/pi.cpp ================================================ /* Copyright (c) 2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "common.h" #include "oneapi/tbb/blocked_range.h" #include "oneapi/tbb/global_control.h" #include "oneapi/tbb/parallel_reduce.h" struct reduce_body { double my_pi; reduce_body() : my_pi(0) {} reduce_body(reduce_body& x, tbb::split) : my_pi(0) {} void operator()(const tbb::blocked_range& r) { my_pi += pi_slice_kernel(r.begin(), r.size()); } void join(const reduce_body& y) { my_pi += y.my_pi; } }; double compute_pi_parallel() { step = pi_t(1.0) / num_intervals; double ret = 0.0; reduce_body body; tbb::parallel_reduce(tbb::blocked_range(0, num_intervals), body); ret = body.my_pi * step; return ret; } static std::unique_ptr gc; threading::threading(int p) { gc.reset(new tbb::global_control(tbb::global_control::max_allowed_parallelism, p)); } threading::~threading() { gc.reset(); } ================================================ FILE: third-party/tbb/examples/parallel_reduce/primes/CMakeLists.txt ================================================ # Copyright (c) 2020-2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. cmake_minimum_required(VERSION 3.5.0...3.31.3) project(primes CXX) include(../../common/cmake/common.cmake) set_common_project_settings(tbb) add_executable(primes main.cpp primes.cpp) target_link_libraries(primes TBB::tbb Threads::Threads) target_compile_options(primes PRIVATE ${TBB_CXX_STD_FLAG}) set(EXECUTABLE "$") set(ARGS "") set(PERF_ARGS silent auto 1000000000 1000 20) add_execution_target(run_primes primes ${EXECUTABLE} "${ARGS}") add_execution_target(perf_run_primes primes ${EXECUTABLE} "${PERF_ARGS}") ================================================ FILE: third-party/tbb/examples/parallel_reduce/primes/README.md ================================================ # Primes sample Parallel version of the Sieve of Eratosthenes. ## Building the example ``` cmake cmake --build . ``` ## Running the sample ### Predefined make targets * `make run_primes` - executes the example with predefined parameters * `make perf_run_primes` - executes the example with suggested parameters to measure the oneTBB performance ### Application parameters Usage: ``` primes [n-of-threads=value] [number=value] [grain-size=value] [n-of-repeats=value] [silent] [-h] [n-of-threads [number [grain-size [n-of-repeats]]]] ``` * `-h` - prints the help for command line options. * `n-of-threads` - the number of threads to use; a range of the form low\[:high\], where low and optional high are non-negative integers or `auto` for a platform-specific default number. * `number` - the upper bound of range to search primes in, must be a positive integer. * `grain-size` - the optional grain size, must be a positive integer. * `n-of-repeats` - the number of the calculation repeats, must be a positive integer. * `silent` - no output except elapsed time. ================================================ FILE: third-party/tbb/examples/parallel_reduce/primes/main.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include #include #include #include #include "oneapi/tbb/tick_count.h" #include "common/utility/utility.hpp" #include "primes.hpp" struct RunOptions { //! NumberType of threads to use. utility::thread_number_range threads; //whether to suppress additional output bool silentFlag; // NumberType n; //! Grain size parameter NumberType grainSize; // number of time to repeat calculation NumberType repeatNumber; RunOptions(utility::thread_number_range threads_, NumberType grainSize_, NumberType n_, bool silentFlag_, NumberType repeatNumber_) : threads(threads_), silentFlag(silentFlag_), n(n_), grainSize(grainSize_), repeatNumber(repeatNumber_) {} }; //! Parse the command line. static RunOptions ParseCommandLine(int argc, char* argv[]) { utility::thread_number_range threads( utility::get_default_num_threads, 0, utility::get_default_num_threads()); NumberType grainSize = 1000; bool silent = false; NumberType number = 100000000; NumberType repeatNumber = 1; utility::parse_cli_arguments( argc, argv, utility::cli_argument_pack() //"-h" option for displaying help is present implicitly .positional_arg(threads, "n-of-threads", utility::thread_number_range_desc) .positional_arg(number, "number", "upper bound of range to search primes in, must be a positive integer") .positional_arg(grainSize, "grain-size", "must be a positive integer") .positional_arg( repeatNumber, "n-of-repeats", "repeat the calculation this number of times, must be a positive integer") .arg(silent, "silent", "no output except elapsed time")); RunOptions options(threads, grainSize, number, silent, repeatNumber); return options; } int main(int argc, char* argv[]) { oneapi::tbb::tick_count mainBeginMark = oneapi::tbb::tick_count::now(); RunOptions options = ParseCommandLine(argc, argv); // Try different numbers of threads for (int p = options.threads.first; p <= options.threads.last; p = options.threads.step(p)) { for (NumberType i = 0; i < options.repeatNumber; ++i) { oneapi::tbb::tick_count iterationBeginMark = oneapi::tbb::tick_count::now(); NumberType count = 0; NumberType n = options.n; if (p == 0) { count = SerialCountPrimes(n); } else { NumberType grainSize = options.grainSize; count = ParallelCountPrimes(n, p, grainSize); } oneapi::tbb::tick_count iterationEndMark = oneapi::tbb::tick_count::now(); if (!options.silentFlag) { std::cout << "#primes from [2.." << options.n << "] = " << count << " (" << (iterationEndMark - iterationBeginMark).seconds() << " sec with "; if (0 != p) std::cout << p << "-way parallelism"; else std::cout << "serial code"; std::cout << ")\n"; } } } utility::report_elapsed_time((oneapi::tbb::tick_count::now() - mainBeginMark).seconds()); return 0; } ================================================ FILE: third-party/tbb/examples/parallel_reduce/primes/primes.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ // Example program that computes number of prime numbers up to n, // where n is a command line argument. The algorithm here is a // fairly efficient version of the sieve of Eratosthenes. // The parallel version demonstrates how to use parallel_reduce, // and in particular how to exploit lazy splitting. #include #include #include #include #include #include #include #include "oneapi/tbb/parallel_reduce.h" #include "oneapi/tbb/global_control.h" #include "primes.hpp" //! If true, then print primes on stdout. static bool printPrimes = false; class Multiples { inline NumberType strike(NumberType start, NumberType limit, NumberType stride) { // Hoist "my_is_composite" into register for sake of speed. bool* is_composite = my_is_composite; assert(stride >= 2); for (; start < limit; start += stride) is_composite[start] = true; return start; } //! Window into conceptual sieve bool* my_is_composite; //! Indexes into window /** my_striker[k] is an index into my_composite corresponding to an odd multiple multiple of my_factor[k]. */ NumberType* my_striker; //! Prime numbers less than m. NumberType* my_factor; public: //! NumberType of factors in my_factor. NumberType n_factor; NumberType m; Multiples(NumberType n) { m = NumberType(sqrt(double(n))); // Round up to even m += m & 1; my_is_composite = new bool[m / 2]; my_striker = new NumberType[m / 2]; my_factor = new NumberType[m / 2]; n_factor = 0; memset(my_is_composite, 0, m / 2); for (NumberType i = 3; i < m; i += 2) { if (!my_is_composite[i / 2]) { if (printPrimes) printf("%d\n", (int)i); my_striker[n_factor] = strike(i / 2, m / 2, i); my_factor[n_factor++] = i; } } } //! Find primes in range [start,window_size), advancing my_striker as we go. /** Returns number of primes found. */ NumberType find_primes_in_window(NumberType start, NumberType window_size) { bool* is_composite = my_is_composite; memset(is_composite, 0, window_size / 2); for (std::size_t k = 0; k < n_factor; ++k) my_striker[k] = strike(my_striker[k] - m / 2, window_size / 2, my_factor[k]); NumberType count = 0; for (NumberType k = 0; k < window_size / 2; ++k) { if (!is_composite[k]) { if (printPrimes) printf("%ld\n", long(start + 2 * k + 1)); ++count; } } return count; } ~Multiples() { delete[] my_factor; delete[] my_striker; delete[] my_is_composite; } //------------------------------------------------------------------------ // Begin extra members required by parallel version //------------------------------------------------------------------------ // Splitting constructor Multiples(const Multiples& f, oneapi::tbb::split) : n_factor(f.n_factor), m(f.m), my_is_composite(nullptr), my_striker(nullptr), my_factor(f.my_factor) {} bool is_initialized() const { return my_is_composite != nullptr; } void initialize(NumberType start) { assert(start >= 1); my_is_composite = new bool[m / 2]; my_striker = new NumberType[m / 2]; for (std::size_t k = 0; k < n_factor; ++k) { NumberType f = my_factor[k]; NumberType p = (start - 1) / f * f % m; my_striker[k] = (p & 1 ? p + 2 * f : p + f) / 2; assert(m / 2 <= my_striker[k]); } } // Move other to *this. void move(Multiples& other) { // The swap moves the contents of other to *this and causes the old contents // of *this to be deleted later when other is destroyed. std::swap(my_striker, other.my_striker); std::swap(my_is_composite, other.my_is_composite); // other.my_factor is a shared pointer that was copied by the splitting constructor. // Set it to nullptr to prevent premature deletion by the destructor of ~other. assert(my_factor == other.my_factor); other.my_factor = nullptr; } //------------------------------------------------------------------------ // End extra methods required by parallel version //------------------------------------------------------------------------ }; //! Count number of primes between 0 and n /** This is the serial version. */ NumberType SerialCountPrimes(NumberType n) { // Two is special case NumberType count = n >= 2; if (n >= 3) { Multiples multiples(n); count += multiples.n_factor; if (printPrimes) printf("---\n"); NumberType window_size = multiples.m; for (NumberType j = multiples.m; j <= n; j += window_size) { if (j + window_size > n + 1) window_size = n + 1 - j; count += multiples.find_primes_in_window(j, window_size); } } return count; } //! Range of a sieve window. class SieveRange { //! Width of full-size window into sieve. const NumberType my_stride; //! Always multiple of my_stride NumberType my_begin; //! One past last number in window. NumberType my_end; //! Width above which it is worth forking. const NumberType my_grainsize; bool assert_okay() const { assert(my_begin % my_stride == 0); assert(my_begin <= my_end); assert(my_stride <= my_grainsize); return true; } public: //------------------------------------------------------------------------ // Begin signatures required by parallel_reduce //------------------------------------------------------------------------ bool is_divisible() const { return my_end - my_begin > my_grainsize; } bool empty() const { return my_end <= my_begin; } SieveRange(SieveRange& r, oneapi::tbb::split) : my_stride(r.my_stride), my_grainsize(r.my_grainsize), my_end(r.my_end) { assert(r.is_divisible()); assert(r.assert_okay()); NumberType middle = r.my_begin + (r.my_end - r.my_begin + r.my_stride - 1) / 2; middle = middle / my_stride * my_stride; my_begin = middle; r.my_end = middle; assert(assert_okay()); assert(r.assert_okay()); } //------------------------------------------------------------------------ // End of signatures required by parallel_reduce //------------------------------------------------------------------------ NumberType begin() const { return my_begin; } NumberType end() const { return my_end; } SieveRange(NumberType begin, NumberType end, NumberType stride, NumberType grainsize) : my_begin(begin), my_end(end), my_stride(stride), my_grainsize(grainsize < stride ? stride : grainsize) { assert(assert_okay()); } }; //! Loop body for parallel_reduce. /** parallel_reduce splits the sieve into subsieves. Each subsieve handles a subrange of [0..n]. */ class Sieve { public: //! Prime Multiples to consider, and working storage for this subsieve. ::Multiples multiples; //! NumberType of primes found so far by this subsieve. NumberType count; //! Construct Sieve for counting primes in [0..n]. Sieve(NumberType n) : multiples(n), count(0) {} //------------------------------------------------------------------------ // Begin signatures required by parallel_reduce //------------------------------------------------------------------------ void operator()(const SieveRange& r) { NumberType m = multiples.m; if (multiples.is_initialized()) { // Simply reuse "Multiples" structure from previous window // This works because parallel_reduce always applies // *this from left to right. } else { // Need to initialize "Multiples" because *this is a forked copy // that needs to be set up to start at r.begin(). multiples.initialize(r.begin()); } NumberType window_size = m; for (NumberType j = r.begin(); j < r.end(); j += window_size) { assert(j % multiples.m == 0); if (j + window_size > r.end()) window_size = r.end() - j; count += multiples.find_primes_in_window(j, window_size); } } void join(Sieve& other) { count += other.count; // Final value of multiples needs to final value of other multiples, // so that *this can correctly process next window to right. multiples.move(other.multiples); } Sieve(Sieve& other, oneapi::tbb::split) : multiples(other.multiples, oneapi::tbb::split()), count(0) {} //------------------------------------------------------------------------ // End of signatures required by parallel_reduce //------------------------------------------------------------------------ }; //! Count number of primes between 0 and n /** This is the parallel version. */ NumberType ParallelCountPrimes(NumberType n, int number_of_threads, NumberType grain_size) { oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism, number_of_threads); // Two is special case NumberType count = n >= 2; if (n >= 3) { Sieve s(n); count += s.multiples.n_factor; if (printPrimes) printf("---\n"); // Explicit grain size and simple_partitioner() used here instead of automatic grainsize // determination because we want SieveRange to be decomposed down to grainSize or smaller. // Doing so improves odds that the working set fits in cache when evaluating Sieve::operator(). oneapi::tbb::parallel_reduce(SieveRange(s.multiples.m, n, s.multiples.m, grain_size), s, oneapi::tbb::simple_partitioner()); count += s.count; } return count; } ================================================ FILE: third-party/tbb/examples/parallel_reduce/primes/primes.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef TBB_examples_primes_H #define TBB_examples_primes_H #include #include "common/utility/get_default_num_threads.hpp" typedef std::size_t NumberType; //! Count number of primes between 0 and n /** This is the serial version. */ NumberType SerialCountPrimes(NumberType n); //! Count number of primes between 0 and n /** This is the parallel version. */ NumberType ParallelCountPrimes(NumberType n, int numberOfThreads = utility::get_default_num_threads(), NumberType grainSize = 1000); #endif /* TBB_examples_primes_H */ ================================================ FILE: third-party/tbb/examples/task_arena/README.md ================================================ # Code Samples of oneAPI Threading Building Blocks (oneTBB) Examples using the `task_arena` feature. | Code sample name | Description |:--- |:--- | fractal |The example calculates two classical Mandelbrot fractals with different concurrency limits. ================================================ FILE: third-party/tbb/examples/task_arena/fractal/CMakeLists.txt ================================================ # Copyright (c) 2020-2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. cmake_minimum_required(VERSION 3.5.0...3.31.3) project(fractal CXX) include(../../common/cmake/common.cmake) set_common_project_settings(tbb) add_executable(fractal fractal.cpp main.cpp ) add_subdirectory(../../common/gui gui) target_link_libraries(fractal PUBLIC TBB::tbb Threads::Threads UI_LIB_fractal) target_compile_options(fractal PRIVATE ${TBB_CXX_STD_FLAG}) set(EXECUTABLE "$") set(ARGS auto) set(PERF_ARGS auto 1 1000000 silent) set(LIGHT_ARGS auto 1 1000) add_execution_target(run_fractal fractal ${EXECUTABLE} "${ARGS}") add_execution_target(perf_run_fractal fractal ${EXECUTABLE} "${PERF_ARGS}") add_execution_target(light_test_fractal fractal ${EXECUTABLE} "${LIGHT_ARGS}") ================================================ FILE: third-party/tbb/examples/task_arena/fractal/README.md ================================================ # Fractal sample The example calculates two classical Mandelbrot fractals with different concurrency levels. The application window is divided into two areas where fractals are rendered. The example also has the console mode. ## Building the example ``` cmake [EXAMPLES_UI_MODE=value] cmake --build . ``` ### Predefined CMake variables * `EXAMPLES_UI_MODE` - defines the GUI mode, supported values are `gdi`, `d2d`, `con` on Windows, `x`,`con` on Linux and `mac`,`con` on macOS. The default mode is `con`. See the [common page](../../README.md) to get more information. ## Running the sample ### Predefined make targets * `make run_fractal` - executes the example with predefined parameters. * `make perf_run_fractal` - executes the example with suggested parameters to measure the oneTBB performance. * `make light_test_fractal` - executes the example with suggested parameters to reduce execution time. ### Application parameters Usage: ``` fractal [n-of-threads=value] [n-of-frames=value] [max-of-iterations=value] [grain-size=value] [use-auto-partitioner] [silent] [single] [-h] [n-of-threads [n-of-frames [max-of-iterations [grain-size]]]] ``` * `-h` - prints the help for command line options. * `n-of-threads` - the number of threads to use; a range of the form low\[:high\], where low and optional high are non-negative integers or `auto` for a platform-specific default number. * `n-of-frames` - the number of frames the example processes internally. * `max-of-iterations` - the maximum number of the fractal iterations. * `grain-size` - the optional grain size, must be a positive integer. * `use-auto-partitioner` - use oneapi::tbb::auto_partitioner. * `silent` - no output except elapsed time. * `single` - process only one fractal. ### Interactive graphical user interface The following hot keys can be used in interactive execution mode when the example is compiled with the graphical user interface: * `left mouse button` - make the fractal active. * `w` - move the active fractal up. * `a` - move the active fractal to the left. * `s` - move the active fractal down. * `d` - move the active fractal to the right. * `q` - zoom in the active fractal. * `e` - zoom out the active fractal. * `r` - increase quality (count of iterations for each pixel) the active fractal. * `f` - decrease quality (count of iterations for each pixel) the active fractal. * `esc` - stop execution. ================================================ FILE: third-party/tbb/examples/task_arena/fractal/fractal.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include "oneapi/tbb/parallel_for.h" #include "oneapi/tbb/blocked_range2d.h" #include "oneapi/tbb/tick_count.h" #include "fractal.hpp" video *v; extern bool silent; extern bool schedule_auto; extern int grain_size; color_t fractal::calc_one_pixel(int x0, int y0) const { unsigned int iter; double fx0, fy0, xtemp, x, y, mu; color_t color; fx0 = (double)x0 - (double)size_x / 2.0; fy0 = (double)y0 - (double)size_y / 2.0; fx0 = fx0 / magn + cx; fy0 = fy0 / magn + cy; iter = 0; x = 0; y = 0; mu = 0; while (((x * x + y * y) <= 4) && (iter < max_iterations)) { xtemp = x * x - y * y + fx0; y = 2 * x * y + fy0; x = xtemp; mu += exp(-sqrt(x * x + y * y)); iter++; } if (iter == max_iterations) { // point corresponds to the mandelbrot set color = v->get_color(255, 255, 255); return color; } int b = (int)(256 * mu); int g = (b / 8); int r = (g / 16); b = b > 255 ? 255 : b; g = g > 255 ? 255 : g; r = r > 255 ? 255 : r; color = v->get_color(r, g, b); return color; } void fractal::clear() { drawing_area area(off_x, off_y, size_x, size_y, dm); // fill the rendering area with black color for (int y = 0; y < size_y; ++y) { area.set_pos(0, y); for (int x = 0; x < size_x; ++x) { area.put_pixel(v->get_color(0, 0, 0)); } } } void fractal::draw_border(bool is_active) { color_t color = is_active ? v->get_color(0, 255, 0) // green color : v->get_color(96, 128, 96); // green-gray color // top border drawing_area area0(off_x - 1, off_y - 1, size_x + 2, 1, dm); for (int i = -1; i < size_x + 1; ++i) area0.put_pixel(color); // bottom border drawing_area area1(off_x - 1, off_y + size_y, size_x + 2, 1, dm); for (int i = -1; i < size_x + 1; ++i) area1.put_pixel(color); // left border drawing_area area2(off_x - 1, off_y, 1, size_y + 2, dm); for (int i = 0; i < size_y; ++i) area2.set_pixel(0, i, color); // right border drawing_area area3(size_x + off_x, off_y, 1, size_y + 2, dm); for (int i = 0; i < size_y; ++i) area3.set_pixel(0, i, color); } void fractal::render_rect(int x0, int y0, int x1, int y1) const { // render the specified rectangle area drawing_area area(off_x + x0, off_y + y0, x1 - x0, y1 - y0, dm); for (int y = y0; y < y1; ++y) { area.set_pos(0, y - y0); for (int x = x0; x < x1; ++x) { area.put_pixel(calc_one_pixel(x, y)); } } } class fractal_body { fractal &f; public: void operator()(oneapi::tbb::blocked_range2d &r) const { if (v->next_frame()) f.render_rect(r.cols().begin(), r.rows().begin(), r.cols().end(), r.rows().end()); } fractal_body(fractal &_f) : f(_f) {} }; void fractal::render(oneapi::tbb::task_group_context &context) { // Make copy of fractal object and render fractal with parallel_for with // the provided context and partitioner chosen by schedule_auto. // Updates to fractal are not reflected in the render. fractal f = *this; fractal_body body(f); if (schedule_auto) oneapi::tbb::parallel_for( oneapi::tbb::blocked_range2d(0, size_y, grain_size, 0, size_x, grain_size), body, oneapi::tbb::auto_partitioner(), context); else oneapi::tbb::parallel_for( oneapi::tbb::blocked_range2d(0, size_y, grain_size, 0, size_x, grain_size), body, oneapi::tbb::simple_partitioner(), context); } void fractal::run(oneapi::tbb::task_group_context &context) { clear(); context.reset(); render(context); } bool fractal::check_point(int x, int y) const { return x >= off_x && x <= off_x + size_x && y >= off_y && y <= off_y + size_y; } void fractal_group::calc_fractal(int num) { // calculate the fractal fractal &f = num ? f1 : f0; oneapi::tbb::tick_count t0 = oneapi::tbb::tick_count::now(); while (v->next_frame() && num_frames[num] != 0) { f.run(context[num]); if (num_frames[num] > 0) num_frames[num] -= 1; } oneapi::tbb::tick_count t1 = oneapi::tbb::tick_count::now(); if (!silent) { printf(" %s fractal finished. Time: %g\n", num ? "Second" : "First", (t1 - t0).seconds()); } } void fractal_group::switch_active(int new_active) { if (new_active != -1) active = new_active; else active = 1 - active; // assumes 'active' is only 0 or 1 draw_borders(); } void fractal_group::set_num_frames_at_least(int n) { if (num_frames[0] < n) num_frames[0] = n; if (num_frames[1] < n) num_frames[1] = n; } void fractal_group::run(bool create_second_fractal) { // First argument of arenas construntor is used to restrict concurrency arenas[0].initialize(num_threads); arenas[1].initialize(num_threads / 2); draw_borders(); // the second fractal is calculating on separated thread if (create_second_fractal) { arenas[1].execute([&] { groups[1].run([&] { calc_fractal(1); }); }); } arenas[0].execute([&] { groups[0].run([&] { calc_fractal(0); }); }); if (create_second_fractal) { arenas[1].execute([&] { groups[1].wait(); }); } arenas[0].execute([&] { groups[0].wait(); }); } void fractal_group::draw_borders() { f0.draw_border(active == 0); f1.draw_border(active == 1); } fractal_group::fractal_group(const drawing_memory &_dm, int _num_threads, unsigned int _max_iterations, int _num_frames) : f0(_dm), f1(_dm), num_threads(_num_threads) { // set rendering areas f0.size_x = f1.size_x = _dm.sizex / 2 - 4; f0.size_y = f1.size_y = _dm.sizey - 4; f0.off_x = f0.off_y = f1.off_y = 2; f1.off_x = f0.size_x + 4 + 2; // set fractals parameters f0.cx = -0.6f; f0.cy = 0.0f; f0.magn = 200.0f; f1.cx = -0.6f; f1.cy = 0.0f; f1.magn = 200.0f; f0.max_iterations = f1.max_iterations = _max_iterations; // initially the first fractal is active active = 0; num_frames[0] = num_frames[1] = _num_frames; } void fractal_group::mouse_click(int x, int y) { // assumption that the point is not inside any fractal area int new_active = -1; if (f0.check_point(x, y)) { // the point is inside the first fractal area new_active = 0; } else if (f1.check_point(x, y)) { // the point is inside the second fractal area new_active = 1; } if (new_active != -1 && new_active != active) { switch_active(new_active); } } ================================================ FILE: third-party/tbb/examples/task_arena/fractal/fractal.hpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef TBB_examples_fractal_H #define TBB_examples_fractal_H #include #include "oneapi/tbb/task_arena.h" #include "oneapi/tbb/task_group.h" #include "common/gui/video.hpp" #include "common/utility/get_default_num_threads.hpp" //! Fractal class class fractal { //! Left corner of the fractal area int off_x, off_y; //! Size of the fractal area int size_x, size_y; //! Fractal properties float cx, cy; float magn; float step; unsigned int max_iterations; //! Drawing memory object for rendering const drawing_memory &dm; //! One pixel calculation routine color_t calc_one_pixel(int x, int y) const; //! Clears the fractal area void clear(); //! Draws the border around the fractal area void draw_border(bool is_active); //! Renders the fractal void render(oneapi::tbb::task_group_context &context); //! Check if the point is inside the fractal area bool check_point(int x, int y) const; public: //! Constructor fractal(const drawing_memory &dm) : step(0.2), dm(dm) { #if _MSC_VER && _WIN64 && !__INTEL_COMPILER // Workaround for MSVC x64 compiler issue volatile int i = 0; #endif } //! Runs the fractal calculation void run(oneapi::tbb::task_group_context &context); //! Renders the fractal rectangular area void render_rect(int x0, int y0, int x1, int y1) const; void move_up() { cy += step; } void move_down() { cy -= step; } void move_left() { cx += step; } void move_right() { cx -= step; } void zoom_in() { magn *= 2.; step /= 2.; } void zoom_out() { magn /= 2.; step *= 2.; } void quality_inc() { max_iterations += max_iterations / 2; } void quality_dec() { max_iterations -= max_iterations / 2; } friend class fractal_group; }; //! The group of fractals class fractal_group { //! Fractals definition fractal f0, f1; //! Number of frames to calculate std::atomic num_frames[2]; //! Contexts, arenas and groups for concurrent computation oneapi::tbb::task_group_context context[2]; oneapi::tbb::task_arena arenas[2]; oneapi::tbb::task_group groups[2]; //! Border type enumeration enum BORDER_TYPE { BORDER_INACTIVE = 0, BORDER_ACTIVE }; //! The number of the threads int num_threads; //! The active (high priority) fractal number int active; //! Draws the borders around the fractals void draw_borders(); //! Sets priorities for fractals calculations void set_priorities(); public: //! Constructor fractal_group(const drawing_memory &_dm, int num_threads = utility::get_default_num_threads(), unsigned int max_iterations = 100000, int num_frames = 1); //! Run calculation void run(bool create_second_fractal = true); //! Mouse event handler void mouse_click(int x, int y); //! Fractal calculation routine void calc_fractal(int num); //! Get number of threads int get_num_threads() const { return num_threads; } //! Reset the number of frames to be not less than the given value void set_num_frames_at_least(int n); //! Switch active fractal void switch_active(int new_active = -1); //! Get active fractal fractal &get_active_fractal() { return active ? f1 : f0; } void active_fractal_zoom_in() { get_active_fractal().zoom_in(); context[active].cancel_group_execution(); } void active_fractal_zoom_out() { get_active_fractal().zoom_out(); context[active].cancel_group_execution(); } void active_fractal_quality_inc() { get_active_fractal().quality_inc(); context[active].cancel_group_execution(); } void active_fractal_quality_dec() { get_active_fractal().quality_dec(); context[active].cancel_group_execution(); } void active_fractal_move_up() { get_active_fractal().move_up(); context[active].cancel_group_execution(); } void active_fractal_move_down() { get_active_fractal().move_down(); context[active].cancel_group_execution(); } void active_fractal_move_left() { get_active_fractal().move_left(); context[active].cancel_group_execution(); } void active_fractal_move_right() { get_active_fractal().move_right(); context[active].cancel_group_execution(); } }; #endif /* TBB_examples_fractal_H */ ================================================ FILE: third-party/tbb/examples/task_arena/fractal/fractal_video.hpp ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef TBB_examples_fractal_video_H #define TBB_examples_fractal_video_H #include "common/gui/video.hpp" #include "fractal.hpp" extern video *v; extern bool single; class fractal_video : public video { fractal_group *fg; private: void on_mouse(int x, int y, int key) { if (key == 1) { if (fg) { fg->set_num_frames_at_least(20); fg->mouse_click(x, y); } } } void on_key(int key) { switch (key & 0xff) { case esc_key: running = false; break; case 'q': if (fg) fg->active_fractal_zoom_in(); break; case 'e': if (fg) fg->active_fractal_zoom_out(); break; case 'r': if (fg) fg->active_fractal_quality_inc(); break; case 'f': if (fg) fg->active_fractal_quality_dec(); break; case 'w': if (fg) fg->active_fractal_move_up(); break; case 'a': if (fg) fg->active_fractal_move_left(); break; case 's': if (fg) fg->active_fractal_move_down(); break; case 'd': if (fg) fg->active_fractal_move_right(); break; } if (fg) fg->set_num_frames_at_least(20); } void on_process() { if (fg) { fg->run(!single); } } public: fractal_video() : fg(nullptr) { title = "oneTBB: Fractal Example"; v = this; } void set_fractal_group(fractal_group &_fg) { fg = &_fg; } }; #endif /* TBB_examples_fractal_video_H */ ================================================ FILE: third-party/tbb/examples/task_arena/fractal/gui/fractal.rc ================================================ // Microsoft Visual C++ generated resource script. // #include "resource.h" #define APSTUDIO_READONLY_SYMBOLS ///////////////////////////////////////////////////////////////////////////// // // Generated from the TEXTINCLUDE 2 resource. // #define APSTUDIO_HIDDEN_SYMBOLS #include "windows.h" #undef APSTUDIO_HIDDEN_SYMBOLS ///////////////////////////////////////////////////////////////////////////// #undef APSTUDIO_READONLY_SYMBOLS ///////////////////////////////////////////////////////////////////////////// // English (U.S.) resources #if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_ENU) #ifdef _WIN32 LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US #pragma code_page(1252) #endif //_WIN32 ///////////////////////////////////////////////////////////////////////////// // // Icon // // Icon with lowest ID value placed first to ensure application icon // remains consistent on all systems. IDI_GUI ICON "gui.ico" IDI_SMALL ICON "small.ico" #ifdef APSTUDIO_INVOKED ///////////////////////////////////////////////////////////////////////////// // // TEXTINCLUDE // 1 TEXTINCLUDE BEGIN "resource.h\0" END 2 TEXTINCLUDE BEGIN "#define APSTUDIO_HIDDEN_SYMBOLS\r\n" "#include ""windows.h""\r\n" "#undef APSTUDIO_HIDDEN_SYMBOLS\r\n" "\0" END 3 TEXTINCLUDE BEGIN "\r\n" "\0" END #endif // APSTUDIO_INVOKED ///////////////////////////////////////////////////////////////////////////// // // String Table // STRINGTABLE BEGIN IDS_APP_TITLE "gui" IDC_GUI "GUI" END #endif // English (U.S.) resources ///////////////////////////////////////////////////////////////////////////// #ifndef APSTUDIO_INVOKED ///////////////////////////////////////////////////////////////////////////// // // Generated from the TEXTINCLUDE 3 resource. // ///////////////////////////////////////////////////////////////////////////// #endif // not APSTUDIO_INVOKED ================================================ FILE: third-party/tbb/examples/task_arena/fractal/gui/resource.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #define IDC_MYICON 2 #define IDD_GUI 102 #define IDS_APP_TITLE 103 #define IDI_GUI 107 #define IDI_SMALL 108 #define IDC_GUI 109 #define IDR_MAINFRAME 128 #define IDC_STATIC -1 ================================================ FILE: third-party/tbb/examples/task_arena/fractal/main.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #define VIDEO_WINMAIN_ARGS #include #include #include "oneapi/tbb/tick_count.h" #include "common/utility/utility.hpp" #include "fractal.hpp" #include "fractal_video.hpp" bool silent = false; bool single = false; bool schedule_auto = false; int grain_size = 8; int main(int argc, char *argv[]) { oneapi::tbb::tick_count mainStartTime = oneapi::tbb::tick_count::now(); // It is used for console mode for test with different number of threads and also has // meaning for GUI: threads.first - use separate event/updating loop thread (>0) or not (0). // threads.second - initialization value for scheduler utility::thread_number_range threads(utility::get_default_num_threads); int num_frames = -1; int max_iterations = 1000000; // command line parsing utility::parse_cli_arguments( argc, argv, utility::cli_argument_pack() //"-h" option for displaying help is present implicitly .positional_arg(threads, "n-of-threads", utility::thread_number_range_desc) .positional_arg( num_frames, "n-of-frames", "number of frames the example processes internally") .positional_arg( max_iterations, "max-of-iterations", "maximum number of the fractal iterations") .positional_arg(grain_size, "grain-size", "the grain size value") .arg(schedule_auto, "use-auto-partitioner", "use oneapi::tbb::auto_partitioner") .arg(silent, "silent", "no output except elapsed time") .arg(single, "single", "process only one fractal")); fractal_video video; // video layer init if (video.init_window(1024, 512)) { video.calc_fps = false; video.threaded = threads.first > 0; // initialize fractal group fractal_group fg(video.get_drawing_memory(), threads.last, max_iterations, num_frames); video.set_fractal_group(fg); // main loop video.main_loop(); } else if (video.init_console()) { // in console mode we always have limited number of frames num_frames = num_frames < 0 ? 1 : num_frames; for (int p = threads.first; p <= threads.last; p = threads.step(p)) { if (!silent) printf("Threads = %d\n", p); fractal_group fg(video.get_drawing_memory(), p, max_iterations, num_frames); fg.run(!single); } } video.terminate(); utility::report_elapsed_time((oneapi::tbb::tick_count::now() - mainStartTime).seconds()); return 0; } ================================================ FILE: third-party/tbb/examples/task_group/README.md ================================================ # Code Samples of oneAPI Threading Building Blocks (oneTBB) Examples using `task_group` interface. | Code sample name | Description |:--- |:--- | sudoku | Compute all solutions for a Sudoku board. ================================================ FILE: third-party/tbb/examples/task_group/sudoku/CMakeLists.txt ================================================ # Copyright (c) 2020-2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. cmake_minimum_required(VERSION 3.5.0...3.31.3) project(sudoku CXX) include(../../common/cmake/common.cmake) set_common_project_settings(tbb) add_executable(sudoku sudoku.cpp) target_link_libraries(sudoku TBB::tbb Threads::Threads) target_compile_options(sudoku PRIVATE ${TBB_CXX_STD_FLAG}) if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL IntelLLVM) target_compile_options(sudoku PRIVATE -D_CRT_SECURE_NO_WARNINGS) endif() set(EXECUTABLE "$") set(ARGS 4 ${CMAKE_CURRENT_SOURCE_DIR}/input1 verbose) set(PERF_ARGS auto ${CMAKE_CURRENT_SOURCE_DIR}/input1 silent) add_execution_target(run_sudoku sudoku ${EXECUTABLE} "${ARGS}") add_execution_target(perf_run_sudoku sudoku ${EXECUTABLE} "${PERF_ARGS}") ================================================ FILE: third-party/tbb/examples/task_group/sudoku/README.md ================================================ # Sudoku sample This directory contains an example that finds all solutions to a Sudoku board. It uses a straightforward state-space search algorithm that exhibits OR-parallelism. It can be optionally run until it obtains just the first solution. The point of the example is to teach how to use the `task_group` interface. ## Building the example ``` cmake cmake --build . ``` ## Running the sample ### Predefined make targets * `make run_sudoku` - executes the example with predefined parameters. * `make perf_run_sudoku` - executes the example with suggested parameters to measure the oneTBB performance. ### Application parameters Usage: ``` sudoku [n-of-threads=value] [filename=value] [verbose] [silent] [find-one] [-h] [n-of-threads [filename]] ``` * `-h` - prints the help for command line options. * `n-of-threads` - the number of threads to use; a range of the form low\[:high\], where low and optional high are non-negative integers or `auto` for a platform-specific default number. * `filename` - the input filename. * `verbose` - prints the first solution. * `silent` - no output except elapsed time. * `find-one` - stops after finding first solution. The example's directory contains following files that may be used as an input file: `input1` - Sample input file with modest number of solutions. `input2` - Sample input file with small number of solutions. `input3` - Sample input file with larger number of solutions. `input4` - Sample input file with very large number of solutions. ================================================ FILE: third-party/tbb/examples/task_group/sudoku/input1 ================================================ 1 0 0 9 0 0 0 8 0 0 8 0 2 0 0 0 0 0 0 0 5 0 0 0 7 0 0 0 5 2 1 0 0 4 0 0 0 0 0 0 0 5 0 0 7 4 0 0 7 0 0 0 3 0 0 3 0 0 0 2 0 0 5 0 0 0 0 0 0 1 0 0 5 0 0 0 1 0 0 0 0 1 0 0 9 0 0 0 8 0 0 8 0 2 0 0 0 0 0 0 0 5 0 0 0 7 0 0 0 5 2 1 0 0 4 0 0 0 0 0 0 0 5 0 0 7 4 0 0 7 0 0 0 3 0 0 3 0 0 0 2 0 0 5 0 0 0 0 0 0 1 0 0 5 0 0 0 1 0 0 0 0 ================================================ FILE: third-party/tbb/examples/task_group/sudoku/input2 ================================================ 2 0 1 0 0 0 0 8 0 0 8 0 2 1 9 6 0 0 0 0 5 0 0 0 7 0 0 0 5 2 1 0 0 4 0 0 0 0 0 0 0 5 0 0 7 4 0 0 7 0 0 0 3 0 0 3 0 0 0 2 0 0 5 0 0 0 0 3 0 1 0 0 5 0 0 0 8 0 0 0 6 2 0 1 0 0 0 0 8 0 0 8 0 2 1 9 6 0 0 0 0 5 0 0 0 7 0 0 0 5 2 1 0 0 4 0 0 0 0 0 0 0 5 0 0 7 4 0 0 7 0 0 0 3 0 0 3 0 0 0 2 0 0 5 0 0 0 0 3 0 1 0 0 5 0 0 0 8 0 0 0 6 ================================================ FILE: third-party/tbb/examples/task_group/sudoku/input3 ================================================ 1 0 0 9 0 0 0 8 0 0 0 0 2 0 0 0 0 0 0 0 5 0 0 0 7 0 0 0 5 2 6 0 0 4 0 0 0 0 0 0 0 5 0 0 7 4 0 0 7 0 0 0 3 0 0 3 0 0 0 2 0 0 5 0 0 0 0 0 0 1 0 0 5 0 0 0 1 0 0 0 0 1 0 0 9 0 0 0 8 0 0 0 0 2 0 0 0 0 0 0 0 5 0 0 0 7 0 0 0 5 2 6 0 0 4 0 0 0 0 0 0 0 5 0 0 7 4 0 0 7 0 0 0 3 0 0 3 0 0 0 2 0 0 5 0 0 0 0 0 0 1 0 0 5 0 0 0 1 0 0 0 0 ================================================ FILE: third-party/tbb/examples/task_group/sudoku/input4 ================================================ 1 0 0 9 0 0 0 8 0 0 0 0 2 0 0 0 0 0 0 0 5 0 0 0 7 0 0 0 0 2 6 0 0 0 0 0 0 0 0 0 0 5 0 0 7 4 0 0 0 0 0 0 3 0 0 3 0 0 0 2 0 0 5 0 0 0 0 0 0 1 0 0 5 0 0 0 1 0 0 0 0 1 0 0 9 0 0 0 8 0 0 0 0 2 0 0 0 0 0 0 0 5 0 0 0 7 0 0 0 0 2 6 0 0 0 0 0 0 0 0 0 0 5 0 0 7 4 0 0 0 0 0 0 3 0 0 3 0 0 0 2 0 0 5 0 0 0 0 0 0 1 0 0 5 0 0 0 1 0 0 0 0 ================================================ FILE: third-party/tbb/examples/task_group/sudoku/sudoku.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include #include "oneapi/tbb/tick_count.h" #include "oneapi/tbb/task_group.h" #include "oneapi/tbb/global_control.h" #include "common/utility/utility.hpp" #include "common/utility/get_default_num_threads.hpp" #pragma warning(disable : 4996) const unsigned BOARD_SIZE = 81; const unsigned BOARD_DIM = 9; std::atomic nSols; bool find_one = false; bool verbose = false; unsigned short init_values[BOARD_SIZE] = { 1, 0, 0, 9, 0, 0, 0, 8, 0, 0, 8, 0, 2, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0, 5, 2, 1, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 7, 4, 0, 0, 7, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 2, 0, 0, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0, 5, 0, 0, 0, 1, 0, 0, 0, 0 }; double solve_time; typedef struct { unsigned short solved_element; unsigned potential_set; } board_element; void read_board(const char* filename) { FILE* fp; int input; fp = fopen(filename, "r"); if (!fp) { fprintf(stderr, "sudoku: Could not open input file '%s'.\n", filename); std::exit(-1); } for (unsigned i = 0; i < BOARD_SIZE; ++i) { if (fscanf(fp, "%d", &input)) init_values[i] = input; else { fprintf(stderr, "sudoku: Error in input file at entry %d, assuming 0.\n", i); init_values[i] = 0; } } fclose(fp); } void print_board(const std::vector& b) { for (unsigned row = 0; row < BOARD_DIM; ++row) { for (unsigned col = 0; col < BOARD_DIM; ++col) { printf(" %d", b[row * BOARD_DIM + col].solved_element); if (col == 2 || col == 5) printf(" |"); } printf("\n"); if (row == 2 || row == 5) printf(" ---------------------\n"); } } void print_potential_board(const std::vector& b) { for (unsigned row = 0; row < BOARD_DIM; ++row) { for (unsigned col = 0; col < BOARD_DIM; ++col) { if (b[row * BOARD_DIM + col].solved_element) printf(" %4d ", b[row * BOARD_DIM + col].solved_element); else printf(" [%4d]", b[row * BOARD_DIM + col].potential_set); if (col == 2 || col == 5) printf(" |"); } printf("\n"); if (row == 2 || row == 5) printf(" ------------------------------------------------------------------\n"); } } void init_board(std::vector& b) { for (unsigned i = 0; i < BOARD_SIZE; ++i) b[i].solved_element = b[i].potential_set = 0; } void init_board(std::vector& b, unsigned short arr[BOARD_SIZE]) { for (unsigned i = 0; i < BOARD_SIZE; ++i) { b[i].solved_element = arr[i]; b[i].potential_set = 0; } } void init_potentials(std::vector& b) { for (unsigned i = 0; i < BOARD_SIZE; ++i) b[i].potential_set = 0; } bool fixed_board(const std::vector& b) { for (int i = BOARD_SIZE - 1; i >= 0; --i) if (b[i].solved_element == 0) return false; return true; } bool in_row(const std::vector& b, unsigned row, unsigned col, unsigned short p) { for (unsigned c = 0; c < BOARD_DIM; ++c) if (c != col && b[row * BOARD_DIM + c].solved_element == p) return true; return false; } bool in_col(const std::vector& b, unsigned row, unsigned col, unsigned short p) { for (unsigned r = 0; r < BOARD_DIM; ++r) if (r != row && b[r * BOARD_DIM + col].solved_element == p) return true; return false; } bool in_block(const std::vector& b, unsigned row, unsigned col, unsigned short p) { unsigned b_row = row / 3 * 3, b_col = col / 3 * 3; for (unsigned i = b_row; i < b_row + 3; ++i) for (unsigned j = b_col; j < b_col + 3; ++j) if (!(i == row && j == col) && b[i * BOARD_DIM + j].solved_element == p) return true; return false; } void calculate_potentials(std::vector& b) { for (unsigned i = 0; i < BOARD_SIZE; ++i) { b[i].potential_set = 0; if (!b[i].solved_element) { // element is not yet fixed unsigned row = i / BOARD_DIM, col = i % BOARD_DIM; for (unsigned potential = 1; potential <= BOARD_DIM; ++potential) { if (!in_row(b, row, col, potential) && !in_col(b, row, col, potential) && !in_block(b, row, col, potential)) b[i].potential_set |= 1 << (potential - 1); } } } } bool valid_board(const std::vector& b) { bool success = true; for (unsigned i = 0; i < BOARD_SIZE; ++i) { if (success && b[i].solved_element) { // element is fixed unsigned row = i / BOARD_DIM, col = i % BOARD_DIM; if (in_row(b, row, col, b[i].solved_element) || in_col(b, row, col, b[i].solved_element) || in_block(b, row, col, b[i].solved_element)) success = false; } } return success; } bool examine_potentials(std::vector& b, bool& progress) { bool singletons = false; for (unsigned i = 0; i < BOARD_SIZE; ++i) { if (b[i].solved_element == 0 && b[i].potential_set == 0) // empty set return false; switch (b[i].potential_set) { case 1: { b[i].solved_element = 1; singletons = true; break; } case 2: { b[i].solved_element = 2; singletons = true; break; } case 4: { b[i].solved_element = 3; singletons = true; break; } case 8: { b[i].solved_element = 4; singletons = true; break; } case 16: { b[i].solved_element = 5; singletons = true; break; } case 32: { b[i].solved_element = 6; singletons = true; break; } case 64: { b[i].solved_element = 7; singletons = true; break; } case 128: { b[i].solved_element = 8; singletons = true; break; } case 256: { b[i].solved_element = 9; singletons = true; break; } } } progress = singletons; return valid_board(b); } void partial_solve(oneapi::tbb::task_group& g, std::vector& b, unsigned first_potential_set) { if (fixed_board(b)) { if (find_one) g.cancel(); if (++nSols == 1 && verbose) { print_board(b); } return; } calculate_potentials(b); bool progress = true; bool success = examine_potentials(b, progress); if (success && progress) { partial_solve(g, b, first_potential_set); } else if (success && !progress) { while (b[first_potential_set].solved_element != 0) ++first_potential_set; for (unsigned short potential = 1; potential <= BOARD_DIM; ++potential) { if (1 << (potential - 1) & b[first_potential_set].potential_set) { g.run([&g, b /*make a copy of the board*/, first_potential_set, potential]() { //as task_group treat passed in functor as const - const_cast is needed //to allow modification of the copy auto& new_board = const_cast&>(b); new_board[first_potential_set].solved_element = potential; partial_solve(g, new_board, first_potential_set); }); } } } } unsigned solve(int p) { oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism, p); nSols = 0; std::vector start_board(BOARD_SIZE); init_board(start_board, init_values); oneapi::tbb::task_group g; oneapi::tbb::tick_count t0 = oneapi::tbb::tick_count::now(); partial_solve(g, start_board, 0); g.wait(); solve_time = (oneapi::tbb::tick_count::now() - t0).seconds(); return nSols; } int main(int argc, char* argv[]) { oneapi::tbb::tick_count mainStartTime = oneapi::tbb::tick_count::now(); utility::thread_number_range threads(utility::get_default_num_threads); std::string filename = ""; bool silent = false; utility::parse_cli_arguments( argc, argv, utility::cli_argument_pack() //"-h" option for displaying help is present implicitly .positional_arg(threads, "n-of-threads", utility::thread_number_range_desc) .positional_arg(filename, "filename", "input filename") .arg(verbose, "verbose", "prints the first solution") .arg(silent, "silent", "no output except elapsed time") .arg(find_one, "find-one", "stops after finding first solution\n")); if (silent) verbose = false; if (!filename.empty()) read_board(filename.c_str()); // otherwise (if file name not specified), the default statically initialized board will be used. for (int p = threads.first; p <= threads.last; p = threads.step(p)) { unsigned number = solve(p); if (!silent) { if (find_one) { printf("Sudoku: Time to find first solution on %d threads: %6.6f seconds.\n", p, solve_time); } else { printf("Sudoku: Time to find all %u solutions on %d threads: %6.6f seconds.\n", number, p, solve_time); } } } utility::report_elapsed_time((oneapi::tbb::tick_count::now() - mainStartTime).seconds()); return 0; }; ================================================ FILE: third-party/tbb/examples/test_all/README.md ================================================ # Code Samples of oneAPI Threading Building Blocks (oneTBB) Examples that test various components of oneTBB. | Code sample name | Description |:--- |:--- | fibonacci | Compute Fibonacci numbers in different ways. ================================================ FILE: third-party/tbb/examples/test_all/fibonacci/CMakeLists.txt ================================================ # Copyright (c) 2019-2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. cmake_minimum_required(VERSION 3.5.0...3.31.3) project(fibonacci CXX) include(../../common/cmake/common.cmake) set_common_project_settings(tbb) add_executable(fibonacci fibonacci.cpp) target_link_libraries(fibonacci TBB::tbb Threads::Threads $<$:rt>) # Link "rt" library on Linux target_compile_options(fibonacci PRIVATE ${TBB_CXX_STD_FLAG}) set(EXECUTABLE "$") set(ARGS "") add_execution_target(run_fibonacci fibonacci ${EXECUTABLE} "${ARGS}") ================================================ FILE: third-party/tbb/examples/test_all/fibonacci/README.md ================================================ # Fibonacci sample This directory contains an example that computes Fibonacci numbers in several different ways. The purpose of the example is to exercise every include file and class in Intel® oneAPI Threading Building Blocks. Most of the computations are deliberately silly and not expected to show any speedup on multiprocessors. ## Building the example ``` cmake cmake --build . ``` ## Running the sample ### Predefined make targets * `make run_fibonacci` - executes the example with predefined parameters. ### Application parameters Usage: ``` fibonacci K [M[:N]] [R] ``` * `K` - specifies the fibonacci number which would be calculated. * `[M:N]` -a range of numbers of threads to be used. * `R` - the number of times to repeat the calculation. ================================================ FILE: third-party/tbb/examples/test_all/fibonacci/fibonacci.cpp ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* Example program that computes Fibonacci numbers in different ways. Arguments are: [ Number [Threads [Repeats]]] The defaults are Number=500 Threads=1:4 Repeats=1. The point of this program is to check that the library is working properly. Most of the computations are deliberately silly and not expected to show any speedup on multiprocessors. */ // enable assertions #ifdef NDEBUG #undef NDEBUG #endif #include #include #include #include #include #include #include #include "oneapi/tbb/tick_count.h" #include "oneapi/tbb/blocked_range.h" #include "oneapi/tbb/concurrent_vector.h" #include "oneapi/tbb/concurrent_queue.h" #include "oneapi/tbb/concurrent_hash_map.h" #include "oneapi/tbb/parallel_for.h" #include "oneapi/tbb/parallel_reduce.h" #include "oneapi/tbb/parallel_scan.h" #include "oneapi/tbb/parallel_pipeline.h" #include "oneapi/tbb/spin_mutex.h" #include "oneapi/tbb/queuing_mutex.h" #include "oneapi/tbb/global_control.h" //! type used for Fibonacci number computations typedef long long value; //! Matrix 2x2 class struct Matrix2x2 { //! Array of values value v[2][2]; Matrix2x2() {} Matrix2x2(value v00, value v01, value v10, value v11) { v[0][0] = v00; v[0][1] = v01; v[1][0] = v10; v[1][1] = v11; } Matrix2x2 operator*(const Matrix2x2 &to) const; //< Multiply two Matrices }; //! Identity matrix static const Matrix2x2 MatrixIdentity(1, 0, 0, 1); //! Default matrix to multiply static const Matrix2x2 Matrix1110(1, 1, 1, 0); //! Raw arrays matrices multiply void Matrix2x2Multiply(const value a[2][2], const value b[2][2], value c[2][2]); /////////////////////// Serial methods //////////////////////// //! Plain serial sum value SerialFib(int n) { if (n < 2) return n; value a = 0, b = 1, sum; int i; for (i = 2; i <= n; i++) { // n is really index of Fibonacci number sum = a + b; a = b; b = sum; } return sum; } //! Serial n-1 matrices multiplication value SerialMatrixFib(int n) { value c[2][2], a[2][2] = { { 1, 1 }, { 1, 0 } }, b[2][2] = { { 1, 1 }, { 1, 0 } }; int i; for (i = 2; i < n; i++) { // Using condition to prevent copying of values if (i & 1) Matrix2x2Multiply(a, c, b); else Matrix2x2Multiply(a, b, c); } return (i & 1) ? c[0][0] : b[0][0]; // get result from upper left cell } //! Recursive summing. Just for complete list of serial algorithms, not used value SerialRecursiveFib(int n) { value result; if (n < 2) result = n; else result = SerialRecursiveFib(n - 1) + SerialRecursiveFib(n - 2); return result; } // GCC 4.8 C++ standard library implements std::this_thread::yield as no-op. #if __TBB_GLIBCXX_THIS_THREAD_YIELD_BROKEN static inline void yield() { sched_yield(); } #else using std::this_thread::yield; #endif //! Introducing of queue method in serial value SerialQueueFib(int n) { oneapi::tbb::concurrent_queue Q; for (int i = 1; i < n; i++) Q.push(Matrix1110); Matrix2x2 A, B; while (true) { while (!Q.try_pop(A)) yield(); if (Q.empty()) break; while (!Q.try_pop(B)) yield(); Q.push(A * B); } return A.v[0][0]; } //! Trying to use concurrent_vector value SerialVectorFib(int n) { oneapi::tbb::concurrent_vector A; A.grow_by(2); A[0] = 0; A[1] = 1; for (int i = 2; i <= n; i++) { A.grow_to_at_least(i + 1); A[i] = A[i - 1] + A[i - 2]; } return A[n]; } ///////////////////// Parallel methods //////////////////////// // *** Serial shared by mutexes *** // //! Shared glabals value SharedA = 0, SharedB = 1; int SharedI = 1, SharedN; //! Template task class which computes Fibonacci numbers with shared globals template class SharedSerialFibBody { M &mutex; public: SharedSerialFibBody(M &m) : mutex(m) {} //! main loop void operator()(const oneapi::tbb::blocked_range &range) const { for (;;) { typename M::scoped_lock lock(mutex); if (SharedI >= SharedN) break; value sum = SharedA + SharedB; SharedA = SharedB; SharedB = sum; ++SharedI; } } }; template <> void SharedSerialFibBody::operator()( const oneapi::tbb::blocked_range &range) const { for (;;) { std::lock_guard lock(mutex); if (SharedI >= SharedN) break; value sum = SharedA + SharedB; SharedA = SharedB; SharedB = sum; ++SharedI; } } //! Root function template value SharedSerialFib(int n) { SharedA = 0; SharedB = 1; SharedI = 1; SharedN = n; M mutex; parallel_for(oneapi::tbb::blocked_range(0, 4, 1), SharedSerialFibBody(mutex)); return SharedB; } // *** Serial shared by concurrent hash map *** // //! Hash comparer struct IntHashCompare { bool equal(const int j, const int k) const { return j == k; } std::size_t hash(const int k) const { return (std::size_t)k; } }; //! NumbersTable type based on concurrent_hash_map typedef oneapi::tbb::concurrent_hash_map NumbersTable; //! task for serial method using shared concurrent_hash_map class ConcurrentHashSerialFibTask { NumbersTable &Fib; int my_n; public: //! constructor ConcurrentHashSerialFibTask(NumbersTable &cht, int n) : Fib(cht), my_n(n) {} //! executing task void operator()() const { for (int i = 2; i <= my_n; ++i) { // there is no difference in to recycle or to make loop NumbersTable::const_accessor f1, f2; // same as iterators if (!Fib.find(f1, i - 1) || !Fib.find(f2, i - 2)) { // Something is seriously wrong, because i-1 and i-2 must have been inserted // earlier by this thread or another thread. assert(0); } value sum = f1->second + f2->second; NumbersTable::const_accessor fsum; Fib.insert(fsum, std::make_pair(i, sum)); // inserting assert(fsum->second == sum); // check value } } }; //! Root function value ConcurrentHashSerialFib(int n) { NumbersTable Fib; bool okay; okay = Fib.insert(std::make_pair(0, 0)); assert(okay); // assign initial values okay = Fib.insert(std::make_pair(1, 1)); assert(okay); // task_list list; oneapi::tbb::task_group tg; // allocate tasks tg.run(ConcurrentHashSerialFibTask(Fib, n)); tg.run(ConcurrentHashSerialFibTask(Fib, n)); tg.wait(); NumbersTable::const_accessor fresult; okay = Fib.find(fresult, n); assert(okay); return fresult->second; } // *** Queue with parallel_pipeline *** // typedef oneapi::tbb::concurrent_queue queue_t; namespace parallel_pipeline_ns { std::atomic N; //< index of Fibonacci number minus 1 queue_t Queue; } // namespace parallel_pipeline_ns //! functor to fills queue struct InputFunc { InputFunc() {} queue_t *operator()(oneapi::tbb::flow_control &fc) const { using namespace parallel_pipeline_ns; int n = --N; if (n <= 0) { fc.stop(); return nullptr; } Queue.push(Matrix1110); return &Queue; } }; //! functor to process queue struct MultiplyFunc { MultiplyFunc() {} void operator()(queue_t *queue) const { //concurrent_queue &Queue = *static_cast *>(p); Matrix2x2 m1, m2; // get two elements while (!queue->try_pop(m1)) yield(); while (!queue->try_pop(m2)) yield(); m1 = m1 * m2; // process them queue->push(m1); // and push back } }; //! Root function value ParallelPipeFib(int n) { using namespace parallel_pipeline_ns; N = n - 1; Queue.push(Matrix1110); oneapi::tbb::parallel_pipeline( n, oneapi::tbb::make_filter(oneapi::tbb::filter_mode::parallel, InputFunc()) & oneapi::tbb::make_filter(oneapi::tbb::filter_mode::parallel, MultiplyFunc())); assert(Queue.unsafe_size() == 1); Matrix2x2 M; bool result = Queue.try_pop(M); // get last element assert(result); value res = M.v[0][0]; // get value Queue.clear(); return res; } // *** parallel_reduce *** // //! Functor for parallel_reduce struct parallel_reduceFibBody { Matrix2x2 sum; int split_flag; //< flag to make one less operation for split bodies //! Constructor fills sum with initial matrix parallel_reduceFibBody() : sum(Matrix1110), split_flag(0) {} //! Splitting constructor parallel_reduceFibBody(parallel_reduceFibBody &other, oneapi::tbb::split) : sum(Matrix1110), split_flag(1 /*note that it is split*/) {} //! Join point void join(parallel_reduceFibBody &s) { sum = sum * s.sum; } //! Process multiplications void operator()(const oneapi::tbb::blocked_range &r) { for (int k = r.begin() + split_flag; k < r.end(); ++k) sum = sum * Matrix1110; split_flag = 0; // reset flag, because this method can be reused for next range } }; //! Root function value parallel_reduceFib(int n) { parallel_reduceFibBody b; oneapi::tbb::parallel_reduce(oneapi::tbb::blocked_range(2, n, 3), b); // do parallel reduce on range [2, n) for b return b.sum.v[0][0]; } // *** parallel_scan *** // //! Functor for parallel_scan struct parallel_scanFibBody { /** Though parallel_scan is usually used to accumulate running sums, it can be used to accumulate running products too. */ Matrix2x2 product; /** Pointer to output sequence */ value *const output; //! Constructor sets product to identity matrix parallel_scanFibBody(value *output_) : product(MatrixIdentity), output(output_) {} //! Splitting constructor parallel_scanFibBody(parallel_scanFibBody &b, oneapi::tbb::split) : product(MatrixIdentity), output(b.output) {} //! Method for merging summary information from a, which was split off from *this, into *this. void reverse_join(parallel_scanFibBody &a) { // When using non-commutative reduction operation, reverse_join // should put argument "a" on the left side of the operation. // The reversal from the argument order is why the method is // called "reverse_join" instead of "join". product = a.product * product; } //! Method for assigning final result back to original body. void assign(parallel_scanFibBody &b) { product = b.product; } //! Compute matrix running product. /** Tag indicates whether is is the final scan over the range, or just a helper "prescan" that is computing a partial reduction. */ template void operator()(const oneapi::tbb::blocked_range &r, Tag tag) { for (int k = r.begin(); k < r.end(); ++k) { // Code performs an "exclusive" scan, which outputs a value *before* updating the product. // For an "inclusive" scan, output the value after the update. if (tag.is_final_scan()) output[k] = product.v[0][1]; product = product * Matrix1110; } } }; //! Root function value parallel_scanFib(int n) { value *output = new value[n]; parallel_scanFibBody b(output); oneapi::tbb::parallel_scan(oneapi::tbb::blocked_range(0, n, 3), b); // output[0..n-1] now contains the Fibonacci sequence (modulo integer wrap-around). // Check the last two values for correctness. assert(n < 2 || output[n - 2] + output[n - 1] == b.product.v[0][1]); delete[] output; return b.product.v[0][1]; } /////////////////////////// Main //////////////////////////////////////////////////// //! A closed range of int. struct IntRange { int low; int high; void set_from_string(const char *s); IntRange(int low_, int high_) : low(low_), high(high_) {} }; void IntRange::set_from_string(const char *s) { char *end; high = low = strtol(s, &end, 0); switch (*end) { case ':': high = strtol(end + 1, nullptr, 0); break; case '\0': break; default: printf("unexpected character = %c\n", *end); } } //! Tick count for start static oneapi::tbb::tick_count t0; //! Verbose output flag static bool Verbose = false; typedef value (*MeasureFunc)(int); //! Measure ticks count in loop [2..n] value Measure(const char *name, MeasureFunc func, int n) { value result; if (Verbose) printf("%s", name); t0 = oneapi::tbb::tick_count::now(); for (int number = 2; number <= n; number++) result = func(number); if (Verbose) printf("\t- in %f msec\n", (oneapi::tbb::tick_count::now() - t0).seconds() * 1000); return result; } //! program entry int main(int argc, char *argv[]) { if (argc > 1) Verbose = true; int NumbersCount = argc > 1 ? strtol(argv[1], nullptr, 0) : 500; IntRange NThread(1, 4); // Number of threads to use. if (argc > 2) NThread.set_from_string(argv[2]); unsigned long ntrial = argc > 3 ? (unsigned long)strtoul(argv[3], nullptr, 0) : 1; value result, sum; if (Verbose) printf("Fibonacci numbers example. Generating %d numbers..\n", NumbersCount); result = Measure("Serial loop", SerialFib, NumbersCount); sum = Measure("Serial matrix", SerialMatrixFib, NumbersCount); assert(result == sum); sum = Measure("Serial vector", SerialVectorFib, NumbersCount); assert(result == sum); sum = Measure("Serial queue", SerialQueueFib, NumbersCount); assert(result == sum); // now in parallel for (unsigned long i = 0; i < ntrial; ++i) { for (int threads = NThread.low; threads <= NThread.high; threads *= 2) { oneapi::tbb::global_control c(oneapi::tbb::global_control::max_allowed_parallelism, threads); if (Verbose) printf("\nThreads number is %d\n", threads); sum = Measure("Shared serial (mutex)\t", SharedSerialFib, NumbersCount); assert(result == sum); sum = Measure("Shared serial (spin_mutex)", SharedSerialFib, NumbersCount); assert(result == sum); sum = Measure("Shared serial (queuing_mutex)", SharedSerialFib, NumbersCount); assert(result == sum); sum = Measure("Shared serial (Conc.HashTable)", ConcurrentHashSerialFib, NumbersCount); assert(result == sum); sum = Measure("Parallel pipe/queue\t", ParallelPipeFib, NumbersCount); assert(result == sum); sum = Measure("Parallel reduce\t\t", parallel_reduceFib, NumbersCount); assert(result == sum); sum = Measure("Parallel scan\t\t", parallel_scanFib, NumbersCount); assert(result == sum); } #ifdef __GNUC__ if (Verbose) printf("Fibonacci number #%d modulo 2^64 is %lld\n\n", NumbersCount, result); #else if (Verbose) printf("Fibonacci number #%d modulo 2^64 is %I64d\n\n", NumbersCount, result); #endif } if (!Verbose) printf("TEST PASSED\n"); // flush to prevent bufferization on exit fflush(stdout); return 0; } // Utils void Matrix2x2Multiply(const value a[2][2], const value b[2][2], value c[2][2]) { for (int i = 0; i <= 1; i++) for (int j = 0; j <= 1; j++) c[i][j] = a[i][0] * b[0][j] + a[i][1] * b[1][j]; } Matrix2x2 Matrix2x2::operator*(const Matrix2x2 &to) const { Matrix2x2 result; Matrix2x2Multiply(v, to.v, result.v); return result; } ================================================ FILE: third-party/tbb/include/oneapi/tbb/blocked_nd_range.h ================================================ /* Copyright (c) 2017-2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_blocked_nd_range_H #define __TBB_blocked_nd_range_H #include // std::any_of #include #include #include // std::is_same, std::enable_if #include "detail/_config.h" #include "detail/_template_helpers.h" // index_sequence, make_index_sequence #include "detail/_namespace_injection.h" #include "detail/_range_common.h" #include "blocked_range.h" namespace tbb { namespace detail { namespace d1 { /* The blocked_nd_range_impl uses make_index_sequence to automatically generate a ctor with exactly N arguments of the type tbb::blocked_range. Such ctor provides an opportunity to use braced-init-list parameters to initialize each dimension. Use of parameters, whose representation is a braced-init-list, but they're not std::initializer_list or a reference to one, produces a non-deduced context within template argument deduction. NOTE: blocked_nd_range must be exactly a templated alias to the blocked_nd_range_impl (and not e.g. a derived class), otherwise it would need to declare its own ctor facing the same problem that the impl class solves. */ template> class blocked_nd_range_impl; template class blocked_nd_range_impl> { public: //! Type of a value. using value_type = Value; //! Type of a dimension range. using dim_range_type = tbb::blocked_range; //! Type for the size of a range. using size_type = typename dim_range_type::size_type; blocked_nd_range_impl() = delete; //! Constructs N-dimensional range over N half-open intervals each represented as tbb::blocked_range. blocked_nd_range_impl(const indexed_t&... args) : my_dims{ {args...} } {} #if __clang__ && __TBB_CLANG_VERSION < 140000 // On clang prior to version 14.0.0, passing a single braced init list to the constructor of blocked_nd_range // matches better on the C array constructor and generates compile-time error because of unexpected size // Adding constraints for this constructor to force the compiler to drop it from overload resolution if the size is unexpected template ::type> blocked_nd_range_impl(const value_type (&size)[M], size_type grainsize = 1) : #else blocked_nd_range_impl(const value_type (&size)[N], size_type grainsize = 1) : #endif my_dims { dim_range_type(0, size[Is], grainsize)... } {} //! Dimensionality of a range. static constexpr unsigned int dim_count() { return N; } //! Range in certain dimension. const dim_range_type& dim(unsigned int dimension) const { __TBB_ASSERT(dimension < N, "out of bound"); return my_dims[dimension]; } //------------------------------------------------------------------------ // Methods that implement Range concept //------------------------------------------------------------------------ //! True if at least one dimension is empty. bool empty() const { return std::any_of(my_dims.begin(), my_dims.end(), [](const dim_range_type& d) { return d.empty(); }); } //! True if at least one dimension is divisible. bool is_divisible() const { return std::any_of(my_dims.begin(), my_dims.end(), [](const dim_range_type& d) { return d.is_divisible(); }); } blocked_nd_range_impl(blocked_nd_range_impl& r, proportional_split proportion) : my_dims(r.my_dims) { do_split(r, proportion); } blocked_nd_range_impl(blocked_nd_range_impl& r, split proportion) : my_dims(r.my_dims) { do_split(r, proportion); } private: static_assert(N != 0, "zero dimensional blocked_nd_range can't be constructed"); //! Ranges in each dimension. std::array my_dims; template void do_split(blocked_nd_range_impl& r, split_type proportion) { static_assert((std::is_same::value || std::is_same::value), "type of split object is incorrect"); __TBB_ASSERT(r.is_divisible(), "can't split not divisible range"); auto my_it = std::max_element(my_dims.begin(), my_dims.end(), [](const dim_range_type& first, const dim_range_type& second) { return (first.size() * double(second.grainsize()) < second.size() * double(first.grainsize())); }); auto r_it = r.my_dims.begin() + (my_it - my_dims.begin()); my_it->my_begin = dim_range_type::do_split(*r_it, proportion); // (!(my_it->my_begin < r_it->my_end) && !(r_it->my_end < my_it->my_begin)) equals to // (my_it->my_begin == r_it->my_end), but we can't use operator== due to Value concept __TBB_ASSERT(!(my_it->my_begin < r_it->my_end) && !(r_it->my_end < my_it->my_begin), "blocked_range has been split incorrectly"); } }; template __TBB_requires(blocked_range_value) class blocked_nd_range : public blocked_nd_range_impl { using base = blocked_nd_range_impl; // Making constructors of base class visible using base::base; }; #if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT && __TBB_PREVIEW_BLOCKED_ND_RANGE_DEDUCTION_GUIDES // blocked_nd_range(const dim_range_type& dim0, const dim_range_type& dim1, ...) // while the arguments are passed as braced-init-lists // Works only for 2 and more arguments since the deduction from // single braced-init-list or single C-array argument prefers the multi-dimensional range // Only braced-init-lists of size 2 and 3 are allowed since dim_range_type may only // be constructed from 2 or 3 arguments template = 2>, typename = std::enable_if_t<(... && (Ns == 2 || Ns == 3))>> blocked_nd_range(const Value (&... dim)[Ns]) -> blocked_nd_range; // blocked_nd_range(const dim_range_type& dim0, const dim_range_type& dim1, ...) // while the arguments are passed as blocked_range objects of the same type template )>> blocked_nd_range(blocked_range, blocked_range...) -> blocked_nd_range; // blocked_nd_range(const value_type (&size)[N], size_type grainsize = 1) template blocked_nd_range(const Value (&)[N], typename blocked_nd_range::size_type = 1) -> blocked_nd_range; // blocked_nd_range(blocked_nd_range&, oneapi::tbb::split) template blocked_nd_range(blocked_nd_range, oneapi::tbb::split) -> blocked_nd_range; // blocked_nd_range(blocked_nd_range&, oneapi::tbb::proportional_split) template blocked_nd_range(blocked_nd_range, oneapi::tbb::proportional_split) -> blocked_nd_range; #endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT && __TBB_PREVIEW_BLOCKED_ND_RANGE_DEDUCTION_GUIDES } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::blocked_nd_range; } // namespace v1 } // namespace tbb #endif /* __TBB_blocked_nd_range_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/blocked_range.h ================================================ /* Copyright (c) 2005-2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_blocked_range_H #define __TBB_blocked_range_H #include #include "detail/_range_common.h" #include "detail/_namespace_injection.h" #include "version.h" namespace tbb { namespace detail { namespace d1 { /** \page range_req Requirements on range concept Class \c R implementing the concept of range must define: - \code R::R( const R& ); \endcode Copy constructor - \code R::~R(); \endcode Destructor - \code bool R::is_divisible() const; \endcode True if range can be partitioned into two subranges - \code bool R::empty() const; \endcode True if range is empty - \code R::R( R& r, split ); \endcode Split range \c r into two subranges. **/ //! A range over which to iterate. /** @ingroup algorithms */ template __TBB_requires(blocked_range_value) class blocked_range { public: //! Type of a value /** Called a const_iterator for sake of algorithms that need to treat a blocked_range as an STL container. */ using const_iterator = Value; //! Type for size of a range using size_type = std::size_t; //! Construct range over half-open interval [begin,end), with the given grainsize. blocked_range( Value begin_, Value end_, size_type grainsize_=1 ) : my_end(end_), my_begin(begin_), my_grainsize(grainsize_) { __TBB_ASSERT( my_grainsize>0, "grainsize must be positive" ); } //! Beginning of range. const_iterator begin() const { return my_begin; } //! One past last value in range. const_iterator end() const { return my_end; } //! Size of the range /** Unspecified if end() __TBB_requires(blocked_range_value && blocked_range_value) friend class blocked_range2d; template __TBB_requires(blocked_range_value && blocked_range_value && blocked_range_value) friend class blocked_range3d; template friend class blocked_nd_range_impl; }; } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::blocked_range; // Split types using detail::split; using detail::proportional_split; } // namespace v1 } // namespace tbb #endif /* __TBB_blocked_range_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/blocked_range2d.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_blocked_range2d_H #define __TBB_blocked_range2d_H #include #include "detail/_config.h" #include "detail/_namespace_injection.h" #include "detail/_range_common.h" #include "blocked_range.h" namespace tbb { namespace detail { namespace d1 { //! A 2-dimensional range that models the Range concept. /** @ingroup algorithms */ template __TBB_requires(blocked_range_value && blocked_range_value) class blocked_range2d { public: //! Type for size of an iteration range using row_range_type = blocked_range; using col_range_type = blocked_range; private: row_range_type my_rows; col_range_type my_cols; public: blocked_range2d( RowValue row_begin, RowValue row_end, typename row_range_type::size_type row_grainsize, ColValue col_begin, ColValue col_end, typename col_range_type::size_type col_grainsize ) : my_rows(row_begin,row_end,row_grainsize), my_cols(col_begin,col_end,col_grainsize) {} blocked_range2d( RowValue row_begin, RowValue row_end, ColValue col_begin, ColValue col_end ) : my_rows(row_begin,row_end), my_cols(col_begin,col_end) {} //! True if range is empty bool empty() const { // Range is empty if at least one dimension is empty. return my_rows.empty() || my_cols.empty(); } //! True if range is divisible into two pieces. bool is_divisible() const { return my_rows.is_divisible() || my_cols.is_divisible(); } blocked_range2d( blocked_range2d& r, split ) : my_rows(r.my_rows), my_cols(r.my_cols) { split split_obj; do_split(r, split_obj); } blocked_range2d( blocked_range2d& r, proportional_split& proportion ) : my_rows(r.my_rows), my_cols(r.my_cols) { do_split(r, proportion); } //! The rows of the iteration space const row_range_type& rows() const { return my_rows; } //! The columns of the iteration space const col_range_type& cols() const { return my_cols; } private: template void do_split( blocked_range2d& r, Split& split_obj ) { if ( my_rows.size()*double(my_cols.grainsize()) < my_cols.size()*double(my_rows.grainsize()) ) { my_cols.my_begin = col_range_type::do_split(r.my_cols, split_obj); } else { my_rows.my_begin = row_range_type::do_split(r.my_rows, split_obj); } } }; } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::blocked_range2d; } // namespace v1 } // namespace tbb #endif /* __TBB_blocked_range2d_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/blocked_range3d.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_blocked_range3d_H #define __TBB_blocked_range3d_H #include #include "detail/_config.h" #include "detail/_namespace_injection.h" #include "blocked_range.h" namespace tbb { namespace detail { namespace d1 { //! A 3-dimensional range that models the Range concept. /** @ingroup algorithms */ template __TBB_requires(blocked_range_value && blocked_range_value && blocked_range_value) class blocked_range3d { public: //! Type for size of an iteration range using page_range_type = blocked_range; using row_range_type = blocked_range; using col_range_type = blocked_range; private: page_range_type my_pages; row_range_type my_rows; col_range_type my_cols; public: blocked_range3d( PageValue page_begin, PageValue page_end, RowValue row_begin, RowValue row_end, ColValue col_begin, ColValue col_end ) : my_pages(page_begin,page_end), my_rows(row_begin,row_end), my_cols(col_begin,col_end) {} blocked_range3d( PageValue page_begin, PageValue page_end, typename page_range_type::size_type page_grainsize, RowValue row_begin, RowValue row_end, typename row_range_type::size_type row_grainsize, ColValue col_begin, ColValue col_end, typename col_range_type::size_type col_grainsize ) : my_pages(page_begin,page_end,page_grainsize), my_rows(row_begin,row_end,row_grainsize), my_cols(col_begin,col_end,col_grainsize) {} //! True if range is empty bool empty() const { // Range is empty if at least one dimension is empty. return my_pages.empty() || my_rows.empty() || my_cols.empty(); } //! True if range is divisible into two pieces. bool is_divisible() const { return my_pages.is_divisible() || my_rows.is_divisible() || my_cols.is_divisible(); } blocked_range3d( blocked_range3d& r, split split_obj ) : my_pages(r.my_pages), my_rows(r.my_rows), my_cols(r.my_cols) { do_split(r, split_obj); } blocked_range3d( blocked_range3d& r, proportional_split& proportion ) : my_pages(r.my_pages), my_rows(r.my_rows), my_cols(r.my_cols) { do_split(r, proportion); } //! The pages of the iteration space const page_range_type& pages() const { return my_pages; } //! The rows of the iteration space const row_range_type& rows() const { return my_rows; } //! The columns of the iteration space const col_range_type& cols() const { return my_cols; } private: template void do_split( blocked_range3d& r, Split& split_obj) { if ( my_pages.size()*double(my_rows.grainsize()) < my_rows.size()*double(my_pages.grainsize()) ) { if ( my_rows.size()*double(my_cols.grainsize()) < my_cols.size()*double(my_rows.grainsize()) ) { my_cols.my_begin = col_range_type::do_split(r.my_cols, split_obj); } else { my_rows.my_begin = row_range_type::do_split(r.my_rows, split_obj); } } else { if ( my_pages.size()*double(my_cols.grainsize()) < my_cols.size()*double(my_pages.grainsize()) ) { my_cols.my_begin = col_range_type::do_split(r.my_cols, split_obj); } else { my_pages.my_begin = page_range_type::do_split(r.my_pages, split_obj); } } } }; } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::blocked_range3d; } // namespace v1 } // namespace tbb #endif /* __TBB_blocked_range3d_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/blocked_rangeNd.h ================================================ /* Copyright (c) 2017-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_blocked_rangeNd_H #define __TBB_blocked_rangeNd_H #if !TBB_PREVIEW_BLOCKED_RANGE_ND #error Set TBB_PREVIEW_BLOCKED_RANGE_ND to include blocked_rangeNd.h #endif #include // std::any_of #include #include #include // std::is_same, std::enable_if #include "detail/_config.h" #include "detail/_template_helpers.h" // index_sequence, make_index_sequence #include "detail/_range_common.h" #include "blocked_range.h" namespace tbb { namespace detail { namespace d1 { /* The blocked_rangeNd_impl uses make_index_sequence to automatically generate a ctor with exactly N arguments of the type tbb::blocked_range. Such ctor provides an opportunity to use braced-init-list parameters to initialize each dimension. Use of parameters, whose representation is a braced-init-list, but they're not std::initializer_list or a reference to one, produces a non-deduced context within template argument deduction. NOTE: blocked_rangeNd must be exactly a templated alias to the blocked_rangeNd_impl (and not e.g. a derived class), otherwise it would need to declare its own ctor facing the same problem that the impl class solves. */ template> __TBB_requires(blocked_range_value) class blocked_rangeNd_impl; template __TBB_requires(blocked_range_value) class blocked_rangeNd_impl> { public: //! Type of a value. using value_type = Value; private: //! Helper type to construct range with N tbb::blocked_range objects. template using dim_type_helper = tbb::blocked_range; public: blocked_rangeNd_impl() = delete; //! Constructs N-dimensional range over N half-open intervals each represented as tbb::blocked_range. blocked_rangeNd_impl(const dim_type_helper&... args) : my_dims{ {args...} } {} //! Dimensionality of a range. static constexpr unsigned int ndims() { return N; } //! Range in certain dimension. const tbb::blocked_range& dim(unsigned int dimension) const { __TBB_ASSERT(dimension < N, "out of bound"); return my_dims[dimension]; } //------------------------------------------------------------------------ // Methods that implement Range concept //------------------------------------------------------------------------ //! True if at least one dimension is empty. bool empty() const { return std::any_of(my_dims.begin(), my_dims.end(), [](const tbb::blocked_range& d) { return d.empty(); }); } //! True if at least one dimension is divisible. bool is_divisible() const { return std::any_of(my_dims.begin(), my_dims.end(), [](const tbb::blocked_range& d) { return d.is_divisible(); }); } blocked_rangeNd_impl(blocked_rangeNd_impl& r, proportional_split proportion) : my_dims(r.my_dims) { do_split(r, proportion); } blocked_rangeNd_impl(blocked_rangeNd_impl& r, split proportion) : my_dims(r.my_dims) { do_split(r, proportion); } private: static_assert(N != 0, "zero dimensional blocked_rangeNd can't be constructed"); //! Ranges in each dimension. std::array, N> my_dims; template void do_split(blocked_rangeNd_impl& r, split_type proportion) { static_assert((std::is_same::value || std::is_same::value), "type of split object is incorrect"); __TBB_ASSERT(r.is_divisible(), "can't split not divisible range"); auto my_it = std::max_element(my_dims.begin(), my_dims.end(), [](const tbb::blocked_range& first, const tbb::blocked_range& second) { return (first.size() * second.grainsize() < second.size() * first.grainsize()); }); auto r_it = r.my_dims.begin() + (my_it - my_dims.begin()); my_it->my_begin = tbb::blocked_range::do_split(*r_it, proportion); // (!(my_it->my_begin < r_it->my_end) && !(r_it->my_end < my_it->my_begin)) equals to // (my_it->my_begin == r_it->my_end), but we can't use operator== due to Value concept __TBB_ASSERT(!(my_it->my_begin < r_it->my_end) && !(r_it->my_end < my_it->my_begin), "blocked_range has been split incorrectly"); } }; template using blocked_rangeNd = blocked_rangeNd_impl; } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::blocked_rangeNd; } // namespace v1 } // namespace tbb #endif /* __TBB_blocked_rangeNd_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/cache_aligned_allocator.h ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_cache_aligned_allocator_H #define __TBB_cache_aligned_allocator_H #include "detail/_utils.h" #include "detail/_namespace_injection.h" #include #include #if __TBB_CPP17_MEMORY_RESOURCE_PRESENT #include #endif namespace tbb { namespace detail { namespace r1 { TBB_EXPORT void* __TBB_EXPORTED_FUNC cache_aligned_allocate(std::size_t size); TBB_EXPORT void __TBB_EXPORTED_FUNC cache_aligned_deallocate(void* p); TBB_EXPORT std::size_t __TBB_EXPORTED_FUNC cache_line_size(); } namespace d1 { template class cache_aligned_allocator { public: using value_type = T; using propagate_on_container_move_assignment = std::true_type; //! Always defined for TBB containers (supported since C++17 for std containers) using is_always_equal = std::true_type; cache_aligned_allocator() = default; template cache_aligned_allocator(const cache_aligned_allocator&) noexcept {} //! Allocate space for n objects, starting on a cache/sector line. __TBB_nodiscard T* allocate(std::size_t n) { return static_cast(r1::cache_aligned_allocate(n * sizeof(value_type))); } //! Free block of memory that starts on a cache line void deallocate(T* p, std::size_t) { r1::cache_aligned_deallocate(p); } //! Largest value for which method allocate might succeed. std::size_t max_size() const noexcept { return (~std::size_t(0) - r1::cache_line_size()) / sizeof(value_type); } #if TBB_ALLOCATOR_TRAITS_BROKEN using pointer = value_type*; using const_pointer = const value_type*; using reference = value_type&; using const_reference = const value_type&; using difference_type = std::ptrdiff_t; using size_type = std::size_t; template struct rebind { using other = cache_aligned_allocator; }; template void construct(U *p, Args&&... args) { ::new (p) U(std::forward(args)...); } void destroy(pointer p) { p->~value_type(); } pointer address(reference x) const { return &x; } const_pointer address(const_reference x) const { return &x; } #endif // TBB_ALLOCATOR_TRAITS_BROKEN }; #if TBB_ALLOCATOR_TRAITS_BROKEN template<> class cache_aligned_allocator { public: using pointer = void*; using const_pointer = const void*; using value_type = void; template struct rebind { using other = cache_aligned_allocator; }; }; #endif template bool operator==(const cache_aligned_allocator&, const cache_aligned_allocator&) noexcept { return true; } #if !__TBB_CPP20_COMPARISONS_PRESENT template bool operator!=(const cache_aligned_allocator&, const cache_aligned_allocator&) noexcept { return false; } #endif #if __TBB_CPP17_MEMORY_RESOURCE_PRESENT //! C++17 memory resource wrapper to ensure cache line size alignment class cache_aligned_resource : public std::pmr::memory_resource { public: cache_aligned_resource() : cache_aligned_resource(std::pmr::get_default_resource()) {} explicit cache_aligned_resource(std::pmr::memory_resource* upstream) : m_upstream(upstream) {} std::pmr::memory_resource* upstream_resource() const { return m_upstream; } private: //! We don't know what memory resource set. Use padding to guarantee alignment void* do_allocate(std::size_t bytes, std::size_t alignment) override { // TODO: make it common with tbb_allocator.cpp std::size_t cache_line_alignment = correct_alignment(alignment); std::size_t space = correct_size(bytes) + cache_line_alignment; std::uintptr_t base = reinterpret_cast(m_upstream->allocate(space)); __TBB_ASSERT(base != 0, "Upstream resource returned nullptr."); // Round up to the next cache line (align the base address) std::uintptr_t result = (base + cache_line_alignment) & ~(cache_line_alignment - 1); __TBB_ASSERT((result - base) >= sizeof(std::uintptr_t), "Can`t store a base pointer to the header"); __TBB_ASSERT(space - (result - base) >= bytes, "Not enough space for the storage"); // Record where block actually starts. (reinterpret_cast(result))[-1] = base; return reinterpret_cast(result); } void do_deallocate(void* ptr, std::size_t bytes, std::size_t alignment) override { if (ptr) { // Recover where block actually starts std::uintptr_t base = (reinterpret_cast(ptr))[-1]; m_upstream->deallocate(reinterpret_cast(base), correct_size(bytes) + correct_alignment(alignment)); } } bool do_is_equal(const std::pmr::memory_resource& other) const noexcept override { if (this == &other) { return true; } #if __TBB_USE_OPTIONAL_RTTI const cache_aligned_resource* other_res = dynamic_cast(&other); return other_res && (upstream_resource() == other_res->upstream_resource()); #else return false; #endif } std::size_t correct_alignment(std::size_t alignment) { __TBB_ASSERT(tbb::detail::is_power_of_two(alignment), "Alignment is not a power of 2"); #if __TBB_CPP17_HW_INTERFERENCE_SIZE_PRESENT std::size_t cache_line_size = std::hardware_destructive_interference_size; #else std::size_t cache_line_size = r1::cache_line_size(); #endif return alignment < cache_line_size ? cache_line_size : alignment; } std::size_t correct_size(std::size_t bytes) { // To handle the case, when small size requested. There could be not // enough space to store the original pointer. return bytes < sizeof(std::uintptr_t) ? sizeof(std::uintptr_t) : bytes; } std::pmr::memory_resource* m_upstream; }; #endif // __TBB_CPP17_MEMORY_RESOURCE_PRESENT } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::cache_aligned_allocator; #if __TBB_CPP17_MEMORY_RESOURCE_PRESENT using detail::d1::cache_aligned_resource; #endif } // namespace v1 } // namespace tbb #endif /* __TBB_cache_aligned_allocator_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/collaborative_call_once.h ================================================ /* Copyright (c) 2021-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_collaborative_call_once_H #define __TBB_collaborative_call_once_H #include "task_arena.h" #include "task_group.h" #include namespace tbb { namespace detail { namespace d1 { #if _MSC_VER && !defined(__INTEL_COMPILER) // Suppress warning: structure was padded due to alignment specifier #pragma warning (push) #pragma warning (disable: 4324) #endif template class collaborative_call_stack_task : public task { const F& m_func; wait_context& m_wait_ctx; void finalize() { m_wait_ctx.release(); } task* execute(d1::execution_data&) override { task* res = d2::task_ptr_or_nullptr(m_func); finalize(); return res; } task* cancel(d1::execution_data&) override { finalize(); return nullptr; } public: collaborative_call_stack_task(const F& f, wait_context& wctx) : m_func(f), m_wait_ctx(wctx) {} }; constexpr std::uintptr_t collaborative_once_max_references = max_nfs_size; constexpr std::uintptr_t collaborative_once_references_mask = collaborative_once_max_references-1; class alignas(max_nfs_size) collaborative_once_runner : no_copy { struct storage_t { task_arena m_arena{ task_arena::attach{} }; wait_context m_wait_context{1}; }; std::atomic m_ref_count{0}; std::atomic m_is_ready{false}; // Storage with task_arena and wait_context must be initialized only by winner thread union { storage_t m_storage; }; template void isolated_execute(Fn f) { auto func = [f] { f(); // delegate_base requires bool returning functor while isolate_within_arena ignores the result return true; }; delegated_function delegate(func); r1::isolate_within_arena(delegate, reinterpret_cast(this)); } public: class lifetime_guard : no_copy { collaborative_once_runner& m_runner; public: lifetime_guard(collaborative_once_runner& r) : m_runner(r) { m_runner.m_ref_count++; } ~lifetime_guard() { m_runner.m_ref_count--; } }; collaborative_once_runner() {} ~collaborative_once_runner() { spin_wait_until_eq(m_ref_count, 0, std::memory_order_acquire); if (m_is_ready.load(std::memory_order_relaxed)) { m_storage.~storage_t(); } } std::uintptr_t to_bits() { return reinterpret_cast(this); } static collaborative_once_runner* from_bits(std::uintptr_t bits) { __TBB_ASSERT( (bits & collaborative_once_references_mask) == 0, "invalid pointer, last log2(max_nfs_size) bits must be zero" ); return reinterpret_cast(bits); } template void run_once(F&& f) { __TBB_ASSERT(!m_is_ready.load(std::memory_order_relaxed), "storage with task_arena and wait_context is already initialized"); // Initialize internal state new(&m_storage) storage_t(); m_storage.m_arena.execute([&] { isolated_execute([&] { task_group_context context{ task_group_context::bound, task_group_context::default_traits | task_group_context::concurrent_wait }; collaborative_call_stack_task t{ std::forward(f), m_storage.m_wait_context }; // Set the ready flag after entering the execute body to prevent // moonlighting threads from occupying all slots inside the arena. m_is_ready.store(true, std::memory_order_release); execute_and_wait(t, context, m_storage.m_wait_context, context); }); }); } void assist() noexcept { // Do not join the arena until the winner thread takes the slot spin_wait_while_eq(m_is_ready, false); m_storage.m_arena.execute([&] { isolated_execute([&] { // We do not want to get an exception from user functor on moonlighting threads. // The exception is handled with the winner thread task_group_context stub_context; wait(m_storage.m_wait_context, stub_context); }); }); } }; class collaborative_once_flag : no_copy { enum state : std::uintptr_t { uninitialized, done, #if TBB_USE_ASSERT dead #endif }; std::atomic m_state{ state::uninitialized }; template friend void collaborative_call_once(collaborative_once_flag& flag, Fn&& f, Args&&... args); void set_completion_state(std::uintptr_t runner_bits, std::uintptr_t desired) { std::uintptr_t expected = runner_bits; do { expected = runner_bits; // Possible inefficiency: when we start waiting, // some moonlighting threads might continue coming that will prolong our waiting. // Fortunately, there are limited number of threads on the system so wait time is limited. spin_wait_until_eq(m_state, expected); } while (!m_state.compare_exchange_strong(expected, desired)); } template void do_collaborative_call_once(Fn&& f) { std::uintptr_t expected = m_state.load(std::memory_order_acquire); collaborative_once_runner runner; do { if (expected == state::uninitialized && m_state.compare_exchange_strong(expected, runner.to_bits())) { // Winner thread runner.run_once([&] { try_call([&] { std::forward(f)(); }).on_exception([&] { // Reset the state to uninitialized to allow other threads to try initialization again set_completion_state(runner.to_bits(), state::uninitialized); }); // We successfully executed functor set_completion_state(runner.to_bits(), state::done); }); break; } else { // Moonlighting thread: we need to add a reference to the state to prolong runner lifetime. // However, the maximum number of references are limited with runner alignment. // So, we use CAS loop and spin_wait to guarantee that references never exceed "max_value". do { auto max_value = expected | collaborative_once_references_mask; expected = spin_wait_while_eq(m_state, max_value); // "expected > state::done" prevents storing values, when state is uninitialized or done } while (expected > state::done && !m_state.compare_exchange_strong(expected, expected + 1)); if (auto shared_runner = collaborative_once_runner::from_bits(expected & ~collaborative_once_references_mask)) { collaborative_once_runner::lifetime_guard guard{*shared_runner}; m_state.fetch_sub(1); // The moonlighting threads are not expected to handle exceptions from user functor. // Therefore, no exception is expected from assist(). shared_runner->assist(); } } __TBB_ASSERT(m_state.load(std::memory_order_relaxed) != state::dead, "collaborative_once_flag has been prematurely destroyed"); } while (expected != state::done); } #if TBB_USE_ASSERT public: ~collaborative_once_flag() { m_state.store(state::dead, std::memory_order_relaxed); } #endif }; template void collaborative_call_once(collaborative_once_flag& flag, Fn&& fn, Args&&... args) { __TBB_ASSERT(flag.m_state.load(std::memory_order_relaxed) != collaborative_once_flag::dead, "collaborative_once_flag has been prematurely destroyed"); if (flag.m_state.load(std::memory_order_acquire) != collaborative_once_flag::done) { #if __TBB_GCC_PARAMETER_PACK_IN_LAMBDAS_BROKEN // Using stored_pack to suppress bug in GCC 4.8 // with parameter pack expansion in lambda auto stored_pack = save_pack(std::forward(args)...); auto func = [&] { call(std::forward(fn), std::move(stored_pack)); }; #else auto func = [&] { fn(std::forward(args)...); }; #endif flag.do_collaborative_call_once(func); } } #if _MSC_VER && !defined(__INTEL_COMPILER) #pragma warning (pop) // 4324 warning #endif } // namespace d1 } // namespace detail using detail::d1::collaborative_call_once; using detail::d1::collaborative_once_flag; } // namespace tbb #endif // __TBB_collaborative_call_once_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/combinable.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_combinable_H #define __TBB_combinable_H #include "detail/_namespace_injection.h" #include "enumerable_thread_specific.h" #include "cache_aligned_allocator.h" namespace tbb { namespace detail { namespace d1 { /** \name combinable **/ //@{ //! Thread-local storage with optional reduction /** @ingroup containers */ template class combinable { using my_alloc = typename tbb::cache_aligned_allocator; using my_ets_type = typename tbb::enumerable_thread_specific; my_ets_type my_ets; public: combinable() = default; template explicit combinable(Finit _finit) : my_ets(_finit) { } void clear() { my_ets.clear(); } T& local() { return my_ets.local(); } T& local(bool& exists) { return my_ets.local(exists); } // combine_func_t has signature T(T,T) or T(const T&, const T&) template T combine(CombineFunc f_combine) { return my_ets.combine(f_combine); } // combine_func_t has signature void(T) or void(const T&) template void combine_each(CombineFunc f_combine) { my_ets.combine_each(f_combine); } }; } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::combinable; } // inline namespace v1 } // namespace tbb #endif /* __TBB_combinable_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/concurrent_hash_map.h ================================================ /* Copyright (c) 2005-2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_concurrent_hash_map_H #define __TBB_concurrent_hash_map_H #include "detail/_namespace_injection.h" #include "detail/_utils.h" #include "detail/_assert.h" #include "detail/_allocator_traits.h" #include "detail/_containers_helpers.h" #include "detail/_template_helpers.h" #include "detail/_hash_compare.h" #include "detail/_range_common.h" #include "tbb_allocator.h" #include "spin_rw_mutex.h" #include #include #include #include #include // Need std::pair #include // Need std::memset namespace tbb { namespace detail { namespace d2 { #if __TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS && __TBB_CPP20_CONCEPTS_PRESENT template concept ch_map_rw_scoped_lockable = rw_scoped_lockable && requires(const typename Mutex::scoped_lock& sl) { { sl.is_writer() } -> std::convertible_to; }; #endif template struct hash_map_node_base : no_copy { using mutex_type = MutexType; // Scoped lock type for mutex using scoped_type = typename MutexType::scoped_lock; // Next node in chain hash_map_node_base* next; mutex_type mutex; }; // Incompleteness flag value static void* const rehash_req_flag = reinterpret_cast(std::size_t(3)); // Rehashed empty bucket flag static void* const empty_rehashed_flag = reinterpret_cast(std::size_t(0)); template bool rehash_required( hash_map_node_base* node_ptr ) { return reinterpret_cast(node_ptr) == rehash_req_flag; } #if TBB_USE_ASSERT template bool empty_rehashed( hash_map_node_base* node_ptr ) { return reinterpret_cast(node_ptr) == empty_rehashed_flag; } #endif // base class of concurrent_hash_map template class hash_map_base { public: using size_type = std::size_t; using hashcode_type = std::size_t; using segment_index_type = std::size_t; using node_base = hash_map_node_base; struct bucket : no_copy { using mutex_type = MutexType; using scoped_type = typename mutex_type::scoped_lock; bucket() : node_list(nullptr) {} bucket( node_base* ptr ) : node_list(ptr) {} mutex_type mutex; std::atomic node_list; }; using allocator_type = Allocator; using allocator_traits_type = tbb::detail::allocator_traits; using bucket_allocator_type = typename allocator_traits_type::template rebind_alloc; using bucket_allocator_traits = tbb::detail::allocator_traits; // Count of segments in the first block static constexpr size_type embedded_block = 1; // Count of segments in the first block static constexpr size_type embedded_buckets = 1 << embedded_block; // Count of segments in the first block static constexpr size_type first_block = 8; //including embedded_block. perfect with bucket size 16, so the allocations are power of 4096 // Size of a pointer / table size static constexpr size_type pointers_per_table = sizeof(segment_index_type) * 8; // one segment per bit using segment_ptr_type = bucket*; using atomic_segment_type = std::atomic; using segments_table_type = atomic_segment_type[pointers_per_table]; hash_map_base( const allocator_type& alloc ) : my_allocator(alloc), my_mask(embedded_buckets - 1), my_size(0) { for (size_type i = 0; i != embedded_buckets; ++i) { my_embedded_segment[i].node_list.store(nullptr, std::memory_order_relaxed); } for (size_type segment_index = 0; segment_index < pointers_per_table; ++segment_index) { auto argument = segment_index < embedded_block ? my_embedded_segment + segment_base(segment_index) : nullptr; my_table[segment_index].store(argument, std::memory_order_relaxed); } __TBB_ASSERT( embedded_block <= first_block, "The first block number must include embedded blocks"); } // segment index of given index in the array static segment_index_type segment_index_of( size_type index ) { return segment_index_type(tbb::detail::log2( index|1 )); } // the first array index of given segment static segment_index_type segment_base( segment_index_type k ) { return (segment_index_type(1) << k & ~segment_index_type(1)); } // segment size except for k == 0 static size_type segment_size( segment_index_type k ) { return size_type(1) << k; // fake value for k==0 } // true if ptr is valid pointer static bool is_valid( void* ptr ) { return reinterpret_cast(ptr) > uintptr_t(63); } template void init_buckets_impl( segment_ptr_type ptr, size_type sz, const Args&... args ) { for (size_type i = 0; i < sz; ++i) { bucket_allocator_traits::construct(my_allocator, ptr + i, args...); } } // Initialize buckets void init_buckets( segment_ptr_type ptr, size_type sz, bool is_initial ) { if (is_initial) { init_buckets_impl(ptr, sz); } else { init_buckets_impl(ptr, sz, reinterpret_cast(rehash_req_flag)); } } // Add node n to bucket b static void add_to_bucket( bucket* b, node_base* n ) { __TBB_ASSERT(!rehash_required(b->node_list.load(std::memory_order_relaxed)), nullptr); n->next = b->node_list.load(std::memory_order_relaxed); b->node_list.store(n, std::memory_order_relaxed); // its under lock and flag is set } const bucket_allocator_type& get_allocator() const { return my_allocator; } bucket_allocator_type& get_allocator() { return my_allocator; } // Enable segment void enable_segment( segment_index_type k, bool is_initial = false ) { __TBB_ASSERT( k, "Zero segment must be embedded" ); size_type sz; __TBB_ASSERT( !is_valid(my_table[k].load(std::memory_order_relaxed)), "Wrong concurrent assignment"); if (k >= first_block) { sz = segment_size(k); segment_ptr_type ptr = nullptr; try_call( [&] { ptr = bucket_allocator_traits::allocate(my_allocator, sz); } ).on_exception( [&] { my_table[k].store(nullptr, std::memory_order_relaxed); }); __TBB_ASSERT(ptr, nullptr); init_buckets(ptr, sz, is_initial); my_table[k].store(ptr, std::memory_order_release); sz <<= 1;// double it to get entire capacity of the container } else { // the first block __TBB_ASSERT( k == embedded_block, "Wrong segment index" ); sz = segment_size(first_block); segment_ptr_type ptr = nullptr; try_call( [&] { ptr = bucket_allocator_traits::allocate(my_allocator, sz - embedded_buckets); } ).on_exception( [&] { my_table[k].store(nullptr, std::memory_order_relaxed); }); __TBB_ASSERT(ptr, nullptr); init_buckets(ptr, sz - embedded_buckets, is_initial); ptr -= segment_base(embedded_block); for(segment_index_type i = embedded_block; i < first_block; i++) // calc the offsets my_table[i].store(ptr + segment_base(i), std::memory_order_release); } my_mask.store(sz-1, std::memory_order_release); } void delete_segment( segment_index_type s ) { segment_ptr_type buckets_ptr = my_table[s].load(std::memory_order_relaxed); size_type sz = segment_size( s ? s : 1 ); size_type deallocate_size = 0; if (s >= first_block) { // the first segment or the next deallocate_size = sz; } else if (s == embedded_block && embedded_block != first_block) { deallocate_size = segment_size(first_block) - embedded_buckets; } for (size_type i = 0; i < deallocate_size; ++i) { bucket_allocator_traits::destroy(my_allocator, buckets_ptr + i); } if (deallocate_size != 0) { bucket_allocator_traits::deallocate(my_allocator, buckets_ptr, deallocate_size); } if (s >= embedded_block) my_table[s].store(nullptr, std::memory_order_relaxed); } // Get bucket by (masked) hashcode bucket *get_bucket( hashcode_type h ) const noexcept { segment_index_type s = segment_index_of( h ); h -= segment_base(s); segment_ptr_type seg = my_table[s].load(std::memory_order_acquire); __TBB_ASSERT( is_valid(seg), "hashcode must be cut by valid mask for allocated segments" ); return &seg[h]; } // detail serial rehashing helper void mark_rehashed_levels( hashcode_type h ) noexcept { segment_index_type s = segment_index_of( h ); while (segment_ptr_type seg = my_table[++s].load(std::memory_order_relaxed)) if (rehash_required(seg[h].node_list.load(std::memory_order_relaxed))) { seg[h].node_list.store(reinterpret_cast(empty_rehashed_flag), std::memory_order_relaxed); mark_rehashed_levels( h + ((hashcode_type)1<node_list.load(std::memory_order_acquire))) { return true; } } return false; } // Insert a node and check for load factor. @return segment index to enable. segment_index_type insert_new_node( bucket *b, node_base *n, hashcode_type mask ) { size_type sz = ++my_size; // prefix form is to enforce allocation after the first item inserted add_to_bucket( b, n ); // check load factor if( sz >= mask ) { // TODO: add custom load_factor segment_index_type new_seg = tbb::detail::log2( mask+1 ); //optimized segment_index_of __TBB_ASSERT( is_valid(my_table[new_seg-1].load(std::memory_order_relaxed)), "new allocations must not publish new mask until segment has allocated"); static const segment_ptr_type is_allocating = segment_ptr_type(2); segment_ptr_type disabled = nullptr; if (!(my_table[new_seg].load(std::memory_order_acquire)) && my_table[new_seg].compare_exchange_strong(disabled, is_allocating)) return new_seg; // The value must be processed } return 0; } // Prepare enough segments for number of buckets void reserve(size_type buckets) { if( !buckets-- ) return; bool is_initial = !my_size.load(std::memory_order_relaxed); for (size_type m = my_mask.load(std::memory_order_relaxed); buckets > m; m = my_mask.load(std::memory_order_relaxed)) { enable_segment( segment_index_of( m+1 ), is_initial ); } } // Swap hash_map_bases void internal_swap_content(hash_map_base &table) { using std::swap; swap_atomics_relaxed(my_mask, table.my_mask); swap_atomics_relaxed(my_size, table.my_size); for(size_type i = 0; i < embedded_buckets; i++) { auto temp = my_embedded_segment[i].node_list.load(std::memory_order_relaxed); my_embedded_segment[i].node_list.store(table.my_embedded_segment[i].node_list.load(std::memory_order_relaxed), std::memory_order_relaxed); table.my_embedded_segment[i].node_list.store(temp, std::memory_order_relaxed); } for(size_type i = embedded_block; i < pointers_per_table; i++) { auto temp = my_table[i].load(std::memory_order_relaxed); my_table[i].store(table.my_table[i].load(std::memory_order_relaxed), std::memory_order_relaxed); table.my_table[i].store(temp, std::memory_order_relaxed); } } void internal_move(hash_map_base&& other) { my_mask.store(other.my_mask.load(std::memory_order_relaxed), std::memory_order_relaxed); other.my_mask.store(embedded_buckets - 1, std::memory_order_relaxed); my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed); other.my_size.store(0, std::memory_order_relaxed); for (size_type i = 0; i < embedded_buckets; ++i) { my_embedded_segment[i].node_list.store(other.my_embedded_segment[i].node_list, std::memory_order_relaxed); other.my_embedded_segment[i].node_list.store(nullptr, std::memory_order_relaxed); } for (size_type i = embedded_block; i < pointers_per_table; ++i) { my_table[i].store(other.my_table[i].load(std::memory_order_relaxed), std::memory_order_relaxed); other.my_table[i].store(nullptr, std::memory_order_relaxed); } } protected: bucket_allocator_type my_allocator; // Hash mask = sum of allocated segment sizes - 1 std::atomic my_mask; // Size of container in stored items std::atomic my_size; // It must be in separate cache line from my_mask due to performance effects // Zero segment bucket my_embedded_segment[embedded_buckets]; // Segment pointers table. Also prevents false sharing between my_mask and my_size segments_table_type my_table; }; template class hash_map_range; // Meets requirements of a forward iterator for STL // Value is either the T or const T type of the container. template class hash_map_iterator { using map_type = Container; using node = typename Container::node; using map_base = typename Container::base_type; using node_base = typename map_base::node_base; using bucket = typename map_base::bucket; public: using value_type = Value; using size_type = typename Container::size_type; using difference_type = typename Container::difference_type; using pointer = value_type*; using reference = value_type&; using iterator_category = std::forward_iterator_tag; // Construct undefined iterator hash_map_iterator(): my_map(), my_index(), my_bucket(), my_node() {} hash_map_iterator( const hash_map_iterator& other ) : my_map(other.my_map), my_index(other.my_index), my_bucket(other.my_bucket), my_node(other.my_node) {} hash_map_iterator& operator=( const hash_map_iterator& other ) { my_map = other.my_map; my_index = other.my_index; my_bucket = other.my_bucket; my_node = other.my_node; return *this; } Value& operator*() const { __TBB_ASSERT( map_base::is_valid(my_node), "iterator uninitialized or at end of container?" ); return my_node->value(); } Value* operator->() const {return &operator*();} hash_map_iterator& operator++() { my_node = static_cast( my_node->next ); if( !my_node ) advance_to_next_bucket(); return *this; } // Post increment hash_map_iterator operator++(int) { hash_map_iterator old(*this); operator++(); return old; } private: template friend bool operator==( const hash_map_iterator& i, const hash_map_iterator& j ); template friend bool operator!=( const hash_map_iterator& i, const hash_map_iterator& j ); template friend ptrdiff_t operator-( const hash_map_iterator& i, const hash_map_iterator& j ); template friend class hash_map_iterator; template friend class hash_map_range; void advance_to_next_bucket() { // TODO?: refactor to iterator_base class size_t k = my_index+1; __TBB_ASSERT( my_bucket, "advancing an invalid iterator?"); while (k <= my_map->my_mask.load(std::memory_order_relaxed)) { // Following test uses 2's-complement wizardry if( k&(k-2) ) // not the beginning of a segment ++my_bucket; else my_bucket = my_map->get_bucket( k ); node_base *n = my_bucket->node_list.load(std::memory_order_relaxed); if( map_base::is_valid(n) ) { my_node = static_cast(n); my_index = k; return; } ++k; } my_bucket = nullptr; my_node = nullptr; my_index = k; // the end } template __TBB_requires(tbb::detail::hash_compare && ch_map_rw_scoped_lockable) #else > __TBB_requires(tbb::detail::hash_compare) #endif friend class concurrent_hash_map; hash_map_iterator( const Container &map, std::size_t index, const bucket *b, node_base *n ) : my_map(&map), my_index(index), my_bucket(b), my_node(nullptr) { // Cannot directly initialize to n, because it could be an invalid node pointer (e.g., when // setting a midpoint for a 1-element range). If it is, try one from a subsequent bucket. if( map_base::is_valid(n) ) my_node = static_cast(n); else if( b ) advance_to_next_bucket(); } // concurrent_hash_map over which we are iterating. const Container *my_map; // Index in hash table for current item size_t my_index; // Pointer to bucket const bucket* my_bucket; // Pointer to node that has current item node* my_node; }; template bool operator==( const hash_map_iterator& i, const hash_map_iterator& j ) { return i.my_node == j.my_node && i.my_map == j.my_map; } template bool operator!=( const hash_map_iterator& i, const hash_map_iterator& j ) { return i.my_node != j.my_node || i.my_map != j.my_map; } // Range class used with concurrent_hash_map template class hash_map_range { using map_type = typename Iterator::map_type; public: // Type for size of a range using size_type = std::size_t; using value_type = typename Iterator::value_type; using reference = typename Iterator::reference; using difference_type = typename Iterator::difference_type; using iterator = Iterator; // True if range is empty. bool empty() const { return my_begin == my_end; } // True if range can be partitioned into two subranges. bool is_divisible() const { return my_midpoint != my_end; } // Split range. hash_map_range( hash_map_range& r, split ) : my_end(r.my_end), my_grainsize(r.my_grainsize) { r.my_end = my_begin = r.my_midpoint; __TBB_ASSERT( !empty(), "Splitting despite the range is not divisible" ); __TBB_ASSERT( !r.empty(), "Splitting despite the range is not divisible" ); set_midpoint(); r.set_midpoint(); } // Init range with container and grainsize specified hash_map_range( const map_type &map, size_type grainsize_ = 1 ) : my_begin( Iterator( map, 0, map.my_embedded_segment, map.my_embedded_segment->node_list.load(std::memory_order_relaxed) ) ), my_end( Iterator( map, map.my_mask.load(std::memory_order_relaxed) + 1, nullptr, nullptr ) ), my_grainsize( grainsize_ ) { __TBB_ASSERT( grainsize_>0, "grainsize must be positive" ); set_midpoint(); } Iterator begin() const { return my_begin; } Iterator end() const { return my_end; } // The grain size for this range. size_type grainsize() const { return my_grainsize; } private: Iterator my_begin; Iterator my_end; mutable Iterator my_midpoint; size_t my_grainsize; // Set my_midpoint to point approximately half way between my_begin and my_end. void set_midpoint() const; template friend class hash_map_range; }; template void hash_map_range::set_midpoint() const { // Split by groups of nodes size_t m = my_end.my_index-my_begin.my_index; if( m > my_grainsize ) { m = my_begin.my_index + m/2u; auto b = my_begin.my_map->get_bucket(m); my_midpoint = Iterator(*my_begin.my_map,m,b,b->node_list.load(std::memory_order_relaxed)); } else { my_midpoint = my_end; } __TBB_ASSERT( my_begin.my_index <= my_midpoint.my_index, "my_begin is after my_midpoint" ); __TBB_ASSERT( my_midpoint.my_index <= my_end.my_index, "my_midpoint is after my_end" ); __TBB_ASSERT( my_begin != my_midpoint || my_begin == my_end, "[my_begin, my_midpoint) range should not be empty" ); } template , typename Allocator = tbb_allocator> #if __TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS , typename MutexType = spin_rw_mutex > __TBB_requires(tbb::detail::hash_compare && ch_map_rw_scoped_lockable) #else > __TBB_requires(tbb::detail::hash_compare) #endif class concurrent_hash_map #if __TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS : protected hash_map_base #else : protected hash_map_base #endif { template friend class hash_map_iterator; template friend class hash_map_range; using allocator_traits_type = tbb::detail::allocator_traits; #if __TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS using base_type = hash_map_base; #else using base_type = hash_map_base; #endif public: using key_type = Key; using mapped_type = T; // type_identity is needed to disable implicit deduction guides for std::initializer_list constructors // and copy/move constructor with explicit allocator argument using allocator_type = tbb::detail::type_identity_t; using hash_compare_type = tbb::detail::type_identity_t; using value_type = std::pair; using size_type = typename base_type::size_type; using difference_type = std::ptrdiff_t; #if __TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS using mutex_type = MutexType; #endif using pointer = typename allocator_traits_type::pointer; using const_pointer = typename allocator_traits_type::const_pointer; using reference = value_type&; using const_reference = const value_type&; using iterator = hash_map_iterator; using const_iterator = hash_map_iterator; using range_type = hash_map_range; using const_range_type = hash_map_range; protected: static_assert(std::is_same::value, "value_type of the container must be the same as its allocator's"); friend class const_accessor; class node; using segment_index_type = typename base_type::segment_index_type; using segment_ptr_type = typename base_type::segment_ptr_type; using node_base = typename base_type::node_base; using bucket = typename base_type::bucket; using hashcode_type = typename base_type::hashcode_type; using bucket_allocator_type = typename base_type::bucket_allocator_type; using node_allocator_type = typename base_type::allocator_traits_type::template rebind_alloc; using node_allocator_traits = tbb::detail::allocator_traits; hash_compare_type my_hash_compare; class node : public node_base { public: node() {} ~node() {} pointer storage() { return &my_value; } value_type& value() { return *storage(); } private: union { value_type my_value; }; }; void delete_node( node_base *n ) { node_allocator_type node_allocator(this->get_allocator()); node_allocator_traits::destroy(node_allocator, static_cast(n)->storage()); node_allocator_traits::destroy(node_allocator, static_cast(n)); node_allocator_traits::deallocate(node_allocator, static_cast(n), 1); } template static node* create_node(bucket_allocator_type& allocator, Args&&... args) { node_allocator_type node_allocator(allocator); node* node_ptr = node_allocator_traits::allocate(node_allocator, 1); auto guard = make_raii_guard([&] { node_allocator_traits::destroy(node_allocator, node_ptr); node_allocator_traits::deallocate(node_allocator, node_ptr, 1); }); node_allocator_traits::construct(node_allocator, node_ptr); node_allocator_traits::construct(node_allocator, node_ptr->storage(), std::forward(args)...); guard.dismiss(); return node_ptr; } static node* allocate_node_copy_construct(bucket_allocator_type& allocator, const Key &key, const T * t){ return create_node(allocator, key, *t); } static node* allocate_node_move_construct(bucket_allocator_type& allocator, const Key &key, const T * t){ return create_node(allocator, key, std::move(*const_cast(t))); } template static node* allocate_node_default_construct(bucket_allocator_type& allocator, const K &key, const T * ){ // Emplace construct an empty T object inside the pair return create_node(allocator, std::piecewise_construct, std::forward_as_tuple(key), std::forward_as_tuple()); } static node* do_not_allocate_node(bucket_allocator_type& , const Key &, const T * ){ __TBB_ASSERT(false,"this dummy function should not be called"); return nullptr; } template node *search_bucket( const K &key, bucket *b ) const { node *n = static_cast( b->node_list.load(std::memory_order_relaxed) ); while (this->is_valid(n) && !my_hash_compare.equal(key, n->value().first)) n = static_cast( n->next ); __TBB_ASSERT(!rehash_required(n), "Search can be executed only for rehashed bucket"); return n; } // bucket accessor is to find, rehash, acquire a lock, and access a bucket class bucket_accessor : public bucket::scoped_type { bucket *my_b; public: bucket_accessor( concurrent_hash_map *base, const hashcode_type h, bool writer = false ) { acquire( base, h, writer ); } // find a bucket by masked hashcode, optionally rehash, and acquire the lock inline void acquire( concurrent_hash_map *base, const hashcode_type h, bool writer = false ) { my_b = base->get_bucket( h ); // TODO: actually, notification is unnecessary here, just hiding double-check if (rehash_required(my_b->node_list.load(std::memory_order_acquire)) && bucket::scoped_type::try_acquire( my_b->mutex, /*write=*/true ) ) { if (rehash_required(my_b->node_list.load(std::memory_order_relaxed))) base->rehash_bucket(my_b, h); // recursive rehashing } else bucket::scoped_type::acquire( my_b->mutex, writer ); __TBB_ASSERT(!rehash_required(my_b->node_list.load(std::memory_order_relaxed)), nullptr); } // get bucket pointer bucket *operator() () { return my_b; } }; // TODO refactor to hash_base void rehash_bucket( bucket *b_new, const hashcode_type hash ) { __TBB_ASSERT( hash > 1, "The lowermost buckets can't be rehashed" ); b_new->node_list.store(reinterpret_cast(empty_rehashed_flag), std::memory_order_release); // mark rehashed hashcode_type mask = (hashcode_type(1) << tbb::detail::log2(hash)) - 1; // get parent mask from the topmost bit bucket_accessor b_old( this, hash & mask ); mask = (mask<<1) | 1; // get full mask for new bucket __TBB_ASSERT( (mask&(mask+1))==0 && (hash & mask) == hash, nullptr ); restart: node_base* prev = nullptr; node_base* curr = b_old()->node_list.load(std::memory_order_acquire); while (this->is_valid(curr)) { hashcode_type curr_node_hash = my_hash_compare.hash(static_cast(curr)->value().first); if ((curr_node_hash & mask) == hash) { if (!b_old.is_writer()) { if (!b_old.upgrade_to_writer()) { goto restart; // node ptr can be invalid due to concurrent erase } } node_base* next = curr->next; // exclude from b_old if (prev == nullptr) { b_old()->node_list.store(curr->next, std::memory_order_relaxed); } else { prev->next = curr->next; } this->add_to_bucket(b_new, curr); curr = next; } else { prev = curr; curr = curr->next; } } } template using hash_compare_is_transparent = dependent_bool, U>; public: class accessor; // Combines data access, locking, and garbage collection. class const_accessor : private node::scoped_type /*which derived from no_copy*/ { #if __TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS friend class concurrent_hash_map; #else friend class concurrent_hash_map; #endif friend class accessor; public: // Type of value using value_type = const typename concurrent_hash_map::value_type; // True if result is empty. bool empty() const { return !my_node; } // Set to null void release() { if( my_node ) { node::scoped_type::release(); my_node = nullptr; } } // Return reference to associated value in hash table. const_reference operator*() const { __TBB_ASSERT( my_node, "attempt to dereference empty accessor" ); return my_node->value(); } // Return pointer to associated value in hash table. const_pointer operator->() const { return &operator*(); } // Create empty result const_accessor() : my_node(nullptr), my_hash() {} // Destroy result after releasing the underlying reference. ~const_accessor() { my_node = nullptr; // scoped lock's release() is called in its destructor } protected: bool is_writer() { return node::scoped_type::is_writer(); } node *my_node; hashcode_type my_hash; }; // Allows write access to elements and combines data access, locking, and garbage collection. class accessor: public const_accessor { public: // Type of value using value_type = typename concurrent_hash_map::value_type; // Return reference to associated value in hash table. reference operator*() const { __TBB_ASSERT( this->my_node, "attempt to dereference empty accessor" ); return this->my_node->value(); } // Return pointer to associated value in hash table. pointer operator->() const { return &operator*(); } }; explicit concurrent_hash_map( const hash_compare_type& compare, const allocator_type& a = allocator_type() ) : base_type(a) , my_hash_compare(compare) {} concurrent_hash_map() : concurrent_hash_map(hash_compare_type()) {} explicit concurrent_hash_map( const allocator_type& a ) : concurrent_hash_map(hash_compare_type(), a) {} // Construct empty table with n preallocated buckets. This number serves also as initial concurrency level. concurrent_hash_map( size_type n, const allocator_type &a = allocator_type() ) : concurrent_hash_map(a) { this->reserve(n); } concurrent_hash_map( size_type n, const hash_compare_type& compare, const allocator_type& a = allocator_type() ) : concurrent_hash_map(compare, a) { this->reserve(n); } // Copy constructor concurrent_hash_map( const concurrent_hash_map &table ) : concurrent_hash_map(node_allocator_traits::select_on_container_copy_construction(table.get_allocator())) { try_call( [&] { internal_copy(table); }).on_exception( [&] { this->clear(); }); } concurrent_hash_map( const concurrent_hash_map &table, const allocator_type &a) : concurrent_hash_map(a) { try_call( [&] { internal_copy(table); }).on_exception( [&] { this->clear(); }); } // Move constructor concurrent_hash_map( concurrent_hash_map &&table ) : concurrent_hash_map(std::move(table.get_allocator())) { this->internal_move(std::move(table)); } // Move constructor concurrent_hash_map( concurrent_hash_map &&table, const allocator_type &a ) : concurrent_hash_map(a) { using is_equal_type = typename node_allocator_traits::is_always_equal; internal_move_construct_with_allocator(std::move(table), a, is_equal_type()); } // Construction with copying iteration range and given allocator instance template concurrent_hash_map( I first, I last, const allocator_type &a = allocator_type() ) : concurrent_hash_map(a) { try_call( [&] { internal_copy(first, last, std::distance(first, last)); }).on_exception( [&] { this->clear(); }); } template concurrent_hash_map( I first, I last, const hash_compare_type& compare, const allocator_type& a = allocator_type() ) : concurrent_hash_map(compare, a) { try_call( [&] { internal_copy(first, last, std::distance(first, last)); }).on_exception( [&] { this->clear(); }); } concurrent_hash_map( std::initializer_list il, const hash_compare_type& compare = hash_compare_type(), const allocator_type& a = allocator_type() ) : concurrent_hash_map(compare, a) { try_call( [&] { internal_copy(il.begin(), il.end(), il.size()); }).on_exception( [&] { this->clear(); }); } concurrent_hash_map( std::initializer_list il, const allocator_type& a ) : concurrent_hash_map(il, hash_compare_type(), a) {} // Assignment concurrent_hash_map& operator=( const concurrent_hash_map &table ) { if( this != &table ) { clear(); copy_assign_allocators(this->my_allocator, table.my_allocator); internal_copy(table); } return *this; } // Move Assignment concurrent_hash_map& operator=( concurrent_hash_map &&table ) { if( this != &table ) { using pocma_type = typename node_allocator_traits::propagate_on_container_move_assignment; using is_equal_type = typename node_allocator_traits::is_always_equal; move_assign_allocators(this->my_allocator, table.my_allocator); internal_move_assign(std::move(table), tbb::detail::disjunction()); } return *this; } // Assignment concurrent_hash_map& operator=( std::initializer_list il ) { clear(); internal_copy(il.begin(), il.end(), il.size()); return *this; } // Rehashes and optionally resizes the whole table. /** Useful to optimize performance before or after concurrent operations. Also enables using of find() and count() concurrent methods in serial context. */ void rehash(size_type sz = 0) { this->reserve(sz); // TODO: add reduction of number of buckets as well hashcode_type mask = this->my_mask.load(std::memory_order_relaxed); hashcode_type b = (mask+1)>>1; // size or first index of the last segment __TBB_ASSERT((b&(b-1))==0, nullptr); // zero or power of 2 bucket *bp = this->get_bucket( b ); // only the last segment should be scanned for rehashing for(; b <= mask; b++, bp++ ) { node_base *n = bp->node_list.load(std::memory_order_relaxed); __TBB_ASSERT( this->is_valid(n) || empty_rehashed(n) || rehash_required(n), "Broken internal structure" ); __TBB_ASSERT( *reinterpret_cast(&bp->mutex) == 0, "concurrent or unexpectedly terminated operation during rehash() execution" ); if (rehash_required(n)) { // rehash bucket, conditional because rehashing of a previous bucket may affect this one hashcode_type h = b; bucket *b_old = bp; do { __TBB_ASSERT( h > 1, "The lowermost buckets can't be rehashed" ); hashcode_type m = ( hashcode_type(1) << tbb::detail::log2( h ) ) - 1; // get parent mask from the topmost bit b_old = this->get_bucket( h &= m ); } while( rehash_required(b_old->node_list.load(std::memory_order_relaxed)) ); // now h - is index of the root rehashed bucket b_old this->mark_rehashed_levels( h ); // mark all non-rehashed children recursively across all segments node_base* prev = nullptr; node_base* curr = b_old->node_list.load(std::memory_order_relaxed); while (this->is_valid(curr)) { hashcode_type curr_node_hash = my_hash_compare.hash(static_cast(curr)->value().first); if ((curr_node_hash & mask) != h) { // should be rehashed node_base* next = curr->next; // exclude from b_old if (prev == nullptr) { b_old->node_list.store(curr->next, std::memory_order_relaxed); } else { prev->next = curr->next; } bucket *b_new = this->get_bucket(curr_node_hash & mask); __TBB_ASSERT(!rehash_required(b_new->node_list.load(std::memory_order_relaxed)), "hash() function changed for key in table or internal error"); this->add_to_bucket(b_new, curr); curr = next; } else { prev = curr; curr = curr->next; } } } } } // Clear table void clear() { hashcode_type m = this->my_mask.load(std::memory_order_relaxed); __TBB_ASSERT((m&(m+1))==0, "data structure is invalid"); this->my_size.store(0, std::memory_order_relaxed); segment_index_type s = this->segment_index_of( m ); __TBB_ASSERT( s+1 == this->pointers_per_table || !this->my_table[s+1].load(std::memory_order_relaxed), "wrong mask or concurrent grow" ); do { __TBB_ASSERT(this->is_valid(this->my_table[s].load(std::memory_order_relaxed)), "wrong mask or concurrent grow" ); segment_ptr_type buckets_ptr = this->my_table[s].load(std::memory_order_relaxed); size_type sz = this->segment_size( s ? s : 1 ); for( segment_index_type i = 0; i < sz; i++ ) for( node_base *n = buckets_ptr[i].node_list.load(std::memory_order_relaxed); this->is_valid(n); n = buckets_ptr[i].node_list.load(std::memory_order_relaxed) ) { buckets_ptr[i].node_list.store(n->next, std::memory_order_relaxed); delete_node( n ); } this->delete_segment(s); } while(s-- > 0); this->my_mask.store(this->embedded_buckets - 1, std::memory_order_relaxed); } // Clear table and destroy it. ~concurrent_hash_map() { clear(); } //------------------------------------------------------------------------ // Parallel algorithm support //------------------------------------------------------------------------ range_type range( size_type grainsize=1 ) { return range_type( *this, grainsize ); } const_range_type range( size_type grainsize=1 ) const { return const_range_type( *this, grainsize ); } //------------------------------------------------------------------------ // STL support - not thread-safe methods //------------------------------------------------------------------------ iterator begin() { return iterator( *this, 0, this->my_embedded_segment, this->my_embedded_segment->node_list.load(std::memory_order_relaxed) ); } const_iterator begin() const { return const_iterator( *this, 0, this->my_embedded_segment, this->my_embedded_segment->node_list.load(std::memory_order_relaxed) ); } const_iterator cbegin() const { return const_iterator( *this, 0, this->my_embedded_segment, this->my_embedded_segment->node_list.load(std::memory_order_relaxed) ); } iterator end() { return iterator( *this, 0, nullptr, nullptr ); } const_iterator end() const { return const_iterator( *this, 0, nullptr, nullptr ); } const_iterator cend() const { return const_iterator( *this, 0, nullptr, nullptr ); } std::pair equal_range( const Key& key ) { return internal_equal_range( key, end() ); } std::pair equal_range( const Key& key ) const { return internal_equal_range( key, end() ); } template typename std::enable_if::value, std::pair>::type equal_range( const K& key ) { return internal_equal_range(key, end()); } template typename std::enable_if::value, std::pair>::type equal_range( const K& key ) const { return internal_equal_range(key, end()); } // Number of items in table. size_type size() const { return this->my_size.load(std::memory_order_acquire); } // True if size()==0. __TBB_nodiscard bool empty() const { return size() == 0; } // Upper bound on size. size_type max_size() const { return allocator_traits_type::max_size(base_type::get_allocator()); } // Returns the current number of buckets size_type bucket_count() const { return this->my_mask.load(std::memory_order_relaxed) + 1; } // return allocator object allocator_type get_allocator() const { return base_type::get_allocator(); } // swap two instances. Iterators are invalidated void swap(concurrent_hash_map& table) { using pocs_type = typename node_allocator_traits::propagate_on_container_swap; using is_equal_type = typename node_allocator_traits::is_always_equal; swap_allocators(this->my_allocator, table.my_allocator); internal_swap(table, tbb::detail::disjunction()); } //------------------------------------------------------------------------ // concurrent map operations //------------------------------------------------------------------------ // Return count of items (0 or 1) size_type count( const Key &key ) const { return const_cast(this)->lookup(key, nullptr, nullptr, /*write=*/false, &do_not_allocate_node); } template typename std::enable_if::value, size_type>::type count( const K& key ) const { return const_cast(this)->lookup(key, nullptr, nullptr, /*write=*/false, &do_not_allocate_node); } // Find item and acquire a read lock on the item. /** Return true if item is found, false otherwise. */ bool find( const_accessor &result, const Key &key ) const { result.release(); return const_cast(this)->lookup(key, nullptr, &result, /*write=*/false, &do_not_allocate_node ); } // Find item and acquire a write lock on the item. /** Return true if item is found, false otherwise. */ bool find( accessor &result, const Key &key ) { result.release(); return lookup(key, nullptr, &result, /*write=*/true, &do_not_allocate_node); } template typename std::enable_if::value, bool>::type find( const_accessor& result, const K& key ) { result.release(); return lookup(key, nullptr, &result, /*write=*/false, &do_not_allocate_node); } template typename std::enable_if::value, bool>::type find( accessor& result, const K& key ) { result.release(); return lookup(key, nullptr, &result, /*write=*/true, &do_not_allocate_node); } // Insert item (if not already present) and acquire a read lock on the item. /** Returns true if item is new. */ bool insert( const_accessor &result, const Key &key ) { result.release(); return lookup(key, nullptr, &result, /*write=*/false, &allocate_node_default_construct<>); } // Insert item (if not already present) and acquire a write lock on the item. /** Returns true if item is new. */ bool insert( accessor &result, const Key &key ) { result.release(); return lookup(key, nullptr, &result, /*write=*/true, &allocate_node_default_construct<>); } template typename std::enable_if::value && std::is_constructible::value, bool>::type insert( const_accessor& result, const K& key ) { result.release(); return lookup(key, nullptr, &result, /*write=*/false, &allocate_node_default_construct); } template typename std::enable_if::value && std::is_constructible::value, bool>::type insert( accessor& result, const K& key ) { result.release(); return lookup(key, nullptr, &result, /*write=*/true, &allocate_node_default_construct); } // Insert item by copying if there is no such key present already and acquire a read lock on the item. /** Returns true if item is new. */ bool insert( const_accessor &result, const value_type &value ) { result.release(); return lookup(value.first, &value.second, &result, /*write=*/false, &allocate_node_copy_construct); } // Insert item by copying if there is no such key present already and acquire a write lock on the item. /** Returns true if item is new. */ bool insert( accessor &result, const value_type &value ) { result.release(); return lookup(value.first, &value.second, &result, /*write=*/true, &allocate_node_copy_construct); } // Insert item by copying if there is no such key present already /** Returns true if item is inserted. */ bool insert( const value_type &value ) { return lookup(value.first, &value.second, nullptr, /*write=*/false, &allocate_node_copy_construct); } // Insert item by copying if there is no such key present already and acquire a read lock on the item. /** Returns true if item is new. */ bool insert( const_accessor &result, value_type && value ) { return generic_move_insert(result, std::move(value)); } // Insert item by copying if there is no such key present already and acquire a write lock on the item. /** Returns true if item is new. */ bool insert( accessor &result, value_type && value ) { return generic_move_insert(result, std::move(value)); } // Insert item by copying if there is no such key present already /** Returns true if item is inserted. */ bool insert( value_type && value ) { return generic_move_insert(accessor_not_used(), std::move(value)); } // Insert item by copying if there is no such key present already and acquire a read lock on the item. /** Returns true if item is new. */ template bool emplace( const_accessor &result, Args&&... args ) { return generic_emplace(result, std::forward(args)...); } // Insert item by copying if there is no such key present already and acquire a write lock on the item. /** Returns true if item is new. */ template bool emplace( accessor &result, Args&&... args ) { return generic_emplace(result, std::forward(args)...); } // Insert item by copying if there is no such key present already /** Returns true if item is inserted. */ template bool emplace( Args&&... args ) { return generic_emplace(accessor_not_used(), std::forward(args)...); } // Insert range [first, last) template void insert( I first, I last ) { for ( ; first != last; ++first ) insert( *first ); } // Insert initializer list void insert( std::initializer_list il ) { insert( il.begin(), il.end() ); } // Erase item. /** Return true if item was erased by particularly this call. */ bool erase( const Key &key ) { return internal_erase(key); } template typename std::enable_if::value, bool>::type erase( const K& key ) { return internal_erase(key); } // Erase item by const_accessor. /** Return true if item was erased by particularly this call. */ bool erase( const_accessor& item_accessor ) { return exclude( item_accessor ); } // Erase item by accessor. /** Return true if item was erased by particularly this call. */ bool erase( accessor& item_accessor ) { return exclude( item_accessor ); } protected: template node* allocate_node_helper( const K& key, const T* t, AllocateNodeType allocate_node, std::true_type ) { return allocate_node(base_type::get_allocator(), key, t); } template node* allocate_node_helper( const K&, const T*, AllocateNodeType, std::false_type ) { __TBB_ASSERT(false, "allocate_node_helper with std::false_type should never been called"); return nullptr; } // Insert or find item and optionally acquire a lock on the item. template bool lookup( const K &key, const T *t, const_accessor *result, bool write, AllocateNodeType allocate_node, node *tmp_n = nullptr) { __TBB_ASSERT( !result || !result->my_node, nullptr ); bool return_value; hashcode_type const h = my_hash_compare.hash( key ); hashcode_type m = this->my_mask.load(std::memory_order_acquire); segment_index_type grow_segment = 0; node *n; restart: {//lock scope __TBB_ASSERT((m&(m+1))==0, "data structure is invalid"); return_value = false; // get bucket bucket_accessor b( this, h & m ); // find a node n = search_bucket( key, b() ); if( OpInsert ) { // [opt] insert a key if( !n ) { if( !tmp_n ) { tmp_n = allocate_node_helper(key, t, allocate_node, std::integral_constant{}); } while ( !b.is_writer() && !b.upgrade_to_writer() ) { // TODO: improved insertion // Rerun search list, in case another thread inserted the intem during the upgrade n = search_bucket(key, b()); if (this->is_valid(n)) { // unfortunately, it did if (!b.downgrade_to_reader()) { // If the lock was downgraded with reacquiring the mutex // Rerun search list in case another thread removed the item during the downgrade n = search_bucket(key, b()); if (!this->is_valid(n)) { // Unfortunately, it did // We need to try upgrading to writer again continue; } } goto exists; } } if( this->check_mask_race(h, m) ) goto restart; // b.release() is done in ~b(). // insert and set flag to grow the container grow_segment = this->insert_new_node( b(), n = tmp_n, m ); tmp_n = nullptr; return_value = true; } } else { // find or count if( !n ) { if( this->check_mask_race( h, m ) ) goto restart; // b.release() is done in ~b(). TODO: replace by continue return false; } return_value = true; } exists: if( !result ) goto check_growth; // TODO: the following seems as generic/regular operation // acquire the item if( !result->try_acquire( n->mutex, write ) ) { for( tbb::detail::atomic_backoff backoff(true);; ) { if( result->try_acquire( n->mutex, write ) ) break; if( !backoff.bounded_pause() ) { // the wait takes really long, restart the operation b.release(); __TBB_ASSERT( !OpInsert || !return_value, "Can't acquire new item in locked bucket?" ); yield(); m = this->my_mask.load(std::memory_order_acquire); goto restart; } } } }//lock scope result->my_node = n; result->my_hash = h; check_growth: // [opt] grow the container if( grow_segment ) { this->enable_segment( grow_segment ); } if( tmp_n ) // if OpInsert only delete_node( tmp_n ); return return_value; } struct accessor_not_used { void release(){}}; friend const_accessor* accessor_location( accessor_not_used const& ){ return nullptr;} friend const_accessor* accessor_location( const_accessor & a ) { return &a;} friend bool is_write_access_needed( accessor const& ) { return true;} friend bool is_write_access_needed( const_accessor const& ) { return false;} friend bool is_write_access_needed( accessor_not_used const& ) { return false;} template bool generic_move_insert( Accessor && result, value_type && value ) { result.release(); return lookup(value.first, &value.second, accessor_location(result), is_write_access_needed(result), &allocate_node_move_construct); } template bool generic_emplace( Accessor && result, Args &&... args ) { result.release(); node * node_ptr = create_node(base_type::get_allocator(), std::forward(args)...); return lookup(node_ptr->value().first, nullptr, accessor_location(result), is_write_access_needed(result), &do_not_allocate_node, node_ptr); } // delete item by accessor bool exclude( const_accessor &item_accessor ) { __TBB_ASSERT( item_accessor.my_node, nullptr ); node_base *const exclude_node = item_accessor.my_node; hashcode_type const hash = item_accessor.my_hash; hashcode_type mask = this->my_mask.load(std::memory_order_acquire); do { // get bucket bucket_accessor b( this, hash & mask, /*writer=*/true ); node_base* prev = nullptr; node_base* curr = b()->node_list.load(std::memory_order_relaxed); while (curr && curr != exclude_node) { prev = curr; curr = curr->next; } if (curr == nullptr) { // someone else was first if (this->check_mask_race(hash, mask)) continue; item_accessor.release(); return false; } __TBB_ASSERT( curr == exclude_node, nullptr ); // remove from container if (prev == nullptr) { b()->node_list.store(curr->next, std::memory_order_relaxed); } else { prev->next = curr->next; } this->my_size--; break; } while(true); if (!item_accessor.is_writer()) { // need to get exclusive lock item_accessor.upgrade_to_writer(); // return value means nothing here } item_accessor.release(); delete_node(exclude_node); // Only one thread can delete it return true; } template bool internal_erase( const K& key ) { node_base *erase_node; hashcode_type const hash = my_hash_compare.hash(key); hashcode_type mask = this->my_mask.load(std::memory_order_acquire); restart: {//lock scope // get bucket bucket_accessor b( this, hash & mask ); search: node_base* prev = nullptr; erase_node = b()->node_list.load(std::memory_order_relaxed); while (this->is_valid(erase_node) && !my_hash_compare.equal(key, static_cast(erase_node)->value().first ) ) { prev = erase_node; erase_node = erase_node->next; } if (erase_node == nullptr) { // not found, but mask could be changed if (this->check_mask_race(hash, mask)) goto restart; return false; } else if (!b.is_writer() && !b.upgrade_to_writer()) { if (this->check_mask_race(hash, mask)) // contended upgrade, check mask goto restart; goto search; } // remove from container if (prev == nullptr) { b()->node_list.store(erase_node->next, std::memory_order_relaxed); } else { prev->next = erase_node->next; } this->my_size--; } { typename node::scoped_type item_locker( erase_node->mutex, /*write=*/true ); } // note: there should be no threads pretending to acquire this mutex again, do not try to upgrade const_accessor! delete_node(erase_node); // Only one thread can delete it due to write lock on the bucket return true; } // Returns an iterator for an item defined by the key, or for the next item after it (if upper==true) template std::pair internal_equal_range( const K& key, I end_ ) const { hashcode_type h = my_hash_compare.hash( key ); hashcode_type m = this->my_mask.load(std::memory_order_relaxed); __TBB_ASSERT((m&(m+1))==0, "data structure is invalid"); h &= m; bucket *b = this->get_bucket( h ); while (rehash_required(b->node_list.load(std::memory_order_relaxed))) { m = ( hashcode_type(1) << tbb::detail::log2( h ) ) - 1; // get parent mask from the topmost bit b = this->get_bucket( h &= m ); } node *n = search_bucket( key, b ); if( !n ) return std::make_pair(end_, end_); iterator lower(*this, h, b, n), upper(lower); return std::make_pair(lower, ++upper); } // Copy "source" to *this, where *this must start out empty. void internal_copy( const concurrent_hash_map& source ) { hashcode_type mask = source.my_mask.load(std::memory_order_relaxed); if( this->my_mask.load(std::memory_order_relaxed) == mask ) { // optimized version this->reserve(source.my_size.load(std::memory_order_relaxed)); // TODO: load_factor? bucket *dst = nullptr, *src = nullptr; bool rehashing_required = false; for( hashcode_type k = 0; k <= mask; k++ ) { if( k & (k-2) ) ++dst,src++; // not the beginning of a segment else { dst = this->get_bucket( k ); src = source.get_bucket( k ); } __TBB_ASSERT(!rehash_required(dst->node_list.load(std::memory_order_relaxed)), "Invalid bucket in destination table"); node *n = static_cast( src->node_list.load(std::memory_order_relaxed) ); if (rehash_required(n)) { // source is not rehashed, items are in previous buckets rehashing_required = true; dst->node_list.store(reinterpret_cast(rehash_req_flag), std::memory_order_relaxed); } else for(; n; n = static_cast( n->next ) ) { node* node_ptr = create_node(base_type::get_allocator(), n->value().first, n->value().second); this->add_to_bucket( dst, node_ptr); this->my_size.fetch_add(1, std::memory_order_relaxed); } } if( rehashing_required ) rehash(); } else internal_copy(source.begin(), source.end(), source.my_size.load(std::memory_order_relaxed)); } template void internal_copy( I first, I last, size_type reserve_size ) { this->reserve(reserve_size); // TODO: load_factor? hashcode_type m = this->my_mask.load(std::memory_order_relaxed); for(; first != last; ++first) { hashcode_type h = my_hash_compare.hash( (*first).first ); bucket *b = this->get_bucket( h & m ); __TBB_ASSERT(!rehash_required(b->node_list.load(std::memory_order_relaxed)), "Invalid bucket in destination table"); node* node_ptr = create_node(base_type::get_allocator(), (*first).first, (*first).second); this->add_to_bucket( b, node_ptr ); ++this->my_size; // TODO: replace by non-atomic op } } void internal_move_construct_with_allocator( concurrent_hash_map&& other, const allocator_type&, /*is_always_equal=*/std::true_type ) { this->internal_move(std::move(other)); } void internal_move_construct_with_allocator( concurrent_hash_map&& other, const allocator_type& a, /*is_always_equal=*/std::false_type ) { if (a == other.get_allocator()){ this->internal_move(std::move(other)); } else { try_call( [&] { internal_copy(std::make_move_iterator(other.begin()), std::make_move_iterator(other.end()), other.size()); }).on_exception( [&] { this->clear(); }); } } void internal_move_assign( concurrent_hash_map&& other, /*is_always_equal || POCMA = */std::true_type) { this->internal_move(std::move(other)); } void internal_move_assign(concurrent_hash_map&& other, /*is_always_equal=*/ std::false_type) { if (this->my_allocator == other.my_allocator) { this->internal_move(std::move(other)); } else { //do per element move internal_copy(std::make_move_iterator(other.begin()), std::make_move_iterator(other.end()), other.size()); } } void internal_swap(concurrent_hash_map& other, /*is_always_equal || POCS = */ std::true_type) { this->internal_swap_content(other); } void internal_swap(concurrent_hash_map& other, /*is_always_equal || POCS = */ std::false_type) { __TBB_ASSERT(this->my_allocator == other.my_allocator, nullptr); this->internal_swap_content(other); } // Fast find when no concurrent erasure is used. For internal use inside TBB only! /** Return pointer to item with given key, or nullptr if no such item exists. Must not be called concurrently with erasure operations. */ const_pointer internal_fast_find( const Key& key ) const { hashcode_type h = my_hash_compare.hash( key ); hashcode_type m = this->my_mask.load(std::memory_order_acquire); node *n; restart: __TBB_ASSERT((m&(m+1))==0, "data structure is invalid"); bucket *b = this->get_bucket( h & m ); // TODO: actually, notification is unnecessary here, just hiding double-check if (rehash_required(b->node_list.load(std::memory_order_acquire))) { typename bucket::scoped_type lock; if( lock.try_acquire( b->mutex, /*write=*/true ) ) { if (rehash_required(b->node_list.load(std::memory_order_relaxed))) const_cast(this)->rehash_bucket( b, h & m ); //recursive rehashing } else lock.acquire( b->mutex, /*write=*/false ); __TBB_ASSERT(!rehash_required(b->node_list.load(std::memory_order_relaxed)), nullptr); } n = search_bucket( key, b ); if( n ) return n->storage(); else if( this->check_mask_race( h, m ) ) goto restart; return nullptr; } }; #if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT template >, typename Alloc = tbb_allocator>, typename = std::enable_if_t>, typename = std::enable_if_t>, typename = std::enable_if_t>> concurrent_hash_map( It, It, HashCompare = HashCompare(), Alloc = Alloc() ) -> concurrent_hash_map, iterator_mapped_t, HashCompare, Alloc>; template >, typename = std::enable_if_t>> concurrent_hash_map( It, It, Alloc ) -> concurrent_hash_map, iterator_mapped_t, d1::tbb_hash_compare>, Alloc>; template >, typename Alloc = tbb_allocator>, typename = std::enable_if_t>, typename = std::enable_if_t>> concurrent_hash_map( std::initializer_list>, HashCompare = HashCompare(), Alloc = Alloc() ) -> concurrent_hash_map, T, HashCompare, Alloc>; template >> concurrent_hash_map( std::initializer_list>, Alloc ) -> concurrent_hash_map, T, d1::tbb_hash_compare>, Alloc>; #endif /* __TBB_CPP17_DEDUCTION_GUIDES_PRESENT */ template inline bool operator==(const concurrent_hash_map &a, const concurrent_hash_map &b) { if(a.size() != b.size()) return false; typename concurrent_hash_map::const_iterator i(a.begin()), i_end(a.end()); typename concurrent_hash_map::const_iterator j, j_end(b.end()); for(; i != i_end; ++i) { j = b.equal_range(i->first).first; if( j == j_end || !(i->second == j->second) ) return false; } return true; } #if !__TBB_CPP20_COMPARISONS_PRESENT template inline bool operator!=(const concurrent_hash_map &a, const concurrent_hash_map &b) { return !(a == b); } #endif // !__TBB_CPP20_COMPARISONS_PRESENT template inline void swap(concurrent_hash_map &a, concurrent_hash_map &b) { a.swap( b ); } } // namespace d2 } // namespace detail inline namespace v1 { using detail::split; using detail::d2::concurrent_hash_map; using detail::d1::tbb_hash_compare; } // namespace v1 } // namespace tbb #endif /* __TBB_concurrent_hash_map_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/concurrent_lru_cache.h ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_concurrent_lru_cache_H #define __TBB_concurrent_lru_cache_H #if ! TBB_PREVIEW_CONCURRENT_LRU_CACHE #error Set TBB_PREVIEW_CONCURRENT_LRU_CACHE to include concurrent_lru_cache.h #endif #include "detail/_assert.h" #include "detail/_aggregator.h" #include // for std::map #include // for std::list #include // for std::make_pair #include // for std::find #include // for std::atomic namespace tbb { namespace detail { namespace d1 { //----------------------------------------------------------------------------- // Concurrent LRU cache //----------------------------------------------------------------------------- template class concurrent_lru_cache : no_assign { // incapsulated helper classes private: struct handle_object; struct storage_map_value_type; struct aggregator_operation; struct retrieve_aggregator_operation; struct signal_end_of_usage_aggregator_operation; // typedefs public: using key_type = KeyT; using value_type = ValT; using pointer = ValT*; using reference = ValT&; using const_pointer = const ValT*; using const_reference = const ValT&; using value_function_type = KeyToValFunctorT; using handle = handle_object; private: using lru_cache_type = concurrent_lru_cache; using storage_map_type = std::map; using storage_map_iterator_type = typename storage_map_type::iterator; using storage_map_pointer_type = typename storage_map_type::pointer; using storage_map_reference_type = typename storage_map_type::reference; using history_list_type = std::list; using history_list_iterator_type = typename history_list_type::iterator; using aggregator_operation_type = aggregator_operation; using aggregator_function_type = aggregating_functor; using aggregator_type = aggregator; friend class aggregating_functor; // fields private: value_function_type my_value_function; aggregator_type my_aggregator; storage_map_type my_storage_map; // storage map for used objects history_list_type my_history_list; // history list for unused objects const std::size_t my_history_list_capacity; // history list's allowed capacity // interface public: concurrent_lru_cache(value_function_type value_function, std::size_t cache_capacity) : my_value_function(value_function), my_history_list_capacity(cache_capacity) { my_aggregator.initialize_handler(aggregator_function_type(this)); } handle operator[](key_type key) { retrieve_aggregator_operation op(key); my_aggregator.execute(&op); if (op.is_new_value_needed()) { op.result().second.my_value = my_value_function(key); op.result().second.my_is_ready.store(true, std::memory_order_release); } else { spin_wait_while_eq(op.result().second.my_is_ready, false); } return handle(*this, op.result()); } private: void handle_operations(aggregator_operation* op_list) { while (op_list) { op_list->cast_and_handle(*this); aggregator_operation* prev_op = op_list; op_list = op_list->next; (prev_op->status).store(1, std::memory_order_release); } } void signal_end_of_usage(storage_map_reference_type map_record_ref) { signal_end_of_usage_aggregator_operation op(map_record_ref); my_aggregator.execute(&op); } void signal_end_of_usage_serial(storage_map_reference_type map_record_ref) { storage_map_iterator_type map_it = my_storage_map.find(map_record_ref.first); __TBB_ASSERT(map_it != my_storage_map.end(), "cache should not return past-end iterators to outer world"); __TBB_ASSERT(&(*map_it) == &map_record_ref, "dangling reference has been returned to outside world: data race?"); __TBB_ASSERT(std::find(my_history_list.begin(), my_history_list.end(), map_it) == my_history_list.end(), "object in use should not be in list of unused objects "); // if it was the last reference, put it to the LRU history if (! --(map_it->second.my_ref_counter)) { // if the LRU history is full, evict the oldest items to get space if (my_history_list.size() >= my_history_list_capacity) { if (my_history_list_capacity == 0) { // Since LRU history capacity is zero, there is no need to keep the element in history my_storage_map.erase(map_it); return; } std::size_t number_of_elements_to_evict = 1 + my_history_list.size() - my_history_list_capacity; for (std::size_t i = 0; i < number_of_elements_to_evict; ++i) { storage_map_iterator_type map_it_to_evict = my_history_list.back(); __TBB_ASSERT(map_it_to_evict->second.my_ref_counter == 0, "item to be evicted should not have a live references"); // TODO: can we use forward_list instead of list? pop_front / insert_after last my_history_list.pop_back(); my_storage_map.erase(map_it_to_evict); } } // TODO: can we use forward_list instead of list? pop_front / insert_after last my_history_list.push_front(map_it); map_it->second.my_history_list_iterator = my_history_list.begin(); } } storage_map_reference_type retrieve_serial(key_type key, bool& is_new_value_needed) { storage_map_iterator_type map_it = my_storage_map.find(key); if (map_it == my_storage_map.end()) { map_it = my_storage_map.emplace_hint( map_it, std::piecewise_construct, std::make_tuple(key), std::make_tuple(value_type(), 0, my_history_list.end(), false)); is_new_value_needed = true; } else { history_list_iterator_type list_it = map_it->second.my_history_list_iterator; if (list_it != my_history_list.end()) { __TBB_ASSERT(map_it->second.my_ref_counter == 0, "item to be evicted should not have a live references"); // Item is going to be used. Therefore it is not a subject for eviction, // so we remove it from LRU history. my_history_list.erase(list_it); map_it->second.my_history_list_iterator = my_history_list.end(); } } ++(map_it->second.my_ref_counter); return *map_it; } }; //----------------------------------------------------------------------------- // Value type for storage map in concurrent LRU cache //----------------------------------------------------------------------------- template struct concurrent_lru_cache::storage_map_value_type { //typedefs public: using ref_counter_type = std::size_t; // fields public: value_type my_value; ref_counter_type my_ref_counter; history_list_iterator_type my_history_list_iterator; std::atomic my_is_ready; // interface public: storage_map_value_type( value_type const& value, ref_counter_type ref_counter, history_list_iterator_type history_list_iterator, bool is_ready) : my_value(value), my_ref_counter(ref_counter), my_history_list_iterator(history_list_iterator), my_is_ready(is_ready) {} }; //----------------------------------------------------------------------------- // Handle object for operator[] in concurrent LRU cache //----------------------------------------------------------------------------- template struct concurrent_lru_cache::handle_object { // fields private: lru_cache_type* my_lru_cache_ptr; storage_map_pointer_type my_map_record_ptr; // interface public: handle_object() : my_lru_cache_ptr(nullptr), my_map_record_ptr(nullptr) {} handle_object(lru_cache_type& lru_cache_ref, storage_map_reference_type map_record_ref) : my_lru_cache_ptr(&lru_cache_ref), my_map_record_ptr(&map_record_ref) {} handle_object(handle_object&) = delete; void operator=(handle_object&) = delete; handle_object(handle_object&& other) : my_lru_cache_ptr(other.my_lru_cache_ptr), my_map_record_ptr(other.my_map_record_ptr) { __TBB_ASSERT( (other.my_lru_cache_ptr != nullptr && other.my_map_record_ptr != nullptr) || (other.my_lru_cache_ptr == nullptr && other.my_map_record_ptr == nullptr), "invalid state of moving object?"); other.my_lru_cache_ptr = nullptr; other.my_map_record_ptr = nullptr; } handle_object& operator=(handle_object&& other) { __TBB_ASSERT( (other.my_lru_cache_ptr != nullptr && other.my_map_record_ptr != nullptr) || (other.my_lru_cache_ptr == nullptr && other.my_map_record_ptr == nullptr), "invalid state of moving object?"); if (my_lru_cache_ptr) my_lru_cache_ptr->signal_end_of_usage(*my_map_record_ptr); my_lru_cache_ptr = other.my_lru_cache_ptr; my_map_record_ptr = other.my_map_record_ptr; other.my_lru_cache_ptr = nullptr; other.my_map_record_ptr = nullptr; return *this; } ~handle_object() { if (my_lru_cache_ptr) my_lru_cache_ptr->signal_end_of_usage(*my_map_record_ptr); } operator bool() const { return (my_lru_cache_ptr && my_map_record_ptr); } value_type& value() { __TBB_ASSERT(my_lru_cache_ptr, "get value from already moved object?"); __TBB_ASSERT(my_map_record_ptr, "get value from an invalid or already moved object?"); return my_map_record_ptr->second.my_value; } }; //----------------------------------------------------------------------------- // Aggregator operation for aggregator type in concurrent LRU cache //----------------------------------------------------------------------------- template struct concurrent_lru_cache::aggregator_operation : aggregated_operation { // incapsulated helper classes public: enum class op_type { retrieve, signal_end_of_usage }; // fields private: op_type my_op; // interface public: aggregator_operation(op_type op) : my_op(op) {} // TODO: aggregator_operation can be implemented // - as a statically typed variant type or CRTP? (static, dependent on the use case) // - or use pointer to function and apply_visitor (dynamic) // - or use virtual functions (dynamic) void cast_and_handle(lru_cache_type& lru_cache_ref) { if (my_op == op_type::retrieve) static_cast(this)->handle(lru_cache_ref); else static_cast(this)->handle(lru_cache_ref); } }; template struct concurrent_lru_cache::retrieve_aggregator_operation : aggregator_operation, private no_assign { public: key_type my_key; storage_map_pointer_type my_map_record_ptr; bool my_is_new_value_needed; public: retrieve_aggregator_operation(key_type key) : aggregator_operation(aggregator_operation::op_type::retrieve), my_key(key), my_map_record_ptr(nullptr), my_is_new_value_needed(false) {} void handle(lru_cache_type& lru_cache_ref) { my_map_record_ptr = &lru_cache_ref.retrieve_serial(my_key, my_is_new_value_needed); } storage_map_reference_type result() { __TBB_ASSERT(my_map_record_ptr, "Attempt to call result() before calling handle()"); return *my_map_record_ptr; } bool is_new_value_needed() { return my_is_new_value_needed; } }; template struct concurrent_lru_cache::signal_end_of_usage_aggregator_operation : aggregator_operation, private no_assign { private: storage_map_reference_type my_map_record_ref; public: signal_end_of_usage_aggregator_operation(storage_map_reference_type map_record_ref) : aggregator_operation(aggregator_operation::op_type::signal_end_of_usage), my_map_record_ref(map_record_ref) {} void handle(lru_cache_type& lru_cache_ref) { lru_cache_ref.signal_end_of_usage_serial(my_map_record_ref); } }; // TODO: if we have guarantees that KeyToValFunctorT always have // ValT as a return type and KeyT as an argument type // we can deduce template parameters of concurrent_lru_cache // by pattern matching on KeyToValFunctorT } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::concurrent_lru_cache; } // inline namespace v1 } // namespace tbb #endif // __TBB_concurrent_lru_cache_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/concurrent_map.h ================================================ /* Copyright (c) 2019-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_concurrent_map_H #define __TBB_concurrent_map_H #include "detail/_namespace_injection.h" #include "detail/_concurrent_skip_list.h" #include "tbb_allocator.h" #include #include #include namespace tbb { namespace detail { namespace d2 { template struct map_traits { static constexpr std::size_t max_level = RandomGenerator::max_level; using random_level_generator_type = RandomGenerator; using key_type = Key; using mapped_type = Value; using compare_type = KeyCompare; using value_type = std::pair; using reference = value_type&; using const_reference = const value_type&; using allocator_type = Allocator; static constexpr bool allow_multimapping = AllowMultimapping; class value_compare { public: bool operator()(const value_type& lhs, const value_type& rhs) const { return comp(lhs.first, rhs.first); } protected: value_compare(compare_type c) : comp(c) {} friend struct map_traits; compare_type comp; }; static value_compare value_comp(compare_type comp) { return value_compare(comp); } static const key_type& get_key(const_reference val) { return val.first; } }; // struct map_traits template class concurrent_multimap; template , typename Allocator = tbb::tbb_allocator>> class concurrent_map : public concurrent_skip_list, Allocator, false>> { using base_type = concurrent_skip_list, Allocator, false>>; public: using key_type = Key; using mapped_type = Value; using value_type = typename base_type::value_type; using size_type = typename base_type::size_type; using difference_type = typename base_type::difference_type; using key_compare = Compare; using value_compare = typename base_type::value_compare; using allocator_type = Allocator; using reference = typename base_type::reference; using const_reference = typename base_type::const_reference; using pointer = typename base_type::pointer; using const_pointer = typename base_type::const_pointer; using iterator = typename base_type::iterator; using const_iterator = typename base_type::const_iterator; using node_type = typename base_type::node_type; // Include constructors of base type using base_type::base_type; // Required for implicit deduction guides concurrent_map() = default; concurrent_map( const concurrent_map& ) = default; concurrent_map( const concurrent_map& other, const allocator_type& alloc ) : base_type(other, alloc) {} concurrent_map( concurrent_map&& ) = default; concurrent_map( concurrent_map&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {} // Required to respect the rule of 5 concurrent_map& operator=( const concurrent_map& ) = default; concurrent_map& operator=( concurrent_map&& ) = default; concurrent_map& operator=( std::initializer_list il ) { base_type::operator= (il); return *this; } // Observers mapped_type& at(const key_type& key) { iterator it = this->find(key); if (it == this->end()) { throw_exception(exception_id::invalid_key); } return it->second; } const mapped_type& at(const key_type& key) const { return const_cast(this)->at(key); } mapped_type& operator[](const key_type& key) { iterator it = this->find(key); if (it == this->end()) { it = this->emplace(std::piecewise_construct, std::forward_as_tuple(key), std::tuple<>()).first; } return it->second; } mapped_type& operator[](key_type&& key) { iterator it = this->find(key); if (it == this->end()) { it = this->emplace(std::piecewise_construct, std::forward_as_tuple(std::move(key)), std::tuple<>()).first; } return it->second; } using base_type::insert; template typename std::enable_if::value, std::pair>::type insert( P&& value ) { return this->emplace(std::forward

(value)); } template typename std::enable_if::value, iterator>::type insert( const_iterator hint, P&& value ) { return this->emplace_hint(hint, std::forward

(value)); } template void merge(concurrent_map& source) { this->internal_merge(source); } template void merge(concurrent_map&& source) { this->internal_merge(std::move(source)); } template void merge(concurrent_multimap& source) { this->internal_merge(source); } template void merge(concurrent_multimap&& source) { this->internal_merge(std::move(source)); } }; // class concurrent_map #if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT template >, typename Alloc = tbb::tbb_allocator>, typename = std::enable_if_t>, typename = std::enable_if_t>, typename = std::enable_if_t>> concurrent_map( It, It, Comp = Comp(), Alloc = Alloc() ) -> concurrent_map, iterator_mapped_t, Comp, Alloc>; template >, typename Alloc = tbb::tbb_allocator>, typename = std::enable_if_t>, typename = std::enable_if_t>> concurrent_map( std::initializer_list>, Comp = Comp(), Alloc = Alloc() ) -> concurrent_map, T, Comp, Alloc>; template >, typename = std::enable_if_t>> concurrent_map( It, It, Alloc ) -> concurrent_map, iterator_mapped_t, std::less>, Alloc>; template >> concurrent_map( std::initializer_list>, Alloc ) -> concurrent_map, T, std::less>, Alloc>; #endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT template void swap( concurrent_map& lhs, concurrent_map& rhs ) { lhs.swap(rhs); } template , typename Allocator = tbb::tbb_allocator>> class concurrent_multimap : public concurrent_skip_list, Allocator, true>> { using base_type = concurrent_skip_list, Allocator, true>>; public: using key_type = Key; using mapped_type = Value; using value_type = typename base_type::value_type; using size_type = typename base_type::size_type; using difference_type = typename base_type::difference_type; using key_compare = Compare; using value_compare = typename base_type::value_compare; using allocator_type = Allocator; using reference = typename base_type::reference; using const_reference = typename base_type::const_reference; using pointer = typename base_type::pointer; using const_pointer = typename base_type::const_pointer; using iterator = typename base_type::iterator; using const_iterator = typename base_type::const_iterator; using node_type = typename base_type::node_type; // Include constructors of base_type using base_type::base_type; using base_type::insert; // Required for implicit deduction guides concurrent_multimap() = default; concurrent_multimap( const concurrent_multimap& ) = default; concurrent_multimap( const concurrent_multimap& other, const allocator_type& alloc ) : base_type(other, alloc) {} concurrent_multimap( concurrent_multimap&& ) = default; concurrent_multimap( concurrent_multimap&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {} // Required to respect the rule of 5 concurrent_multimap& operator=( const concurrent_multimap& ) = default; concurrent_multimap& operator=( concurrent_multimap&& ) = default; concurrent_multimap& operator=( std::initializer_list il ) { base_type::operator= (il); return *this; } template typename std::enable_if::value, std::pair>::type insert( P&& value ) { return this->emplace(std::forward

(value)); } template typename std::enable_if::value, iterator>::type insert( const_iterator hint, P&& value ) { return this->emplace_hint(hint, std::forward

(value)); } template void merge(concurrent_multimap& source) { this->internal_merge(source); } template void merge(concurrent_multimap&& source) { this->internal_merge(std::move(source)); } template void merge(concurrent_map& source) { this->internal_merge(source); } template void merge(concurrent_map&& source) { this->internal_merge(std::move(source)); } }; // class concurrent_multimap #if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT template >, typename Alloc = tbb::tbb_allocator>, typename = std::enable_if_t>, typename = std::enable_if_t>, typename = std::enable_if_t>> concurrent_multimap( It, It, Comp = Comp(), Alloc = Alloc() ) -> concurrent_multimap, iterator_mapped_t, Comp, Alloc>; template >, typename Alloc = tbb::tbb_allocator>, typename = std::enable_if_t>, typename = std::enable_if_t>> concurrent_multimap( std::initializer_list>, Comp = Comp(), Alloc = Alloc() ) -> concurrent_multimap, T, Comp, Alloc>; template >, typename = std::enable_if_t>> concurrent_multimap( It, It, Alloc ) -> concurrent_multimap, iterator_mapped_t, std::less>, Alloc>; template >> concurrent_multimap( std::initializer_list>, Alloc ) -> concurrent_multimap, T, std::less>, Alloc>; #endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT template void swap( concurrent_multimap& lhs, concurrent_multimap& rhs ) { lhs.swap(rhs); } } // namespace d2 } // namespace detail inline namespace v1 { using detail::d2::concurrent_map; using detail::d2::concurrent_multimap; using detail::split; } // inline namespace v1 } // namespace tbb #endif // __TBB_concurrent_map_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/concurrent_priority_queue.h ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_concurrent_priority_queue_H #define __TBB_concurrent_priority_queue_H #include "detail/_namespace_injection.h" #include "detail/_aggregator.h" #include "detail/_template_helpers.h" #include "detail/_allocator_traits.h" #include "detail/_range_common.h" #include "detail/_exception.h" #include "detail/_utils.h" #include "detail/_containers_helpers.h" #include "cache_aligned_allocator.h" #include #include #include #include #include #include namespace tbb { namespace detail { namespace d1 { template , typename Allocator = cache_aligned_allocator> class concurrent_priority_queue { public: using value_type = T; using reference = T&; using const_reference = const T&; using size_type = std::size_t; using difference_type = std::ptrdiff_t; using allocator_type = Allocator; concurrent_priority_queue() : concurrent_priority_queue(allocator_type{}) {} explicit concurrent_priority_queue( const allocator_type& alloc ) : mark(0), my_size(0), my_compare(), data(alloc) { my_aggregator.initialize_handler(functor{this}); } explicit concurrent_priority_queue( const Compare& compare, const allocator_type& alloc = allocator_type() ) : mark(0), my_size(0), my_compare(compare), data(alloc) { my_aggregator.initialize_handler(functor{this}); } explicit concurrent_priority_queue( size_type init_capacity, const allocator_type& alloc = allocator_type() ) : mark(0), my_size(0), my_compare(), data(alloc) { data.reserve(init_capacity); my_aggregator.initialize_handler(functor{this}); } explicit concurrent_priority_queue( size_type init_capacity, const Compare& compare, const allocator_type& alloc = allocator_type() ) : mark(0), my_size(0), my_compare(compare), data(alloc) { data.reserve(init_capacity); my_aggregator.initialize_handler(functor{this}); } template concurrent_priority_queue( InputIterator begin, InputIterator end, const Compare& compare, const allocator_type& alloc = allocator_type() ) : mark(0), my_compare(compare), data(begin, end, alloc) { my_aggregator.initialize_handler(functor{this}); heapify(); my_size.store(data.size(), std::memory_order_relaxed); } template concurrent_priority_queue( InputIterator begin, InputIterator end, const allocator_type& alloc = allocator_type() ) : concurrent_priority_queue(begin, end, Compare(), alloc) {} concurrent_priority_queue( std::initializer_list init, const Compare& compare, const allocator_type& alloc = allocator_type() ) : concurrent_priority_queue(init.begin(), init.end(), compare, alloc) {} concurrent_priority_queue( std::initializer_list init, const allocator_type& alloc = allocator_type() ) : concurrent_priority_queue(init, Compare(), alloc) {} concurrent_priority_queue( const concurrent_priority_queue& other ) : mark(other.mark), my_size(other.my_size.load(std::memory_order_relaxed)), my_compare(other.my_compare), data(other.data) { my_aggregator.initialize_handler(functor{this}); } concurrent_priority_queue( const concurrent_priority_queue& other, const allocator_type& alloc ) : mark(other.mark), my_size(other.my_size.load(std::memory_order_relaxed)), my_compare(other.my_compare), data(other.data, alloc) { my_aggregator.initialize_handler(functor{this}); } concurrent_priority_queue( concurrent_priority_queue&& other ) : mark(other.mark), my_size(other.my_size.load(std::memory_order_relaxed)), my_compare(other.my_compare), data(std::move(other.data)) { my_aggregator.initialize_handler(functor{this}); } concurrent_priority_queue( concurrent_priority_queue&& other, const allocator_type& alloc ) : mark(other.mark), my_size(other.my_size.load(std::memory_order_relaxed)), my_compare(other.my_compare), data(std::move(other.data), alloc) { my_aggregator.initialize_handler(functor{this}); } concurrent_priority_queue& operator=( const concurrent_priority_queue& other ) { if (this != &other) { data = other.data; mark = other.mark; my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed); } return *this; } concurrent_priority_queue& operator=( concurrent_priority_queue&& other ) { if (this != &other) { // TODO: check if exceptions from std::vector::operator=(vector&&) should be handled separately data = std::move(other.data); mark = other.mark; my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed); } return *this; } concurrent_priority_queue& operator=( std::initializer_list init ) { assign(init.begin(), init.end()); return *this; } template void assign( InputIterator begin, InputIterator end ) { data.assign(begin, end); mark = 0; my_size.store(data.size(), std::memory_order_relaxed); heapify(); } void assign( std::initializer_list init ) { assign(init.begin(), init.end()); } /* Returned value may not reflect results of pending operations. This operation reads shared data and will trigger a race condition. */ __TBB_nodiscard bool empty() const { return size() == 0; } // Returns the current number of elements contained in the queue /* Returned value may not reflect results of pending operations. This operation reads shared data and will trigger a race condition. */ size_type size() const { return my_size.load(std::memory_order_relaxed); } /* This operation can be safely used concurrently with other push, try_pop or emplace operations. */ void push( const value_type& value ) { cpq_operation op_data(value, PUSH_OP); my_aggregator.execute(&op_data); if (op_data.status == FAILED) throw_exception(exception_id::bad_alloc); } /* This operation can be safely used concurrently with other push, try_pop or emplace operations. */ void push( value_type&& value ) { cpq_operation op_data(value, PUSH_RVALUE_OP); my_aggregator.execute(&op_data); if (op_data.status == FAILED) throw_exception(exception_id::bad_alloc); } /* This operation can be safely used concurrently with other push, try_pop or emplace operations. */ template void emplace( Args&&... args ) { // TODO: support uses allocator construction in this place push(value_type(std::forward(args)...)); } // Gets a reference to and removes highest priority element /* If a highest priority element was found, sets elem and returns true, otherwise returns false. This operation can be safely used concurrently with other push, try_pop or emplace operations. */ bool try_pop( value_type& value ) { cpq_operation op_data(value, POP_OP); my_aggregator.execute(&op_data); return op_data.status == SUCCEEDED; } // This operation affects the whole container => it is not thread-safe void clear() { data.clear(); mark = 0; my_size.store(0, std::memory_order_relaxed); } // This operation affects the whole container => it is not thread-safe void swap( concurrent_priority_queue& other ) { if (this != &other) { using std::swap; swap(data, other.data); swap(mark, other.mark); size_type sz = my_size.load(std::memory_order_relaxed); my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed); other.my_size.store(sz, std::memory_order_relaxed); } } allocator_type get_allocator() const { return data.get_allocator(); } private: enum operation_type {INVALID_OP, PUSH_OP, POP_OP, PUSH_RVALUE_OP}; enum operation_status {WAIT = 0, SUCCEEDED, FAILED}; class cpq_operation : public aggregated_operation { public: operation_type type; union { value_type* elem; size_type sz; }; cpq_operation( const value_type& value, operation_type t ) : type(t), elem(const_cast(&value)) {} }; // class cpq_operation class functor { concurrent_priority_queue* my_cpq; public: functor() : my_cpq(nullptr) {} functor( concurrent_priority_queue* cpq ) : my_cpq(cpq) {} void operator()(cpq_operation* op_list) { __TBB_ASSERT(my_cpq != nullptr, "Invalid functor"); my_cpq->handle_operations(op_list); } }; // class functor void handle_operations( cpq_operation* op_list ) { call_itt_notify(acquired, this); cpq_operation* tmp, *pop_list = nullptr; __TBB_ASSERT(mark == data.size(), nullptr); // First pass processes all constant (amortized; reallocation may happen) time pushes and pops. while(op_list) { // ITT note: &(op_list->status) tag is used to cover accesses to op_list // node. This thread is going to handle the operation, and so will acquire it // and perform the associated operation w/o triggering a race condition; the // thread that created the operation is waiting on the status field, so when // this thread is done with the operation, it will perform a // store_with_release to give control back to the waiting thread in // aggregator::insert_operation. // TODO: enable call_itt_notify(acquired, &(op_list->status)); __TBB_ASSERT(op_list->type != INVALID_OP, nullptr); tmp = op_list; op_list = op_list->next.load(std::memory_order_relaxed); if (tmp->type == POP_OP) { if (mark < data.size() && my_compare(data[0], data.back())) { // there are newly pushed elems and the last one is higher than top *(tmp->elem) = std::move(data.back()); my_size.store(my_size.load(std::memory_order_relaxed) - 1, std::memory_order_relaxed); tmp->status.store(uintptr_t(SUCCEEDED), std::memory_order_release); data.pop_back(); __TBB_ASSERT(mark <= data.size(), nullptr); } else { // no convenient item to pop; postpone tmp->next.store(pop_list, std::memory_order_relaxed); pop_list = tmp; } } else { // PUSH_OP or PUSH_RVALUE_OP __TBB_ASSERT(tmp->type == PUSH_OP || tmp->type == PUSH_RVALUE_OP, "Unknown operation"); #if TBB_USE_EXCEPTIONS try #endif { if (tmp->type == PUSH_OP) { push_back_helper(*(tmp->elem)); } else { data.push_back(std::move(*(tmp->elem))); } my_size.store(my_size.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed); tmp->status.store(uintptr_t(SUCCEEDED), std::memory_order_release); } #if TBB_USE_EXCEPTIONS catch(...) { tmp->status.store(uintptr_t(FAILED), std::memory_order_release); } #endif } } // Second pass processes pop operations while(pop_list) { tmp = pop_list; pop_list = pop_list->next.load(std::memory_order_relaxed); __TBB_ASSERT(tmp->type == POP_OP, nullptr); if (data.empty()) { tmp->status.store(uintptr_t(FAILED), std::memory_order_release); } else { __TBB_ASSERT(mark <= data.size(), nullptr); if (mark < data.size() && my_compare(data[0], data.back())) { // there are newly pushed elems and the last one is higher than top *(tmp->elem) = std::move(data.back()); my_size.store(my_size.load(std::memory_order_relaxed) - 1, std::memory_order_relaxed); tmp->status.store(uintptr_t(SUCCEEDED), std::memory_order_release); data.pop_back(); } else { // extract top and push last element down heap *(tmp->elem) = std::move(data[0]); my_size.store(my_size.load(std::memory_order_relaxed) - 1, std::memory_order_relaxed); tmp->status.store(uintptr_t(SUCCEEDED), std::memory_order_release); reheap(); } } } // heapify any leftover pushed elements before doing the next // batch of operations if (mark < data.size()) heapify(); __TBB_ASSERT(mark == data.size(), nullptr); call_itt_notify(releasing, this); } // Merge unsorted elements into heap void heapify() { if (!mark && data.size() > 0) mark = 1; for (; mark < data.size(); ++mark) { // for each unheapified element under size size_type cur_pos = mark; value_type to_place = std::move(data[mark]); do { // push to_place up the heap size_type parent = (cur_pos - 1) >> 1; if (!my_compare(data[parent], to_place)) break; data[cur_pos] = std::move(data[parent]); cur_pos = parent; } while(cur_pos); data[cur_pos] = std::move(to_place); } } // Re-heapify after an extraction // Re-heapify by pushing last element down the heap from the root. void reheap() { size_type cur_pos = 0, child = 1; while(child < mark) { size_type target = child; if (child + 1 < mark && my_compare(data[child], data[child + 1])) ++target; // target now has the higher priority child if (my_compare(data[target], data.back())) break; data[cur_pos] = std::move(data[target]); cur_pos = target; child = (cur_pos << 1) + 1; } if (cur_pos != data.size() - 1) data[cur_pos] = std::move(data.back()); data.pop_back(); if (mark > data.size()) mark = data.size(); } void push_back_helper( const T& value ) { push_back_helper_impl(value, std::is_copy_constructible{}); } void push_back_helper_impl( const T& value, /*is_copy_constructible = */std::true_type ) { data.push_back(value); } void push_back_helper_impl( const T&, /*is_copy_constructible = */std::false_type ) { __TBB_ASSERT(false, "error: calling tbb::concurrent_priority_queue.push(const value_type&) for move-only type"); } using aggregator_type = aggregator; aggregator_type my_aggregator; // Padding added to avoid false sharing char padding1[max_nfs_size - sizeof(aggregator_type)]; // The point at which unsorted elements begin size_type mark; std::atomic my_size; Compare my_compare; // Padding added to avoid false sharing char padding2[max_nfs_size - (2*sizeof(size_type)) - sizeof(Compare)]; //! Storage for the heap of elements in queue, plus unheapified elements /** data has the following structure: binary unheapified heap elements ____|_______|____ | | | v v v [_|...|_|_|...|_| |...| ] 0 ^ ^ ^ | | |__capacity | |__my_size |__mark Thus, data stores the binary heap starting at position 0 through mark-1 (it may be empty). Then there are 0 or more elements that have not yet been inserted into the heap, in positions mark through my_size-1. */ using vector_type = std::vector; vector_type data; friend bool operator==( const concurrent_priority_queue& lhs, const concurrent_priority_queue& rhs ) { return lhs.data == rhs.data; } #if !__TBB_CPP20_COMPARISONS_PRESENT friend bool operator!=( const concurrent_priority_queue& lhs, const concurrent_priority_queue& rhs ) { return !(lhs == rhs); } #endif }; // class concurrent_priority_queue #if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT template >, typename Alloc = tbb::cache_aligned_allocator>, typename = std::enable_if_t>, typename = std::enable_if_t>, typename = std::enable_if_t>> concurrent_priority_queue( It, It, Comp = Comp(), Alloc = Alloc() ) -> concurrent_priority_queue, Comp, Alloc>; template >, typename = std::enable_if_t>> concurrent_priority_queue( It, It, Alloc ) -> concurrent_priority_queue, std::less>, Alloc>; template , typename Alloc = tbb::cache_aligned_allocator, typename = std::enable_if_t>, typename = std::enable_if_t>> concurrent_priority_queue( std::initializer_list, Comp = Comp(), Alloc = Alloc() ) -> concurrent_priority_queue; template >> concurrent_priority_queue( std::initializer_list, Alloc ) -> concurrent_priority_queue, Alloc>; #endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT template void swap( concurrent_priority_queue& lhs, concurrent_priority_queue& rhs ) { lhs.swap(rhs); } } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::concurrent_priority_queue; } // inline namespace v1 } // namespace tbb #endif // __TBB_concurrent_priority_queue_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/concurrent_queue.h ================================================ /* Copyright (c) 2005-2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_concurrent_queue_H #define __TBB_concurrent_queue_H #include "detail/_namespace_injection.h" #include "detail/_concurrent_queue_base.h" #include "detail/_allocator_traits.h" #include "detail/_exception.h" #include "detail/_containers_helpers.h" #include "cache_aligned_allocator.h" namespace tbb { namespace detail { namespace d2 { template std::pair internal_try_pop_impl(void* dst, QueueRep& queue, Allocator& alloc ) { ticket_type ticket{}; do { // Basically, we need to read `head_counter` before `tail_counter`. To achieve it we build happens-before on `head_counter` ticket = queue.head_counter.load(std::memory_order_acquire); do { if (static_cast(queue.tail_counter.load(std::memory_order_relaxed) - ticket) <= 0) { // queue is empty // Queue is empty return { false, ticket }; } // Queue had item with ticket k when we looked. Attempt to get that item. // Another thread snatched the item, retry. } while (!queue.head_counter.compare_exchange_strong(ticket, ticket + 1)); } while (!queue.choose(ticket).pop(dst, ticket, queue, alloc)); return { true, ticket }; } // A high-performance thread-safe non-blocking concurrent queue. // Multiple threads may each push and pop concurrently. template > class concurrent_queue { using allocator_traits_type = tbb::detail::allocator_traits; using queue_representation_type = concurrent_queue_rep; using queue_allocator_type = typename allocator_traits_type::template rebind_alloc; using queue_allocator_traits = tbb::detail::allocator_traits; public: using size_type = std::size_t; using value_type = T; using reference = T&; using const_reference = const T&; using difference_type = std::ptrdiff_t; using allocator_type = Allocator; using pointer = typename allocator_traits_type::pointer; using const_pointer = typename allocator_traits_type::const_pointer; using iterator = concurrent_queue_iterator; using const_iterator = concurrent_queue_iterator; concurrent_queue() : concurrent_queue(allocator_type()) {} explicit concurrent_queue(const allocator_type& a) : my_allocator(a), my_queue_representation(nullptr) { my_queue_representation = static_cast(r1::cache_aligned_allocate(sizeof(queue_representation_type))); queue_allocator_traits::construct(my_allocator, my_queue_representation); __TBB_ASSERT(is_aligned(my_queue_representation, max_nfs_size), "alignment error" ); __TBB_ASSERT(is_aligned(&my_queue_representation->head_counter, max_nfs_size), "alignment error" ); __TBB_ASSERT(is_aligned(&my_queue_representation->tail_counter, max_nfs_size), "alignment error" ); __TBB_ASSERT(is_aligned(&my_queue_representation->array, max_nfs_size), "alignment error" ); } template concurrent_queue(InputIterator begin, InputIterator end, const allocator_type& a = allocator_type()) : concurrent_queue(a) { for (; begin != end; ++begin) push(*begin); } concurrent_queue( std::initializer_list init, const allocator_type& alloc = allocator_type() ) : concurrent_queue(init.begin(), init.end(), alloc) {} concurrent_queue(const concurrent_queue& src, const allocator_type& a) : concurrent_queue(a) { my_queue_representation->assign(*src.my_queue_representation, my_allocator, copy_construct_item); } concurrent_queue(const concurrent_queue& src) : concurrent_queue(queue_allocator_traits::select_on_container_copy_construction(src.get_allocator())) { my_queue_representation->assign(*src.my_queue_representation, my_allocator, copy_construct_item); } // Move constructors concurrent_queue(concurrent_queue&& src) : concurrent_queue(std::move(src.my_allocator)) { internal_swap(src); } concurrent_queue(concurrent_queue&& src, const allocator_type& a) : concurrent_queue(a) { // checking that memory allocated by one instance of allocator can be deallocated // with another if (my_allocator == src.my_allocator) { internal_swap(src); } else { // allocators are different => performing per-element move my_queue_representation->assign(*src.my_queue_representation, my_allocator, move_construct_item); src.clear(); } } // Destroy queue ~concurrent_queue() { clear(); my_queue_representation->clear(my_allocator); queue_allocator_traits::destroy(my_allocator, my_queue_representation); r1::cache_aligned_deallocate(my_queue_representation); } concurrent_queue& operator=( const concurrent_queue& other ) { //TODO: implement support for std::allocator_traits::propagate_on_container_copy_assignment if (my_queue_representation != other.my_queue_representation) { clear(); my_allocator = other.my_allocator; my_queue_representation->assign(*other.my_queue_representation, my_allocator, copy_construct_item); } return *this; } concurrent_queue& operator=( concurrent_queue&& other ) { //TODO: implement support for std::allocator_traits::propagate_on_container_move_assignment if (my_queue_representation != other.my_queue_representation) { clear(); if (my_allocator == other.my_allocator) { internal_swap(other); } else { my_queue_representation->assign(*other.my_queue_representation, other.my_allocator, move_construct_item); other.clear(); my_allocator = std::move(other.my_allocator); } } return *this; } concurrent_queue& operator=( std::initializer_list init ) { assign(init); return *this; } template void assign( InputIterator first, InputIterator last ) { concurrent_queue src(first, last); clear(); my_queue_representation->assign(*src.my_queue_representation, my_allocator, move_construct_item); } void assign( std::initializer_list init ) { assign(init.begin(), init.end()); } void swap ( concurrent_queue& other ) { //TODO: implement support for std::allocator_traits::propagate_on_container_swap __TBB_ASSERT(my_allocator == other.my_allocator, "unequal allocators"); internal_swap(other); } // Enqueue an item at tail of queue. void push(const T& value) { internal_push(value); } void push(T&& value) { internal_push(std::move(value)); } template void emplace( Args&&... args ) { internal_push(std::forward(args)...); } // Attempt to dequeue an item from head of queue. /** Does not wait for item to become available. Returns true if successful; false otherwise. */ bool try_pop( T& result ) { return internal_try_pop(&result); } // Return the number of items in the queue; thread unsafe size_type unsafe_size() const { std::ptrdiff_t size = my_queue_representation->size(); return size < 0 ? 0 : size_type(size); } // Equivalent to size()==0. __TBB_nodiscard bool empty() const { return my_queue_representation->empty(); } // Clear the queue. not thread-safe. void clear() { my_queue_representation->clear(my_allocator); } // Return allocator object allocator_type get_allocator() const { return my_allocator; } //------------------------------------------------------------------------ // The iterators are intended only for debugging. They are slow and not thread safe. //------------------------------------------------------------------------ iterator unsafe_begin() { return concurrent_queue_iterator_provider::get(*this); } iterator unsafe_end() { return iterator(); } const_iterator unsafe_begin() const { return concurrent_queue_iterator_provider::get(*this); } const_iterator unsafe_end() const { return const_iterator(); } const_iterator unsafe_cbegin() const { return concurrent_queue_iterator_provider::get(*this); } const_iterator unsafe_cend() const { return const_iterator(); } private: void internal_swap(concurrent_queue& src) { using std::swap; swap(my_queue_representation, src.my_queue_representation); } template void internal_push( Args&&... args ) { ticket_type k = my_queue_representation->tail_counter++; my_queue_representation->choose(k).push(k, *my_queue_representation, my_allocator, std::forward(args)...); } bool internal_try_pop( void* dst ) { return internal_try_pop_impl(dst, *my_queue_representation, my_allocator).first; } template friend class concurrent_queue_iterator; static void copy_construct_item(T* location, const void* src) { // TODO: use allocator_traits for copy construction new (location) value_type(*static_cast(src)); // queue_allocator_traits::construct(my_allocator, location, *static_cast(src)); } static void move_construct_item(T* location, const void* src) { // TODO: use allocator_traits for move construction new (location) value_type(std::move(*static_cast(const_cast(src)))); } queue_allocator_type my_allocator; queue_representation_type* my_queue_representation; friend void swap( concurrent_queue& lhs, concurrent_queue& rhs ) { lhs.swap(rhs); } friend bool operator==( const concurrent_queue& lhs, const concurrent_queue& rhs ) { return lhs.unsafe_size() == rhs.unsafe_size() && std::equal(lhs.unsafe_begin(), lhs.unsafe_end(), rhs.unsafe_begin()); } #if !__TBB_CPP20_COMPARISONS_PRESENT friend bool operator!=( const concurrent_queue& lhs, const concurrent_queue& rhs ) { return !(lhs == rhs); } #endif // __TBB_CPP20_COMPARISONS_PRESENT }; // class concurrent_queue #if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT // Deduction guide for the constructor from two iterators template >, typename = std::enable_if_t>, typename = std::enable_if_t>> concurrent_queue( It, It, Alloc = Alloc() ) -> concurrent_queue, Alloc>; #endif /* __TBB_CPP17_DEDUCTION_GUIDES_PRESENT */ class concurrent_monitor; // The concurrent monitor tags for concurrent_bounded_queue. static constexpr std::size_t cbq_slots_avail_tag = 0; static constexpr std::size_t cbq_items_avail_tag = 1; } // namespace d2 namespace r1 { class concurrent_monitor; TBB_EXPORT std::uint8_t* __TBB_EXPORTED_FUNC allocate_bounded_queue_rep( std::size_t queue_rep_size ); TBB_EXPORT void __TBB_EXPORTED_FUNC deallocate_bounded_queue_rep( std::uint8_t* mem, std::size_t queue_rep_size ); TBB_EXPORT void __TBB_EXPORTED_FUNC abort_bounded_queue_monitors( concurrent_monitor* monitors ); TBB_EXPORT void __TBB_EXPORTED_FUNC notify_bounded_queue_monitor( concurrent_monitor* monitors, std::size_t monitor_tag , std::size_t ticket ); TBB_EXPORT void __TBB_EXPORTED_FUNC wait_bounded_queue_monitor( concurrent_monitor* monitors, std::size_t monitor_tag, std::ptrdiff_t target, d1::delegate_base& predicate ); } // namespace r1 namespace d2 { // A high-performance thread-safe blocking concurrent bounded queue. // Supports boundedness and blocking semantics. // Multiple threads may each push and pop concurrently. template > class concurrent_bounded_queue { using allocator_traits_type = tbb::detail::allocator_traits; using queue_representation_type = concurrent_queue_rep; using queue_allocator_type = typename allocator_traits_type::template rebind_alloc; using queue_allocator_traits = tbb::detail::allocator_traits; template void internal_wait(r1::concurrent_monitor* monitors, std::size_t monitor_tag, std::ptrdiff_t target, FuncType pred) { d1::delegated_function func(pred); r1::wait_bounded_queue_monitor(monitors, monitor_tag, target, func); } public: using size_type = std::ptrdiff_t; using value_type = T; using reference = T&; using const_reference = const T&; using difference_type = std::ptrdiff_t; using allocator_type = Allocator; using pointer = typename allocator_traits_type::pointer; using const_pointer = typename allocator_traits_type::const_pointer; using iterator = concurrent_queue_iterator; using const_iterator = concurrent_queue_iterator ; concurrent_bounded_queue() : concurrent_bounded_queue(allocator_type()) {} explicit concurrent_bounded_queue( const allocator_type& a ) : my_allocator(a), my_capacity(0), my_abort_counter(0), my_queue_representation(nullptr) { my_queue_representation = reinterpret_cast( r1::allocate_bounded_queue_rep(sizeof(queue_representation_type))); my_monitors = reinterpret_cast(my_queue_representation + 1); queue_allocator_traits::construct(my_allocator, my_queue_representation); my_capacity = std::size_t(-1) / (queue_representation_type::item_size > 1 ? queue_representation_type::item_size : 2); __TBB_ASSERT(is_aligned(my_queue_representation, max_nfs_size), "alignment error" ); __TBB_ASSERT(is_aligned(&my_queue_representation->head_counter, max_nfs_size), "alignment error" ); __TBB_ASSERT(is_aligned(&my_queue_representation->tail_counter, max_nfs_size), "alignment error" ); __TBB_ASSERT(is_aligned(&my_queue_representation->array, max_nfs_size), "alignment error" ); } template concurrent_bounded_queue( InputIterator begin, InputIterator end, const allocator_type& a = allocator_type() ) : concurrent_bounded_queue(a) { for (; begin != end; ++begin) push(*begin); } concurrent_bounded_queue( std::initializer_list init, const allocator_type& alloc = allocator_type() ): concurrent_bounded_queue(init.begin(), init.end(), alloc) {} concurrent_bounded_queue( const concurrent_bounded_queue& src, const allocator_type& a ) : concurrent_bounded_queue(a) { my_capacity = src.my_capacity; my_queue_representation->assign(*src.my_queue_representation, my_allocator, copy_construct_item); } concurrent_bounded_queue( const concurrent_bounded_queue& src ) : concurrent_bounded_queue(queue_allocator_traits::select_on_container_copy_construction(src.get_allocator())) { my_capacity = src.my_capacity; my_queue_representation->assign(*src.my_queue_representation, my_allocator, copy_construct_item); } // Move constructors concurrent_bounded_queue( concurrent_bounded_queue&& src ) : concurrent_bounded_queue(std::move(src.my_allocator)) { internal_swap(src); } concurrent_bounded_queue( concurrent_bounded_queue&& src, const allocator_type& a ) : concurrent_bounded_queue(a) { // checking that memory allocated by one instance of allocator can be deallocated // with another if (my_allocator == src.my_allocator) { internal_swap(src); } else { // allocators are different => performing per-element move my_queue_representation->assign(*src.my_queue_representation, my_allocator, move_construct_item); src.clear(); } } // Destroy queue ~concurrent_bounded_queue() { clear(); my_queue_representation->clear(my_allocator); queue_allocator_traits::destroy(my_allocator, my_queue_representation); r1::deallocate_bounded_queue_rep(reinterpret_cast(my_queue_representation), sizeof(queue_representation_type)); } concurrent_bounded_queue& operator=( const concurrent_bounded_queue& other ) { //TODO: implement support for std::allocator_traits::propagate_on_container_copy_assignment if (my_queue_representation != other.my_queue_representation) { clear(); my_allocator = other.my_allocator; my_capacity = other.my_capacity; my_queue_representation->assign(*other.my_queue_representation, my_allocator, copy_construct_item); } return *this; } concurrent_bounded_queue& operator=( concurrent_bounded_queue&& other ) { //TODO: implement support for std::allocator_traits::propagate_on_container_move_assignment if (my_queue_representation != other.my_queue_representation) { clear(); if (my_allocator == other.my_allocator) { internal_swap(other); } else { my_queue_representation->assign(*other.my_queue_representation, other.my_allocator, move_construct_item); other.clear(); my_allocator = std::move(other.my_allocator); my_capacity = other.my_capacity; } } return *this; } concurrent_bounded_queue& operator=( std::initializer_list init ) { assign(init); return *this; } template void assign( InputIterator first, InputIterator last ) { concurrent_bounded_queue src(first, last); clear(); my_queue_representation->assign(*src.my_queue_representation, my_allocator, move_construct_item); } void assign( std::initializer_list init ) { assign(init.begin(), init.end()); } void swap ( concurrent_bounded_queue& other ) { //TODO: implement support for std::allocator_traits::propagate_on_container_swap __TBB_ASSERT(my_allocator == other.my_allocator, "unequal allocators"); internal_swap(other); } // Enqueue an item at tail of queue. void push( const T& value ) { internal_push(value); } void push( T&& value ) { internal_push(std::move(value)); } // Enqueue an item at tail of queue if queue is not already full. // Does not wait for queue to become not full. // Returns true if item is pushed; false if queue was already full. bool try_push( const T& value ) { return internal_push_if_not_full(value); } bool try_push( T&& value ) { return internal_push_if_not_full(std::move(value)); } template void emplace( Args&&... args ) { internal_push(std::forward(args)...); } template bool try_emplace( Args&&... args ) { return internal_push_if_not_full(std::forward(args)...); } // Attempt to dequeue an item from head of queue. void pop( T& result ) { internal_pop(&result); } /** Does not wait for item to become available. Returns true if successful; false otherwise. */ bool try_pop( T& result ) { return internal_pop_if_present(&result); } void abort() { internal_abort(); } // Return the number of items in the queue; thread unsafe std::ptrdiff_t size() const { return my_queue_representation->size(); } void set_capacity( size_type new_capacity ) { std::ptrdiff_t c = new_capacity < 0 ? infinite_capacity : new_capacity; my_capacity = c; } size_type capacity() const { return my_capacity; } // Equivalent to size()==0. __TBB_nodiscard bool empty() const { return my_queue_representation->empty(); } // Clear the queue. not thread-safe. void clear() { my_queue_representation->clear(my_allocator); } // Return allocator object allocator_type get_allocator() const { return my_allocator; } //------------------------------------------------------------------------ // The iterators are intended only for debugging. They are slow and not thread safe. //------------------------------------------------------------------------ iterator unsafe_begin() { return concurrent_queue_iterator_provider::get(*this); } iterator unsafe_end() { return iterator(); } const_iterator unsafe_begin() const { return concurrent_queue_iterator_provider::get(*this); } const_iterator unsafe_end() const { return const_iterator(); } const_iterator unsafe_cbegin() const { return concurrent_queue_iterator_provider::get(*this); } const_iterator unsafe_cend() const { return const_iterator(); } private: void internal_swap( concurrent_bounded_queue& src ) { using std::swap; swap(my_queue_representation, src.my_queue_representation); swap(my_capacity, src.my_capacity); swap(my_monitors, src.my_monitors); } static constexpr std::ptrdiff_t infinite_capacity = std::ptrdiff_t(~size_type(0) / 2); template void internal_push( Args&&... args ) { unsigned old_abort_counter = my_abort_counter.load(std::memory_order_relaxed); ticket_type ticket = my_queue_representation->tail_counter++; std::ptrdiff_t target = ticket - my_capacity; if (static_cast(my_queue_representation->head_counter.load(std::memory_order_relaxed)) <= target) { // queue is full auto pred = [&] { if (my_abort_counter.load(std::memory_order_relaxed) != old_abort_counter) { throw_exception(exception_id::user_abort); } return static_cast(my_queue_representation->head_counter.load(std::memory_order_relaxed)) <= target; }; try_call( [&] { internal_wait(my_monitors, cbq_slots_avail_tag, target, pred); }).on_exception( [&] { my_queue_representation->choose(ticket).abort_push(ticket, *my_queue_representation, my_allocator); }); } __TBB_ASSERT((static_cast(my_queue_representation->head_counter.load(std::memory_order_relaxed)) > target), nullptr); my_queue_representation->choose(ticket).push(ticket, *my_queue_representation, my_allocator, std::forward(args)...); r1::notify_bounded_queue_monitor(my_monitors, cbq_items_avail_tag, ticket); } template bool internal_push_if_not_full( Args&&... args ) { ticket_type ticket = my_queue_representation->tail_counter.load(std::memory_order_relaxed); do { if (static_cast(ticket - my_queue_representation->head_counter.load(std::memory_order_relaxed)) >= my_capacity) { // Queue is full return false; } // Queue had empty slot with ticket k when we looked. Attempt to claim that slot. // Another thread claimed the slot, so retry. } while (!my_queue_representation->tail_counter.compare_exchange_strong(ticket, ticket + 1)); my_queue_representation->choose(ticket).push(ticket, *my_queue_representation, my_allocator, std::forward(args)...); r1::notify_bounded_queue_monitor(my_monitors, cbq_items_avail_tag, ticket); return true; } void internal_pop( void* dst ) { std::ptrdiff_t target; // This loop is a single pop operation; abort_counter should not be re-read inside unsigned old_abort_counter = my_abort_counter.load(std::memory_order_relaxed); do { target = my_queue_representation->head_counter++; if (static_cast(my_queue_representation->tail_counter.load(std::memory_order_relaxed)) <= target) { auto pred = [&] { if (my_abort_counter.load(std::memory_order_relaxed) != old_abort_counter) { throw_exception(exception_id::user_abort); } return static_cast(my_queue_representation->tail_counter.load(std::memory_order_relaxed)) <= target; }; try_call( [&] { internal_wait(my_monitors, cbq_items_avail_tag, target, pred); }).on_exception( [&] { my_queue_representation->head_counter--; }); } __TBB_ASSERT(static_cast(my_queue_representation->tail_counter.load(std::memory_order_relaxed)) > target, nullptr); } while (!my_queue_representation->choose(target).pop(dst, target, *my_queue_representation, my_allocator)); r1::notify_bounded_queue_monitor(my_monitors, cbq_slots_avail_tag, target); } bool internal_pop_if_present( void* dst ) { bool present{}; ticket_type ticket{}; std::tie(present, ticket) = internal_try_pop_impl(dst, *my_queue_representation, my_allocator); if (present) { r1::notify_bounded_queue_monitor(my_monitors, cbq_slots_avail_tag, ticket); } return present; } void internal_abort() { ++my_abort_counter; r1::abort_bounded_queue_monitors(my_monitors); } static void copy_construct_item(T* location, const void* src) { // TODO: use allocator_traits for copy construction new (location) value_type(*static_cast(src)); } static void move_construct_item(T* location, const void* src) { // TODO: use allocator_traits for move construction new (location) value_type(std::move(*static_cast(const_cast(src)))); } template friend class concurrent_queue_iterator; queue_allocator_type my_allocator; std::ptrdiff_t my_capacity; std::atomic my_abort_counter; queue_representation_type* my_queue_representation; r1::concurrent_monitor* my_monitors; friend void swap( concurrent_bounded_queue& lhs, concurrent_bounded_queue& rhs ) { lhs.swap(rhs); } friend bool operator==( const concurrent_bounded_queue& lhs, const concurrent_bounded_queue& rhs ) { return lhs.size() == rhs.size() && std::equal(lhs.unsafe_begin(), lhs.unsafe_end(), rhs.unsafe_begin()); } #if !__TBB_CPP20_COMPARISONS_PRESENT friend bool operator!=( const concurrent_bounded_queue& lhs, const concurrent_bounded_queue& rhs ) { return !(lhs == rhs); } #endif // __TBB_CPP20_COMPARISONS_PRESENT }; // class concurrent_bounded_queue #if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT // Deduction guide for the constructor from two iterators template >> concurrent_bounded_queue( It, It, Alloc = Alloc() ) -> concurrent_bounded_queue, Alloc>; #endif /* __TBB_CPP17_DEDUCTION_GUIDES_PRESENT */ } //namespace d2 } // namespace detail inline namespace v1 { using detail::d2::concurrent_queue; using detail::d2::concurrent_bounded_queue; using detail::r1::user_abort; using detail::r1::bad_last_alloc; } // inline namespace v1 } // namespace tbb #endif // __TBB_concurrent_queue_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/concurrent_set.h ================================================ /* Copyright (c) 2019-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_concurrent_set_H #define __TBB_concurrent_set_H #include "detail/_namespace_injection.h" #include "detail/_concurrent_skip_list.h" #include "tbb_allocator.h" #include #include namespace tbb { namespace detail { namespace d2 { template struct set_traits { static constexpr std::size_t max_level = RandomGenerator::max_level; using random_level_generator_type = RandomGenerator; using key_type = Key; using value_type = key_type; using compare_type = KeyCompare; using value_compare = compare_type; using reference = value_type&; using const_reference = const value_type&; using allocator_type = Allocator; static constexpr bool allow_multimapping = AllowMultimapping; static const key_type& get_key(const_reference val) { return val; } static value_compare value_comp(compare_type comp) { return comp; } }; // struct set_traits template class concurrent_multiset; template , typename Allocator = tbb::tbb_allocator> class concurrent_set : public concurrent_skip_list, Allocator, false>> { using base_type = concurrent_skip_list, Allocator, false>>; public: using key_type = Key; using value_type = typename base_type::value_type; using size_type = typename base_type::size_type; using difference_type = typename base_type::difference_type; using key_compare = Compare; using value_compare = typename base_type::value_compare; using allocator_type = Allocator; using reference = typename base_type::reference; using const_reference = typename base_type::const_reference; using pointer = typename base_type::pointer; using const_pointer = typename base_type::const_pointer; using iterator = typename base_type::iterator; using const_iterator = typename base_type::const_iterator; using node_type = typename base_type::node_type; // Include constructors of base_type using base_type::base_type; // Required for implicit deduction guides concurrent_set() = default; concurrent_set( const concurrent_set& ) = default; concurrent_set( const concurrent_set& other, const allocator_type& alloc ) : base_type(other, alloc) {} concurrent_set( concurrent_set&& ) = default; concurrent_set( concurrent_set&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {} // Required to respect the rule of 5 concurrent_set& operator=( const concurrent_set& ) = default; concurrent_set& operator=( concurrent_set&& ) = default; concurrent_set& operator=( std::initializer_list il ) { base_type::operator= (il); return *this; } template void merge(concurrent_set& source) { this->internal_merge(source); } template void merge(concurrent_set&& source) { this->internal_merge(std::move(source)); } template void merge(concurrent_multiset& source) { this->internal_merge(source); } template void merge(concurrent_multiset&& source) { this->internal_merge(std::move(source)); } }; // class concurrent_set #if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT template >, typename Alloc = tbb::tbb_allocator>, typename = std::enable_if_t>, typename = std::enable_if_t>, typename = std::enable_if_t>> concurrent_set( It, It, Comp = Comp(), Alloc = Alloc() ) -> concurrent_set, Comp, Alloc>; template , typename Alloc = tbb::tbb_allocator, typename = std::enable_if_t>, typename = std::enable_if_t>> concurrent_set( std::initializer_list, Comp = Comp(), Alloc = Alloc() ) -> concurrent_set; template >, typename = std::enable_if_t>> concurrent_set( It, It, Alloc ) -> concurrent_set, std::less>, Alloc>; template >> concurrent_set( std::initializer_list, Alloc ) -> concurrent_set, Alloc>; #endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT template void swap( concurrent_set& lhs, concurrent_set& rhs ) { lhs.swap(rhs); } template , typename Allocator = tbb::tbb_allocator> class concurrent_multiset : public concurrent_skip_list, Allocator, true>> { using base_type = concurrent_skip_list, Allocator, true>>; public: using key_type = Key; using value_type = typename base_type::value_type; using size_type = typename base_type::size_type; using difference_type = typename base_type::difference_type; using key_compare = Compare; using value_compare = typename base_type::value_compare; using allocator_type = Allocator; using reference = typename base_type::reference; using const_reference = typename base_type::const_reference; using pointer = typename base_type::pointer; using const_pointer = typename base_type::const_pointer; using iterator = typename base_type::iterator; using const_iterator = typename base_type::const_iterator; using node_type = typename base_type::node_type; // Include constructors of base_type; using base_type::base_type; // Required for implicit deduction guides concurrent_multiset() = default; concurrent_multiset( const concurrent_multiset& ) = default; concurrent_multiset( const concurrent_multiset& other, const allocator_type& alloc ) : base_type(other, alloc) {} concurrent_multiset( concurrent_multiset&& ) = default; concurrent_multiset( concurrent_multiset&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {} // Required to respect the rule of 5 concurrent_multiset& operator=( const concurrent_multiset& ) = default; concurrent_multiset& operator=( concurrent_multiset&& ) = default; concurrent_multiset& operator=( std::initializer_list il ) { base_type::operator= (il); return *this; } template void merge(concurrent_set& source) { this->internal_merge(source); } template void merge(concurrent_set&& source) { this->internal_merge(std::move(source)); } template void merge(concurrent_multiset& source) { this->internal_merge(source); } template void merge(concurrent_multiset&& source) { this->internal_merge(std::move(source)); } }; // class concurrent_multiset #if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT template >, typename Alloc = tbb::tbb_allocator>, typename = std::enable_if_t>, typename = std::enable_if_t>, typename = std::enable_if_t>> concurrent_multiset( It, It, Comp = Comp(), Alloc = Alloc() ) -> concurrent_multiset, Comp, Alloc>; template , typename Alloc = tbb::tbb_allocator, typename = std::enable_if_t>, typename = std::enable_if_t>> concurrent_multiset( std::initializer_list, Comp = Comp(), Alloc = Alloc() ) -> concurrent_multiset; template >, typename = std::enable_if_t>> concurrent_multiset( It, It, Alloc ) -> concurrent_multiset, std::less>, Alloc>; template >> concurrent_multiset( std::initializer_list, Alloc ) -> concurrent_multiset, Alloc>; #endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT template void swap( concurrent_multiset& lhs, concurrent_multiset& rhs ) { lhs.swap(rhs); } } // namespace d2 } // namespace detail inline namespace v1 { using detail::d2::concurrent_set; using detail::d2::concurrent_multiset; using detail::split; } // inline namespace v1 } // namespace tbb #endif // __TBB_concurrent_set_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/concurrent_unordered_map.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_concurrent_unordered_map_H #define __TBB_concurrent_unordered_map_H #include "detail/_namespace_injection.h" #include "detail/_concurrent_unordered_base.h" #include "tbb_allocator.h" #include namespace tbb { namespace detail { namespace d2 { template struct concurrent_unordered_map_traits { using value_type = std::pair; using key_type = Key; using allocator_type = Allocator; using hash_compare_type = d1::hash_compare; static constexpr bool allow_multimapping = AllowMultimapping; static constexpr const key_type& get_key( const value_type& value ) { return value.first; } }; // struct concurrent_unordered_map_traits template class concurrent_unordered_multimap; template , typename KeyEqual = std::equal_to, typename Allocator = tbb::tbb_allocator> > class concurrent_unordered_map : public concurrent_unordered_base> { using traits_type = concurrent_unordered_map_traits; using base_type = concurrent_unordered_base; public: using key_type = typename base_type::key_type; using mapped_type = T; using value_type = typename base_type::value_type; using size_type = typename base_type::size_type; using difference_type = typename base_type::difference_type; using hasher = typename base_type::hasher; using key_equal = typename base_type::key_equal; using allocator_type = typename base_type::allocator_type; using reference = typename base_type::reference; using const_reference = typename base_type::const_reference; using pointer = typename base_type::pointer; using const_pointer = typename base_type::const_pointer; using iterator = typename base_type::iterator; using const_iterator = typename base_type::const_iterator; using local_iterator = typename base_type::local_iterator; using const_local_iterator = typename base_type::const_local_iterator; using node_type = typename base_type::node_type; // Include constructors of base type using base_type::base_type; // Required for implicit deduction guides concurrent_unordered_map() = default; concurrent_unordered_map( const concurrent_unordered_map& ) = default; concurrent_unordered_map( const concurrent_unordered_map& other, const allocator_type& alloc ) : base_type(other, alloc) {} concurrent_unordered_map( concurrent_unordered_map&& ) = default; concurrent_unordered_map( concurrent_unordered_map&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {} // Required to respect the rule of 5 concurrent_unordered_map& operator=( const concurrent_unordered_map& ) = default; concurrent_unordered_map& operator=( concurrent_unordered_map&& ) = default; concurrent_unordered_map& operator=( std::initializer_list il ) { base_type::operator= (il); return *this; } // Observers mapped_type& operator[]( const key_type& key ) { iterator where = this->find(key); if (where == this->end()) { where = this->emplace(std::piecewise_construct, std::forward_as_tuple(key), std::tuple<>()).first; } return where->second; } mapped_type& operator[]( key_type&& key ) { iterator where = this->find(key); if (where == this->end()) { where = this->emplace(std::piecewise_construct, std::forward_as_tuple(std::move(key)), std::tuple<>()).first; } return where->second; } mapped_type& at( const key_type& key ) { iterator where = this->find(key); if (where == this->end()) { throw_exception(exception_id::invalid_key); } return where->second; } const mapped_type& at( const key_type& key ) const { const_iterator where = this->find(key); if (where == this->end()) { throw_exception(exception_id::out_of_range); } return where->second; } using base_type::insert; template typename std::enable_if::value, std::pair>::type insert( P&& value ) { return this->emplace(std::forward

(value)); } template typename std::enable_if::value, iterator>::type insert( const_iterator hint, P&& value ) { return this->emplace_hint(hint, std::forward

(value)); } template void merge( concurrent_unordered_map& source ) { this->internal_merge(source); } template void merge( concurrent_unordered_map&& source ) { this->internal_merge(std::move(source)); } template void merge( concurrent_unordered_multimap& source ) { this->internal_merge(source); } template void merge( concurrent_unordered_multimap&& source ) { this->internal_merge(std::move(source)); } }; // class concurrent_unordered_map #if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT template >, typename KeyEq = std::equal_to>, typename Alloc = tbb::tbb_allocator>, typename = std::enable_if_t>, typename = std::enable_if_t>, typename = std::enable_if_t>, typename = std::enable_if_t>, typename = std::enable_if_t>> concurrent_unordered_map( It, It, std::size_t = {}, Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() ) -> concurrent_unordered_map, iterator_mapped_t, Hash, KeyEq, Alloc>; template >, typename KeyEq = std::equal_to>, typename Alloc = tbb::tbb_allocator>, typename = std::enable_if_t>, typename = std::enable_if_t>, typename = std::enable_if_t>, typename = std::enable_if_t>> concurrent_unordered_map( std::initializer_list>, std::size_t = {}, Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() ) -> concurrent_unordered_map, T, Hash, KeyEq, Alloc>; template >, typename = std::enable_if_t>> concurrent_unordered_map( It, It, std::size_t, Alloc ) -> concurrent_unordered_map, iterator_mapped_t, std::hash>, std::equal_to>, Alloc>; // TODO: investigate if a deduction guide for concurrent_unordered_map(It, It, Alloc) is needed template >, typename = std::enable_if_t>, typename = std::enable_if_t>, typename = std::enable_if_t>> concurrent_unordered_map( It, It, std::size_t, Hash, Alloc ) -> concurrent_unordered_map, iterator_mapped_t, Hash, std::equal_to>, Alloc>; template >> concurrent_unordered_map( std::initializer_list>, std::size_t, Alloc ) -> concurrent_unordered_map, T, std::hash>, std::equal_to>, Alloc>; template >> concurrent_unordered_map( std::initializer_list>, Alloc ) -> concurrent_unordered_map, T, std::hash>, std::equal_to>, Alloc>; template >, typename = std::enable_if_t>, typename = std::enable_if_t>> concurrent_unordered_map( std::initializer_list>, std::size_t, Hash, Alloc ) -> concurrent_unordered_map, T, Hash, std::equal_to>, Alloc>; #if __APPLE__ && __TBB_CLANG_VERSION == 100000 // An explicit deduction guide is required for copy/move constructor with allocator for APPLE LLVM 10.0.0 // due to an issue with generating an implicit deduction guide for these constructors under several strange surcumstances. // Currently the issue takes place because the last template parameter for Traits is boolean, it should not affect the deduction guides // The issue reproduces only on this version of the compiler template concurrent_unordered_map( concurrent_unordered_map, Alloc ) -> concurrent_unordered_map; #endif #endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT template void swap( concurrent_unordered_map& lhs, concurrent_unordered_map& rhs ) { lhs.swap(rhs); } template , typename KeyEqual = std::equal_to, typename Allocator = tbb::tbb_allocator> > class concurrent_unordered_multimap : public concurrent_unordered_base> { using traits_type = concurrent_unordered_map_traits; using base_type = concurrent_unordered_base; public: using key_type = typename base_type::key_type; using mapped_type = T; using value_type = typename base_type::value_type; using size_type = typename base_type::size_type; using difference_type = typename base_type::difference_type; using hasher = typename base_type::hasher; using key_equal = typename base_type::key_equal; using allocator_type = typename base_type::allocator_type; using reference = typename base_type::reference; using const_reference = typename base_type::const_reference; using pointer = typename base_type::pointer; using const_pointer = typename base_type::const_pointer; using iterator = typename base_type::iterator; using const_iterator = typename base_type::const_iterator; using local_iterator = typename base_type::local_iterator; using const_local_iterator = typename base_type::const_local_iterator; using node_type = typename base_type::node_type; // Include constructors of base type using base_type::base_type; using base_type::insert; // Required for implicit deduction guides concurrent_unordered_multimap() = default; concurrent_unordered_multimap( const concurrent_unordered_multimap& ) = default; concurrent_unordered_multimap( const concurrent_unordered_multimap& other, const allocator_type& alloc ) : base_type(other, alloc) {} concurrent_unordered_multimap( concurrent_unordered_multimap&& ) = default; concurrent_unordered_multimap( concurrent_unordered_multimap&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {} // Required to respect the rule of 5 concurrent_unordered_multimap& operator=( const concurrent_unordered_multimap& ) = default; concurrent_unordered_multimap& operator=( concurrent_unordered_multimap&& ) = default; concurrent_unordered_multimap& operator=( std::initializer_list il ) { base_type::operator= (il); return *this; } template typename std::enable_if::value, std::pair>::type insert( P&& value ) { return this->emplace(std::forward

(value)); } template typename std::enable_if::value, iterator>::type insert( const_iterator hint, P&& value ) { return this->emplace_hint(hint, std::forward(value)); } template void merge( concurrent_unordered_map& source ) { this->internal_merge(source); } template void merge( concurrent_unordered_map&& source ) { this->internal_merge(std::move(source)); } template void merge( concurrent_unordered_multimap& source ) { this->internal_merge(source); } template void merge( concurrent_unordered_multimap&& source ) { this->internal_merge(std::move(source)); } }; // class concurrent_unordered_multimap #if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT template >, typename KeyEq = std::equal_to>, typename Alloc = tbb::tbb_allocator>, typename = std::enable_if_t>, typename = std::enable_if_t>, typename = std::enable_if_t>, typename = std::enable_if_t>, typename = std::enable_if_t>> concurrent_unordered_multimap( It, It, std::size_t = {}, Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() ) -> concurrent_unordered_multimap, iterator_mapped_t, Hash, KeyEq, Alloc>; template >, typename KeyEq = std::equal_to>, typename Alloc = tbb::tbb_allocator>, typename = std::enable_if_t>, typename = std::enable_if_t>, typename = std::enable_if_t>, typename = std::enable_if_t>> concurrent_unordered_multimap( std::initializer_list>, std::size_t = {}, Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() ) -> concurrent_unordered_multimap, T, Hash, KeyEq, Alloc>; template >, typename = std::enable_if_t>> concurrent_unordered_multimap( It, It, std::size_t, Alloc ) -> concurrent_unordered_multimap, iterator_mapped_t, std::hash>, std::equal_to>, Alloc>; template >, typename = std::enable_if_t>, typename = std::enable_if_t>, typename = std::enable_if_t>> concurrent_unordered_multimap( It, It, std::size_t, Hash, Alloc ) -> concurrent_unordered_multimap, iterator_mapped_t, Hash, std::equal_to>, Alloc>; template >> concurrent_unordered_multimap( std::initializer_list>, std::size_t, Alloc ) -> concurrent_unordered_multimap, T, std::hash>, std::equal_to>, Alloc>; template >> concurrent_unordered_multimap( std::initializer_list>, Alloc ) -> concurrent_unordered_multimap, T, std::hash>, std::equal_to>, Alloc>; template >, typename = std::enable_if_t>, typename = std::enable_if_t>> concurrent_unordered_multimap( std::initializer_list>, std::size_t, Hash, Alloc ) -> concurrent_unordered_multimap, T, Hash, std::equal_to>, Alloc>; #if __APPLE__ && __TBB_CLANG_VERSION == 100000 // An explicit deduction guide is required for copy/move constructor with allocator for APPLE LLVM 10.0.0 // due to an issue with generating an implicit deduction guide for these constructors under several strange surcumstances. // Currently the issue takes place because the last template parameter for Traits is boolean, it should not affect the deduction guides // The issue reproduces only on this version of the compiler template concurrent_unordered_multimap( concurrent_unordered_multimap, Alloc ) -> concurrent_unordered_multimap; #endif #endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT template void swap( concurrent_unordered_multimap& lhs, concurrent_unordered_multimap& rhs ) { lhs.swap(rhs); } } // namespace d2 } // namespace detail inline namespace v1 { using detail::d2::concurrent_unordered_map; using detail::d2::concurrent_unordered_multimap; using detail::split; } // inline namespace v1 } // namespace tbb #endif // __TBB_concurrent_unordered_map_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/concurrent_unordered_set.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_concurrent_unordered_set_H #define __TBB_concurrent_unordered_set_H #include "detail/_namespace_injection.h" #include "detail/_concurrent_unordered_base.h" #include "tbb_allocator.h" namespace tbb { namespace detail { namespace d2 { template struct concurrent_unordered_set_traits { using key_type = Key; using value_type = key_type; using allocator_type = Allocator; using hash_compare_type = d1::hash_compare; static constexpr bool allow_multimapping = AllowMultimapping; static constexpr const key_type& get_key( const value_type& value ) { return value; } }; // class concurrent_unordered_set_traits template class concurrent_unordered_multiset; template , typename KeyEqual = std::equal_to, typename Allocator = tbb::tbb_allocator> class concurrent_unordered_set : public concurrent_unordered_base> { using traits_type = concurrent_unordered_set_traits; using base_type = concurrent_unordered_base; public: using key_type = typename base_type::key_type; using value_type = typename base_type::value_type; using size_type = typename base_type::size_type; using difference_type = typename base_type::difference_type; using hasher = typename base_type::hasher; using key_equal = typename base_type::key_equal; using allocator_type = typename base_type::allocator_type; using reference = typename base_type::reference; using const_reference = typename base_type::const_reference; using pointer = typename base_type::pointer; using const_pointer = typename base_type::const_pointer; using iterator = typename base_type::iterator; using const_iterator = typename base_type::const_iterator; using local_iterator = typename base_type::local_iterator; using const_local_iterator = typename base_type::const_local_iterator; using node_type = typename base_type::node_type; // Include constructors of base_type; using base_type::base_type; // Required for implicit deduction guides concurrent_unordered_set() = default; concurrent_unordered_set( const concurrent_unordered_set& ) = default; concurrent_unordered_set( const concurrent_unordered_set& other, const allocator_type& alloc ) : base_type(other, alloc) {} concurrent_unordered_set( concurrent_unordered_set&& ) = default; concurrent_unordered_set( concurrent_unordered_set&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {} // Required to respect the rule of 5 concurrent_unordered_set& operator=( const concurrent_unordered_set& ) = default; concurrent_unordered_set& operator=( concurrent_unordered_set&& ) = default; concurrent_unordered_set& operator=( std::initializer_list il ) { base_type::operator= (il); return *this; } template void merge( concurrent_unordered_set& source ) { this->internal_merge(source); } template void merge( concurrent_unordered_set&& source ) { this->internal_merge(std::move(source)); } template void merge( concurrent_unordered_multiset& source ) { this->internal_merge(source); } template void merge( concurrent_unordered_multiset&& source ) { this->internal_merge(std::move(source)); } }; // class concurrent_unordered_set #if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT template >, typename KeyEq = std::equal_to>, typename Alloc = tbb::tbb_allocator>, typename = std::enable_if_t>, typename = std::enable_if_t>, typename = std::enable_if_t>, typename = std::enable_if_t>, typename = std::enable_if_t>> concurrent_unordered_set( It, It, std::size_t = {}, Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() ) -> concurrent_unordered_set, Hash, KeyEq, Alloc>; template , typename KeyEq = std::equal_to, typename Alloc = tbb::tbb_allocator, typename = std::enable_if_t>, typename = std::enable_if_t>, typename = std::enable_if_t>, typename = std::enable_if_t>> concurrent_unordered_set( std::initializer_list, std::size_t = {}, Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() ) -> concurrent_unordered_set; template >, typename = std::enable_if_t>> concurrent_unordered_set( It, It, std::size_t, Alloc ) -> concurrent_unordered_set, std::hash>, std::equal_to>, Alloc>; template >, typename = std::enable_if_t>, typename = std::enable_if_t>, typename = std::enable_if_t>> concurrent_unordered_set( It, It, std::size_t, Hash, Alloc ) -> concurrent_unordered_set, Hash, std::equal_to>, Alloc>; template >> concurrent_unordered_set( std::initializer_list, std::size_t, Alloc ) -> concurrent_unordered_set, std::equal_to, Alloc>; template >> concurrent_unordered_set( std::initializer_list, Alloc ) -> concurrent_unordered_set, std::equal_to, Alloc>; template >, typename = std::enable_if_t>, typename = std::enable_if_t>> concurrent_unordered_set( std::initializer_list, std::size_t, Hash, Alloc ) -> concurrent_unordered_set, Alloc>; #if __APPLE__ && __TBB_CLANG_VERSION == 100000 // An explicit deduction guide is required for copy/move constructor with allocator for APPLE LLVM 10.0.0 // due to an issue with generating an implicit deduction guide for these constructors under several strange surcumstances. // Currently the issue takes place because the last template parameter for Traits is boolean, it should not affect the deduction guides // The issue reproduces only on this version of the compiler template concurrent_unordered_set( concurrent_unordered_set, Alloc ) -> concurrent_unordered_set; #endif #endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT template void swap( concurrent_unordered_set& lhs, concurrent_unordered_set& rhs ) { lhs.swap(rhs); } template , typename KeyEqual = std::equal_to, typename Allocator = tbb::tbb_allocator> class concurrent_unordered_multiset : public concurrent_unordered_base> { using traits_type = concurrent_unordered_set_traits; using base_type = concurrent_unordered_base; public: using key_type = typename base_type::key_type; using value_type = typename base_type::value_type; using size_type = typename base_type::size_type; using difference_type = typename base_type::difference_type; using hasher = typename base_type::hasher; using key_equal = typename base_type::key_equal; using allocator_type = typename base_type::allocator_type; using reference = typename base_type::reference; using const_reference = typename base_type::const_reference; using pointer = typename base_type::pointer; using const_pointer = typename base_type::const_pointer; using iterator = typename base_type::iterator; using const_iterator = typename base_type::const_iterator; using local_iterator = typename base_type::local_iterator; using const_local_iterator = typename base_type::const_local_iterator; using node_type = typename base_type::node_type; // Include constructors of base_type; using base_type::base_type; // Required for implicit deduction guides concurrent_unordered_multiset() = default; concurrent_unordered_multiset( const concurrent_unordered_multiset& ) = default; concurrent_unordered_multiset( const concurrent_unordered_multiset& other, const allocator_type& alloc ) : base_type(other, alloc) {} concurrent_unordered_multiset( concurrent_unordered_multiset&& ) = default; concurrent_unordered_multiset( concurrent_unordered_multiset&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {} // Required to respect the rule of 5 concurrent_unordered_multiset& operator=( const concurrent_unordered_multiset& ) = default; concurrent_unordered_multiset& operator=( concurrent_unordered_multiset&& ) = default; concurrent_unordered_multiset& operator=( std::initializer_list il ) { base_type::operator= (il); return *this; } template void merge( concurrent_unordered_set& source ) { this->internal_merge(source); } template void merge( concurrent_unordered_set&& source ) { this->internal_merge(std::move(source)); } template void merge( concurrent_unordered_multiset& source ) { this->internal_merge(source); } template void merge( concurrent_unordered_multiset&& source ) { this->internal_merge(std::move(source)); } }; // class concurrent_unordered_multiset #if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT template >, typename KeyEq = std::equal_to>, typename Alloc = tbb::tbb_allocator>, typename = std::enable_if_t>, typename = std::enable_if_t>, typename = std::enable_if_t>, typename = std::enable_if_t>, typename = std::enable_if_t>> concurrent_unordered_multiset( It, It, std::size_t = {}, Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() ) -> concurrent_unordered_multiset, Hash, KeyEq, Alloc>; template , typename KeyEq = std::equal_to, typename Alloc = tbb::tbb_allocator, typename = std::enable_if_t>, typename = std::enable_if_t>, typename = std::enable_if_t>, typename = std::enable_if_t>> concurrent_unordered_multiset( std::initializer_list, std::size_t = {}, Hash = Hash(), KeyEq = KeyEq(), Alloc = Alloc() ) -> concurrent_unordered_multiset; template >, typename = std::enable_if_t>> concurrent_unordered_multiset( It, It, std::size_t, Alloc ) -> concurrent_unordered_multiset, std::hash>, std::equal_to>, Alloc>; template >, typename = std::enable_if_t>, typename = std::enable_if_t>, typename = std::enable_if_t>> concurrent_unordered_multiset( It, It, std::size_t, Hash, Alloc ) -> concurrent_unordered_multiset, Hash, std::equal_to>, Alloc>; template >> concurrent_unordered_multiset( std::initializer_list, std::size_t, Alloc ) -> concurrent_unordered_multiset, std::equal_to, Alloc>; template >> concurrent_unordered_multiset( std::initializer_list, Alloc ) -> concurrent_unordered_multiset, std::equal_to, Alloc>; template >, typename = std::enable_if_t>, typename = std::enable_if_t>> concurrent_unordered_multiset( std::initializer_list, std::size_t, Hash, Alloc ) -> concurrent_unordered_multiset, Alloc>; #if __APPLE__ && __TBB_CLANG_VERSION == 100000 // An explicit deduction guide is required for copy/move constructor with allocator for APPLE LLVM 10.0.0 // due to an issue with generating an implicit deduction guide for these constructors under several strange surcumstances. // Currently the issue takes place because the last template parameter for Traits is boolean, it should not affect the deduction guides // The issue reproduces only on this version of the compiler template concurrent_unordered_multiset( concurrent_unordered_multiset, Alloc ) -> concurrent_unordered_multiset; #endif #endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT template void swap( concurrent_unordered_multiset& lhs, concurrent_unordered_multiset& rhs ) { lhs.swap(rhs); } } // namespace d2 } // namespace detail inline namespace v1 { using detail::d2::concurrent_unordered_set; using detail::d2::concurrent_unordered_multiset; using detail::split; } // inline namespace v1 } // namespace tbb #endif // __TBB_concurrent_unordered_set_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/concurrent_vector.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_concurrent_vector_H #define __TBB_concurrent_vector_H #include "detail/_namespace_injection.h" #include "detail/_utils.h" #include "detail/_assert.h" #include "detail/_allocator_traits.h" #include "detail/_segment_table.h" #include "detail/_containers_helpers.h" #include "blocked_range.h" #include "cache_aligned_allocator.h" #include #include // std::move_if_noexcept #include #if __TBB_CPP20_COMPARISONS_PRESENT #include #endif namespace tbb { namespace detail { namespace d1 { template class vector_iterator { using vector_type = Vector; public: using value_type = Value; using size_type = typename vector_type::size_type; using difference_type = typename vector_type::difference_type; using pointer = value_type*; using reference = value_type&; using iterator_category = std::random_access_iterator_tag; template friend vector_iterator operator+( typename vector_iterator::difference_type, const vector_iterator& ); template friend typename vector_iterator::difference_type operator-( const vector_iterator&, const vector_iterator& ); template friend bool operator==( const vector_iterator&, const vector_iterator& ); template friend bool operator<( const vector_iterator&, const vector_iterator& ); template friend class vector_iterator; template friend class concurrent_vector; private: vector_iterator( const vector_type& vector, size_type index, value_type* item = nullptr ) : my_vector(const_cast(&vector)), my_index(index), my_item(item) {} public: vector_iterator() : my_vector(nullptr), my_index(~size_type(0)), my_item(nullptr) {} vector_iterator( const vector_iterator& other ) : my_vector(other.my_vector), my_index(other.my_index), my_item(other.my_item) {} vector_iterator& operator=( const vector_iterator& other ) { my_vector = other.my_vector; my_index = other.my_index; my_item = other.my_item; return *this; } vector_iterator operator+( difference_type offset ) const { return vector_iterator(*my_vector, my_index + offset); } vector_iterator& operator+=( difference_type offset ) { my_index += offset; my_item = nullptr; return *this; } vector_iterator operator-( difference_type offset ) const { return vector_iterator(*my_vector, my_index - offset); } vector_iterator& operator-=( difference_type offset ) { my_index -= offset; my_item = nullptr; return *this; } reference operator*() const { value_type *item = my_item; if (item == nullptr) { item = &my_vector->internal_subscript(my_index); } else { __TBB_ASSERT(item == &my_vector->internal_subscript(my_index), "corrupt cache"); } return *item; } pointer operator->() const { return &(operator*()); } reference operator[]( difference_type k ) const { return my_vector->internal_subscript(my_index + k); } vector_iterator& operator++() { ++my_index; if (my_item != nullptr) { if (vector_type::is_first_element_in_segment(my_index)) { // If the iterator crosses a segment boundary, the pointer become invalid // as possibly next segment is in another memory location my_item = nullptr; } else { ++my_item; } } return *this; } vector_iterator operator++(int) { vector_iterator result = *this; ++(*this); return result; } vector_iterator& operator--() { __TBB_ASSERT(my_index > 0, "operator--() applied to iterator already at beginning of concurrent_vector"); --my_index; if (my_item != nullptr) { if (vector_type::is_first_element_in_segment(my_index)) { // If the iterator crosses a segment boundary, the pointer become invalid // as possibly next segment is in another memory location my_item = nullptr; } else { --my_item; } } return *this; } vector_iterator operator--(int) { vector_iterator result = *this; --(*this); return result; } private: // concurrent_vector over which we are iterating. vector_type* my_vector; // Index into the vector size_type my_index; // Caches my_vector *it; // If my_item == nullptr cached value is not available use internal_subscript(my_index) mutable value_type* my_item; }; // class vector_iterator template vector_iterator operator+( typename vector_iterator::difference_type offset, const vector_iterator& v ) { return vector_iterator(*v.my_vector, v.my_index + offset); } template typename vector_iterator::difference_type operator-( const vector_iterator& i, const vector_iterator& j ) { using difference_type = typename vector_iterator::difference_type; return static_cast(i.my_index) - static_cast(j.my_index); } template bool operator==( const vector_iterator& i, const vector_iterator& j ) { return i.my_vector == j.my_vector && i.my_index == j.my_index; } template bool operator!=( const vector_iterator& i, const vector_iterator& j ) { return !(i == j); } template bool operator<( const vector_iterator& i, const vector_iterator& j ) { return i.my_index < j.my_index; } template bool operator>( const vector_iterator& i, const vector_iterator& j ) { return j < i; } template bool operator>=( const vector_iterator& i, const vector_iterator& j ) { return !(i < j); } template bool operator<=( const vector_iterator& i, const vector_iterator& j ) { return !(j < i); } static constexpr std::size_t embedded_table_num_segments = 3; template > class concurrent_vector : private segment_table, embedded_table_num_segments> { using self_type = concurrent_vector; using base_type = segment_table; friend class segment_table; template class generic_range_type : public tbb::blocked_range { using base_type = tbb::blocked_range; public: using value_type = T; using reference = T&; using const_reference = const T&; using iterator = Iterator; using difference_type = std::ptrdiff_t; using base_type::base_type; template generic_range_type( const generic_range_type& r) : blocked_range(r.begin(), r.end(), r.grainsize()) {} generic_range_type( generic_range_type& r, split ) : blocked_range(r, split()) {} }; // class generic_range_type static_assert(std::is_same::value, "value_type of the container must be the same as its allocator's"); using allocator_traits_type = tbb::detail::allocator_traits; // Segment table for concurrent_vector can be extended static constexpr bool allow_table_extending = true; static constexpr bool is_noexcept_assignment = allocator_traits_type::propagate_on_container_move_assignment::value || allocator_traits_type::is_always_equal::value; static constexpr bool is_noexcept_swap = allocator_traits_type::propagate_on_container_swap::value || allocator_traits_type::is_always_equal::value; public: using value_type = T; using allocator_type = Allocator; using size_type = std::size_t; using difference_type = std::ptrdiff_t; using reference = value_type&; using const_reference = const value_type&; using pointer = typename allocator_traits_type::pointer; using const_pointer = typename allocator_traits_type::const_pointer; using iterator = vector_iterator; using const_iterator = vector_iterator; using reverse_iterator = std::reverse_iterator; using const_reverse_iterator = std::reverse_iterator; using range_type = generic_range_type; using const_range_type = generic_range_type; concurrent_vector() : concurrent_vector(allocator_type()) {} explicit concurrent_vector( const allocator_type& alloc ) noexcept : base_type(alloc) {} explicit concurrent_vector( size_type count, const value_type& value, const allocator_type& alloc = allocator_type() ) : concurrent_vector(alloc) { try_call( [&] { grow_by(count, value); } ).on_exception( [&] { base_type::clear(); }); } explicit concurrent_vector( size_type count, const allocator_type& alloc = allocator_type() ) : concurrent_vector(alloc) { try_call( [&] { grow_by(count); } ).on_exception( [&] { base_type::clear(); }); } template concurrent_vector( InputIterator first, InputIterator last, const allocator_type& alloc = allocator_type() ) : concurrent_vector(alloc) { try_call( [&] { grow_by(first, last); } ).on_exception( [&] { base_type::clear(); }); } concurrent_vector( const concurrent_vector& other ) : base_type(segment_table_allocator_traits::select_on_container_copy_construction(other.get_allocator())) { try_call( [&] { grow_by(other.begin(), other.end()); } ).on_exception( [&] { base_type::clear(); }); } concurrent_vector( const concurrent_vector& other, const allocator_type& alloc ) : base_type(other, alloc) {} concurrent_vector(concurrent_vector&& other) noexcept : base_type(std::move(other)) {} concurrent_vector( concurrent_vector&& other, const allocator_type& alloc ) : base_type(std::move(other), alloc) {} concurrent_vector( std::initializer_list init, const allocator_type& alloc = allocator_type() ) : concurrent_vector(init.begin(), init.end(), alloc) {} ~concurrent_vector() {} // Assignment concurrent_vector& operator=( const concurrent_vector& other ) { base_type::operator=(other); return *this; } concurrent_vector& operator=( concurrent_vector&& other ) noexcept(is_noexcept_assignment) { base_type::operator=(std::move(other)); return *this; } concurrent_vector& operator=( std::initializer_list init ) { assign(init); return *this; } void assign( size_type count, const value_type& value ) { destroy_elements(); grow_by(count, value); } template typename std::enable_if::value, void>::type assign( InputIterator first, InputIterator last ) { destroy_elements(); grow_by(first, last); } void assign( std::initializer_list init ) { destroy_elements(); assign(init.begin(), init.end()); } // Concurrent growth iterator grow_by( size_type delta ) { return internal_grow_by_delta(delta); } iterator grow_by( size_type delta, const value_type& value ) { return internal_grow_by_delta(delta, value); } template typename std::enable_if::value, iterator>::type grow_by( ForwardIterator first, ForwardIterator last ) { auto delta = std::distance(first, last); return internal_grow_by_delta(delta, first, last); } iterator grow_by( std::initializer_list init ) { return grow_by(init.begin(), init.end()); } iterator grow_to_at_least( size_type n ) { return internal_grow_to_at_least(n); } iterator grow_to_at_least( size_type n, const value_type& value ) { return internal_grow_to_at_least(n, value); } iterator push_back( const value_type& item ) { return internal_emplace_back(item); } iterator push_back( value_type&& item ) { return internal_emplace_back(std::move(item)); } template iterator emplace_back( Args&&... args ) { return internal_emplace_back(std::forward(args)...); } // Items access reference operator[]( size_type index ) { return internal_subscript(index); } const_reference operator[]( size_type index ) const { return internal_subscript(index); } reference at( size_type index ) { return internal_subscript_with_exceptions(index); } const_reference at( size_type index ) const { return internal_subscript_with_exceptions(index); } // Get range for iterating with parallel algorithms range_type range( size_t grainsize = 1 ) { return range_type(begin(), end(), grainsize); } // Get const range for iterating with parallel algorithms const_range_type range( size_t grainsize = 1 ) const { return const_range_type(begin(), end(), grainsize); } reference front() { return internal_subscript(0); } const_reference front() const { return internal_subscript(0); } reference back() { return internal_subscript(size() - 1); } const_reference back() const { return internal_subscript(size() - 1); } // Iterators iterator begin() { return iterator(*this, 0); } const_iterator begin() const { return const_iterator(*this, 0); } const_iterator cbegin() const { return const_iterator(*this, 0); } iterator end() { return iterator(*this, size()); } const_iterator end() const { return const_iterator(*this, size()); } const_iterator cend() const { return const_iterator(*this, size()); } reverse_iterator rbegin() { return reverse_iterator(end()); } const_reverse_iterator rbegin() const { return const_reverse_iterator(end()); } const_reverse_iterator crbegin() const { return const_reverse_iterator(cend()); } reverse_iterator rend() { return reverse_iterator(begin()); } const_reverse_iterator rend() const { return const_reverse_iterator(begin()); } const_reverse_iterator crend() const { return const_reverse_iterator(cbegin()); } allocator_type get_allocator() const { return base_type::get_allocator(); } // Storage bool empty() const noexcept { return 0 == size(); } size_type size() const noexcept { return std::min(this->my_size.load(std::memory_order_acquire), capacity()); } size_type max_size() const noexcept { return allocator_traits_type::max_size(base_type::get_allocator()); } size_type capacity() const noexcept { return base_type::capacity(); } void reserve( size_type n ) { if (n == 0) return; if (n > max_size()) { tbb::detail::throw_exception(exception_id::reservation_length_error); } this->assign_first_block_if_necessary(this->segment_index_of(n - 1) + 1); base_type::reserve(n); } void resize( size_type n ) { internal_resize(n); } void resize( size_type n, const value_type& val ) { internal_resize(n, val); } void shrink_to_fit() { internal_compact(); } void swap(concurrent_vector& other) noexcept(is_noexcept_swap) { base_type::swap(other); } void clear() { destroy_elements(); } private: using segment_type = typename base_type::segment_type; using segment_table_type = typename base_type::segment_table_type; using segment_table_allocator_traits = typename base_type::segment_table_allocator_traits; using segment_index_type = typename base_type::segment_index_type; using segment_element_type = typename base_type::value_type; using segment_element_allocator_type = typename allocator_traits_type::template rebind_alloc; using segment_element_allocator_traits = tbb::detail::allocator_traits; segment_table_type allocate_long_table( const typename base_type::atomic_segment* embedded_table, size_type start_index ) { __TBB_ASSERT(start_index <= this->embedded_table_size, "Start index out of embedded table"); // If other threads are trying to set pointers in the short segment, wait for them to finish their // assignments before we copy the short segment to the long segment. Note: grow_to_at_least depends on it for (segment_index_type i = 0; this->segment_base(i) < start_index; ++i) { spin_wait_while_eq(embedded_table[i], segment_type(nullptr)); } // It is possible that the table was extend by a thread allocating first_block, need to check this. if (this->get_table() != embedded_table) { return nullptr; } // Allocate long segment table and fill with null pointers segment_table_type new_segment_table = segment_table_allocator_traits::allocate(base_type::get_allocator(), this->pointers_per_long_table); // Copy segment pointers from the embedded table for (size_type segment_index = 0; segment_index < this->pointers_per_embedded_table; ++segment_index) { segment_table_allocator_traits::construct(base_type::get_allocator(), &new_segment_table[segment_index], embedded_table[segment_index].load(std::memory_order_relaxed)); } for (size_type segment_index = this->pointers_per_embedded_table; segment_index < this->pointers_per_long_table; ++segment_index) { segment_table_allocator_traits::construct(base_type::get_allocator(), &new_segment_table[segment_index], nullptr); } return new_segment_table; } // create_segment function is required by the segment_table base class segment_type create_segment( segment_table_type table, segment_index_type seg_index, size_type index ) { size_type first_block = this->my_first_block.load(std::memory_order_relaxed); // First block allocation if (seg_index < first_block) { // If 0 segment is already allocated, then it remains to wait until the segments are filled to requested if (table[0].load(std::memory_order_acquire) != nullptr) { spin_wait_while_eq(table[seg_index], segment_type(nullptr)); return nullptr; } segment_element_allocator_type segment_allocator(base_type::get_allocator()); segment_type new_segment = nullptr; size_type first_block_size = this->segment_size(first_block); try_call( [&] { new_segment = segment_element_allocator_traits::allocate(segment_allocator, first_block_size); } ).on_exception( [&] { segment_type disabled_segment = nullptr; if (table[0].compare_exchange_strong(disabled_segment, this->segment_allocation_failure_tag)) { size_type end_segment = table == this->my_embedded_table ? this->pointers_per_embedded_table : first_block; for (size_type i = 1; i < end_segment; ++i) { table[i].store(this->segment_allocation_failure_tag, std::memory_order_release); } } }); segment_type disabled_segment = nullptr; if (table[0].compare_exchange_strong(disabled_segment, new_segment)) { this->extend_table_if_necessary(table, /*start_index*/0, /*end_index*/first_block_size); for (size_type i = 1; i < first_block; ++i) { table[i].store(new_segment, std::memory_order_release); } // Other threads can wait on a snapshot of an embedded table, need to fill it. for (size_type i = 1; i < first_block && i < this->pointers_per_embedded_table; ++i) { this->my_embedded_table[i].store(new_segment, std::memory_order_release); } } else if (new_segment != this->segment_allocation_failure_tag) { // Deallocate the memory segment_element_allocator_traits::deallocate(segment_allocator, new_segment, first_block_size); // 0 segment is already allocated, then it remains to wait until the segments are filled to requested spin_wait_while_eq(table[seg_index], segment_type(nullptr)); } } else { size_type offset = this->segment_base(seg_index); if (index == offset) { __TBB_ASSERT(table[seg_index].load(std::memory_order_relaxed) == nullptr, "Only this thread can enable this segment"); segment_element_allocator_type segment_allocator(base_type::get_allocator()); segment_type new_segment = this->segment_allocation_failure_tag; try_call( [&] { new_segment = segment_element_allocator_traits::allocate(segment_allocator,this->segment_size(seg_index)); // Shift base address to simplify access by index new_segment -= this->segment_base(seg_index); } ).on_completion( [&] { table[seg_index].store(new_segment, std::memory_order_release); }); } else { spin_wait_while_eq(table[seg_index], segment_type(nullptr)); } } return nullptr; } // Returns the number of elements in the segment to be destroy size_type number_of_elements_in_segment( segment_index_type seg_index ) { size_type curr_vector_size = this->my_size.load(std::memory_order_relaxed); size_type curr_segment_base = this->segment_base(seg_index); if (seg_index == 0) { return std::min(curr_vector_size, this->segment_size(seg_index)); } else { // Perhaps the segment is allocated, but there are no elements in it. if (curr_vector_size < curr_segment_base) { return 0; } return curr_segment_base * 2 > curr_vector_size ? curr_vector_size - curr_segment_base : curr_segment_base; } } segment_type nullify_segment( segment_table_type table, size_type segment_index ) { segment_type target_segment = table[segment_index].load(std::memory_order_relaxed); if (segment_index >= this->my_first_block) { table[segment_index].store(nullptr, std::memory_order_relaxed); } else { if (segment_index == 0) { for (size_type i = 0; i < this->my_first_block; ++i) { table[i].store(nullptr, std::memory_order_relaxed); } } } return target_segment; } void deallocate_segment( segment_type address, segment_index_type seg_index ) { segment_element_allocator_type segment_allocator(base_type::get_allocator()); size_type first_block = this->my_first_block.load(std::memory_order_relaxed); if (seg_index >= first_block) { segment_element_allocator_traits::deallocate(segment_allocator, address, this->segment_size(seg_index)); } else if (seg_index == 0) { size_type elements_to_deallocate = first_block > 0 ? this->segment_size(first_block) : this->segment_size(0); segment_element_allocator_traits::deallocate(segment_allocator, address, elements_to_deallocate); } } // destroy_segment function is required by the segment_table base class void destroy_segment( segment_type address, segment_index_type seg_index ) { size_type elements_to_destroy = number_of_elements_in_segment(seg_index); segment_element_allocator_type segment_allocator(base_type::get_allocator()); for (size_type i = 0; i < elements_to_destroy; ++i) { segment_element_allocator_traits::destroy(segment_allocator, address + i); } deallocate_segment(address, seg_index); } // copy_segment function is required by the segment_table base class void copy_segment( segment_index_type seg_index, segment_type from, segment_type to ) { size_type i = 0; try_call( [&] { for (; i != number_of_elements_in_segment(seg_index); ++i) { segment_table_allocator_traits::construct(base_type::get_allocator(), to + i, from[i]); } } ).on_exception( [&] { // Zero-initialize items left not constructed after the exception zero_unconstructed_elements(this->get_segment(seg_index) + i, this->segment_size(seg_index) - i); segment_index_type last_segment = this->segment_index_of(this->my_size.load(std::memory_order_relaxed)); auto table = this->get_table(); for (segment_index_type j = seg_index + 1; j != last_segment; ++j) { auto curr_segment = table[j].load(std::memory_order_relaxed); if (curr_segment) { zero_unconstructed_elements(curr_segment + this->segment_base(j), this->segment_size(j)); } } this->my_size.store(this->segment_size(seg_index) + i, std::memory_order_relaxed); }); } // move_segment function is required by the segment_table base class void move_segment( segment_index_type seg_index, segment_type from, segment_type to ) { size_type i = 0; try_call( [&] { for (; i != number_of_elements_in_segment(seg_index); ++i) { segment_table_allocator_traits::construct(base_type::get_allocator(), to + i, std::move(from[i])); } } ).on_exception( [&] { // Zero-initialize items left not constructed after the exception zero_unconstructed_elements(this->get_segment(seg_index) + i, this->segment_size(seg_index) - i); segment_index_type last_segment = this->segment_index_of(this->my_size.load(std::memory_order_relaxed)); auto table = this->get_table(); for (segment_index_type j = seg_index + 1; j != last_segment; ++j) { auto curr_segment = table[j].load(std::memory_order_relaxed); if (curr_segment) { zero_unconstructed_elements(curr_segment + this->segment_base(j), this->segment_size(j)); } } this->my_size.store(this->segment_size(seg_index) + i, std::memory_order_relaxed); }); } static constexpr bool is_first_element_in_segment( size_type index ) { // An element is the first in a segment if its index is equal to a power of two return is_power_of_two_at_least(index, 2); } const_reference internal_subscript( size_type index ) const { return const_cast(this)->internal_subscript(index); } reference internal_subscript( size_type index ) { __TBB_ASSERT(index < this->my_size.load(std::memory_order_relaxed), "Invalid subscript index"); return base_type::template internal_subscript(index); } const_reference internal_subscript_with_exceptions( size_type index ) const { return const_cast(this)->internal_subscript_with_exceptions(index); } reference internal_subscript_with_exceptions( size_type index ) { if (index >= this->my_size.load(std::memory_order_acquire)) { tbb::detail::throw_exception(exception_id::out_of_range); } segment_table_type table = this->my_segment_table.load(std::memory_order_acquire); size_type seg_index = this->segment_index_of(index); if (base_type::number_of_segments(table) < seg_index) { tbb::detail::throw_exception(exception_id::out_of_range); } if (table[seg_index] <= this->segment_allocation_failure_tag) { tbb::detail::throw_exception(exception_id::out_of_range); } return base_type::template internal_subscript(index); } static void zero_unconstructed_elements( pointer start, size_type count ) { std::memset(static_cast(start), 0, count * sizeof(value_type)); } template iterator internal_emplace_back( Args&&... args ) { size_type old_size = this->my_size++; this->assign_first_block_if_necessary(default_first_block_size); auto element_address = &base_type::template internal_subscript(old_size); // try_call API is not convenient here due to broken // variadic capture on GCC 4.8.5 auto value_guard = make_raii_guard([&] { zero_unconstructed_elements(element_address, /*count =*/1); }); segment_table_allocator_traits::construct(base_type::get_allocator(), element_address, std::forward(args)...); value_guard.dismiss(); return iterator(*this, old_size, element_address); } template void internal_loop_construct( segment_table_type table, size_type start_idx, size_type end_idx, const Args&... args ) { static_assert(sizeof...(Args) < 2, "Too many parameters"); for (size_type idx = start_idx; idx < end_idx; ++idx) { auto element_address = &base_type::template internal_subscript(idx); // try_call API is not convenient here due to broken // variadic capture on GCC 4.8.5 auto value_guard = make_raii_guard( [&] { segment_index_type last_allocated_segment = this->find_last_allocated_segment(table); size_type segment_size = this->segment_size(last_allocated_segment); end_idx = end_idx < segment_size ? end_idx : segment_size; for (size_type i = idx; i < end_idx; ++i) { zero_unconstructed_elements(&this->internal_subscript(i), /*count =*/1); } }); segment_table_allocator_traits::construct(base_type::get_allocator(), element_address, args...); value_guard.dismiss(); } } template void internal_loop_construct( segment_table_type table, size_type start_idx, size_type end_idx, ForwardIterator first, ForwardIterator ) { for (size_type idx = start_idx; idx < end_idx; ++idx) { auto element_address = &base_type::template internal_subscript(idx); try_call( [&] { segment_table_allocator_traits::construct(base_type::get_allocator(), element_address, *first++); } ).on_exception( [&] { segment_index_type last_allocated_segment = this->find_last_allocated_segment(table); size_type segment_size = this->segment_size(last_allocated_segment); end_idx = end_idx < segment_size ? end_idx : segment_size; for (size_type i = idx; i < end_idx; ++i) { zero_unconstructed_elements(&this->internal_subscript(i), /*count =*/1); } }); } } template iterator internal_grow( size_type start_idx, size_type end_idx, const Args&... args ) { size_type seg_index = this->segment_index_of(end_idx - 1); this->assign_first_block_if_necessary(seg_index + 1); segment_table_type table = this->get_table(); this->extend_table_if_necessary(table, start_idx, end_idx); if (seg_index > this->my_first_block.load(std::memory_order_relaxed)) { // So that other threads be able to work with the last segment of grow_by, allocate it immediately. // If the last segment is not less than the first block if (table[seg_index].load(std::memory_order_relaxed) == nullptr) { size_type first_element = this->segment_base(seg_index); if (first_element >= start_idx && first_element < end_idx) { segment_type segment = table[seg_index].load(std::memory_order_relaxed); base_type::enable_segment(segment, table, seg_index, first_element); } } } internal_loop_construct(table, start_idx, end_idx, args...); return iterator(*this, start_idx, &base_type::template internal_subscript(start_idx)); } template iterator internal_grow_by_delta( size_type delta, const Args&... args ) { if (delta == size_type(0)) { return end(); } size_type start_idx = this->my_size.fetch_add(delta); size_type end_idx = start_idx + delta; return internal_grow(start_idx, end_idx, args...); } template iterator internal_grow_to_at_least( size_type new_size, const Args&... args ) { size_type old_size = this->my_size.load(std::memory_order_relaxed); if (new_size == size_type(0)) return iterator(*this, 0); while (old_size < new_size && !this->my_size.compare_exchange_weak(old_size, new_size)) {} int delta = static_cast(new_size) - static_cast(old_size); if (delta > 0) { return internal_grow(old_size, new_size, args...); } size_type end_segment = this->segment_index_of(new_size - 1); // Check/wait for segments allocation completes if (end_segment >= this->pointers_per_embedded_table && this->get_table() == this->my_embedded_table) { spin_wait_while_eq(this->my_segment_table, this->my_embedded_table); } for (segment_index_type seg_idx = 0; seg_idx <= end_segment; ++seg_idx) { if (this->get_table()[seg_idx].load(std::memory_order_relaxed) == nullptr) { atomic_backoff backoff(true); while (this->get_table()[seg_idx].load(std::memory_order_relaxed) == nullptr) { backoff.pause(); } } } #if TBB_USE_DEBUG size_type cap = capacity(); __TBB_ASSERT( cap >= new_size, nullptr); #endif return iterator(*this, size()); } template void internal_resize( size_type n, const Args&... args ) { if (n == 0) { clear(); return; } size_type old_size = this->my_size.load(std::memory_order_acquire); if (n > old_size) { reserve(n); grow_to_at_least(n, args...); } else { if (old_size == n) { return; } size_type last_segment = this->segment_index_of(old_size - 1); // Delete segments for (size_type seg_idx = this->segment_index_of(n - 1) + 1; seg_idx <= last_segment; ++seg_idx) { this->delete_segment(seg_idx); } // If n > segment_size(n) => we need to destroy all of the items in the first segment // Otherwise, we need to destroy only items with the index < n size_type n_segment = this->segment_index_of(n - 1); size_type last_index_to_destroy = std::min(this->segment_base(n_segment) + this->segment_size(n_segment), old_size); // Destroy elements in curr segment for (size_type idx = n; idx < last_index_to_destroy; ++idx) { segment_table_allocator_traits::destroy(base_type::get_allocator(), &base_type::template internal_subscript(idx)); } this->my_size.store(n, std::memory_order_release); } } void destroy_elements() { allocator_type alloc(base_type::get_allocator()); for (size_type i = 0; i < this->my_size.load(std::memory_order_relaxed); ++i) { allocator_traits_type::destroy(alloc, &base_type::template internal_subscript(i)); } this->my_size.store(0, std::memory_order_relaxed); } static bool incompact_predicate( size_type size ) { // memory page size const size_type page_size = 4096; return size < page_size || ((size - 1) % page_size < page_size / 2 && size < page_size * 128); } void internal_compact() { const size_type curr_size = this->my_size.load(std::memory_order_relaxed); segment_table_type table = this->get_table(); const segment_index_type k_end = this->find_last_allocated_segment(table); // allocated segments const segment_index_type k_stop = curr_size ? this->segment_index_of(curr_size - 1) + 1 : 0; // number of segments to store existing items: 0=>0; 1,2=>1; 3,4=>2; [5-8]=>3;.. const segment_index_type first_block = this->my_first_block; // number of merged segments, getting values from atomics segment_index_type k = first_block; if (k_stop < first_block) { k = k_stop; } else { while (k < k_stop && incompact_predicate(this->segment_size(k) * sizeof(value_type))) k++; } if (k_stop == k_end && k == first_block) { return; } // First segment optimization if (k != first_block && k) { size_type max_block = std::max(first_block, k); auto buffer_table = segment_table_allocator_traits::allocate(base_type::get_allocator(), max_block); for (size_type seg_idx = 0; seg_idx < max_block; ++seg_idx) { segment_table_allocator_traits::construct(base_type::get_allocator(), &buffer_table[seg_idx], table[seg_idx].load(std::memory_order_relaxed)); table[seg_idx].store(nullptr, std::memory_order_relaxed); } this->my_first_block.store(k, std::memory_order_relaxed); size_type index = 0; try_call( [&] { for (; index < std::min(this->segment_size(max_block), curr_size); ++index) { auto element_address = &static_cast(this)->operator[](index); segment_index_type seg_idx = this->segment_index_of(index); segment_table_allocator_traits::construct(base_type::get_allocator(), element_address, std::move_if_noexcept(buffer_table[seg_idx].load(std::memory_order_relaxed)[index])); } } ).on_exception( [&] { segment_element_allocator_type allocator(base_type::get_allocator()); for (size_type i = 0; i < index; ++i) { auto element_adress = &this->operator[](i); segment_element_allocator_traits::destroy(allocator, element_adress); } segment_element_allocator_traits::deallocate(allocator, table[0].load(std::memory_order_relaxed), this->segment_size(max_block)); for (size_type seg_idx = 0; seg_idx < max_block; ++seg_idx) { table[seg_idx].store(buffer_table[seg_idx].load(std::memory_order_relaxed), std::memory_order_relaxed); buffer_table[seg_idx].store(nullptr, std::memory_order_relaxed); } segment_table_allocator_traits::deallocate(base_type::get_allocator(), buffer_table, max_block); this->my_first_block.store(first_block, std::memory_order_relaxed); }); // Need to correct deallocate old segments // Method destroy_segment respect active first_block, therefore, // in order for the segment deletion to work correctly, set the first_block size that was earlier, // destroy the unnecessary segments. this->my_first_block.store(first_block, std::memory_order_relaxed); for (size_type seg_idx = max_block; seg_idx > 0 ; --seg_idx) { auto curr_segment = buffer_table[seg_idx - 1].load(std::memory_order_relaxed); if (curr_segment != nullptr) { destroy_segment(buffer_table[seg_idx - 1].load(std::memory_order_relaxed) + this->segment_base(seg_idx - 1), seg_idx - 1); } } this->my_first_block.store(k, std::memory_order_relaxed); for (size_type seg_idx = 0; seg_idx < max_block; ++seg_idx) { segment_table_allocator_traits::destroy(base_type::get_allocator(), &buffer_table[seg_idx]); } segment_table_allocator_traits::deallocate(base_type::get_allocator(), buffer_table, max_block); } // free unnecessary segments allocated by reserve() call if (k_stop < k_end) { for (size_type seg_idx = k_end; seg_idx != k_stop; --seg_idx) { if (table[seg_idx - 1].load(std::memory_order_relaxed) != nullptr) { this->delete_segment(seg_idx - 1); } } if (!k) this->my_first_block.store(0, std::memory_order_relaxed); } } // Lever for adjusting the size of first_block at the very first insertion. // TODO: consider >1 value, check performance static constexpr size_type default_first_block_size = 1; template friend class vector_iterator; }; // class concurrent_vector #if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT // Deduction guide for the constructor from two iterators template >, typename = std::enable_if_t>, typename = std::enable_if_t>> concurrent_vector( It, It, Alloc = Alloc() ) -> concurrent_vector, Alloc>; #endif template void swap(concurrent_vector &lhs, concurrent_vector &rhs) { lhs.swap(rhs); } template bool operator==(const concurrent_vector &lhs, const concurrent_vector &rhs) { return lhs.size() == rhs.size() && std::equal(lhs.begin(), lhs.end(), rhs.begin()); } #if !__TBB_CPP20_COMPARISONS_PRESENT template bool operator!=(const concurrent_vector &lhs, const concurrent_vector &rhs) { return !(lhs == rhs); } #endif // !__TBB_CPP20_COMPARISONS_PRESENT #if __TBB_CPP20_COMPARISONS_PRESENT && __TBB_CPP20_CONCEPTS_PRESENT template tbb::detail::synthesized_three_way_result::value_type> operator<=>(const concurrent_vector &lhs, const concurrent_vector &rhs) { return std::lexicographical_compare_three_way(lhs.begin(), lhs.end(), rhs.begin(), rhs.end(), tbb::detail::synthesized_three_way_comparator{}); } #else template bool operator<(const concurrent_vector &lhs, const concurrent_vector &rhs) { return std::lexicographical_compare(lhs.begin(), lhs.end(), rhs.begin(), rhs.end()); } template bool operator<=(const concurrent_vector &lhs, const concurrent_vector &rhs) { return !(rhs < lhs); } template bool operator>(const concurrent_vector &lhs, const concurrent_vector &rhs) { return rhs < lhs; } template bool operator>=(const concurrent_vector &lhs, const concurrent_vector &rhs) { return !(lhs < rhs); } #endif // __TBB_CPP20_COMPARISONS_PRESENT && __TBB_CPP20_CONCEPTS_PRESENT } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::concurrent_vector; } // namespace v1 } // namespace tbb #endif // __TBB_concurrent_vector_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_aggregator.h ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_detail__aggregator_H #define __TBB_detail__aggregator_H #include "_assert.h" #include "_utils.h" #include #if !__TBBMALLOC_BUILD // TODO: check this macro with TBB Malloc #include "../profiling.h" #endif namespace tbb { namespace detail { namespace d1 { // Base class for aggregated operation template class aggregated_operation { public: // Zero value means "wait" status, all other values are "user" specified values and // are defined into the scope of a class which uses "status" std::atomic status; std::atomic next; aggregated_operation() : status{}, next(nullptr) {} }; // class aggregated_operation // Aggregator base class /* An aggregator for collecting operations coming from multiple sources and executing them serially on a single thread. OperationType must be derived from aggregated_operation. The parameter HandlerType is a functor that will be passed the list of operations and is expected to handle each operation appropriately, setting the status of each operation to non-zero. */ template class aggregator_generic { public: aggregator_generic() : pending_operations(nullptr), handler_busy(false) {} // Execute an operation /* Places an operation into the waitlist (pending_operations), and either handles the list, or waits for the operation to complete, or returns. The long_life_time parameter specifies the life time of the given operation object. Operations with long_life_time == true may be accessed after execution. A "short" life time operation (long_life_time == false) can be destroyed during execution, and so any access to it after it was put into the waitlist, including status check, is invalid. As a consequence, waiting for completion of such operation causes undefined behavior. */ template void execute( OperationType* op, HandlerType& handle_operations, bool long_life_time = true ) { // op->status should be read before inserting the operation into the // aggregator waitlist since it can become invalid after executing a // handler (if the operation has 'short' life time.) const uintptr_t status = op->status.load(std::memory_order_relaxed); // ITT note: &(op->status) tag is used to cover accesses to this op node. This // thread has created the operation, and now releases it so that the handler // thread may handle the associated operation w/o triggering a race condition; // thus this tag will be acquired just before the operation is handled in the // handle_operations functor. call_itt_notify(releasing, &(op->status)); // insert the operation in the queue. OperationType* res = pending_operations.load(std::memory_order_relaxed); do { op->next.store(res, std::memory_order_relaxed); } while (!pending_operations.compare_exchange_strong(res, op)); if (!res) { // first in the list; handle the operations // ITT note: &pending_operations tag covers access to the handler_busy flag, // which this waiting handler thread will try to set before entering // handle_operations. call_itt_notify(acquired, &pending_operations); start_handle_operations(handle_operations); // The operation with 'short' life time can already be destroyed if (long_life_time) __TBB_ASSERT(op->status.load(std::memory_order_relaxed), nullptr); } // Not first; wait for op to be ready else if (!status) { // operation is blocking here. __TBB_ASSERT(long_life_time, "Waiting for an operation object that might be destroyed during processing"); call_itt_notify(prepare, &(op->status)); spin_wait_while_eq(op->status, uintptr_t(0)); } } private: // Trigger the handling of operations when the handler is free template void start_handle_operations( HandlerType& handle_operations ) { OperationType* op_list; // ITT note: &handler_busy tag covers access to pending_operations as it is passed // between active and waiting handlers. Below, the waiting handler waits until // the active handler releases, and the waiting handler acquires &handler_busy as // it becomes the active_handler. The release point is at the end of this // function, when all operations in pending_operations have been handled by the // owner of this aggregator. call_itt_notify(prepare, &handler_busy); // get the handler_busy: // only one thread can possibly spin here at a time spin_wait_until_eq(handler_busy, uintptr_t(0)); call_itt_notify(acquired, &handler_busy); // acquire fence not necessary here due to causality rule and surrounding atomics handler_busy.store(1, std::memory_order_relaxed); // ITT note: &pending_operations tag covers access to the handler_busy flag // itself. Capturing the state of the pending_operations signifies that // handler_busy has been set and a new active handler will now process that list's // operations. call_itt_notify(releasing, &pending_operations); // grab pending_operations op_list = pending_operations.exchange(nullptr); // handle all the operations handle_operations(op_list); // release the handler handler_busy.store(0, std::memory_order_release); } // An atomically updated list (aka mailbox) of pending operations std::atomic pending_operations; // Controls threads access to handle_operations std::atomic handler_busy; }; // class aggregator_generic template class aggregator : public aggregator_generic { HandlerType handle_operations; public: aggregator() = default; void initialize_handler( HandlerType h ) { handle_operations = h; } void execute(OperationType* op) { aggregator_generic::execute(op, handle_operations); } }; // class aggregator // the most-compatible friend declaration (vs, gcc, icc) is // template friend class aggregating_functor; template class aggregating_functor { AggregatingClass* my_object{nullptr}; public: aggregating_functor() = default; aggregating_functor( AggregatingClass* object ) : my_object(object) { __TBB_ASSERT(my_object, nullptr); } void operator()( OperationList* op_list ) { __TBB_ASSERT(my_object, nullptr); my_object->handle_operations(op_list); } }; // class aggregating_functor } // namespace d1 } // namespace detail } // namespace tbb #endif // __TBB_detail__aggregator_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_aligned_space.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_aligned_space_H #define __TBB_aligned_space_H #include #include "_template_helpers.h" namespace tbb { namespace detail { inline namespace d0 { //! Block of space aligned sufficiently to construct an array T with N elements. /** The elements are not constructed or destroyed by this class. @ingroup memory_allocation */ template class aligned_space { alignas(alignof(T)) std::uint8_t aligned_array[N * sizeof(T)]; public: //! Pointer to beginning of array T* begin() const { return punned_cast(&aligned_array); } //! Pointer to one past last element in array. T* end() const { return begin() + N; } }; } // namespace d0 } // namespace detail } // namespace tbb #endif /* __TBB_aligned_space_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_allocator_traits.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_detail__allocator_traits_H #define __TBB_detail__allocator_traits_H #include "_config.h" #include "_template_helpers.h" #include #include namespace tbb { namespace detail { inline namespace d0 { #if !__TBB_CPP17_ALLOCATOR_IS_ALWAYS_EQUAL_PRESENT // Struct is_always_equal_detector provides the member type "type" which is // Allocator::is_always_equal if it is present, std::false_type otherwise template struct is_always_equal_detector { using type = std::false_type; }; template struct is_always_equal_detector> { using type = typename Allocator::is_always_equal; }; #endif // !__TBB_CPP17_ALLOCATOR_IS_ALWAYS_EQUAL_PRESENT template class allocator_traits : public std::allocator_traits { using base_type = std::allocator_traits; public: #if !__TBB_CPP17_ALLOCATOR_IS_ALWAYS_EQUAL_PRESENT using is_always_equal = typename is_always_equal_detector::type; #endif template using rebind_traits = typename tbb::detail::allocator_traits>; }; // struct allocator_traits template void copy_assign_allocators_impl( Allocator& lhs, const Allocator& rhs, /*pocca = */std::true_type ) { lhs = rhs; } template void copy_assign_allocators_impl( Allocator&, const Allocator&, /*pocca = */ std::false_type ) {} // Copy assigns allocators only if propagate_on_container_copy_assignment is true template void copy_assign_allocators( Allocator& lhs, const Allocator& rhs ) { using pocca_type = typename allocator_traits::propagate_on_container_copy_assignment; copy_assign_allocators_impl(lhs, rhs, pocca_type()); } template void move_assign_allocators_impl( Allocator& lhs, Allocator& rhs, /*pocma = */ std::true_type ) { lhs = std::move(rhs); } template void move_assign_allocators_impl( Allocator&, Allocator&, /*pocma = */ std::false_type ) {} // Move assigns allocators only if propagate_on_container_move_assignment is true template void move_assign_allocators( Allocator& lhs, Allocator& rhs ) { using pocma_type = typename allocator_traits::propagate_on_container_move_assignment; move_assign_allocators_impl(lhs, rhs, pocma_type()); } template void swap_allocators_impl( Allocator& lhs, Allocator& rhs, /*pocs = */ std::true_type ) { using std::swap; swap(lhs, rhs); } template void swap_allocators_impl( Allocator&, Allocator&, /*pocs = */ std::false_type ) {} // Swaps allocators only if propagate_on_container_swap is true template void swap_allocators( Allocator& lhs, Allocator& rhs ) { using pocs_type = typename allocator_traits::propagate_on_container_swap; swap_allocators_impl(lhs, rhs, pocs_type()); } } // inline namespace d0 } // namespace detail } // namespace tbb #endif // __TBB_detail__allocator_traits_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_assert.h ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_detail__assert_H #define __TBB_detail__assert_H #include "_config.h" #if __TBBMALLOC_BUILD namespace rml { namespace internal { #else namespace tbb { namespace detail { namespace r1 { #endif //! Process an assertion failure. /** Normally called from __TBB_ASSERT macro. If assertion handler is null, print message for assertion failure and abort. Otherwise call the assertion handler. */ TBB_EXPORT void __TBB_EXPORTED_FUNC assertion_failure(const char* location, int line, const char* expression, const char* comment); #if __TBBMALLOC_BUILD }} // namespaces rml::internal #else } // namespace r1 } // namespace detail } // namespace tbb #endif #if __TBBMALLOC_BUILD //! Release version of assertions #define __TBB_ASSERT_RELEASE(predicate,message) ((predicate)?((void)0) : rml::internal::assertion_failure(__func__,__LINE__,#predicate,message)) #else #define __TBB_ASSERT_RELEASE(predicate,message) ((predicate)?((void)0) : tbb::detail::r1::assertion_failure(__func__,__LINE__,#predicate,message)) #endif #if TBB_USE_ASSERT //! Assert that predicate is true. /** If predicate is false, print assertion failure message. If the comment argument is not nullptr, it is printed as part of the failure message. The comment argument has no other effect. */ #define __TBB_ASSERT(predicate,message) __TBB_ASSERT_RELEASE(predicate,message) //! "Extended" version #define __TBB_ASSERT_EX __TBB_ASSERT #else //! No-op version of __TBB_ASSERT. #define __TBB_ASSERT(predicate,comment) ((void)0) //! "Extended" version is useful to suppress warnings if a variable is only used with an assert #define __TBB_ASSERT_EX(predicate,comment) ((void)(1 && (predicate))) #endif // TBB_USE_ASSERT #endif // __TBB_detail__assert_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_attach.h ================================================ /* Copyright (c) 2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_detail__attach_H #define __TBB_detail__attach_H #include "_config.h" namespace tbb { namespace detail { namespace d1 { struct attach {}; } // namespace d1 } // namespace detail } // namespace tbb #endif // __TBB_detail__attach_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_concurrent_queue_base.h ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_detail__concurrent_queue_base_H #define __TBB_detail__concurrent_queue_base_H #include "_utils.h" #include "_exception.h" #include "_machine.h" #include "_allocator_traits.h" #include "../profiling.h" #include "../spin_mutex.h" #include "../cache_aligned_allocator.h" #include namespace tbb { namespace detail { namespace d2 { using ticket_type = std::size_t; template inline bool is_valid_page(const Page p) { return reinterpret_cast(p) > 1; } template struct concurrent_queue_rep; template class micro_queue_pop_finalizer; #if _MSC_VER && !defined(__INTEL_COMPILER) // unary minus operator applied to unsigned type, result still unsigned #pragma warning( push ) #pragma warning( disable: 4146 ) #endif // A queue using simple locking. // For efficiency, this class has no constructor. // The caller is expected to zero-initialize it. template class micro_queue { private: using queue_rep_type = concurrent_queue_rep; using self_type = micro_queue; public: using size_type = std::size_t; using value_type = T; using reference = value_type&; using const_reference = const value_type&; using allocator_type = Allocator; using allocator_traits_type = tbb::detail::allocator_traits; using queue_allocator_type = typename allocator_traits_type::template rebind_alloc; static constexpr size_type item_size = sizeof(T); static constexpr size_type items_per_page = item_size <= 8 ? 32 : item_size <= 16 ? 16 : item_size <= 32 ? 8 : item_size <= 64 ? 4 : item_size <= 128 ? 2 : 1; struct padded_page { padded_page() {} ~padded_page() {} reference operator[] (std::size_t index) { __TBB_ASSERT(index < items_per_page, "Index out of range"); return items[index]; } const_reference operator[] (std::size_t index) const { __TBB_ASSERT(index < items_per_page, "Index out of range"); return items[index]; } padded_page* next{ nullptr }; std::atomic mask{}; union { value_type items[items_per_page]; }; }; // struct padded_page using page_allocator_type = typename allocator_traits_type::template rebind_alloc; protected: using page_allocator_traits = tbb::detail::allocator_traits; public: using item_constructor_type = void (*)(value_type* location, const void* src); micro_queue() = default; micro_queue( const micro_queue& ) = delete; micro_queue& operator=( const micro_queue& ) = delete; size_type prepare_page( ticket_type k, queue_rep_type& base, page_allocator_type page_allocator, padded_page*& p ) { __TBB_ASSERT(p == nullptr, "Invalid page argument for prepare_page"); k &= -queue_rep_type::n_queue; size_type index = modulo_power_of_two(k / queue_rep_type::n_queue, items_per_page); if (!index) { try_call( [&] { p = page_allocator_traits::allocate(page_allocator, 1); }).on_exception( [&] { ++base.n_invalid_entries; invalidate_page( k ); }); page_allocator_traits::construct(page_allocator, p); } spin_wait_until_my_turn(tail_counter, k, base); d1::call_itt_notify(d1::acquired, &tail_counter); if (p) { spin_mutex::scoped_lock lock( page_mutex ); padded_page* q = tail_page.load(std::memory_order_relaxed); if (is_valid_page(q)) { q->next = p; } else { head_page.store(p, std::memory_order_relaxed); } tail_page.store(p, std::memory_order_relaxed); } else { p = tail_page.load(std::memory_order_relaxed); } return index; } template void push( ticket_type k, queue_rep_type& base, queue_allocator_type& allocator, Args&&... args ) { padded_page* p = nullptr; page_allocator_type page_allocator(allocator); size_type index = prepare_page(k, base, page_allocator, p); __TBB_ASSERT(p != nullptr, "Page was not prepared"); // try_call API is not convenient here due to broken // variadic capture on GCC 4.8.5 auto value_guard = make_raii_guard([&] { ++base.n_invalid_entries; d1::call_itt_notify(d1::releasing, &tail_counter); tail_counter.fetch_add(queue_rep_type::n_queue); }); page_allocator_traits::construct(page_allocator, &(*p)[index], std::forward(args)...); // If no exception was thrown, mark item as present. p->mask.store(p->mask.load(std::memory_order_relaxed) | uintptr_t(1) << index, std::memory_order_relaxed); d1::call_itt_notify(d1::releasing, &tail_counter); value_guard.dismiss(); tail_counter.fetch_add(queue_rep_type::n_queue); } void abort_push( ticket_type k, queue_rep_type& base, queue_allocator_type& allocator ) { padded_page* p = nullptr; prepare_page(k, base, allocator, p); ++base.n_invalid_entries; tail_counter.fetch_add(queue_rep_type::n_queue); } bool pop( void* dst, ticket_type k, queue_rep_type& base, queue_allocator_type& allocator ) { k &= -queue_rep_type::n_queue; spin_wait_until_eq(head_counter, k); d1::call_itt_notify(d1::acquired, &head_counter); spin_wait_while_eq(tail_counter, k); d1::call_itt_notify(d1::acquired, &tail_counter); padded_page *p = head_page.load(std::memory_order_relaxed); __TBB_ASSERT( p, nullptr ); size_type index = modulo_power_of_two( k/queue_rep_type::n_queue, items_per_page ); bool success = false; { page_allocator_type page_allocator(allocator); micro_queue_pop_finalizer finalizer(*this, page_allocator, k + queue_rep_type::n_queue, index == items_per_page - 1 ? p : nullptr ); if (p->mask.load(std::memory_order_relaxed) & (std::uintptr_t(1) << index)) { success = true; assign_and_destroy_item(dst, *p, index); } else { --base.n_invalid_entries; } } return success; } micro_queue& assign( const micro_queue& src, queue_allocator_type& allocator, item_constructor_type construct_item ) { head_counter.store(src.head_counter.load(std::memory_order_relaxed), std::memory_order_relaxed); tail_counter.store(src.tail_counter.load(std::memory_order_relaxed), std::memory_order_relaxed); const padded_page* srcp = src.head_page.load(std::memory_order_relaxed); if( is_valid_page(srcp) ) { ticket_type g_index = head_counter.load(std::memory_order_relaxed); size_type n_items = (tail_counter.load(std::memory_order_relaxed) - head_counter.load(std::memory_order_relaxed)) / queue_rep_type::n_queue; size_type index = modulo_power_of_two(head_counter.load(std::memory_order_relaxed) / queue_rep_type::n_queue, items_per_page); size_type end_in_first_page = (index+n_items < items_per_page) ? (index + n_items) : items_per_page; try_call( [&] { head_page.store(make_copy(allocator, srcp, index, end_in_first_page, g_index, construct_item), std::memory_order_relaxed); }).on_exception( [&] { head_counter.store(0, std::memory_order_relaxed); tail_counter.store(0, std::memory_order_relaxed); }); padded_page* cur_page = head_page.load(std::memory_order_relaxed); try_call( [&] { if (srcp != src.tail_page.load(std::memory_order_relaxed)) { for (srcp = srcp->next; srcp != src.tail_page.load(std::memory_order_relaxed); srcp=srcp->next ) { cur_page->next = make_copy( allocator, srcp, 0, items_per_page, g_index, construct_item ); cur_page = cur_page->next; } __TBB_ASSERT(srcp == src.tail_page.load(std::memory_order_relaxed), nullptr ); size_type last_index = modulo_power_of_two(tail_counter.load(std::memory_order_relaxed) / queue_rep_type::n_queue, items_per_page); if( last_index==0 ) last_index = items_per_page; cur_page->next = make_copy( allocator, srcp, 0, last_index, g_index, construct_item ); cur_page = cur_page->next; } tail_page.store(cur_page, std::memory_order_relaxed); }).on_exception( [&] { padded_page* invalid_page = reinterpret_cast(std::uintptr_t(1)); tail_page.store(invalid_page, std::memory_order_relaxed); }); } else { head_page.store(nullptr, std::memory_order_relaxed); tail_page.store(nullptr, std::memory_order_relaxed); } return *this; } padded_page* make_copy( queue_allocator_type& allocator, const padded_page* src_page, size_type begin_in_page, size_type end_in_page, ticket_type& g_index, item_constructor_type construct_item ) { page_allocator_type page_allocator(allocator); padded_page* new_page = page_allocator_traits::allocate(page_allocator, 1); new_page->next = nullptr; new_page->mask.store(src_page->mask.load(std::memory_order_relaxed), std::memory_order_relaxed); for (; begin_in_page!=end_in_page; ++begin_in_page, ++g_index) { if (new_page->mask.load(std::memory_order_relaxed) & uintptr_t(1) << begin_in_page) { copy_item(*new_page, begin_in_page, *src_page, begin_in_page, construct_item); } } return new_page; } void invalidate_page( ticket_type k ) { // Append an invalid page at address 1 so that no more pushes are allowed. padded_page* invalid_page = reinterpret_cast(std::uintptr_t(1)); { spin_mutex::scoped_lock lock( page_mutex ); tail_counter.store(k + queue_rep_type::n_queue + 1, std::memory_order_relaxed); padded_page* q = tail_page.load(std::memory_order_relaxed); if (is_valid_page(q)) { q->next = invalid_page; } else { head_page.store(invalid_page, std::memory_order_relaxed); } tail_page.store(invalid_page, std::memory_order_relaxed); } } padded_page* get_head_page() { return head_page.load(std::memory_order_relaxed); } void clear(queue_allocator_type& allocator, padded_page* new_head = nullptr, padded_page* new_tail = nullptr) { padded_page* curr_page = get_head_page(); size_type index = (head_counter.load(std::memory_order_relaxed) / queue_rep_type::n_queue) % items_per_page; page_allocator_type page_allocator(allocator); while (curr_page && is_valid_page(curr_page)) { while (index != items_per_page) { if (curr_page->mask.load(std::memory_order_relaxed) & (std::uintptr_t(1) << index)) { page_allocator_traits::destroy(page_allocator, &curr_page->operator[](index)); } ++index; } index = 0; padded_page* next_page = curr_page->next; page_allocator_traits::destroy(page_allocator, curr_page); page_allocator_traits::deallocate(page_allocator, curr_page, 1); curr_page = next_page; } head_counter.store(0, std::memory_order_relaxed); tail_counter.store(0, std::memory_order_relaxed); head_page.store(new_head, std::memory_order_relaxed); tail_page.store(new_tail, std::memory_order_relaxed); } void clear_and_invalidate(queue_allocator_type& allocator) { padded_page* invalid_page = reinterpret_cast(std::uintptr_t(1)); clear(allocator, invalid_page, invalid_page); } private: // template friend class micro_queue_pop_finalizer; // Class used to ensure exception-safety of method "pop" class destroyer { value_type& my_value; public: destroyer( reference value ) : my_value(value) {} destroyer( const destroyer& ) = delete; destroyer& operator=( const destroyer& ) = delete; ~destroyer() {my_value.~T();} }; // class destroyer void copy_item( padded_page& dst, size_type dindex, const padded_page& src, size_type sindex, item_constructor_type construct_item ) { auto& src_item = src[sindex]; construct_item( &dst[dindex], static_cast(&src_item) ); } void assign_and_destroy_item( void* dst, padded_page& src, size_type index ) { auto& from = src[index]; destroyer d(from); *static_cast(dst) = std::move(from); } void spin_wait_until_my_turn( std::atomic& counter, ticket_type k, queue_rep_type& rb ) const { for (atomic_backoff b{};; b.pause()) { ticket_type c = counter.load(std::memory_order_acquire); if (c == k) return; else if (c & 1) { ++rb.n_invalid_entries; throw_exception( exception_id::bad_last_alloc); } } } std::atomic head_page{}; std::atomic head_counter{}; std::atomic tail_page{}; std::atomic tail_counter{}; spin_mutex page_mutex{}; }; // class micro_queue #if _MSC_VER && !defined(__INTEL_COMPILER) #pragma warning( pop ) #endif // warning 4146 is back template class micro_queue_pop_finalizer { public: using padded_page = typename Container::padded_page; using allocator_type = Allocator; using allocator_traits_type = tbb::detail::allocator_traits; micro_queue_pop_finalizer( Container& queue, Allocator& alloc, ticket_type k, padded_page* p ) : my_ticket_type(k), my_queue(queue), my_page(p), allocator(alloc) {} micro_queue_pop_finalizer( const micro_queue_pop_finalizer& ) = delete; micro_queue_pop_finalizer& operator=( const micro_queue_pop_finalizer& ) = delete; ~micro_queue_pop_finalizer() { padded_page* p = my_page; if( is_valid_page(p) ) { spin_mutex::scoped_lock lock( my_queue.page_mutex ); padded_page* q = p->next; my_queue.head_page.store(q, std::memory_order_relaxed); if( !is_valid_page(q) ) { my_queue.tail_page.store(nullptr, std::memory_order_relaxed); } } my_queue.head_counter.store(my_ticket_type, std::memory_order_release); if ( is_valid_page(p) ) { allocator_traits_type::destroy(allocator, static_cast(p)); allocator_traits_type::deallocate(allocator, static_cast(p), 1); } } private: ticket_type my_ticket_type; Container& my_queue; padded_page* my_page; Allocator& allocator; }; // class micro_queue_pop_finalizer #if _MSC_VER && !defined(__INTEL_COMPILER) // structure was padded due to alignment specifier #pragma warning( push ) #pragma warning( disable: 4324 ) #endif template struct concurrent_queue_rep { using self_type = concurrent_queue_rep; using size_type = std::size_t; using micro_queue_type = micro_queue; using allocator_type = Allocator; using allocator_traits_type = tbb::detail::allocator_traits; using padded_page = typename micro_queue_type::padded_page; using page_allocator_type = typename micro_queue_type::page_allocator_type; using item_constructor_type = typename micro_queue_type::item_constructor_type; private: using page_allocator_traits = tbb::detail::allocator_traits; using queue_allocator_type = typename allocator_traits_type::template rebind_alloc; public: // must be power of 2 static constexpr size_type n_queue = 8; // Approximately n_queue/golden ratio static constexpr size_type phi = 3; static constexpr size_type item_size = micro_queue_type::item_size; static constexpr size_type items_per_page = micro_queue_type::items_per_page; concurrent_queue_rep() {} concurrent_queue_rep( const concurrent_queue_rep& ) = delete; concurrent_queue_rep& operator=( const concurrent_queue_rep& ) = delete; void clear( queue_allocator_type& alloc ) { for (size_type index = 0; index < n_queue; ++index) { array[index].clear(alloc); } head_counter.store(0, std::memory_order_relaxed); tail_counter.store(0, std::memory_order_relaxed); n_invalid_entries.store(0, std::memory_order_relaxed); } void assign( const concurrent_queue_rep& src, queue_allocator_type& alloc, item_constructor_type construct_item ) { head_counter.store(src.head_counter.load(std::memory_order_relaxed), std::memory_order_relaxed); tail_counter.store(src.tail_counter.load(std::memory_order_relaxed), std::memory_order_relaxed); n_invalid_entries.store(src.n_invalid_entries.load(std::memory_order_relaxed), std::memory_order_relaxed); // copy or move micro_queues size_type queue_idx = 0; try_call( [&] { for (; queue_idx < n_queue; ++queue_idx) { array[queue_idx].assign(src.array[queue_idx], alloc, construct_item); } }).on_exception( [&] { for (size_type i = 0; i < queue_idx + 1; ++i) { array[i].clear_and_invalidate(alloc); } head_counter.store(0, std::memory_order_relaxed); tail_counter.store(0, std::memory_order_relaxed); n_invalid_entries.store(0, std::memory_order_relaxed); }); __TBB_ASSERT(head_counter.load(std::memory_order_relaxed) == src.head_counter.load(std::memory_order_relaxed) && tail_counter.load(std::memory_order_relaxed) == src.tail_counter.load(std::memory_order_relaxed), "the source concurrent queue should not be concurrently modified." ); } bool empty() const { ticket_type tc = tail_counter.load(std::memory_order_acquire); ticket_type hc = head_counter.load(std::memory_order_relaxed); // if tc!=r.tail_counter, the queue was not empty at some point between the two reads. return tc == tail_counter.load(std::memory_order_relaxed) && std::ptrdiff_t(tc - hc - n_invalid_entries.load(std::memory_order_relaxed)) <= 0; } std::ptrdiff_t size() const { __TBB_ASSERT(sizeof(std::ptrdiff_t) <= sizeof(size_type), nullptr); std::ptrdiff_t hc = head_counter.load(std::memory_order_acquire); std::ptrdiff_t tc = tail_counter.load(std::memory_order_relaxed); std::ptrdiff_t nie = n_invalid_entries.load(std::memory_order_relaxed); return tc - hc - nie; } friend class micro_queue; // Map ticket_type to an array index static size_type index( ticket_type k ) { return k * phi % n_queue; } micro_queue_type& choose( ticket_type k ) { // The formula here approximates LRU in a cache-oblivious way. return array[index(k)]; } alignas(max_nfs_size) micro_queue_type array[n_queue]; alignas(max_nfs_size) std::atomic head_counter{}; alignas(max_nfs_size) std::atomic tail_counter{}; alignas(max_nfs_size) std::atomic n_invalid_entries{}; }; // class concurrent_queue_rep #if _MSC_VER && !defined(__INTEL_COMPILER) #pragma warning( pop ) #endif template class concurrent_queue_iterator_base { using queue_rep_type = concurrent_queue_rep; using padded_page = typename queue_rep_type::padded_page; protected: concurrent_queue_iterator_base() = default; concurrent_queue_iterator_base( const concurrent_queue_iterator_base& other ) { assign(other); } concurrent_queue_iterator_base( queue_rep_type* queue_rep ) : my_queue_rep(queue_rep), my_head_counter(my_queue_rep->head_counter.load(std::memory_order_relaxed)) { for (std::size_t i = 0; i < queue_rep_type::n_queue; ++i) { my_array[i] = my_queue_rep->array[i].get_head_page(); } if (!get_item(my_item, my_head_counter)) advance(); } void assign( const concurrent_queue_iterator_base& other ) { my_item = other.my_item; my_queue_rep = other.my_queue_rep; if (my_queue_rep != nullptr) { my_head_counter = other.my_head_counter; for (std::size_t i = 0; i < queue_rep_type::n_queue; ++i) { my_array[i] = other.my_array[i]; } } } void advance() { __TBB_ASSERT(my_item, "Attempt to increment iterator past end of the queue"); std::size_t k = my_head_counter; #if TBB_USE_ASSERT Value* tmp; get_item(tmp, k); __TBB_ASSERT(my_item == tmp, nullptr); #endif std::size_t i = modulo_power_of_two(k / queue_rep_type::n_queue, my_queue_rep->items_per_page); if (i == my_queue_rep->items_per_page - 1) { padded_page*& root = my_array[queue_rep_type::index(k)]; root = root->next; } // Advance k my_head_counter = ++k; if (!get_item(my_item, k)) advance(); } concurrent_queue_iterator_base& operator=( const concurrent_queue_iterator_base& other ) { this->assign(other); return *this; } bool get_item( Value*& item, std::size_t k ) { if (k == my_queue_rep->tail_counter.load(std::memory_order_relaxed)) { item = nullptr; return true; } else { padded_page* p = my_array[queue_rep_type::index(k)]; __TBB_ASSERT(p, nullptr); std::size_t i = modulo_power_of_two(k / queue_rep_type::n_queue, my_queue_rep->items_per_page); item = &(*p)[i]; return (p->mask & uintptr_t(1) << i) != 0; } } Value* my_item{ nullptr }; queue_rep_type* my_queue_rep{ nullptr }; ticket_type my_head_counter{}; padded_page* my_array[queue_rep_type::n_queue]{}; }; // class concurrent_queue_iterator_base struct concurrent_queue_iterator_provider { template static Iterator get( const Container& container ) { return Iterator(container); } }; // struct concurrent_queue_iterator_provider template class concurrent_queue_iterator : public concurrent_queue_iterator_base::type, Allocator> { using base_type = concurrent_queue_iterator_base::type, Allocator>; public: using value_type = Value; using pointer = value_type*; using reference = value_type&; using difference_type = std::ptrdiff_t; using iterator_category = std::forward_iterator_tag; concurrent_queue_iterator() = default; /** If Value==Container::value_type, then this routine is the copy constructor. If Value==const Container::value_type, then this routine is a conversion constructor. */ concurrent_queue_iterator( const concurrent_queue_iterator& other ) : base_type(other) {} private: concurrent_queue_iterator( const Container& container ) : base_type(container.my_queue_representation) {} public: concurrent_queue_iterator& operator=( const concurrent_queue_iterator& other ) { this->assign(other); return *this; } reference operator*() const { return *static_cast(this->my_item); } pointer operator->() const { return &operator*(); } concurrent_queue_iterator& operator++() { this->advance(); return *this; } concurrent_queue_iterator operator++(int) { concurrent_queue_iterator tmp = *this; ++*this; return tmp; } friend bool operator==( const concurrent_queue_iterator& lhs, const concurrent_queue_iterator& rhs ) { return lhs.my_item == rhs.my_item; } friend bool operator!=( const concurrent_queue_iterator& lhs, const concurrent_queue_iterator& rhs ) { return lhs.my_item != rhs.my_item; } private: friend struct concurrent_queue_iterator_provider; }; // class concurrent_queue_iterator } // namespace d2 } // namespace detail } // tbb #endif // __TBB_detail__concurrent_queue_base_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_concurrent_skip_list.h ================================================ /* Copyright (c) 2019-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_detail__concurrent_skip_list_H #define __TBB_detail__concurrent_skip_list_H #if !defined(__TBB_concurrent_map_H) && !defined(__TBB_concurrent_set_H) #error Do not #include this internal file directly; use public TBB headers instead. #endif #include "_config.h" #include "_range_common.h" #include "_allocator_traits.h" #include "_template_helpers.h" #include "_node_handle.h" #include "_containers_helpers.h" #include "_assert.h" #include "_exception.h" #include "../enumerable_thread_specific.h" #include #include #include #include #include #include // Need std::geometric_distribution #include // Need std::equal and std::lexicographical_compare #include #if __TBB_CPP20_COMPARISONS_PRESENT #include #endif #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) #pragma warning(push) #pragma warning(disable: 4127) // warning C4127: conditional expression is constant #endif namespace tbb { namespace detail { namespace d2 { template class skip_list_node { using node_ptr = skip_list_node*; public: using value_type = Value; using atomic_node_ptr = std::atomic; using size_type = std::size_t; using container_allocator_type = Allocator; using reference = value_type&; using const_reference = const value_type&; private: using allocator_traits = tbb::detail::allocator_traits; // Allocator is the same as the container allocator=> allocates unitptr_t // It is required to rebind it to value_type to get the correct pointer and const_pointer using value_allocator_traits = typename allocator_traits::template rebind_traits; public: using pointer = typename value_allocator_traits::pointer; using const_pointer = typename value_allocator_traits::const_pointer; //In perfect world these constructor and destructor would have been private, //however this seems technically impractical due to use of allocator_traits. //Should not be called directly, instead use create method skip_list_node( size_type levels ) : my_height(levels), my_index_number(0) {} //Should not be called directly, instead use destroy method ~skip_list_node() {} skip_list_node( const skip_list_node& ) = delete; skip_list_node( skip_list_node&& ) = delete; skip_list_node& operator=( const skip_list_node& ) = delete; skip_list_node& operator=( skip_list_node&& ) = delete; static skip_list_node* create( container_allocator_type& alloc, size_type height ) { size_type sz = calc_node_size(height); static_assert(std::is_same::value, "skip_list_node assumes that passed in allocator operates on bytes"); auto* node = reinterpret_cast(allocator_traits::allocate(alloc, sz)); //Construct the node itself allocator_traits::construct(alloc, node, height); //Construct the level pointers for (size_type l = 0; l < height; ++l) { allocator_traits::construct(alloc, &node->get_atomic_next(l), nullptr); } return node; } static void destroy( container_allocator_type& alloc, skip_list_node* node ) { //Destroy the level pointers for (size_type l = 0; l < node->height(); ++l) { allocator_traits::destroy(alloc, &node->atomic_next(l)); } size_type sz = calc_node_size(node->height()); // Destroy the node itself allocator_traits::destroy(alloc, node); // Deallocate the node allocator_traits::deallocate(alloc, reinterpret_cast(node), sz); } pointer storage() { return &my_value; } reference value() { return *storage(); } node_ptr next( size_type level ) const { node_ptr res = get_atomic_next(level).load(std::memory_order_acquire); __TBB_ASSERT(res == nullptr || res->height() > level, "Broken internal structure"); return res; } atomic_node_ptr& atomic_next( size_type level ) { atomic_node_ptr& res = get_atomic_next(level); #if TBB_USE_DEBUG node_ptr node = res.load(std::memory_order_acquire); __TBB_ASSERT(node == nullptr || node->height() > level, "Broken internal structure"); #endif return res; } void set_next( size_type level, node_ptr n ) { __TBB_ASSERT(n == nullptr || n->height() > level, "Broken internal structure"); get_atomic_next(level).store(n, std::memory_order_relaxed); } size_type height() const { return my_height; } void set_index_number( size_type index_num ) { my_index_number = index_num; } size_type index_number() const { return my_index_number; } private: static size_type calc_node_size( size_type height ) { static_assert(alignof(skip_list_node) >= alignof(atomic_node_ptr), "Incorrect alignment"); return sizeof(skip_list_node) + height * sizeof(atomic_node_ptr); } atomic_node_ptr& get_atomic_next( size_type level ) { atomic_node_ptr* arr = reinterpret_cast(this + 1); return arr[level]; } const atomic_node_ptr& get_atomic_next( size_type level ) const { const atomic_node_ptr* arr = reinterpret_cast(this + 1); return arr[level]; } union { value_type my_value; }; size_type my_height; size_type my_index_number; }; // class skip_list_node template class skip_list_iterator { using node_type = NodeType; using node_ptr = node_type*; public: using iterator_category = std::forward_iterator_tag; using value_type = ValueType; using difference_type = std::ptrdiff_t; using pointer = value_type*; using reference = value_type&; skip_list_iterator() : skip_list_iterator(nullptr) {} skip_list_iterator( const skip_list_iterator& other ) : my_node_ptr(other.my_node_ptr) {} skip_list_iterator& operator=( const skip_list_iterator& other ) { my_node_ptr = other.my_node_ptr; return *this; } reference operator*() const { return my_node_ptr->value(); } pointer operator->() const { return my_node_ptr->storage(); } skip_list_iterator& operator++() { __TBB_ASSERT(my_node_ptr != nullptr, nullptr); my_node_ptr = my_node_ptr->next(0); return *this; } skip_list_iterator operator++(int) { skip_list_iterator tmp = *this; ++*this; return tmp; } private: skip_list_iterator(node_type* n) : my_node_ptr(n) {} node_ptr my_node_ptr; template friend class concurrent_skip_list; template friend class skip_list_iterator; friend class const_range; friend class range; friend bool operator==( const skip_list_iterator& lhs, const skip_list_iterator& rhs ) { return lhs.my_node_ptr == rhs.my_node_ptr; } friend bool operator!=( const skip_list_iterator& lhs, const skip_list_iterator& rhs ) { return lhs.my_node_ptr != rhs.my_node_ptr; } }; // class skip_list_iterator template class concurrent_skip_list { protected: using container_traits = Traits; using self_type = concurrent_skip_list; using allocator_type = typename container_traits::allocator_type; using allocator_traits_type = tbb::detail::allocator_traits; using key_compare = typename container_traits::compare_type; using value_compare = typename container_traits::value_compare; using key_type = typename container_traits::key_type; using value_type = typename container_traits::value_type; static_assert(std::is_same::value, "value_type of the container should be the same as its allocator"); using size_type = std::size_t; using difference_type = std::ptrdiff_t; static constexpr size_type max_level = container_traits::max_level; using node_allocator_type = typename allocator_traits_type::template rebind_alloc; using node_allocator_traits = tbb::detail::allocator_traits; using list_node_type = skip_list_node; using node_type = d1::node_handle; using iterator = skip_list_iterator; using const_iterator = skip_list_iterator; using reference = value_type&; using const_reference = const value_type&; using pointer = typename allocator_traits_type::pointer; using const_pointer = typename allocator_traits_type::const_pointer; using random_level_generator_type = typename container_traits::random_level_generator_type; using node_ptr = list_node_type*; using array_type = std::array; private: template using is_transparent = dependent_bool, T>; public: static constexpr bool allow_multimapping = container_traits::allow_multimapping; concurrent_skip_list() : my_head_ptr(nullptr), my_size(0), my_max_height(0) {} explicit concurrent_skip_list( const key_compare& comp, const allocator_type& alloc = allocator_type() ) : my_node_allocator(alloc), my_compare(comp), my_head_ptr(nullptr), my_size(0), my_max_height(0) {} explicit concurrent_skip_list( const allocator_type& alloc ) : concurrent_skip_list(key_compare(), alloc) {} template concurrent_skip_list( InputIterator first, InputIterator last, const key_compare& comp = key_compare(), const allocator_type& alloc = allocator_type() ) : concurrent_skip_list(comp, alloc) { internal_copy(first, last); } template concurrent_skip_list( InputIterator first, InputIterator last, const allocator_type& alloc ) : concurrent_skip_list(first, last, key_compare(), alloc) {} concurrent_skip_list( std::initializer_list init, const key_compare& comp = key_compare(), const allocator_type& alloc = allocator_type() ) : concurrent_skip_list(init.begin(), init.end(), comp, alloc) {} concurrent_skip_list( std::initializer_list init, const allocator_type& alloc ) : concurrent_skip_list(init, key_compare(), alloc) {} concurrent_skip_list( const concurrent_skip_list& other ) : my_node_allocator(node_allocator_traits::select_on_container_copy_construction(other.get_allocator())), my_compare(other.my_compare), my_rng(other.my_rng), my_head_ptr(nullptr), my_size(0), my_max_height(0) { internal_copy(other); __TBB_ASSERT(my_size == other.my_size, "Wrong size of copy-constructed container"); } concurrent_skip_list( const concurrent_skip_list& other, const allocator_type& alloc ) : my_node_allocator(alloc), my_compare(other.my_compare), my_rng(other.my_rng), my_head_ptr(nullptr), my_size(0), my_max_height(0) { internal_copy(other); __TBB_ASSERT(my_size == other.my_size, "Wrong size of copy-constructed container"); } concurrent_skip_list( concurrent_skip_list&& other ) : my_node_allocator(std::move(other.my_node_allocator)), my_compare(other.my_compare), my_rng(std::move(other.my_rng)), my_head_ptr(nullptr) // my_head_ptr would be stored in internal_move { internal_move(std::move(other)); } concurrent_skip_list( concurrent_skip_list&& other, const allocator_type& alloc ) : my_node_allocator(alloc), my_compare(other.my_compare), my_rng(std::move(other.my_rng)), my_head_ptr(nullptr) { using is_always_equal = typename allocator_traits_type::is_always_equal; internal_move_construct_with_allocator(std::move(other), is_always_equal()); } ~concurrent_skip_list() { clear(); delete_head(); } concurrent_skip_list& operator=( const concurrent_skip_list& other ) { if (this != &other) { clear(); copy_assign_allocators(my_node_allocator, other.my_node_allocator); my_compare = other.my_compare; my_rng = other.my_rng; internal_copy(other); } return *this; } concurrent_skip_list& operator=( concurrent_skip_list&& other ) { if (this != &other) { clear(); delete_head(); my_compare = std::move(other.my_compare); my_rng = std::move(other.my_rng); move_assign_allocators(my_node_allocator, other.my_node_allocator); using pocma_type = typename node_allocator_traits::propagate_on_container_move_assignment; using is_always_equal = typename node_allocator_traits::is_always_equal; internal_move_assign(std::move(other), tbb::detail::disjunction()); } return *this; } concurrent_skip_list& operator=( std::initializer_list il ) { clear(); insert(il.begin(),il.end()); return *this; } std::pair insert( const value_type& value ) { return internal_insert(value); } std::pair insert( value_type&& value ) { return internal_insert(std::move(value)); } iterator insert( const_iterator, const_reference value ) { // Ignore hint return insert(value).first; } iterator insert( const_iterator, value_type&& value ) { // Ignore hint return insert(std::move(value)).first; } template void insert( InputIterator first, InputIterator last ) { while (first != last) { insert(*first); ++first; } } void insert( std::initializer_list init ) { insert(init.begin(), init.end()); } std::pair insert( node_type&& nh ) { if (!nh.empty()) { auto insert_node = d1::node_handle_accessor::get_node_ptr(nh); std::pair insert_result = internal_insert_node(insert_node); if (insert_result.second) { d1::node_handle_accessor::deactivate(nh); } return insert_result; } return std::pair(end(), false); } iterator insert( const_iterator, node_type&& nh ) { // Ignore hint return insert(std::move(nh)).first; } template std::pair emplace( Args&&... args ) { return internal_insert(std::forward(args)...); } template iterator emplace_hint( const_iterator, Args&&... args ) { // Ignore hint return emplace(std::forward(args)...).first; } iterator unsafe_erase( iterator pos ) { std::pair extract_result = internal_extract(pos); if (extract_result.first) { // node was extracted delete_value_node(extract_result.first); return extract_result.second; } return end(); } iterator unsafe_erase( const_iterator pos ) { return unsafe_erase(get_iterator(pos)); } iterator unsafe_erase( const_iterator first, const_iterator last ) { while (first != last) { // Unsafe erase returns the iterator which follows the erased one first = unsafe_erase(first); } return get_iterator(first); } size_type unsafe_erase( const key_type& key ) { return internal_erase(key); } template typename std::enable_if::value && !std::is_convertible::value && !std::is_convertible::value, size_type>::type unsafe_erase( const K& key ) { return internal_erase(key); } node_type unsafe_extract( const_iterator pos ) { std::pair extract_result = internal_extract(pos); return extract_result.first ? d1::node_handle_accessor::construct(extract_result.first) : node_type(); } node_type unsafe_extract( iterator pos ) { return unsafe_extract(const_iterator(pos)); } node_type unsafe_extract( const key_type& key ) { return unsafe_extract(find(key)); } template typename std::enable_if::value && !std::is_convertible::value && !std::is_convertible::value, node_type>::type unsafe_extract( const K& key ) { return unsafe_extract(find(key)); } iterator lower_bound( const key_type& key ) { return iterator(internal_get_bound(key, my_compare)); } const_iterator lower_bound( const key_type& key ) const { return const_iterator(internal_get_bound(key, my_compare)); } template typename std::enable_if::value, iterator>::type lower_bound( const K& key ) { return iterator(internal_get_bound(key, my_compare)); } template typename std::enable_if::value, const_iterator>::type lower_bound( const K& key ) const { return const_iterator(internal_get_bound(key, my_compare)); } iterator upper_bound( const key_type& key ) { return iterator(internal_get_bound(key, not_greater_compare(my_compare))); } const_iterator upper_bound( const key_type& key ) const { return const_iterator(internal_get_bound(key, not_greater_compare(my_compare))); } template typename std::enable_if::value, iterator>::type upper_bound( const K& key ) { return iterator(internal_get_bound(key, not_greater_compare(my_compare))); } template typename std::enable_if::value, const_iterator>::type upper_bound( const K& key ) const { return const_iterator(internal_get_bound(key, not_greater_compare(my_compare))); } iterator find( const key_type& key ) { return iterator(internal_find(key)); } const_iterator find( const key_type& key ) const { return const_iterator(internal_find(key)); } template typename std::enable_if::value, iterator>::type find( const K& key ) { return iterator(internal_find(key)); } template typename std::enable_if::value, const_iterator>::type find( const K& key ) const { return const_iterator(internal_find(key)); } size_type count( const key_type& key ) const { return internal_count(key); } template typename std::enable_if::value, size_type>::type count( const K& key ) const { return internal_count(key); } bool contains( const key_type& key ) const { return find(key) != end(); } template typename std::enable_if::value, bool>::type contains( const K& key ) const { return find(key) != end(); } void clear() noexcept { // clear is not thread safe - load can be relaxed node_ptr head = my_head_ptr.load(std::memory_order_relaxed); if (head == nullptr) return; // Head is not allocated => container is empty node_ptr current = head->next(0); // Delete all value nodes in the container while (current) { node_ptr next = current->next(0); delete_value_node(current); current = next; } for (size_type level = 0; level < head->height(); ++level) { head->set_next(level, nullptr); } my_size.store(0, std::memory_order_relaxed); my_max_height.store(0, std::memory_order_relaxed); } iterator begin() { return iterator(internal_begin()); } const_iterator begin() const { return const_iterator(internal_begin()); } const_iterator cbegin() const { return const_iterator(internal_begin()); } iterator end() { return iterator(nullptr); } const_iterator end() const { return const_iterator(nullptr); } const_iterator cend() const { return const_iterator(nullptr); } size_type size() const { return my_size.load(std::memory_order_relaxed); } size_type max_size() const { return node_allocator_traits::max_size(my_node_allocator); } __TBB_nodiscard bool empty() const { return 0 == size(); } allocator_type get_allocator() const { return my_node_allocator; } void swap(concurrent_skip_list& other) { if (this != &other) { using pocs_type = typename node_allocator_traits::propagate_on_container_swap; using is_always_equal = typename node_allocator_traits::is_always_equal; internal_swap(other, tbb::detail::disjunction()); } } std::pair equal_range(const key_type& key) { return internal_equal_range(key); } std::pair equal_range(const key_type& key) const { return internal_equal_range(key); } template typename std::enable_if::value, std::pair>::type equal_range( const K& key ) { return internal_equal_range(key); } template typename std::enable_if::value, std::pair>::type equal_range( const K& key ) const { return internal_equal_range(key); } key_compare key_comp() const { return my_compare; } value_compare value_comp() const { return container_traits::value_comp(my_compare); } class const_range_type { public: using size_type = typename concurrent_skip_list::size_type; using difference_type = typename concurrent_skip_list::difference_type; using iterator = typename concurrent_skip_list::const_iterator; using value_type = typename iterator::value_type; using reference = typename iterator::reference; bool empty() const { return my_begin.my_node_ptr ? (my_begin.my_node_ptr->next(0) == my_end.my_node_ptr) : true; } bool is_divisible() const { return my_begin.my_node_ptr && my_level != 0 ? my_begin.my_node_ptr->next(my_level - 1) != my_end.my_node_ptr : false; } size_type size() const { return std::distance(my_begin, my_end); } const_range_type( const_range_type& r, split) : my_end(r.my_end) { if (r.empty()) { __TBB_ASSERT(my_end.my_node_ptr == nullptr, nullptr); my_begin = my_end; my_level = 0; } else { my_begin = iterator(r.my_begin.my_node_ptr->next(r.my_level - 1)); my_level = my_begin.my_node_ptr->height(); } r.my_end = my_begin; } const_range_type( const concurrent_skip_list& l) : my_end(l.end()), my_begin(l.begin()), my_level(my_begin.my_node_ptr ? my_begin.my_node_ptr->height() : 0) {} iterator begin() const { return my_begin; } iterator end() const { return my_end; } size_type grainsize() const { return 1; } private: const_iterator my_end; const_iterator my_begin; size_type my_level; }; // class const_range_type class range_type : public const_range_type { public: using iterator = typename concurrent_skip_list::iterator; using value_type = typename iterator::value_type; using reference = typename iterator::reference; range_type(range_type& r, split) : const_range_type(r, split()) {} range_type(const concurrent_skip_list& l) : const_range_type(l) {} iterator begin() const { node_ptr node = const_range_type::begin().my_node_ptr; return iterator(node); } iterator end() const { node_ptr node = const_range_type::end().my_node_ptr; return iterator(node); } }; // class range_type range_type range() { return range_type(*this); } const_range_type range() const { return const_range_type(*this); } private: node_ptr internal_begin() const { node_ptr head = get_head(); return head == nullptr ? head : head->next(0); } void internal_move(concurrent_skip_list&& other) { my_head_ptr.store(other.my_head_ptr.load(std::memory_order_relaxed), std::memory_order_relaxed); other.my_head_ptr.store(nullptr, std::memory_order_relaxed); my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed); other.my_size.store(0, std::memory_order_relaxed); my_max_height.store(other.my_max_height.load(std::memory_order_relaxed), std::memory_order_relaxed); other.my_max_height.store(0, std::memory_order_relaxed); } void internal_move_construct_with_allocator(concurrent_skip_list&& other, /*is_always_equal = */std::true_type) { internal_move(std::move(other)); } void internal_move_construct_with_allocator(concurrent_skip_list&& other, /*is_always_equal = */std::false_type) { if (my_node_allocator == other.get_allocator()) { internal_move(std::move(other)); } else { my_size.store(0, std::memory_order_relaxed); my_max_height.store(other.my_max_height.load(std::memory_order_relaxed), std::memory_order_relaxed); internal_copy(std::make_move_iterator(other.begin()), std::make_move_iterator(other.end())); } } static const key_type& get_key( node_ptr n ) { __TBB_ASSERT(n, nullptr); return container_traits::get_key(static_cast(n)->value()); } template bool found( node_ptr node, const K& key ) const { return node != nullptr && !my_compare(key, get_key(node)); } template node_ptr internal_find(const K& key) const { return allow_multimapping ? internal_find_multi(key) : internal_find_unique(key); } template node_ptr internal_find_multi( const K& key ) const { node_ptr prev = get_head(); if (prev == nullptr) return nullptr; // If the head node is not allocated - exit node_ptr curr = nullptr; node_ptr old_curr = curr; for (size_type h = my_max_height.load(std::memory_order_acquire); h > 0; --h) { curr = internal_find_position(h - 1, prev, key, my_compare); if (curr != old_curr && found(curr, key)) { return curr; } old_curr = curr; } return nullptr; } template node_ptr internal_find_unique( const K& key ) const { const_iterator it = lower_bound(key); return (it == end() || my_compare(key, container_traits::get_key(*it))) ? nullptr : it.my_node_ptr; } template size_type internal_count( const K& key ) const { if (allow_multimapping) { // TODO: reimplement without double traversal std::pair r = equal_range(key); return std::distance(r.first, r.second); } return size_type(contains(key) ? 1 : 0); } template std::pair internal_equal_range(const K& key) const { iterator lb = get_iterator(lower_bound(key)); auto result = std::make_pair(lb, lb); // If the lower bound points to the node with the requested key if (found(lb.my_node_ptr, key)) { if (!allow_multimapping) { // For unique containers - move the second iterator forward and exit ++result.second; } else { // For multi containers - find the upper bound starting from the lower bound node_ptr prev = lb.my_node_ptr; node_ptr curr = nullptr; not_greater_compare cmp(my_compare); // Start from the lower bound of the range for (size_type h = prev->height(); h > 0; --h) { curr = prev->next(h - 1); while (curr && cmp(get_key(curr), key)) { prev = curr; // If the height of the next node is greater than the current one - jump to its height if (h < curr->height()) { h = curr->height(); } curr = prev->next(h - 1); } } result.second = iterator(curr); } } return result; } // Finds position on the level using comparator cmp starting from the node prev template node_ptr internal_find_position( size_type level, node_ptr& prev, const K& key, const Comparator& cmp ) const { __TBB_ASSERT(level < prev->height(), "Wrong level to find position"); node_ptr curr = prev->next(level); while (curr && cmp(get_key(curr), key)) { prev = curr; __TBB_ASSERT(level < prev->height(), nullptr); curr = prev->next(level); } return curr; } // The same as previous overload, but allows index_number comparison template node_ptr internal_find_position( size_type level, node_ptr& prev, node_ptr node, const Comparator& cmp ) const { __TBB_ASSERT(level < prev->height(), "Wrong level to find position"); node_ptr curr = prev->next(level); while (curr && cmp(get_key(curr), get_key(node))) { if (allow_multimapping && cmp(get_key(node), get_key(curr)) && curr->index_number() > node->index_number()) { break; } prev = curr; __TBB_ASSERT(level < prev->height(), nullptr); curr = prev->next(level); } return curr; } template void fill_prev_curr_arrays(array_type& prev_nodes, array_type& curr_nodes, node_ptr node, const key_type& key, const Comparator& cmp, node_ptr head ) { size_type curr_max_height = my_max_height.load(std::memory_order_acquire); size_type node_height = node->height(); if (curr_max_height < node_height) { std::fill(prev_nodes.begin() + curr_max_height, prev_nodes.begin() + node_height, head); std::fill(curr_nodes.begin() + curr_max_height, curr_nodes.begin() + node_height, nullptr); } node_ptr prev = head; for (size_type level = curr_max_height; level > 0; --level) { node_ptr curr = internal_find_position(level - 1, prev, key, cmp); prev_nodes[level - 1] = prev; curr_nodes[level - 1] = curr; } } void fill_prev_array_for_existing_node( array_type& prev_nodes, node_ptr node ) { node_ptr head = create_head_if_necessary(); prev_nodes.fill(head); node_ptr prev = head; for (size_type level = node->height(); level > 0; --level) { while (prev->next(level - 1) != node) { prev = prev->next(level - 1); } prev_nodes[level - 1] = prev; } } struct not_greater_compare { const key_compare& my_less_compare; not_greater_compare( const key_compare& less_compare ) : my_less_compare(less_compare) {} template bool operator()( const K1& first, const K2& second ) const { return !my_less_compare(second, first); } }; not_greater_compare select_comparator( /*allow_multimapping = */ std::true_type ) { return not_greater_compare(my_compare); } key_compare select_comparator( /*allow_multimapping = */ std::false_type ) { return my_compare; } template std::pair internal_insert( Args&&... args ) { node_ptr new_node = create_value_node(std::forward(args)...); std::pair insert_result = internal_insert_node(new_node); if (!insert_result.second) { delete_value_node(new_node); } return insert_result; } std::pair internal_insert_node( node_ptr new_node ) { array_type prev_nodes; array_type curr_nodes; size_type new_height = new_node->height(); auto compare = select_comparator(std::integral_constant{}); node_ptr head_node = create_head_if_necessary(); for (;;) { fill_prev_curr_arrays(prev_nodes, curr_nodes, new_node, get_key(new_node), compare, head_node); node_ptr prev = prev_nodes[0]; node_ptr next = curr_nodes[0]; if (allow_multimapping) { new_node->set_index_number(prev->index_number() + 1); } else { if (found(next, get_key(new_node))) { return std::pair(iterator(next), false); } } new_node->set_next(0, next); if (!prev->atomic_next(0).compare_exchange_strong(next, new_node)) { continue; } // If the node was successfully linked on the first level - it will be linked on other levels // Insertion cannot fail starting from this point // If the height of inserted node is greater than maximum - increase maximum size_type max_height = my_max_height.load(std::memory_order_acquire); for (;;) { if (new_height <= max_height || my_max_height.compare_exchange_strong(max_height, new_height)) { // If the maximum was successfully updated by current thread // or by an other thread for the value, greater or equal to new_height break; } } for (std::size_t level = 1; level < new_height; ++level) { // Link the node on upper levels for (;;) { prev = prev_nodes[level]; next = static_cast(curr_nodes[level]); new_node->set_next(level, next); __TBB_ASSERT(new_node->height() > level, "Internal structure break"); if (prev->atomic_next(level).compare_exchange_strong(next, new_node)) { break; } for (size_type lev = level; lev != new_height; ++lev ) { curr_nodes[lev] = internal_find_position(lev, prev_nodes[lev], new_node, compare); } } } ++my_size; return std::pair(iterator(new_node), true); } } template node_ptr internal_get_bound( const K& key, const Comparator& cmp ) const { node_ptr prev = get_head(); if (prev == nullptr) return nullptr; // If the head node is not allocated - exit node_ptr curr = nullptr; for (size_type h = my_max_height.load(std::memory_order_acquire); h > 0; --h) { curr = internal_find_position(h - 1, prev, key, cmp); } return curr; } template size_type internal_erase( const K& key ) { auto eq = equal_range(key); size_type old_size = size(); unsafe_erase(eq.first, eq.second); return old_size - size(); } // Returns node_ptr to the extracted node and node_ptr to the next node after the extracted std::pair internal_extract( const_iterator it ) { std::pair result(nullptr, nullptr); if ( it != end() ) { array_type prev_nodes; node_ptr erase_node = it.my_node_ptr; node_ptr next_node = erase_node->next(0); fill_prev_array_for_existing_node(prev_nodes, erase_node); for (size_type level = 0; level < erase_node->height(); ++level) { prev_nodes[level]->set_next(level, erase_node->next(level)); erase_node->set_next(level, nullptr); } my_size.fetch_sub(1, std::memory_order_relaxed); result.first = erase_node; result.second = next_node; } return result; } protected: template void internal_merge( SourceType&& source ) { using source_type = typename std::decay::type; using source_iterator = typename source_type::iterator; static_assert((std::is_same::value), "Incompatible containers cannot be merged"); for (source_iterator it = source.begin(); it != source.end();) { source_iterator where = it++; if (allow_multimapping || !contains(container_traits::get_key(*where))) { node_type handle = source.unsafe_extract(where); __TBB_ASSERT(!handle.empty(), "Extracted handle in merge is empty"); if (!insert(std::move(handle)).second) { __TBB_ASSERT(!handle.empty(), "Handle should not be empty if insert fails"); //If the insertion fails - return the node into source source.insert(std::move(handle)); } __TBB_ASSERT(handle.empty(), "Node handle should be empty after the insertion"); } } } private: void internal_copy( const concurrent_skip_list& other ) { internal_copy(other.begin(), other.end()); } template void internal_copy( Iterator first, Iterator last ) { try_call([&] { for (auto it = first; it != last; ++it) { insert(*it); } }).on_exception([&] { clear(); delete_head(); }); } node_ptr create_node( size_type height ) { return list_node_type::create(my_node_allocator, height); } template node_ptr create_value_node( Args&&... args ) { node_ptr node = create_node(my_rng()); // try_call API is not convenient here due to broken // variadic capture on GCC 4.8.5 auto value_guard = make_raii_guard([&] { delete_node(node); }); // Construct the value inside the node node_allocator_traits::construct(my_node_allocator, node->storage(), std::forward(args)...); value_guard.dismiss(); return node; } node_ptr create_head_node() { return create_node(max_level); } void delete_head() { node_ptr head = my_head_ptr.load(std::memory_order_relaxed); if (head != nullptr) { delete_node(head); my_head_ptr.store(nullptr, std::memory_order_relaxed); } } void delete_node( node_ptr node ) { list_node_type::destroy(my_node_allocator, node); } void delete_value_node( node_ptr node ) { // Destroy the value inside the node node_allocator_traits::destroy(my_node_allocator, node->storage()); delete_node(node); } node_ptr get_head() const { return my_head_ptr.load(std::memory_order_acquire); } node_ptr create_head_if_necessary() { node_ptr current_head = get_head(); if (current_head == nullptr) { // Head node was not created - create it node_ptr new_head = create_head_node(); if (my_head_ptr.compare_exchange_strong(current_head, new_head)) { current_head = new_head; } else { // If an other thread has already created the head node - destroy new_head // current_head now points to the actual head node delete_node(new_head); } } __TBB_ASSERT(my_head_ptr.load(std::memory_order_relaxed) != nullptr, nullptr); __TBB_ASSERT(current_head != nullptr, nullptr); return current_head; } static iterator get_iterator( const_iterator it ) { return iterator(it.my_node_ptr); } void internal_move_assign( concurrent_skip_list&& other, /*POCMA || is_always_equal =*/std::true_type ) { internal_move(std::move(other)); } void internal_move_assign( concurrent_skip_list&& other, /*POCMA || is_always_equal =*/std::false_type ) { if (my_node_allocator == other.my_node_allocator) { internal_move(std::move(other)); } else { internal_copy(std::make_move_iterator(other.begin()), std::make_move_iterator(other.end())); } } void internal_swap_fields( concurrent_skip_list& other ) { using std::swap; swap_allocators(my_node_allocator, other.my_node_allocator); swap(my_compare, other.my_compare); swap(my_rng, other.my_rng); swap_atomics_relaxed(my_head_ptr, other.my_head_ptr); swap_atomics_relaxed(my_size, other.my_size); swap_atomics_relaxed(my_max_height, other.my_max_height); } void internal_swap( concurrent_skip_list& other, /*POCMA || is_always_equal =*/std::true_type ) { internal_swap_fields(other); } void internal_swap( concurrent_skip_list& other, /*POCMA || is_always_equal =*/std::false_type ) { __TBB_ASSERT(my_node_allocator == other.my_node_allocator, "Swapping with unequal allocators is not allowed"); internal_swap_fields(other); } node_allocator_type my_node_allocator; key_compare my_compare; random_level_generator_type my_rng; std::atomic my_head_ptr; std::atomic my_size; std::atomic my_max_height; template friend class concurrent_skip_list; }; // class concurrent_skip_list template bool operator==( const concurrent_skip_list& lhs, const concurrent_skip_list& rhs ) { if (lhs.size() != rhs.size()) return false; #if _MSC_VER // Passing "unchecked" iterators to std::equal with 3 parameters // causes compiler warnings. // The workaround is to use overload with 4 parameters, which is // available since C++14 - minimally supported version on MSVC return std::equal(lhs.begin(), lhs.end(), rhs.begin(), rhs.end()); #else return std::equal(lhs.begin(), lhs.end(), rhs.begin()); #endif } #if !__TBB_CPP20_COMPARISONS_PRESENT template bool operator!=( const concurrent_skip_list& lhs, const concurrent_skip_list& rhs ) { return !(lhs == rhs); } #endif #if __TBB_CPP20_COMPARISONS_PRESENT && __TBB_CPP20_CONCEPTS_PRESENT template tbb::detail::synthesized_three_way_result operator<=>( const concurrent_skip_list& lhs, const concurrent_skip_list& rhs ) { return std::lexicographical_compare_three_way(lhs.begin(), lhs.end(), rhs.begin(), rhs.end(), tbb::detail::synthesized_three_way_comparator{}); } #else template bool operator<( const concurrent_skip_list& lhs, const concurrent_skip_list& rhs ) { return std::lexicographical_compare(lhs.begin(), lhs.end(), rhs.begin(), rhs.end()); } template bool operator>( const concurrent_skip_list& lhs, const concurrent_skip_list& rhs ) { return rhs < lhs; } template bool operator<=( const concurrent_skip_list& lhs, const concurrent_skip_list& rhs ) { return !(rhs < lhs); } template bool operator>=( const concurrent_skip_list& lhs, const concurrent_skip_list& rhs ) { return !(lhs < rhs); } #endif // __TBB_CPP20_COMPARISONS_PRESENT && __TBB_CPP20_CONCEPTS_PRESENT // Generates a number from the interval [0, MaxLevel). template class concurrent_geometric_level_generator { public: static constexpr std::size_t max_level = MaxLevel; // TODO: modify the algorithm to accept other values of max_level static_assert(max_level == 32, "Incompatible max_level for rng"); concurrent_geometric_level_generator() : engines(std::minstd_rand::result_type(time(nullptr))) {} std::size_t operator()() { // +1 is required to pass at least 1 into log2 (log2(0) is undefined) // -1 is required to have an ability to return 0 from the generator (max_level - log2(2^31) - 1) std::size_t result = max_level - std::size_t(tbb::detail::log2(engines.local()() + 1)) - 1; __TBB_ASSERT(result <= max_level, nullptr); return result; } private: tbb::enumerable_thread_specific engines; }; } // namespace d2 } // namespace detail } // namespace tbb #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) #pragma warning(pop) // warning 4127 is back #endif #endif // __TBB_detail__concurrent_skip_list_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_concurrent_unordered_base.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_detail__concurrent_unordered_base_H #define __TBB_detail__concurrent_unordered_base_H #if !defined(__TBB_concurrent_unordered_map_H) && !defined(__TBB_concurrent_unordered_set_H) #error Do not #include this internal file directly; use public TBB headers instead. #endif #include "_range_common.h" #include "_containers_helpers.h" #include "_segment_table.h" #include "_hash_compare.h" #include "_allocator_traits.h" #include "_node_handle.h" #include "_assert.h" #include "_utils.h" #include "_exception.h" #include #include #include #include #include #include #include #include #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) #pragma warning(push) #pragma warning(disable: 4127) // warning C4127: conditional expression is constant #endif namespace tbb { namespace detail { namespace d2 { template class concurrent_unordered_base; template class solist_iterator { private: using node_ptr = typename Container::value_node_ptr; template friend class split_ordered_list; template friend class solist_iterator; template friend class concurrent_unordered_base; template friend bool operator==( const solist_iterator& i, const solist_iterator& j ); template friend bool operator!=( const solist_iterator& i, const solist_iterator& j ); public: using value_type = Value; using difference_type = typename Container::difference_type; using pointer = value_type*; using reference = value_type&; using iterator_category = std::forward_iterator_tag; solist_iterator() : my_node_ptr(nullptr) {} solist_iterator( const solist_iterator& other ) : my_node_ptr(other.my_node_ptr) {} solist_iterator& operator=( const solist_iterator& other ) { my_node_ptr = other.my_node_ptr; return *this; } reference operator*() const { return my_node_ptr->value(); } pointer operator->() const { return my_node_ptr->storage(); } solist_iterator& operator++() { auto next_node = my_node_ptr->next(); while(next_node && next_node->is_dummy()) { next_node = next_node->next(); } my_node_ptr = static_cast(next_node); return *this; } solist_iterator operator++(int) { solist_iterator tmp = *this; ++*this; return tmp; } private: solist_iterator( node_ptr pnode ) : my_node_ptr(pnode) {} node_ptr get_node_ptr() const { return my_node_ptr; } node_ptr my_node_ptr; }; template bool operator==( const solist_iterator& i, const solist_iterator& j ) { return i.my_node_ptr == j.my_node_ptr; } template bool operator!=( const solist_iterator& i, const solist_iterator& j ) { return i.my_node_ptr != j.my_node_ptr; } template class list_node { public: using node_ptr = list_node*; using sokey_type = SokeyType; list_node(sokey_type key) : my_next(nullptr), my_order_key(key) {} void init( sokey_type key ) { my_order_key = key; } sokey_type order_key() const { return my_order_key; } bool is_dummy() { // The last bit of order key is unset for dummy nodes return (my_order_key & 0x1) == 0; } node_ptr next() const { return my_next.load(std::memory_order_acquire); } void set_next( node_ptr next_node ) { my_next.store(next_node, std::memory_order_release); } bool try_set_next( node_ptr expected_next, node_ptr new_next ) { return my_next.compare_exchange_strong(expected_next, new_next); } private: std::atomic my_next; sokey_type my_order_key; }; // class list_node template class value_node : public list_node { public: using base_type = list_node; using sokey_type = typename base_type::sokey_type; using value_type = ValueType; value_node( sokey_type ord_key ) : base_type(ord_key) {} ~value_node() {} value_type* storage() { return &my_value; } value_type& value() { return *storage(); } private: union { value_type my_value; }; }; // class value_node template class concurrent_unordered_base { using self_type = concurrent_unordered_base; using traits_type = Traits; using hash_compare_type = typename traits_type::hash_compare_type; class unordered_segment_table; public: using value_type = typename traits_type::value_type; using key_type = typename traits_type::key_type; using allocator_type = typename traits_type::allocator_type; private: using allocator_traits_type = tbb::detail::allocator_traits; // TODO: check assert conditions for different C++ standards static_assert(std::is_same::value, "value_type of the container must be the same as its allocator"); using sokey_type = std::size_t; public: using size_type = std::size_t; using difference_type = std::ptrdiff_t; using iterator = solist_iterator; using const_iterator = solist_iterator; using local_iterator = iterator; using const_local_iterator = const_iterator; using reference = value_type&; using const_reference = const value_type&; using pointer = typename allocator_traits_type::pointer; using const_pointer = typename allocator_traits_type::const_pointer; using hasher = typename hash_compare_type::hasher; using key_equal = typename hash_compare_type::key_equal; private: using list_node_type = list_node; using value_node_type = value_node; using node_ptr = list_node_type*; using value_node_ptr = value_node_type*; using value_node_allocator_type = typename allocator_traits_type::template rebind_alloc; using node_allocator_type = typename allocator_traits_type::template rebind_alloc; using node_allocator_traits = tbb::detail::allocator_traits; using value_node_allocator_traits = tbb::detail::allocator_traits; static constexpr size_type round_up_to_power_of_two( size_type bucket_count ) { return size_type(1) << size_type(tbb::detail::log2(uintptr_t(bucket_count == 0 ? 1 : bucket_count) * 2 - 1)); } template using is_transparent = dependent_bool, T>; public: using node_type = d1::node_handle; explicit concurrent_unordered_base( size_type bucket_count, const hasher& hash = hasher(), const key_equal& equal = key_equal(), const allocator_type& alloc = allocator_type() ) : my_size(0), my_bucket_count(round_up_to_power_of_two(bucket_count)), my_max_load_factor(float(initial_max_load_factor)), my_hash_compare(hash, equal), my_head(sokey_type(0)), my_segments(alloc) {} concurrent_unordered_base() : concurrent_unordered_base(initial_bucket_count) {} concurrent_unordered_base( size_type bucket_count, const allocator_type& alloc ) : concurrent_unordered_base(bucket_count, hasher(), key_equal(), alloc) {} concurrent_unordered_base( size_type bucket_count, const hasher& hash, const allocator_type& alloc ) : concurrent_unordered_base(bucket_count, hash, key_equal(), alloc) {} explicit concurrent_unordered_base( const allocator_type& alloc ) : concurrent_unordered_base(initial_bucket_count, hasher(), key_equal(), alloc) {} template concurrent_unordered_base( InputIterator first, InputIterator last, size_type bucket_count = initial_bucket_count, const hasher& hash = hasher(), const key_equal& equal = key_equal(), const allocator_type& alloc = allocator_type() ) : concurrent_unordered_base(bucket_count, hash, equal, alloc) { insert(first, last); } template concurrent_unordered_base( InputIterator first, InputIterator last, size_type bucket_count, const allocator_type& alloc ) : concurrent_unordered_base(first, last, bucket_count, hasher(), key_equal(), alloc) {} template concurrent_unordered_base( InputIterator first, InputIterator last, size_type bucket_count, const hasher& hash, const allocator_type& alloc ) : concurrent_unordered_base(first, last, bucket_count, hash, key_equal(), alloc) {} concurrent_unordered_base( const concurrent_unordered_base& other ) : my_size(other.my_size.load(std::memory_order_relaxed)), my_bucket_count(other.my_bucket_count.load(std::memory_order_relaxed)), my_max_load_factor(other.my_max_load_factor), my_hash_compare(other.my_hash_compare), my_head(other.my_head.order_key()), my_segments(other.my_segments) { try_call( [&] { internal_copy(other); } ).on_exception( [&] { clear(); }); } concurrent_unordered_base( const concurrent_unordered_base& other, const allocator_type& alloc ) : my_size(other.my_size.load(std::memory_order_relaxed)), my_bucket_count(other.my_bucket_count.load(std::memory_order_relaxed)), my_max_load_factor(other.my_max_load_factor), my_hash_compare(other.my_hash_compare), my_head(other.my_head.order_key()), my_segments(other.my_segments, alloc) { try_call( [&] { internal_copy(other); } ).on_exception( [&] { clear(); }); } concurrent_unordered_base( concurrent_unordered_base&& other ) : my_size(other.my_size.load(std::memory_order_relaxed)), my_bucket_count(other.my_bucket_count.load(std::memory_order_relaxed)), my_max_load_factor(std::move(other.my_max_load_factor)), my_hash_compare(std::move(other.my_hash_compare)), my_head(other.my_head.order_key()), my_segments(std::move(other.my_segments)) { move_content(std::move(other)); } concurrent_unordered_base( concurrent_unordered_base&& other, const allocator_type& alloc ) : my_size(other.my_size.load(std::memory_order_relaxed)), my_bucket_count(other.my_bucket_count.load(std::memory_order_relaxed)), my_max_load_factor(std::move(other.my_max_load_factor)), my_hash_compare(std::move(other.my_hash_compare)), my_head(other.my_head.order_key()), my_segments(std::move(other.my_segments), alloc) { using is_always_equal = typename allocator_traits_type::is_always_equal; internal_move_construct_with_allocator(std::move(other), alloc, is_always_equal()); } concurrent_unordered_base( std::initializer_list init, size_type bucket_count = initial_bucket_count, const hasher& hash = hasher(), const key_equal& equal = key_equal(), const allocator_type& alloc = allocator_type() ) : concurrent_unordered_base(init.begin(), init.end(), bucket_count, hash, equal, alloc) {} concurrent_unordered_base( std::initializer_list init, size_type bucket_count, const allocator_type& alloc ) : concurrent_unordered_base(init, bucket_count, hasher(), key_equal(), alloc) {} concurrent_unordered_base( std::initializer_list init, size_type bucket_count, const hasher& hash, const allocator_type& alloc ) : concurrent_unordered_base(init, bucket_count, hash, key_equal(), alloc) {} ~concurrent_unordered_base() { internal_clear(); } concurrent_unordered_base& operator=( const concurrent_unordered_base& other ) { if (this != &other) { clear(); my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed); my_bucket_count.store(other.my_bucket_count.load(std::memory_order_relaxed), std::memory_order_relaxed); my_max_load_factor = other.my_max_load_factor; my_hash_compare = other.my_hash_compare; my_segments = other.my_segments; internal_copy(other); // TODO: guards for exceptions? } return *this; } concurrent_unordered_base& operator=( concurrent_unordered_base&& other ) noexcept(unordered_segment_table::is_noexcept_assignment) { if (this != &other) { clear(); my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed); my_bucket_count.store(other.my_bucket_count.load(std::memory_order_relaxed), std::memory_order_relaxed); my_max_load_factor = std::move(other.my_max_load_factor); my_hash_compare = std::move(other.my_hash_compare); my_segments = std::move(other.my_segments); using pocma_type = typename allocator_traits_type::propagate_on_container_move_assignment; using is_always_equal = typename allocator_traits_type::is_always_equal; internal_move_assign(std::move(other), tbb::detail::disjunction()); } return *this; } concurrent_unordered_base& operator=( std::initializer_list init ) { clear(); insert(init); return *this; } void swap( concurrent_unordered_base& other ) noexcept(unordered_segment_table::is_noexcept_swap) { if (this != &other) { using pocs_type = typename allocator_traits_type::propagate_on_container_swap; using is_always_equal = typename allocator_traits_type::is_always_equal; internal_swap(other, tbb::detail::disjunction()); } } allocator_type get_allocator() const noexcept { return my_segments.get_allocator(); } iterator begin() noexcept { return iterator(first_value_node(&my_head)); } const_iterator begin() const noexcept { return const_iterator(first_value_node(const_cast(&my_head))); } const_iterator cbegin() const noexcept { return const_iterator(first_value_node(const_cast(&my_head))); } iterator end() noexcept { return iterator(nullptr); } const_iterator end() const noexcept { return const_iterator(nullptr); } const_iterator cend() const noexcept { return const_iterator(nullptr); } __TBB_nodiscard bool empty() const noexcept { return size() == 0; } size_type size() const noexcept { return my_size.load(std::memory_order_relaxed); } size_type max_size() const noexcept { return allocator_traits_type::max_size(get_allocator()); } void clear() noexcept { internal_clear(); } std::pair insert( const value_type& value ) { return internal_insert_value(value); } std::pair insert( value_type&& value ) { return internal_insert_value(std::move(value)); } iterator insert( const_iterator, const value_type& value ) { // Ignore hint return insert(value).first; } iterator insert( const_iterator, value_type&& value ) { // Ignore hint return insert(std::move(value)).first; } template void insert( InputIterator first, InputIterator last ) { for (; first != last; ++first) { insert(*first); } } void insert( std::initializer_list init ) { insert(init.begin(), init.end()); } std::pair insert( node_type&& nh ) { if (!nh.empty()) { value_node_ptr insert_node = d1::node_handle_accessor::get_node_ptr(nh); auto init_node = [&insert_node]( sokey_type order_key )->value_node_ptr { insert_node->init(order_key); return insert_node; }; auto insert_result = internal_insert(insert_node->value(), init_node); if (insert_result.inserted) { // If the insertion succeeded - set node handle to the empty state __TBB_ASSERT(insert_result.remaining_node == nullptr, "internal_insert_node should not return the remaining node if the insertion succeeded"); d1::node_handle_accessor::deactivate(nh); } return { iterator(insert_result.node_with_equal_key), insert_result.inserted }; } return {end(), false}; } iterator insert( const_iterator, node_type&& nh ) { // Ignore hint return insert(std::move(nh)).first; } template std::pair emplace( Args&&... args ) { // Create a node with temporary order_key 0, which will be reinitialize // in internal_insert after the hash calculation value_node_ptr insert_node = create_node(0, std::forward(args)...); auto init_node = [&insert_node]( sokey_type order_key )->value_node_ptr { insert_node->init(order_key); return insert_node; }; auto insert_result = internal_insert(insert_node->value(), init_node); if (!insert_result.inserted) { // If the insertion failed - destroy the node which was created insert_node->init(split_order_key_regular(1)); destroy_node(insert_node); } return { iterator(insert_result.node_with_equal_key), insert_result.inserted }; } template iterator emplace_hint( const_iterator, Args&&... args ) { // Ignore hint return emplace(std::forward(args)...).first; } iterator unsafe_erase( const_iterator pos ) { return iterator(first_value_node(internal_erase(pos.get_node_ptr()))); } iterator unsafe_erase( iterator pos ) { return iterator(first_value_node(internal_erase(pos.get_node_ptr()))); } iterator unsafe_erase( const_iterator first, const_iterator last ) { while(first != last) { first = unsafe_erase(first); } return iterator(first.get_node_ptr()); } size_type unsafe_erase( const key_type& key ) { return internal_erase_by_key(key); } template typename std::enable_if::value && !std::is_convertible::value && !std::is_convertible::value, size_type>::type unsafe_erase( const K& key ) { return internal_erase_by_key(key); } node_type unsafe_extract( const_iterator pos ) { internal_extract(pos.get_node_ptr()); return d1::node_handle_accessor::construct(pos.get_node_ptr()); } node_type unsafe_extract( iterator pos ) { internal_extract(pos.get_node_ptr()); return d1::node_handle_accessor::construct(pos.get_node_ptr()); } node_type unsafe_extract( const key_type& key ) { iterator item = find(key); return item == end() ? node_type() : unsafe_extract(item); } template typename std::enable_if::value && !std::is_convertible::value && !std::is_convertible::value, node_type>::type unsafe_extract( const K& key ) { iterator item = find(key); return item == end() ? node_type() : unsafe_extract(item); } // Lookup functions iterator find( const key_type& key ) { value_node_ptr result = internal_find(key); return result == nullptr ? end() : iterator(result); } const_iterator find( const key_type& key ) const { value_node_ptr result = const_cast(this)->internal_find(key); return result == nullptr ? end() : const_iterator(result); } template typename std::enable_if::value, iterator>::type find( const K& key ) { value_node_ptr result = internal_find(key); return result == nullptr ? end() : iterator(result); } template typename std::enable_if::value, const_iterator>::type find( const K& key ) const { value_node_ptr result = const_cast(this)->internal_find(key); return result == nullptr ? end() : const_iterator(result); } std::pair equal_range( const key_type& key ) { auto result = internal_equal_range(key); return std::make_pair(iterator(result.first), iterator(result.second)); } std::pair equal_range( const key_type& key ) const { auto result = const_cast(this)->internal_equal_range(key); return std::make_pair(const_iterator(result.first), const_iterator(result.second)); } template typename std::enable_if::value, std::pair>::type equal_range( const K& key ) { auto result = internal_equal_range(key); return std::make_pair(iterator(result.first), iterator(result.second)); } template typename std::enable_if::value, std::pair>::type equal_range( const K& key ) const { auto result = const_cast(this)->internal_equal_range(key); return std::make_pair(iterator(result.first), iterator(result.second)); } size_type count( const key_type& key ) const { return internal_count(key); } template typename std::enable_if::value, size_type>::type count( const K& key ) const { return internal_count(key); } bool contains( const key_type& key ) const { return find(key) != end(); } template typename std::enable_if::value, bool>::type contains( const K& key ) const { return find(key) != end(); } // Bucket interface local_iterator unsafe_begin( size_type n ) { return local_iterator(first_value_node(get_bucket(n))); } const_local_iterator unsafe_begin( size_type n ) const { auto bucket_begin = first_value_node(const_cast(this)->get_bucket(n)); return const_local_iterator(bucket_begin); } const_local_iterator unsafe_cbegin( size_type n ) const { auto bucket_begin = first_value_node(const_cast(this)->get_bucket(n)); return const_local_iterator(bucket_begin); } local_iterator unsafe_end( size_type n ) { size_type bucket_count = my_bucket_count.load(std::memory_order_relaxed); return n != bucket_count - 1 ? unsafe_begin(get_next_bucket_index(n)) : local_iterator(nullptr); } const_local_iterator unsafe_end( size_type n ) const { size_type bucket_count = my_bucket_count.load(std::memory_order_relaxed); return n != bucket_count - 1 ? unsafe_begin(get_next_bucket_index(n)) : const_local_iterator(nullptr); } const_local_iterator unsafe_cend( size_type n ) const { size_type bucket_count = my_bucket_count.load(std::memory_order_relaxed); return n != bucket_count - 1 ? unsafe_begin(get_next_bucket_index(n)) : const_local_iterator(nullptr); } size_type unsafe_bucket_count() const { return my_bucket_count.load(std::memory_order_relaxed); } size_type unsafe_max_bucket_count() const { return max_size(); } size_type unsafe_bucket_size( size_type n ) const { return size_type(std::distance(unsafe_begin(n), unsafe_end(n))); } size_type unsafe_bucket( const key_type& key ) const { return my_hash_compare(key) % my_bucket_count.load(std::memory_order_relaxed); } // Hash policy float load_factor() const { return float(size() / float(my_bucket_count.load(std::memory_order_acquire))); } float max_load_factor() const { return my_max_load_factor; } void max_load_factor( float mlf ) { if (mlf != mlf || mlf < 0) { tbb::detail::throw_exception(exception_id::invalid_load_factor); } my_max_load_factor = mlf; } // TODO: unsafe? void rehash( size_type bucket_count ) { size_type current_bucket_count = my_bucket_count.load(std::memory_order_acquire); if (current_bucket_count < bucket_count) { // TODO: do we need do-while here? my_bucket_count.compare_exchange_strong(current_bucket_count, round_up_to_power_of_two(bucket_count)); } } void reserve( size_type elements_count ) { size_type current_bucket_count = my_bucket_count.load(std::memory_order_acquire); size_type necessary_bucket_count = current_bucket_count; // max_load_factor() is currently unsafe, so we can assume that my_max_load_factor // would not be changed during the calculation // TODO: Log2 seems useful here while (necessary_bucket_count * max_load_factor() < elements_count) { necessary_bucket_count <<= 1; } while (!my_bucket_count.compare_exchange_strong(current_bucket_count, necessary_bucket_count)) { if (current_bucket_count >= necessary_bucket_count) break; } } // Observers hasher hash_function() const { return my_hash_compare.hash_function(); } key_equal key_eq() const { return my_hash_compare.key_eq(); } class const_range_type { private: const concurrent_unordered_base& my_instance; node_ptr my_begin_node; // may be node* const node_ptr my_end_node; mutable node_ptr my_midpoint_node; public: using size_type = typename concurrent_unordered_base::size_type; using value_type = typename concurrent_unordered_base::value_type; using reference = typename concurrent_unordered_base::reference; using difference_type = typename concurrent_unordered_base::difference_type; using iterator = typename concurrent_unordered_base::const_iterator; bool empty() const { return my_begin_node == my_end_node; } bool is_divisible() const { return my_midpoint_node != my_end_node; } size_type grainsize() const { return 1; } const_range_type( const_range_type& range, split ) : my_instance(range.my_instance), my_begin_node(range.my_midpoint_node), my_end_node(range.my_end_node) { range.my_end_node = my_begin_node; __TBB_ASSERT(!empty(), "Splitting despite the range is not divisible"); __TBB_ASSERT(!range.empty(), "Splitting despite the range is not divisible"); set_midpoint(); range.set_midpoint(); } iterator begin() const { return iterator(my_instance.first_value_node(my_begin_node)); } iterator end() const { return iterator(my_instance.first_value_node(my_end_node)); } const_range_type( const concurrent_unordered_base& table ) : my_instance(table), my_begin_node(my_instance.first_value_node(const_cast(&table.my_head))), my_end_node(nullptr) { set_midpoint(); } private: void set_midpoint() const { if (empty()) { my_midpoint_node = my_end_node; } else { sokey_type invalid_key = ~sokey_type(0); sokey_type begin_key = my_begin_node != nullptr ? my_begin_node->order_key() : invalid_key; sokey_type end_key = my_end_node != nullptr ? my_end_node->order_key() : invalid_key; size_type mid_bucket = reverse_bits(begin_key + (end_key - begin_key) / 2) % my_instance.my_bucket_count.load(std::memory_order_relaxed); while( my_instance.my_segments[mid_bucket].load(std::memory_order_relaxed) == nullptr) { mid_bucket = my_instance.get_parent(mid_bucket); } if (reverse_bits(mid_bucket) > begin_key) { // Found a dummy node between begin and end my_midpoint_node = my_instance.first_value_node( my_instance.my_segments[mid_bucket].load(std::memory_order_relaxed)); } else { // Didn't find a dummy node between begin and end my_midpoint_node = my_end_node; } } } }; // class const_range_type class range_type : public const_range_type { public: using iterator = typename concurrent_unordered_base::iterator; using const_range_type::const_range_type; iterator begin() const { return iterator(const_range_type::begin().get_node_ptr()); } iterator end() const { return iterator(const_range_type::end().get_node_ptr()); } }; // class range_type // Parallel iteration range_type range() { return range_type(*this); } const_range_type range() const { return const_range_type(*this); } protected: static constexpr bool allow_multimapping = traits_type::allow_multimapping; private: static constexpr size_type initial_bucket_count = 8; static constexpr float initial_max_load_factor = 4; // TODO: consider 1? static constexpr size_type pointers_per_embedded_table = sizeof(size_type) * 8 - 1; class unordered_segment_table : public d1::segment_table, allocator_type, unordered_segment_table, pointers_per_embedded_table> { using self_type = unordered_segment_table; using atomic_node_ptr = std::atomic; using base_type = d1::segment_table, allocator_type, unordered_segment_table, pointers_per_embedded_table>; using segment_type = typename base_type::segment_type; using base_allocator_type = typename base_type::allocator_type; using segment_allocator_type = typename allocator_traits_type::template rebind_alloc; using segment_allocator_traits = tbb::detail::allocator_traits; public: // Segment table for unordered containers should not be extended in the wait- free implementation static constexpr bool allow_table_extending = false; static constexpr bool is_noexcept_assignment = std::is_nothrow_move_assignable::value && std::is_nothrow_move_assignable::value && segment_allocator_traits::is_always_equal::value; static constexpr bool is_noexcept_swap = tbb::detail::is_nothrow_swappable::value && tbb::detail::is_nothrow_swappable::value && segment_allocator_traits::is_always_equal::value; // TODO: using base_type::base_type is not compiling on Windows and Intel Compiler - investigate unordered_segment_table( const base_allocator_type& alloc = base_allocator_type() ) : base_type(alloc) {} unordered_segment_table( const unordered_segment_table& ) = default; unordered_segment_table( const unordered_segment_table& other, const base_allocator_type& alloc ) : base_type(other, alloc) {} unordered_segment_table( unordered_segment_table&& ) = default; unordered_segment_table( unordered_segment_table&& other, const base_allocator_type& alloc ) : base_type(std::move(other), alloc) {} unordered_segment_table& operator=( const unordered_segment_table& ) = default; unordered_segment_table& operator=( unordered_segment_table&& ) = default; segment_type create_segment( typename base_type::segment_table_type, typename base_type::segment_index_type segment_index, size_type ) { segment_allocator_type alloc(this->get_allocator()); size_type seg_size = this->segment_size(segment_index); segment_type new_segment = segment_allocator_traits::allocate(alloc, seg_size); for (size_type i = 0; i != seg_size; ++i) { segment_allocator_traits::construct(alloc, new_segment + i, nullptr); } return new_segment; } segment_type nullify_segment( typename base_type::segment_table_type table, size_type segment_index ) { segment_type target_segment = table[segment_index].load(std::memory_order_relaxed); table[segment_index].store(nullptr, std::memory_order_relaxed); return target_segment; } // deallocate_segment is required by the segment_table base class, but // in unordered, it is also necessary to call the destructor during deallocation void deallocate_segment( segment_type address, size_type index ) { destroy_segment(address, index); } void destroy_segment( segment_type address, size_type index ) { segment_allocator_type alloc(this->get_allocator()); for (size_type i = 0; i != this->segment_size(index); ++i) { segment_allocator_traits::destroy(alloc, address + i); } segment_allocator_traits::deallocate(alloc, address, this->segment_size(index)); } void copy_segment( size_type index, segment_type, segment_type to ) { if (index == 0) { // The first element in the first segment is embedded into the table (my_head) // so the first pointer should not be stored here // It would be stored during move ctor/assignment operation to[1].store(nullptr, std::memory_order_relaxed); } else { for (size_type i = 0; i != this->segment_size(index); ++i) { to[i].store(nullptr, std::memory_order_relaxed); } } } void move_segment( size_type index, segment_type from, segment_type to ) { if (index == 0) { // The first element in the first segment is embedded into the table (my_head) // so the first pointer should not be stored here // It would be stored during move ctor/assignment operation to[1].store(from[1].load(std::memory_order_relaxed), std::memory_order_relaxed); } else { for (size_type i = 0; i != this->segment_size(index); ++i) { to[i].store(from[i].load(std::memory_order_relaxed), std::memory_order_relaxed); from[i].store(nullptr, std::memory_order_relaxed); } } } // allocate_long_table is required by the segment_table base class, but unused for unordered containers typename base_type::segment_table_type allocate_long_table( const typename base_type::atomic_segment*, size_type ) { __TBB_ASSERT(false, "This method should never been called"); // TableType is a pointer return nullptr; } // destroy_elements is required by the segment_table base class, but unused for unordered containers // this function call but do nothing void destroy_elements() {} }; // struct unordered_segment_table void internal_clear() { // TODO: consider usefulness of two versions of clear() - with dummy nodes deallocation and without it node_ptr next = my_head.next(); node_ptr curr = next; my_head.set_next(nullptr); while (curr != nullptr) { next = curr->next(); destroy_node(curr); curr = next; } my_size.store(0, std::memory_order_relaxed); my_segments.clear(); } void destroy_node( node_ptr node ) { if (node->is_dummy()) { node_allocator_type dummy_node_allocator(my_segments.get_allocator()); // Destroy the node node_allocator_traits::destroy(dummy_node_allocator, node); // Deallocate the memory node_allocator_traits::deallocate(dummy_node_allocator, node, 1); } else { // GCC 11.1 issues a warning here that incorrect destructor might be called for dummy_nodes #if (__TBB_GCC_VERSION >= 110100 && __TBB_GCC_VERSION < 150000 ) && !__clang__ && !__INTEL_COMPILER volatile #endif value_node_ptr val_node = static_cast(node); value_node_allocator_type value_node_allocator(my_segments.get_allocator()); // Destroy the value value_node_allocator_traits::destroy(value_node_allocator, val_node->storage()); // Destroy the node value_node_allocator_traits::destroy(value_node_allocator, val_node); // Deallocate the memory value_node_allocator_traits::deallocate(value_node_allocator, val_node, 1); } } struct internal_insert_return_type { // If the insertion failed - the remaining_node points to the node, which was failed to insert // This node can be allocated in process of insertion value_node_ptr remaining_node; // If the insertion failed - node_with_equal_key points to the node in the list with the // key, equivalent to the inserted, otherwise it points to the node, which was inserted. value_node_ptr node_with_equal_key; // Insertion status // NOTE: if it is true - remaining_node should be nullptr bool inserted; }; // struct internal_insert_return_type // Inserts the value into the split ordered list template std::pair internal_insert_value( ValueType&& value ) { auto create_value_node = [&value, this]( sokey_type order_key )->value_node_ptr { return create_node(order_key, std::forward(value)); }; auto insert_result = internal_insert(value, create_value_node); if (insert_result.remaining_node != nullptr) { // If the insertion fails - destroy the node which was failed to insert if it exist __TBB_ASSERT(!insert_result.inserted, "remaining_node should be nullptr if the node was successfully inserted"); destroy_node(insert_result.remaining_node); } return { iterator(insert_result.node_with_equal_key), insert_result.inserted }; } // Inserts the node into the split ordered list // Creates a node using the specified callback after the place for insertion was found // Returns internal_insert_return_type object, where: // - If the insertion succeeded: // - remaining_node is nullptr // - node_with_equal_key point to the inserted node // - inserted is true // - If the insertion failed: // - remaining_node points to the node, that was failed to insert if it was created. // nullptr if the node was not created, because the requested key was already // presented in the list // - node_with_equal_key point to the element in the list with the key, equivalent to // to the requested key // - inserted is false template internal_insert_return_type internal_insert( ValueType&& value, CreateInsertNode create_insert_node ) { static_assert(std::is_same::type, value_type>::value, "Incorrect type in internal_insert"); const key_type& key = traits_type::get_key(value); sokey_type hash_key = sokey_type(my_hash_compare(key)); sokey_type order_key = split_order_key_regular(hash_key); node_ptr prev = prepare_bucket(hash_key); __TBB_ASSERT(prev != nullptr, "Invalid head node"); auto search_result = search_after(prev, order_key, key); if (search_result.second) { return internal_insert_return_type{ nullptr, search_result.first, false }; } value_node_ptr new_node = create_insert_node(order_key); node_ptr curr = search_result.first; while (!try_insert(prev, new_node, curr)) { search_result = search_after(prev, order_key, key); if (search_result.second) { return internal_insert_return_type{ new_node, search_result.first, false }; } curr = search_result.first; } auto sz = my_size.fetch_add(1); adjust_table_size(sz + 1, my_bucket_count.load(std::memory_order_acquire)); return internal_insert_return_type{ nullptr, static_cast(new_node), true }; } // Searches the node with the key, equivalent to key with requested order key after the node prev // Returns the existing node and true if the node is already in the list // Returns the first node with the order key, greater than requested and false if the node is not presented in the list std::pair search_after( node_ptr& prev, sokey_type order_key, const key_type& key ) { // NOTE: static_cast(curr) should be done only after we would ensure // that the node is not a dummy node node_ptr curr = prev->next(); while (curr != nullptr && (curr->order_key() < order_key || (curr->order_key() == order_key && !my_hash_compare(traits_type::get_key(static_cast(curr)->value()), key)))) { prev = curr; curr = curr->next(); } if (curr != nullptr && curr->order_key() == order_key && !allow_multimapping) { return { static_cast(curr), true }; } return { static_cast(curr), false }; } void adjust_table_size( size_type total_elements, size_type current_size ) { // Grow the table by a factor of 2 if possible and needed if ( (float(total_elements) / float(current_size)) > my_max_load_factor ) { // Double the size of the hash only if size hash not changed in between loads my_bucket_count.compare_exchange_strong(current_size, 2u * current_size); } } node_ptr insert_dummy_node( node_ptr parent_dummy_node, sokey_type order_key ) { node_ptr prev_node = parent_dummy_node; node_ptr dummy_node = create_dummy_node(order_key); node_ptr next_node; do { next_node = prev_node->next(); // Move forward through the list while the order key is less than requested while (next_node != nullptr && next_node->order_key() < order_key) { prev_node = next_node; next_node = next_node->next(); } if (next_node != nullptr && next_node->order_key() == order_key) { // Another dummy node with the same order key was inserted by another thread // Destroy the node and exit destroy_node(dummy_node); return next_node; } } while (!try_insert(prev_node, dummy_node, next_node)); return dummy_node; } // Try to insert a node between prev_node and expected next // If the next is not equal to expected next - return false static bool try_insert( node_ptr prev_node, node_ptr new_node, node_ptr current_next_node ) { new_node->set_next(current_next_node); return prev_node->try_set_next(current_next_node, new_node); } // Returns the bucket, associated with the hash_key node_ptr prepare_bucket( sokey_type hash_key ) { size_type bucket = hash_key % my_bucket_count.load(std::memory_order_acquire); return get_bucket(bucket); } // Initialize the corresponding bucket if it is not initialized node_ptr get_bucket( size_type bucket_index ) { if (my_segments[bucket_index].load(std::memory_order_acquire) == nullptr) { init_bucket(bucket_index); } return my_segments[bucket_index].load(std::memory_order_acquire); } void init_bucket( size_type bucket ) { if (bucket == 0) { // Atomicaly store the first bucket into my_head node_ptr disabled = nullptr; my_segments[0].compare_exchange_strong(disabled, &my_head); return; } size_type parent_bucket = get_parent(bucket); while (my_segments[parent_bucket].load(std::memory_order_acquire) == nullptr) { // Initialize all of the parent buckets init_bucket(parent_bucket); } __TBB_ASSERT(my_segments[parent_bucket].load(std::memory_order_acquire) != nullptr, "Parent bucket should be initialized"); node_ptr parent = my_segments[parent_bucket].load(std::memory_order_acquire); // Insert dummy node into the list node_ptr dummy_node = insert_dummy_node(parent, split_order_key_dummy(bucket)); // TODO: consider returning pair to avoid store operation if the bucket was stored by an other thread // or move store to insert_dummy_node // Add dummy_node into the segment table my_segments[bucket].store(dummy_node, std::memory_order_release); } node_ptr create_dummy_node( sokey_type order_key ) { node_allocator_type dummy_node_allocator(my_segments.get_allocator()); node_ptr dummy_node = node_allocator_traits::allocate(dummy_node_allocator, 1); node_allocator_traits::construct(dummy_node_allocator, dummy_node, order_key); return dummy_node; } template value_node_ptr create_node( sokey_type order_key, Args&&... args ) { value_node_allocator_type value_node_allocator(my_segments.get_allocator()); // Allocate memory for the value_node value_node_ptr new_node = value_node_allocator_traits::allocate(value_node_allocator, 1); // Construct the node value_node_allocator_traits::construct(value_node_allocator, new_node, order_key); // try_call API is not convenient here due to broken // variadic capture on GCC 4.8.5 auto value_guard = make_raii_guard([&] { value_node_allocator_traits::destroy(value_node_allocator, new_node); value_node_allocator_traits::deallocate(value_node_allocator, new_node, 1); }); // Construct the value in the node value_node_allocator_traits::construct(value_node_allocator, new_node->storage(), std::forward(args)...); value_guard.dismiss(); return new_node; } value_node_ptr first_value_node( node_ptr first_node ) const { while (first_node != nullptr && first_node->is_dummy()) { first_node = first_node->next(); } return static_cast(first_node); } // Unsafe method, which removes the node from the list and returns the next node node_ptr internal_erase( value_node_ptr node_to_erase ) { __TBB_ASSERT(node_to_erase != nullptr, "Invalid iterator for erase"); node_ptr next_node = node_to_erase->next(); internal_extract(node_to_erase); destroy_node(node_to_erase); return next_node; } template size_type internal_erase_by_key( const K& key ) { // TODO: consider reimplementation without equal_range - it is not effective to perform lookup over a bucket // for each unsafe_erase call auto eq_range = equal_range(key); size_type erased_count = 0; for (auto it = eq_range.first; it != eq_range.second;) { it = unsafe_erase(it); ++erased_count; } return erased_count; } // Unsafe method, which extracts the node from the list void internal_extract( value_node_ptr node_to_extract ) { const key_type& key = traits_type::get_key(node_to_extract->value()); sokey_type hash_key = sokey_type(my_hash_compare(key)); node_ptr prev_node = prepare_bucket(hash_key); for (node_ptr node = prev_node->next(); node != nullptr; prev_node = node, node = node->next()) { if (node == node_to_extract) { unlink_node(prev_node, node, node_to_extract->next()); my_size.store(my_size.load(std::memory_order_relaxed) - 1, std::memory_order_relaxed); return; } __TBB_ASSERT(node->order_key() <= node_to_extract->order_key(), "node, which is going to be extracted should be presented in the list"); } } protected: template void internal_merge( SourceType&& source ) { static_assert(std::is_same::type::node_type>::value, "Incompatible containers cannot be merged"); for (node_ptr source_prev = &source.my_head; source_prev->next() != nullptr;) { if (!source_prev->next()->is_dummy()) { value_node_ptr curr = static_cast(source_prev->next()); // If the multimapping is allowed, or the key is not presented // in the *this container - extract the node from the list if (allow_multimapping || !contains(traits_type::get_key(curr->value()))) { node_ptr next_node = curr->next(); source.unlink_node(source_prev, curr, next_node); // Remember the old order key sokey_type old_order_key = curr->order_key(); // Node handle with curr cannot be used directly in insert call, because // the destructor of node_type will destroy curr node_type curr_node = d1::node_handle_accessor::construct(curr); // If the insertion fails - return ownership of the node to the source if (!insert(std::move(curr_node)).second) { __TBB_ASSERT(!allow_multimapping, "Insertion should succeed for multicontainer"); __TBB_ASSERT(source_prev->next() == next_node, "Concurrent operations with the source container in merge are prohibited"); // Initialize the node with the old order key, because the order key // can change during the insertion curr->init(old_order_key); __TBB_ASSERT(old_order_key >= source_prev->order_key() && (next_node == nullptr || old_order_key <= next_node->order_key()), "Wrong nodes order in the source container"); // Merge is unsafe for source container, so the insertion back can be done without compare_exchange curr->set_next(next_node); source_prev->set_next(curr); source_prev = curr; d1::node_handle_accessor::deactivate(curr_node); } else { source.my_size.fetch_sub(1, std::memory_order_relaxed); } } else { source_prev = curr; } } else { source_prev = source_prev->next(); } } } private: // Unsafe method, which unlinks the node between prev and next void unlink_node( node_ptr prev_node, node_ptr node_to_unlink, node_ptr next_node ) { __TBB_ASSERT(prev_node->next() == node_to_unlink && node_to_unlink->next() == next_node, "erasing and extracting nodes from the containers are unsafe in concurrent mode"); prev_node->set_next(next_node); node_to_unlink->set_next(nullptr); } template value_node_ptr internal_find( const K& key ) { sokey_type hash_key = sokey_type(my_hash_compare(key)); sokey_type order_key = split_order_key_regular(hash_key); node_ptr curr = prepare_bucket(hash_key); while (curr != nullptr) { if (curr->order_key() > order_key) { // If the order key is greater than the requested order key, // the element is not in the hash table return nullptr; } else if (curr->order_key() == order_key && my_hash_compare(traits_type::get_key(static_cast(curr)->value()), key)) { // The fact that order keys match does not mean that the element is found. // Key function comparison has to be performed to check whether this is the // right element. If not, keep searching while order key is the same. return static_cast(curr); } curr = curr->next(); } return nullptr; } template std::pair internal_equal_range( const K& key ) { sokey_type hash_key = sokey_type(my_hash_compare(key)); sokey_type order_key = split_order_key_regular(hash_key); node_ptr curr = prepare_bucket(hash_key); while (curr != nullptr) { if (curr->order_key() > order_key) { // If the order key is greater than the requested order key, // the element is not in the hash table return std::make_pair(nullptr, nullptr); } else if (curr->order_key() == order_key && my_hash_compare(traits_type::get_key(static_cast(curr)->value()), key)) { value_node_ptr first = static_cast(curr); node_ptr last = first; do { last = last->next(); } while (allow_multimapping && last != nullptr && !last->is_dummy() && my_hash_compare(traits_type::get_key(static_cast(last)->value()), key)); return std::make_pair(first, first_value_node(last)); } curr = curr->next(); } return {nullptr, nullptr}; } template size_type internal_count( const K& key ) const { if (allow_multimapping) { // TODO: consider reimplementing the internal_equal_range with elements counting to avoid std::distance auto eq_range = equal_range(key); return std::distance(eq_range.first, eq_range.second); } else { return contains(key) ? 1 : 0; } } void internal_copy( const concurrent_unordered_base& other ) { node_ptr last_node = &my_head; my_segments[0].store(&my_head, std::memory_order_relaxed); for (node_ptr node = other.my_head.next(); node != nullptr; node = node->next()) { node_ptr new_node; if (!node->is_dummy()) { // The node in the right table contains a value new_node = create_node(node->order_key(), static_cast(node)->value()); } else { // The node in the right table is a dummy node new_node = create_dummy_node(node->order_key()); my_segments[reverse_bits(node->order_key())].store(new_node, std::memory_order_relaxed); } last_node->set_next(new_node); last_node = new_node; } } void internal_move( concurrent_unordered_base&& other ) { node_ptr last_node = &my_head; my_segments[0].store(&my_head, std::memory_order_relaxed); for (node_ptr node = other.my_head.next(); node != nullptr; node = node->next()) { node_ptr new_node; if (!node->is_dummy()) { // The node in the right table contains a value new_node = create_node(node->order_key(), std::move(static_cast(node)->value())); } else { // TODO: do we need to destroy a dummy node in the right container? // The node in the right table is a dummy_node new_node = create_dummy_node(node->order_key()); my_segments[reverse_bits(node->order_key())].store(new_node, std::memory_order_relaxed); } last_node->set_next(new_node); last_node = new_node; } } void move_content( concurrent_unordered_base&& other ) { // NOTE: allocators should be equal my_head.set_next(other.my_head.next()); other.my_head.set_next(nullptr); my_segments[0].store(&my_head, std::memory_order_relaxed); other.my_bucket_count.store(initial_bucket_count, std::memory_order_relaxed); other.my_max_load_factor = initial_max_load_factor; other.my_size.store(0, std::memory_order_relaxed); } void internal_move_construct_with_allocator( concurrent_unordered_base&& other, const allocator_type&, /*is_always_equal = */std::true_type ) { // Allocators are always equal - no need to compare for equality move_content(std::move(other)); } void internal_move_construct_with_allocator( concurrent_unordered_base&& other, const allocator_type& alloc, /*is_always_equal = */std::false_type ) { // Allocators are not always equal if (alloc == other.my_segments.get_allocator()) { move_content(std::move(other)); } else { try_call( [&] { internal_move(std::move(other)); } ).on_exception( [&] { clear(); }); } } // Move assigns the hash table to other is any instances of allocator_type are always equal // or propagate_on_container_move_assignment is true void internal_move_assign( concurrent_unordered_base&& other, /*is_always_equal || POCMA = */std::true_type ) { move_content(std::move(other)); } // Move assigns the hash table to other is any instances of allocator_type are not always equal // and propagate_on_container_move_assignment is false void internal_move_assign( concurrent_unordered_base&& other, /*is_always_equal || POCMA = */std::false_type ) { if (my_segments.get_allocator() == other.my_segments.get_allocator()) { move_content(std::move(other)); } else { // TODO: guards for exceptions internal_move(std::move(other)); } } void internal_swap( concurrent_unordered_base& other, /*is_always_equal || POCS = */std::true_type ) { internal_swap_fields(other); } void internal_swap( concurrent_unordered_base& other, /*is_always_equal || POCS = */std::false_type ) { __TBB_ASSERT(my_segments.get_allocator() == other.my_segments.get_allocator(), "Swapping with unequal allocators is not allowed"); internal_swap_fields(other); } void internal_swap_fields( concurrent_unordered_base& other ) { node_ptr first_node = my_head.next(); my_head.set_next(other.my_head.next()); other.my_head.set_next(first_node); size_type current_size = my_size.load(std::memory_order_relaxed); my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed); other.my_size.store(current_size, std::memory_order_relaxed); size_type bucket_count = my_bucket_count.load(std::memory_order_relaxed); my_bucket_count.store(other.my_bucket_count.load(std::memory_order_relaxed), std::memory_order_relaxed); other.my_bucket_count.store(bucket_count, std::memory_order_relaxed); using std::swap; swap(my_max_load_factor, other.my_max_load_factor); swap(my_hash_compare, other.my_hash_compare); my_segments.swap(other.my_segments); // swap() method from segment table swaps all of the segments including the first segment // We should restore it to my_head. Without it the first segment of the container will point // to other.my_head. my_segments[0].store(&my_head, std::memory_order_relaxed); other.my_segments[0].store(&other.my_head, std::memory_order_relaxed); } // A regular order key has its original hash value reversed and the last bit set static constexpr sokey_type split_order_key_regular( sokey_type hash ) { return reverse_bits(hash) | 0x1; } // A dummy order key has its original hash value reversed and the last bit unset static constexpr sokey_type split_order_key_dummy( sokey_type hash ) { return reverse_bits(hash) & ~sokey_type(0x1); } size_type get_parent( size_type bucket ) const { // Unset bucket's most significant turned-on bit __TBB_ASSERT(bucket != 0, "Unable to get_parent of the bucket 0"); size_type msb = tbb::detail::log2(bucket); return bucket & ~(size_type(1) << msb); } size_type get_next_bucket_index( size_type bucket ) const { size_type bits = tbb::detail::log2(my_bucket_count.load(std::memory_order_relaxed)); size_type reversed_next = reverse_n_bits(bucket, bits) + 1; return reverse_n_bits(reversed_next, bits); } std::atomic my_size; std::atomic my_bucket_count; float my_max_load_factor; hash_compare_type my_hash_compare; list_node_type my_head; // Head node for split ordered list unordered_segment_table my_segments; // Segment table of pointers to nodes template friend class solist_iterator; template friend class concurrent_unordered_base; }; // class concurrent_unordered_base template bool operator==( const concurrent_unordered_base& lhs, const concurrent_unordered_base& rhs ) { if (&lhs == &rhs) { return true; } if (lhs.size() != rhs.size()) { return false; } #if _MSC_VER // Passing "unchecked" iterators to std::permutation with 3 parameters // causes compiler warnings. // The workaround is to use overload with 4 parameters, which is // available since C++14 - minimally supported version on MSVC return std::is_permutation(lhs.begin(), lhs.end(), rhs.begin(), rhs.end()); #else return std::is_permutation(lhs.begin(), lhs.end(), rhs.begin()); #endif } #if !__TBB_CPP20_COMPARISONS_PRESENT template bool operator!=( const concurrent_unordered_base& lhs, const concurrent_unordered_base& rhs ) { return !(lhs == rhs); } #endif #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) #pragma warning(pop) // warning 4127 is back #endif } // namespace d2 } // namespace detail } // namespace tbb #endif // __TBB_detail__concurrent_unordered_base_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_config.h ================================================ /* Copyright (c) 2005-2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_detail__config_H #define __TBB_detail__config_H /** This header is supposed to contain macro definitions only. The macros defined here are intended to control such aspects of TBB build as - presence of compiler features - compilation modes - feature sets - known compiler/platform issues **/ /* Check which standard library we use. */ #include #ifdef __has_include #if __has_include() #include #endif #endif #include "_export.h" #if _MSC_VER #define __TBB_EXPORTED_FUNC __cdecl #define __TBB_EXPORTED_METHOD __thiscall #else #define __TBB_EXPORTED_FUNC #define __TBB_EXPORTED_METHOD #endif #if defined(_MSVC_LANG) #define __TBB_LANG _MSVC_LANG #else #define __TBB_LANG __cplusplus #endif // _MSVC_LANG #define __TBB_CPP14_PRESENT (__TBB_LANG >= 201402L) #define __TBB_CPP17_PRESENT (__TBB_LANG >= 201703L) #define __TBB_CPP20_PRESENT (__TBB_LANG >= 202002L) #if __INTEL_COMPILER || _MSC_VER #define __TBB_NOINLINE(decl) __declspec(noinline) decl #elif __GNUC__ #define __TBB_NOINLINE(decl) decl __attribute__ ((noinline)) #else #define __TBB_NOINLINE(decl) decl #endif #define __TBB_STRING_AUX(x) #x #define __TBB_STRING(x) __TBB_STRING_AUX(x) // Note that when ICC or Clang is in use, __TBB_GCC_VERSION might not fully match // the actual GCC version on the system. #define __TBB_GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) /* Check which standard library we use. */ // Prior to GCC 7, GNU libstdc++ did not have a convenient version macro. // Therefore we use different ways to detect its version. #ifdef TBB_USE_GLIBCXX_VERSION // The version is explicitly specified in our public TBB_USE_GLIBCXX_VERSION macro. // Its format should match the __TBB_GCC_VERSION above, e.g. 70301 for libstdc++ coming with GCC 7.3.1. #define __TBB_GLIBCXX_VERSION TBB_USE_GLIBCXX_VERSION #elif _GLIBCXX_RELEASE && _GLIBCXX_RELEASE != __GNUC__ // Reported versions of GCC and libstdc++ do not match; trust the latter #define __TBB_GLIBCXX_VERSION (_GLIBCXX_RELEASE*10000) #elif __GLIBCPP__ || __GLIBCXX__ // The version macro is not defined or matches the GCC version; use __TBB_GCC_VERSION #define __TBB_GLIBCXX_VERSION __TBB_GCC_VERSION #endif #if __clang__ // according to clang documentation, version can be vendor specific #define __TBB_CLANG_VERSION (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__) #endif /** Macro helpers **/ #define __TBB_CONCAT_AUX(A,B) A##B // The additional level of indirection is needed to expand macros A and B (not to get the AB macro). // See [cpp.subst] and [cpp.concat] for more details. #define __TBB_CONCAT(A,B) __TBB_CONCAT_AUX(A,B) // The IGNORED argument and comma are needed to always have 2 arguments (even when A is empty). #define __TBB_IS_MACRO_EMPTY(A,IGNORED) __TBB_CONCAT_AUX(__TBB_MACRO_EMPTY,A) #define __TBB_MACRO_EMPTY 1 #if _M_X64 || _M_ARM64 #define __TBB_W(name) name##64 #else #define __TBB_W(name) name #endif /** User controlled TBB features & modes **/ #ifndef TBB_USE_DEBUG /* There are four cases that are supported: 1. "_DEBUG is undefined" means "no debug"; 2. "_DEBUG defined to something that is evaluated to 0" (including "garbage", as per [cpp.cond]) means "no debug"; 3. "_DEBUG defined to something that is evaluated to a non-zero value" means "debug"; 4. "_DEBUG defined to nothing (empty)" means "debug". */ #ifdef _DEBUG // Check if _DEBUG is empty. #define __TBB_IS__DEBUG_EMPTY (__TBB_IS_MACRO_EMPTY(_DEBUG,IGNORED)==__TBB_MACRO_EMPTY) #if __TBB_IS__DEBUG_EMPTY #define TBB_USE_DEBUG 1 #else #define TBB_USE_DEBUG _DEBUG #endif // __TBB_IS__DEBUG_EMPTY #else #define TBB_USE_DEBUG 0 #endif // _DEBUG #endif // TBB_USE_DEBUG #ifndef TBB_USE_ASSERT #define TBB_USE_ASSERT TBB_USE_DEBUG #endif // TBB_USE_ASSERT #ifndef TBB_USE_PROFILING_TOOLS #if TBB_USE_DEBUG #define TBB_USE_PROFILING_TOOLS 2 #else // TBB_USE_DEBUG #define TBB_USE_PROFILING_TOOLS 0 #endif // TBB_USE_DEBUG #endif // TBB_USE_PROFILING_TOOLS // Exceptions support cases #if !(__EXCEPTIONS || defined(_CPPUNWIND) || __SUNPRO_CC) #if TBB_USE_EXCEPTIONS #error Compilation settings do not support exception handling. Please do not set TBB_USE_EXCEPTIONS macro or set it to 0. #elif !defined(TBB_USE_EXCEPTIONS) #define TBB_USE_EXCEPTIONS 0 #endif #elif !defined(TBB_USE_EXCEPTIONS) #define TBB_USE_EXCEPTIONS 1 #endif /** Preprocessor symbols to determine HW architecture **/ #if _WIN32 || _WIN64 #if defined(_M_X64) || defined(__x86_64__) // the latter for MinGW support #define __TBB_x86_64 1 #elif defined(_M_IA64) #define __TBB_ipf 1 #elif defined(_M_IX86) || defined(__i386__) // the latter for MinGW support #define __TBB_x86_32 1 #else #define __TBB_generic_arch 1 #endif #else /* Assume generic Unix */ #if __x86_64__ #define __TBB_x86_64 1 #elif __ia64__ #define __TBB_ipf 1 #elif __i386__||__i386 // __i386 is for Sun OS #define __TBB_x86_32 1 #else #define __TBB_generic_arch 1 #endif #endif /** Windows API or POSIX API **/ #if _WIN32 || _WIN64 #define __TBB_USE_WINAPI 1 #else #define __TBB_USE_POSIX 1 #endif /** Internal TBB features & modes **/ /** __TBB_DYNAMIC_LOAD_ENABLED describes the system possibility to load shared libraries at run time **/ #ifndef __TBB_DYNAMIC_LOAD_ENABLED #define __TBB_DYNAMIC_LOAD_ENABLED (!__EMSCRIPTEN__) #endif /** __TBB_WIN8UI_SUPPORT enables support of Windows* Store Apps and limit a possibility to load shared libraries at run time only from application container **/ #if defined(WINAPI_FAMILY) && WINAPI_FAMILY == WINAPI_FAMILY_APP #define __TBB_WIN8UI_SUPPORT 1 #else #define __TBB_WIN8UI_SUPPORT 0 #endif /** __TBB_WEAK_SYMBOLS_PRESENT denotes that the system supports the weak symbol mechanism **/ #ifndef __TBB_WEAK_SYMBOLS_PRESENT #define __TBB_WEAK_SYMBOLS_PRESENT ( !__EMSCRIPTEN__ && !_WIN32 && !__APPLE__ && !__sun && (__TBB_GCC_VERSION >= 40000 || __INTEL_COMPILER ) ) #endif /** Presence of compiler features **/ #if __clang__ && !__INTEL_COMPILER #define __TBB_USE_OPTIONAL_RTTI __has_feature(cxx_rtti) #elif defined(_CPPRTTI) #define __TBB_USE_OPTIONAL_RTTI 1 #else #define __TBB_USE_OPTIONAL_RTTI (__GXX_RTTI || __RTTI || __INTEL_RTTI__) #endif /** Address sanitizer detection **/ #ifdef __SANITIZE_ADDRESS__ #define __TBB_USE_ADDRESS_SANITIZER 1 #elif defined(__has_feature) #if __has_feature(address_sanitizer) #define __TBB_USE_ADDRESS_SANITIZER 1 #endif #endif /** Library features presence macros **/ #define __TBB_CPP14_INTEGER_SEQUENCE_PRESENT (__TBB_LANG >= 201402L) #define __TBB_CPP17_INVOKE_PRESENT (__TBB_LANG >= 201703L) // TODO: Remove the condition(__INTEL_COMPILER > 2021) from the __TBB_CPP17_DEDUCTION_GUIDES_PRESENT // macro when this feature start working correctly on this compiler. #if __INTEL_COMPILER && (!_MSC_VER || __INTEL_CXX11_MOVE__) #define __TBB_CPP14_VARIABLE_TEMPLATES_PRESENT (__TBB_LANG >= 201402L) #define __TBB_CPP17_DEDUCTION_GUIDES_PRESENT (__INTEL_COMPILER > 2021 && __TBB_LANG >= 201703L) #elif __clang__ #define __TBB_CPP14_VARIABLE_TEMPLATES_PRESENT (__has_feature(cxx_variable_templates)) #ifdef __cpp_deduction_guides #define __TBB_CPP17_DEDUCTION_GUIDES_PRESENT (__cpp_deduction_guides >= 201611L) #else #define __TBB_CPP17_DEDUCTION_GUIDES_PRESENT 0 #endif #elif __GNUC__ #define __TBB_CPP14_VARIABLE_TEMPLATES_PRESENT (__TBB_LANG >= 201402L && __TBB_GCC_VERSION >= 50000) #define __TBB_CPP17_DEDUCTION_GUIDES_PRESENT (__cpp_deduction_guides >= 201606L) #elif _MSC_VER #define __TBB_CPP14_VARIABLE_TEMPLATES_PRESENT (_MSC_FULL_VER >= 190023918 && (!__INTEL_COMPILER || __INTEL_COMPILER >= 1700)) #define __TBB_CPP17_DEDUCTION_GUIDES_PRESENT (_MSC_VER >= 1914 && __TBB_LANG >= 201703L && (!__INTEL_COMPILER || __INTEL_COMPILER > 2021)) #else #define __TBB_CPP14_VARIABLE_TEMPLATES_PRESENT (__TBB_LANG >= 201402L) #define __TBB_CPP17_DEDUCTION_GUIDES_PRESENT (__TBB_LANG >= 201703L) #endif // GCC4.8 on RHEL7 does not support std::get_new_handler #define __TBB_CPP11_GET_NEW_HANDLER_PRESENT (_MSC_VER >= 1900 || __TBB_GLIBCXX_VERSION >= 40900 && __GXX_EXPERIMENTAL_CXX0X__ || _LIBCPP_VERSION) // GCC4.8 on RHEL7 does not support std::is_trivially_copyable #define __TBB_CPP11_TYPE_PROPERTIES_PRESENT (_LIBCPP_VERSION || _MSC_VER >= 1700 || (__TBB_GLIBCXX_VERSION >= 50000 && __GXX_EXPERIMENTAL_CXX0X__)) #define __TBB_CPP17_MEMORY_RESOURCE_PRESENT (_MSC_VER >= 1913 && (__TBB_LANG > 201402L) || \ __TBB_GLIBCXX_VERSION >= 90000 && __TBB_LANG >= 201703L) #define __TBB_CPP17_HW_INTERFERENCE_SIZE_PRESENT (_MSC_VER >= 1911) #define __TBB_CPP17_LOGICAL_OPERATIONS_PRESENT (__TBB_LANG >= 201703L) #define __TBB_CPP17_ALLOCATOR_IS_ALWAYS_EQUAL_PRESENT (__TBB_LANG >= 201703L) #define __TBB_CPP17_IS_SWAPPABLE_PRESENT (__TBB_LANG >= 201703L) // TODO: fix concepts on Clang or define the broken versions #if !(__clang__) && defined(__cpp_concepts) && defined(__cpp_lib_concepts) #define __TBB_CPP20_CONCEPTS_PRESENT ((__cpp_concepts >= 201907L) && (__cpp_lib_concepts >= 202002L)) #else #define __TBB_CPP20_CONCEPTS_PRESENT 0 #endif #if defined(__cpp_impl_three_way_comparison) && defined(__cpp_lib_three_way_comparison) #define __TBB_CPP20_COMPARISONS_PRESENT ((__cpp_impl_three_way_comparison >= 201907L) && (__cpp_lib_three_way_comparison >= 201907L)) #else #define __TBB_CPP20_COMPARISONS_PRESENT 0 #endif #define __TBB_RESUMABLE_TASKS (!__TBB_WIN8UI_SUPPORT && !__ANDROID__ && !__QNXNTO__ && (!__linux__ || __GLIBC__)) /* This macro marks incomplete code or comments describing ideas which are considered for the future. * See also for plain comment with TODO and FIXME marks for small improvement opportunities. */ #define __TBB_TODO 0 /* Check which standard library we use. */ /* __TBB_SYMBOL is defined only while processing exported symbols list where C++ is not allowed. */ #if !defined(__TBB_SYMBOL) && !__TBB_CONFIG_PREPROC_ONLY #include #endif /** Target OS is either iOS* or iOS* simulator **/ #if __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ #define __TBB_IOS 1 #endif #if __APPLE__ #if __INTEL_COMPILER && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ > 1099 \ && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101000 // ICC does not correctly set the macro if -mmacosx-min-version is not specified #define __TBB_MACOS_TARGET_VERSION (100000 + 10*(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ - 1000)) #else #define __TBB_MACOS_TARGET_VERSION __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ #endif #endif #if defined(__GNUC__) && !defined(__INTEL_COMPILER) #define __TBB_GCC_WARNING_IGNORED_ATTRIBUTES_PRESENT (__TBB_GCC_VERSION >= 60100) #endif #if __GNUC__ && !__INTEL_COMPILER && !__clang__ #define __TBB_GCC_PARAMETER_PACK_IN_LAMBDAS_BROKEN (__TBB_GCC_VERSION <= 40805) #endif #define __TBB_CPP17_FALLTHROUGH_PRESENT (__TBB_LANG >= 201703L) #define __TBB_CPP17_NODISCARD_PRESENT (__TBB_LANG >= 201703L) #define __TBB_FALLTHROUGH_PRESENT (__TBB_GCC_VERSION >= 70000 && !__INTEL_COMPILER) #if __TBB_CPP17_FALLTHROUGH_PRESENT #define __TBB_fallthrough [[fallthrough]] #elif __TBB_FALLTHROUGH_PRESENT #define __TBB_fallthrough __attribute__ ((fallthrough)) #else #define __TBB_fallthrough #endif #if __TBB_CPP17_NODISCARD_PRESENT #define __TBB_nodiscard [[nodiscard]] #elif __clang__ || __GNUC__ #define __TBB_nodiscard __attribute__((warn_unused_result)) #else #define __TBB_nodiscard #endif #define __TBB_CPP17_UNCAUGHT_EXCEPTIONS_PRESENT (_MSC_VER >= 1900 || __GLIBCXX__ && __cpp_lib_uncaught_exceptions \ || _LIBCPP_VERSION >= 3700 && (!__TBB_MACOS_TARGET_VERSION || __TBB_MACOS_TARGET_VERSION >= 101200)) #define __TBB_TSX_INTRINSICS_PRESENT (__RTM__ || __INTEL_COMPILER || (_MSC_VER>=1700 && (__TBB_x86_64 || __TBB_x86_32))) #define __TBB_WAITPKG_INTRINSICS_PRESENT ((__INTEL_COMPILER >= 1900 || (__TBB_GCC_VERSION >= 110000 && __TBB_GNU_ASM_VERSION >= 2032) || __TBB_CLANG_VERSION >= 120000) \ && (_WIN32 || _WIN64 || __unix__ || __APPLE__) && (__TBB_x86_32 || __TBB_x86_64) && !__ANDROID__) /** Internal TBB features & modes **/ /** __TBB_SOURCE_DIRECTLY_INCLUDED is a mode used in whitebox testing when it's necessary to test internal functions not exported from TBB DLLs **/ #if (_WIN32||_WIN64) && (__TBB_SOURCE_DIRECTLY_INCLUDED || TBB_USE_PREVIEW_BINARY) #define __TBB_NO_IMPLICIT_LINKAGE 1 #define __TBBMALLOC_NO_IMPLICIT_LINKAGE 1 #endif #if (__TBB_BUILD || __TBBMALLOC_BUILD || __TBBMALLOCPROXY_BUILD || __TBBBIND_BUILD) && !defined(__TBB_NO_IMPLICIT_LINKAGE) #define __TBB_NO_IMPLICIT_LINKAGE 1 #endif #if _MSC_VER #if !__TBB_NO_IMPLICIT_LINKAGE #ifdef _DEBUG #pragma comment(lib, "tbb12_debug.lib") #else #pragma comment(lib, "tbb12.lib") #endif #endif #endif #ifndef __TBB_SCHEDULER_OBSERVER #define __TBB_SCHEDULER_OBSERVER 1 #endif /* __TBB_SCHEDULER_OBSERVER */ #ifndef __TBB_FP_CONTEXT #define __TBB_FP_CONTEXT 1 #endif /* __TBB_FP_CONTEXT */ #define __TBB_RECYCLE_TO_ENQUEUE __TBB_BUILD // keep non-official #ifndef __TBB_ARENA_OBSERVER #define __TBB_ARENA_OBSERVER __TBB_SCHEDULER_OBSERVER #endif /* __TBB_ARENA_OBSERVER */ #ifndef __TBB_ARENA_BINDING #define __TBB_ARENA_BINDING 1 #endif // Thread pinning is not available on macOS* #define __TBB_CPUBIND_PRESENT (__TBB_ARENA_BINDING && !__APPLE__) #ifndef __TBB_ENQUEUE_ENFORCED_CONCURRENCY #define __TBB_ENQUEUE_ENFORCED_CONCURRENCY 1 #endif #if !defined(__TBB_SURVIVE_THREAD_SWITCH) && \ (_WIN32 || _WIN64 || __APPLE__ || (defined(__unix__) && !__ANDROID__)) #define __TBB_SURVIVE_THREAD_SWITCH 1 #endif /* __TBB_SURVIVE_THREAD_SWITCH */ #ifndef TBB_PREVIEW_FLOW_GRAPH_FEATURES #define TBB_PREVIEW_FLOW_GRAPH_FEATURES __TBB_CPF_BUILD #endif #ifndef __TBB_DEFAULT_PARTITIONER #define __TBB_DEFAULT_PARTITIONER tbb::auto_partitioner #endif #ifndef __TBB_FLOW_TRACE_CODEPTR #define __TBB_FLOW_TRACE_CODEPTR __TBB_CPF_BUILD #endif // Intel(R) C++ Compiler starts analyzing usages of the deprecated content at the template // instantiation site, which is too late for suppression of the corresponding messages for internal // stuff. #if !defined(__INTEL_COMPILER) && (!defined(TBB_SUPPRESS_DEPRECATED_MESSAGES) || (TBB_SUPPRESS_DEPRECATED_MESSAGES == 0)) #if (__TBB_LANG >= 201402L && (!defined(_MSC_VER) || _MSC_VER >= 1920)) #define __TBB_DEPRECATED [[deprecated]] #define __TBB_DEPRECATED_MSG(msg) [[deprecated(msg)]] #elif _MSC_VER #define __TBB_DEPRECATED __declspec(deprecated) #define __TBB_DEPRECATED_MSG(msg) __declspec(deprecated(msg)) #elif (__GNUC__ && __TBB_GCC_VERSION >= 40805) || __clang__ #define __TBB_DEPRECATED __attribute__((deprecated)) #define __TBB_DEPRECATED_MSG(msg) __attribute__((deprecated(msg))) #endif #endif // !defined(TBB_SUPPRESS_DEPRECATED_MESSAGES) || (TBB_SUPPRESS_DEPRECATED_MESSAGES == 0) #if !defined(__TBB_DEPRECATED) #define __TBB_DEPRECATED #define __TBB_DEPRECATED_MSG(msg) #elif !defined(__TBB_SUPPRESS_INTERNAL_DEPRECATED_MESSAGES) // Suppress deprecated messages from self #define __TBB_SUPPRESS_INTERNAL_DEPRECATED_MESSAGES 1 #endif #if defined(TBB_SUPPRESS_DEPRECATED_MESSAGES) && (TBB_SUPPRESS_DEPRECATED_MESSAGES == 0) #define __TBB_DEPRECATED_VERBOSE __TBB_DEPRECATED #define __TBB_DEPRECATED_VERBOSE_MSG(msg) __TBB_DEPRECATED_MSG(msg) #else #define __TBB_DEPRECATED_VERBOSE #define __TBB_DEPRECATED_VERBOSE_MSG(msg) #endif // (TBB_SUPPRESS_DEPRECATED_MESSAGES == 0) #if (!defined(TBB_SUPPRESS_DEPRECATED_MESSAGES) || (TBB_SUPPRESS_DEPRECATED_MESSAGES == 0)) && !(__TBB_LANG >= 201103L || _MSC_VER >= 1900) #pragma message("TBB Warning: Support for C++98/03 is deprecated. Please use the compiler that supports C++11 features at least.") #endif #ifdef _VARIADIC_MAX #define __TBB_VARIADIC_MAX _VARIADIC_MAX #else #if _MSC_VER == 1700 #define __TBB_VARIADIC_MAX 5 // VS11 setting, issue resolved in VS12 #elif _MSC_VER == 1600 #define __TBB_VARIADIC_MAX 10 // VS10 setting #else #define __TBB_VARIADIC_MAX 15 #endif #endif #if __SANITIZE_THREAD__ #define __TBB_USE_THREAD_SANITIZER 1 #elif defined(__has_feature) #if __has_feature(thread_sanitizer) #define __TBB_USE_THREAD_SANITIZER 1 #endif #endif #ifndef __TBB_USE_SANITIZERS #define __TBB_USE_SANITIZERS (__TBB_USE_THREAD_SANITIZER || __TBB_USE_ADDRESS_SANITIZER) #endif #ifndef __TBB_RESUMABLE_TASKS_USE_THREADS #define __TBB_RESUMABLE_TASKS_USE_THREADS __TBB_USE_SANITIZERS #endif #ifndef __TBB_USE_CONSTRAINTS #define __TBB_USE_CONSTRAINTS 1 #endif #ifndef __TBB_STRICT_CONSTRAINTS #define __TBB_STRICT_CONSTRAINTS 1 #endif #if __TBB_CPP20_CONCEPTS_PRESENT && __TBB_USE_CONSTRAINTS #define __TBB_requires(...) requires __VA_ARGS__ #else // __TBB_CPP20_CONCEPTS_PRESENT #define __TBB_requires(...) #endif // __TBB_CPP20_CONCEPTS_PRESENT /** Macros of the form __TBB_XXX_BROKEN denote known issues that are caused by the bugs in compilers, standard or OS specific libraries. They should be removed as soon as the corresponding bugs are fixed or the buggy OS/compiler versions go out of the support list. **/ // Some STL containers not support allocator traits in old GCC versions #if __GXX_EXPERIMENTAL_CXX0X__ && __TBB_GLIBCXX_VERSION <= 50301 #define TBB_ALLOCATOR_TRAITS_BROKEN 1 #endif // GCC 4.8 C++ standard library implements std::this_thread::yield as no-op. #if __TBB_GLIBCXX_VERSION >= 40800 && __TBB_GLIBCXX_VERSION < 40900 #define __TBB_GLIBCXX_THIS_THREAD_YIELD_BROKEN 1 #endif /** End of __TBB_XXX_BROKEN macro section **/ #if defined(_MSC_VER) && _MSC_VER>=1500 && !defined(__INTEL_COMPILER) // A macro to suppress erroneous or benign "unreachable code" MSVC warning (4702) #define __TBB_MSVC_UNREACHABLE_CODE_IGNORED 1 #endif // Many OS versions (Android 4.0.[0-3] for example) need workaround for dlopen to avoid non-recursive loader lock hang // Setting the workaround for all compile targets ($APP_PLATFORM) below Android 4.4 (android-19) #if __ANDROID__ #include #endif #define __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING (TBB_PREVIEW_FLOW_GRAPH_FEATURES) #ifndef __TBB_PREVIEW_CRITICAL_TASKS #define __TBB_PREVIEW_CRITICAL_TASKS 1 #endif #ifndef __TBB_PREVIEW_FLOW_GRAPH_NODE_SET #define __TBB_PREVIEW_FLOW_GRAPH_NODE_SET (TBB_PREVIEW_FLOW_GRAPH_FEATURES) #endif #ifndef __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT #define __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT (TBB_PREVIEW_FLOW_GRAPH_FEATURES \ || TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT) #endif #if TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS #define __TBB_PREVIEW_CONCURRENT_HASH_MAP_EXTENSIONS 1 #endif #if TBB_PREVIEW_TASK_GROUP_EXTENSIONS || __TBB_BUILD #define __TBB_PREVIEW_TASK_GROUP_EXTENSIONS 1 #endif #if TBB_PREVIEW_PARALLEL_PHASE || __TBB_BUILD #define __TBB_PREVIEW_PARALLEL_PHASE 1 #endif #if TBB_PREVIEW_BLOCKED_ND_RANGE_DEDUCTION_GUIDES #define __TBB_PREVIEW_BLOCKED_ND_RANGE_DEDUCTION_GUIDES 1 #endif #endif // __TBB_detail__config_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_containers_helpers.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_detail__containers_helpers_H #define __TBB_detail__containers_helpers_H #include "_template_helpers.h" #include "_allocator_traits.h" #include #include #include namespace tbb { namespace detail { inline namespace d0 { template struct comp_is_transparent : std::false_type {}; template struct comp_is_transparent> : std::true_type {}; template struct has_transparent_key_equal : std::false_type { using type = KeyEqual; }; template struct has_transparent_key_equal> : std::true_type { using type = typename Hasher::transparent_key_equal; static_assert(comp_is_transparent::value, "Hash::transparent_key_equal::is_transparent is not valid or does not denote a type."); static_assert((std::is_same>::value || std::is_same::value), "KeyEqual is a different type than equal_to or Hash::transparent_key_equal."); }; struct is_iterator_impl { template using iter_traits_category = typename std::iterator_traits::iterator_category; template using input_iter_category = typename std::enable_if>::value>::type; }; // struct is_iterator_impl template using is_input_iterator = supports; #if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT template inline constexpr bool is_input_iterator_v = is_input_iterator::value; #endif } // inline namespace d0 } // namespace detail } // namespace tbb #endif // __TBB_detail__containers_helpers_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_exception.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB__exception_H #define __TBB__exception_H #include "_config.h" #include // std::bad_alloc #include // std::exception #include // std::runtime_error namespace tbb { namespace detail { inline namespace d0 { enum class exception_id { bad_alloc = 1, bad_last_alloc, user_abort, nonpositive_step, out_of_range, reservation_length_error, missing_wait, invalid_load_factor, invalid_key, bad_tagged_msg_cast, unsafe_wait, last_entry }; } // namespace d0 #if _MSC_VER #pragma warning(disable: 4275) #endif namespace r1 { //! Exception for concurrent containers class TBB_EXPORT bad_last_alloc : public std::bad_alloc { public: const char* __TBB_EXPORTED_METHOD what() const noexcept(true) override; }; //! Exception for user-initiated abort class TBB_EXPORT user_abort : public std::exception { public: const char* __TBB_EXPORTED_METHOD what() const noexcept(true) override; }; //! Exception for missing wait on structured_task_group class TBB_EXPORT missing_wait : public std::exception { public: const char* __TBB_EXPORTED_METHOD what() const noexcept(true) override; }; //! Exception for impossible finalization of task_sheduler_handle #if __APPLE__ #pragma GCC visibility push(default) #endif class TBB_EXPORT unsafe_wait : public std::runtime_error { public: unsafe_wait(const char* msg) : std::runtime_error(msg) {} }; #if __APPLE__ #pragma GCC visibility pop #endif //! Gathers all throw operators in one place. /** Its purpose is to minimize code bloat that can be caused by throw operators scattered in multiple places, especially in templates. **/ TBB_EXPORT void __TBB_EXPORTED_FUNC throw_exception ( exception_id ); } // namespace r1 inline namespace d0 { using r1::throw_exception; } // namespace d0 } // namespace detail } // namespace tbb #endif // __TBB__exception_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_export.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_detail__export_H #define __TBB_detail__export_H #if defined(__MINGW32__) #define __TBB_EXPORT __declspec(dllexport) #elif defined(_WIN32) // Use .def files for these #define __TBB_EXPORT #elif defined(__unix__) || defined(__APPLE__) // Use .def files for these #define __TBB_EXPORT __attribute__ ((visibility ("default"))) #else #error "Unknown platform/compiler" #endif #if __TBB_BUILD #define TBB_EXPORT __TBB_EXPORT #else #define TBB_EXPORT #endif #if __TBBMALLOC_BUILD #define TBBMALLOC_EXPORT __TBB_EXPORT #else #define TBBMALLOC_EXPORT #endif #if __TBBMALLOCPROXY_BUILD #define TBBMALLOCPROXY_EXPORT __TBB_EXPORT #else #define TBBMALLOCPROXY_EXPORT #endif #if __TBBBIND_BUILD #define TBBBIND_EXPORT __TBB_EXPORT #else #define TBBBIND_EXPORT #endif #endif ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_flow_graph_body_impl.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB__flow_graph_body_impl_H #define __TBB__flow_graph_body_impl_H #ifndef __TBB_flow_graph_H #error Do not #include this internal file directly; use public TBB headers instead. #endif // included in namespace tbb::detail::d2 (in flow_graph.h) typedef std::uint64_t tag_value; // TODO revamp: find out if there is already helper for has_policy. template struct Policy {}; template struct has_policy; template struct has_policy : std::integral_constant::value || has_policy::value> {}; template struct has_policy : std::integral_constant::value> {}; template struct has_policy > : has_policy {}; namespace graph_policy_namespace { struct rejecting { }; struct reserving { }; struct queueing { }; struct lightweight { }; // K == type of field used for key-matching. Each tag-matching port will be provided // functor that, given an object accepted by the port, will return the /// field of type K being used for matching. template::type > > __TBB_requires(tbb::detail::hash_compare) struct key_matching { typedef K key_type; typedef typename std::decay::type base_key_type; typedef KHash hash_compare_type; }; // old tag_matching join's new specifier typedef key_matching tag_matching; // Aliases for Policy combinations typedef Policy queueing_lightweight; typedef Policy rejecting_lightweight; } // namespace graph_policy_namespace // -------------- function_body containers ---------------------- //! A functor that takes no input and generates a value of type Output template< typename Output > class input_body : no_assign { public: virtual ~input_body() {} virtual Output operator()(d1::flow_control& fc) = 0; virtual input_body* clone() = 0; }; //! The leaf for input_body template< typename Output, typename Body> class input_body_leaf : public input_body { public: input_body_leaf( const Body &_body ) : body(_body) { } Output operator()(d1::flow_control& fc) override { return body(fc); } input_body_leaf* clone() override { return new input_body_leaf< Output, Body >(body); } Body get_body() { return body; } private: Body body; }; //! A functor that takes an Input and generates an Output template< typename Input, typename Output > class function_body : no_assign { public: virtual ~function_body() {} virtual Output operator()(const Input &input) = 0; virtual function_body* clone() = 0; }; //! the leaf for function_body template class function_body_leaf : public function_body< Input, Output > { public: function_body_leaf( const B &_body ) : body(_body) { } Output operator()(const Input &i) override { return tbb::detail::invoke(body,i); } B get_body() { return body; } function_body_leaf* clone() override { return new function_body_leaf< Input, Output, B >(body); } private: B body; }; //! the leaf for function_body specialized for Input and output of continue_msg template class function_body_leaf< continue_msg, continue_msg, B> : public function_body< continue_msg, continue_msg > { public: function_body_leaf( const B &_body ) : body(_body) { } continue_msg operator()( const continue_msg &i ) override { body(i); return i; } B get_body() { return body; } function_body_leaf* clone() override { return new function_body_leaf< continue_msg, continue_msg, B >(body); } private: B body; }; //! the leaf for function_body specialized for Output of continue_msg template class function_body_leaf< Input, continue_msg, B> : public function_body< Input, continue_msg > { public: function_body_leaf( const B &_body ) : body(_body) { } continue_msg operator()(const Input &i) override { body(i); return continue_msg(); } B get_body() { return body; } function_body_leaf* clone() override { return new function_body_leaf< Input, continue_msg, B >(body); } private: B body; }; //! the leaf for function_body specialized for Input of continue_msg template class function_body_leaf< continue_msg, Output, B > : public function_body< continue_msg, Output > { public: function_body_leaf( const B &_body ) : body(_body) { } Output operator()(const continue_msg &i) override { return body(i); } B get_body() { return body; } function_body_leaf* clone() override { return new function_body_leaf< continue_msg, Output, B >(body); } private: B body; }; //! function_body that takes an Input and a set of output ports template class multifunction_body : no_assign { public: virtual ~multifunction_body () {} virtual void operator()(const Input &/* input*/, OutputSet &/*oset*/) = 0; virtual multifunction_body* clone() = 0; virtual void* get_body_ptr() = 0; }; //! leaf for multifunction. OutputSet can be a std::tuple or a vector. template class multifunction_body_leaf : public multifunction_body { public: multifunction_body_leaf(const B &_body) : body(_body) { } void operator()(const Input &input, OutputSet &oset) override { tbb::detail::invoke(body, input, oset); // body may explicitly put() to one or more of oset. } void* get_body_ptr() override { return &body; } multifunction_body_leaf* clone() override { return new multifunction_body_leaf(body); } private: B body; }; // ------ function bodies for hash_buffers and key-matching joins. template class type_to_key_function_body : no_assign { public: virtual ~type_to_key_function_body() {} virtual Output operator()(const Input &input) = 0; // returns an Output virtual type_to_key_function_body* clone() = 0; }; // specialization for ref output template class type_to_key_function_body : no_assign { public: virtual ~type_to_key_function_body() {} virtual const Output & operator()(const Input &input) = 0; // returns a const Output& virtual type_to_key_function_body* clone() = 0; }; template class type_to_key_function_body_leaf : public type_to_key_function_body { public: type_to_key_function_body_leaf( const B &_body ) : body(_body) { } Output operator()(const Input &i) override { return tbb::detail::invoke(body, i); } type_to_key_function_body_leaf* clone() override { return new type_to_key_function_body_leaf< Input, Output, B>(body); } private: B body; }; template class type_to_key_function_body_leaf : public type_to_key_function_body< Input, Output&> { public: type_to_key_function_body_leaf( const B &_body ) : body(_body) { } const Output& operator()(const Input &i) override { return tbb::detail::invoke(body, i); } type_to_key_function_body_leaf* clone() override { return new type_to_key_function_body_leaf< Input, Output&, B>(body); } private: B body; }; // --------------------------- end of function_body containers ------------------------ // --------------------------- node task bodies --------------------------------------- //! A task that calls a node's forward_task function template< typename NodeType > class forward_task_bypass : public graph_task { NodeType &my_node; public: forward_task_bypass( graph& g, d1::small_object_allocator& allocator, NodeType &n , node_priority_t node_priority = no_priority ) : graph_task(g, allocator, node_priority), my_node(n) {} d1::task* execute(d1::execution_data& ed) override { graph_task* next_task = my_node.forward_task(); if (SUCCESSFULLY_ENQUEUED == next_task) next_task = nullptr; else if (next_task) next_task = prioritize_task(my_node.graph_reference(), *next_task); finalize(ed); return next_task; } d1::task* cancel(d1::execution_data& ed) override { finalize(ed); return nullptr; } }; //! A task that calls a node's apply_body_bypass function, passing in an input of type Input // return the task* unless it is SUCCESSFULLY_ENQUEUED, in which case return nullptr template< typename NodeType, typename Input, typename BaseTaskType = graph_task> class apply_body_task_bypass : public BaseTaskType { NodeType &my_node; Input my_input; using check_metainfo = std::is_same; using without_metainfo = std::true_type; using with_metainfo = std::false_type; graph_task* call_apply_body_bypass_impl(without_metainfo) { return my_node.apply_body_bypass(my_input __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{})); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT graph_task* call_apply_body_bypass_impl(with_metainfo) { return my_node.apply_body_bypass(my_input, message_metainfo{this->get_msg_wait_context_vertices()}); } #endif graph_task* call_apply_body_bypass() { return call_apply_body_bypass_impl(check_metainfo{}); } public: #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT template apply_body_task_bypass( graph& g, d1::small_object_allocator& allocator, NodeType &n, const Input &i, node_priority_t node_priority, Metainfo&& metainfo ) : BaseTaskType(g, allocator, node_priority, std::forward(metainfo).waiters()) , my_node(n), my_input(i) {} #endif apply_body_task_bypass( graph& g, d1::small_object_allocator& allocator, NodeType& n, const Input& i, node_priority_t node_priority = no_priority ) : BaseTaskType(g, allocator, node_priority), my_node(n), my_input(i) {} d1::task* execute(d1::execution_data& ed) override { graph_task* next_task = call_apply_body_bypass(); if (SUCCESSFULLY_ENQUEUED == next_task) next_task = nullptr; else if (next_task) next_task = prioritize_task(my_node.graph_reference(), *next_task); BaseTaskType::template finalize(ed); return next_task; } d1::task* cancel(d1::execution_data& ed) override { BaseTaskType::template finalize(ed); return nullptr; } }; //! A task that calls a node's apply_body_bypass function with no input template< typename NodeType > class input_node_task_bypass : public graph_task { NodeType &my_node; public: input_node_task_bypass( graph& g, d1::small_object_allocator& allocator, NodeType &n ) : graph_task(g, allocator), my_node(n) {} d1::task* execute(d1::execution_data& ed) override { graph_task* next_task = my_node.apply_body_bypass( ); if (SUCCESSFULLY_ENQUEUED == next_task) next_task = nullptr; else if (next_task) next_task = prioritize_task(my_node.graph_reference(), *next_task); finalize(ed); return next_task; } d1::task* cancel(d1::execution_data& ed) override { finalize(ed); return nullptr; } }; // ------------------------ end of node task bodies ----------------------------------- template class threshold_regulator; template class threshold_regulator::value>::type> : public receiver, no_copy { T* my_node; protected: graph_task* try_put_task( const DecrementType& value ) override { graph_task* result = my_node->decrement_counter( value ); if( !result ) result = SUCCESSFULLY_ENQUEUED; return result; } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT // Intentionally ignore the metainformation // If there are more items associated with passed metainfo to be processed // They should be stored in the buffer before the limiter_node graph_task* try_put_task(const DecrementType& value, const message_metainfo&) override { return try_put_task(value); } #endif graph& graph_reference() const override { return my_node->my_graph; } template friend class limiter_node; void reset_receiver( reset_flags ) {} public: threshold_regulator(T* owner) : my_node(owner) { // Do not work with the passed pointer here as it may not be fully initialized yet } }; template class threshold_regulator : public continue_receiver, no_copy { T *my_node; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT // Intentionally ignore the metainformation // If there are more items associated with passed metainfo to be processed // They should be stored in the buffer before the limiter_node graph_task* execute(const message_metainfo&) override { #else graph_task* execute() override { #endif return my_node->decrement_counter( 1 ); } protected: graph& graph_reference() const override { return my_node->my_graph; } public: typedef continue_msg input_type; typedef continue_msg output_type; threshold_regulator(T* owner) : continue_receiver( /*number_of_predecessors=*/0, no_priority ), my_node(owner) { // Do not work with the passed pointer here as it may not be fully initialized yet } }; #endif // __TBB__flow_graph_body_impl_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_flow_graph_cache_impl.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB__flow_graph_cache_impl_H #define __TBB__flow_graph_cache_impl_H #ifndef __TBB_flow_graph_H #error Do not #include this internal file directly; use public TBB headers instead. #endif // included in namespace tbb::detail::d2 (in flow_graph.h) //! A node_cache maintains a std::queue of elements of type T. Each operation is protected by a lock. template< typename T, typename M=spin_mutex > class node_cache { public: typedef size_t size_type; bool empty() { typename mutex_type::scoped_lock lock( my_mutex ); return internal_empty(); } void add( T &n ) { typename mutex_type::scoped_lock lock( my_mutex ); internal_push(n); } void remove( T &n ) { typename mutex_type::scoped_lock lock( my_mutex ); for ( size_t i = internal_size(); i != 0; --i ) { T &s = internal_pop(); if ( &s == &n ) break; // only remove one predecessor per request internal_push(s); } } void clear() { while( !my_q.empty()) (void)my_q.pop(); } protected: typedef M mutex_type; mutex_type my_mutex; std::queue< T * > my_q; // Assumes lock is held inline bool internal_empty( ) { return my_q.empty(); } // Assumes lock is held inline size_type internal_size( ) { return my_q.size(); } // Assumes lock is held inline void internal_push( T &n ) { my_q.push(&n); } // Assumes lock is held inline T &internal_pop() { T *v = my_q.front(); my_q.pop(); return *v; } }; //! A cache of predecessors that only supports try_get template< typename T, typename M=spin_mutex > class predecessor_cache : public node_cache< sender, M > { public: typedef M mutex_type; typedef T output_type; typedef sender predecessor_type; typedef receiver successor_type; predecessor_cache( successor_type* owner ) : my_owner( owner ) { __TBB_ASSERT( my_owner, "predecessor_cache should have an owner." ); // Do not work with the passed pointer here as it may not be fully initialized yet } private: bool get_item_impl( output_type& v __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo* metainfo_ptr = nullptr) ) { bool successful_get = false; do { predecessor_type *src; { typename mutex_type::scoped_lock lock(this->my_mutex); if ( this->internal_empty() ) { break; } src = &this->internal_pop(); } // Try to get from this sender #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT if (metainfo_ptr) { successful_get = src->try_get( v, *metainfo_ptr ); } else #endif { successful_get = src->try_get( v ); } if (successful_get == false) { // Relinquish ownership of the edge register_successor(*src, *my_owner); } else { // Retain ownership of the edge this->add(*src); } } while ( successful_get == false ); return successful_get; } public: bool get_item( output_type& v ) { return get_item_impl(v); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT bool get_item( output_type& v, message_metainfo& metainfo ) { return get_item_impl(v, &metainfo); } #endif // If we are removing arcs (rf_clear_edges), call clear() rather than reset(). void reset() { for(;;) { predecessor_type *src; { if (this->internal_empty()) break; src = &this->internal_pop(); } register_successor(*src, *my_owner); } } protected: successor_type* my_owner; }; //! An cache of predecessors that supports requests and reservations template< typename T, typename M=spin_mutex > class reservable_predecessor_cache : public predecessor_cache< T, M > { public: typedef M mutex_type; typedef T output_type; typedef sender predecessor_type; typedef receiver successor_type; reservable_predecessor_cache( successor_type* owner ) : predecessor_cache(owner), reserved_src(nullptr) { // Do not work with the passed pointer here as it may not be fully initialized yet } private: bool try_reserve_impl( output_type &v __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo* metainfo) ) { bool successful_reserve = false; do { predecessor_type* pred = nullptr; { typename mutex_type::scoped_lock lock(this->my_mutex); if ( reserved_src.load(std::memory_order_relaxed) || this->internal_empty() ) return false; pred = &this->internal_pop(); reserved_src.store(pred, std::memory_order_relaxed); } // Try to get from this sender #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT if (metainfo) { successful_reserve = pred->try_reserve( v, *metainfo ); } else #endif { successful_reserve = pred->try_reserve( v ); } if (successful_reserve == false) { typename mutex_type::scoped_lock lock(this->my_mutex); // Relinquish ownership of the edge register_successor( *pred, *this->my_owner ); reserved_src.store(nullptr, std::memory_order_relaxed); } else { // Retain ownership of the edge this->add( *pred); } } while ( successful_reserve == false ); return successful_reserve; } public: bool try_reserve( output_type& v ) { return try_reserve_impl(v __TBB_FLOW_GRAPH_METAINFO_ARG(nullptr)); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT bool try_reserve( output_type& v, message_metainfo& metainfo ) { return try_reserve_impl(v, &metainfo); } #endif bool try_release() { reserved_src.load(std::memory_order_relaxed)->try_release(); reserved_src.store(nullptr, std::memory_order_relaxed); return true; } bool try_consume() { reserved_src.load(std::memory_order_relaxed)->try_consume(); reserved_src.store(nullptr, std::memory_order_relaxed); return true; } void reset() { reserved_src.store(nullptr, std::memory_order_relaxed); predecessor_cache::reset(); } void clear() { reserved_src.store(nullptr, std::memory_order_relaxed); predecessor_cache::clear(); } private: std::atomic reserved_src; }; //! An abstract cache of successors template class successor_cache : no_copy { protected: typedef M mutex_type; mutex_type my_mutex; typedef receiver successor_type; typedef receiver* pointer_type; typedef sender owner_type; // TODO revamp: introduce heapified collection of successors for strict priorities typedef std::list< pointer_type > successors_type; successors_type my_successors; owner_type* my_owner; public: successor_cache( owner_type* owner ) : my_owner(owner) { // Do not work with the passed pointer here as it may not be fully initialized yet } virtual ~successor_cache() {} void register_successor( successor_type& r ) { typename mutex_type::scoped_lock l(my_mutex, true); if( r.priority() != no_priority ) my_successors.push_front( &r ); else my_successors.push_back( &r ); } void remove_successor( successor_type& r ) { typename mutex_type::scoped_lock l(my_mutex, true); for ( typename successors_type::iterator i = my_successors.begin(); i != my_successors.end(); ++i ) { if ( *i == & r ) { my_successors.erase(i); break; } } } bool empty() { typename mutex_type::scoped_lock l(my_mutex, false); return my_successors.empty(); } void clear() { my_successors.clear(); } virtual graph_task* try_put_task( const T& t ) = 0; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT virtual graph_task* try_put_task( const T& t, const message_metainfo& metainfo ) = 0; #endif }; // successor_cache //! An abstract cache of successors, specialized to continue_msg template class successor_cache< continue_msg, M > : no_copy { protected: typedef M mutex_type; mutex_type my_mutex; typedef receiver successor_type; typedef receiver* pointer_type; typedef sender owner_type; typedef std::list< pointer_type > successors_type; successors_type my_successors; owner_type* my_owner; public: successor_cache( sender* owner ) : my_owner(owner) { // Do not work with the passed pointer here as it may not be fully initialized yet } virtual ~successor_cache() {} void register_successor( successor_type& r ) { typename mutex_type::scoped_lock l(my_mutex, true); if( r.priority() != no_priority ) my_successors.push_front( &r ); else my_successors.push_back( &r ); __TBB_ASSERT( my_owner, "Cache of successors must have an owner." ); if ( r.is_continue_receiver() ) { r.register_predecessor( *my_owner ); } } void remove_successor( successor_type& r ) { typename mutex_type::scoped_lock l(my_mutex, true); for ( successors_type::iterator i = my_successors.begin(); i != my_successors.end(); ++i ) { if ( *i == &r ) { __TBB_ASSERT(my_owner, "Cache of successors must have an owner."); // TODO: check if we need to test for continue_receiver before removing from r. r.remove_predecessor( *my_owner ); my_successors.erase(i); break; } } } bool empty() { typename mutex_type::scoped_lock l(my_mutex, false); return my_successors.empty(); } void clear() { my_successors.clear(); } virtual graph_task* try_put_task( const continue_msg& t ) = 0; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT virtual graph_task* try_put_task( const continue_msg& t, const message_metainfo& metainfo ) = 0; #endif }; // successor_cache< continue_msg > //! A cache of successors that are broadcast to template class broadcast_cache : public successor_cache { typedef successor_cache base_type; typedef M mutex_type; typedef typename successor_cache::successors_type successors_type; graph_task* try_put_task_impl( const T& t __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo) ) { graph_task * last_task = nullptr; typename mutex_type::scoped_lock l(this->my_mutex, /*write=*/true); typename successors_type::iterator i = this->my_successors.begin(); while ( i != this->my_successors.end() ) { graph_task *new_task = (*i)->try_put_task(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo)); // workaround for icc bug graph& graph_ref = (*i)->graph_reference(); last_task = combine_tasks(graph_ref, last_task, new_task); // enqueue if necessary if(new_task) { ++i; } else { // failed if ( (*i)->register_predecessor(*this->my_owner) ) { i = this->my_successors.erase(i); } else { ++i; } } } return last_task; } public: broadcast_cache( typename base_type::owner_type* owner ): base_type(owner) { // Do not work with the passed pointer here as it may not be fully initialized yet } graph_task* try_put_task( const T &t ) override { return try_put_task_impl(t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{})); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT graph_task* try_put_task( const T &t, const message_metainfo& metainfo ) override { return try_put_task_impl(t, metainfo); } #endif // call try_put_task and return list of received tasks bool gather_successful_try_puts( const T &t, graph_task_list& tasks ) { bool is_at_least_one_put_successful = false; typename mutex_type::scoped_lock l(this->my_mutex, /*write=*/true); typename successors_type::iterator i = this->my_successors.begin(); while ( i != this->my_successors.end() ) { graph_task * new_task = (*i)->try_put_task(t); if(new_task) { ++i; if(new_task != SUCCESSFULLY_ENQUEUED) { tasks.push_back(*new_task); } is_at_least_one_put_successful = true; } else { // failed if ( (*i)->register_predecessor(*this->my_owner) ) { i = this->my_successors.erase(i); } else { ++i; } } } return is_at_least_one_put_successful; } }; //! A cache of successors that are put in a round-robin fashion template class round_robin_cache : public successor_cache { typedef successor_cache base_type; typedef size_t size_type; typedef M mutex_type; typedef typename successor_cache::successors_type successors_type; public: round_robin_cache( typename base_type::owner_type* owner ): base_type(owner) { // Do not work with the passed pointer here as it may not be fully initialized yet } size_type size() { typename mutex_type::scoped_lock l(this->my_mutex, false); return this->my_successors.size(); } private: graph_task* try_put_task_impl( const T &t __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo) ) { typename mutex_type::scoped_lock l(this->my_mutex, /*write=*/true); typename successors_type::iterator i = this->my_successors.begin(); while ( i != this->my_successors.end() ) { graph_task* new_task = (*i)->try_put_task(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo)); if ( new_task ) { return new_task; } else { if ( (*i)->register_predecessor(*this->my_owner) ) { i = this->my_successors.erase(i); } else { ++i; } } } return nullptr; } public: graph_task* try_put_task(const T& t) override { return try_put_task_impl(t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{})); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT graph_task* try_put_task( const T& t, const message_metainfo& metainfo ) override { return try_put_task_impl(t, metainfo); } #endif }; #endif // __TBB__flow_graph_cache_impl_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_flow_graph_impl.h ================================================ /* Copyright (c) 2005-2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_flow_graph_impl_H #define __TBB_flow_graph_impl_H // #include "../config.h" #include "_task.h" #include "../task_group.h" #include "../task_arena.h" #include "../flow_graph_abstractions.h" #include "../concurrent_priority_queue.h" #include namespace tbb { namespace detail { namespace d2 { class graph_task; static graph_task* const SUCCESSFULLY_ENQUEUED = (graph_task*)-1; typedef unsigned int node_priority_t; static const node_priority_t no_priority = node_priority_t(0); class graph; class graph_node; template class graph_iterator { friend class graph; friend class graph_node; public: typedef size_t size_type; typedef GraphNodeType value_type; typedef GraphNodeType* pointer; typedef GraphNodeType& reference; typedef const GraphNodeType& const_reference; typedef std::forward_iterator_tag iterator_category; //! Copy constructor graph_iterator(const graph_iterator& other) : my_graph(other.my_graph), current_node(other.current_node) {} //! Assignment graph_iterator& operator=(const graph_iterator& other) { if (this != &other) { my_graph = other.my_graph; current_node = other.current_node; } return *this; } //! Dereference reference operator*() const; //! Dereference pointer operator->() const; //! Equality bool operator==(const graph_iterator& other) const { return ((my_graph == other.my_graph) && (current_node == other.current_node)); } #if !__TBB_CPP20_COMPARISONS_PRESENT //! Inequality bool operator!=(const graph_iterator& other) const { return !(operator==(other)); } #endif //! Pre-increment graph_iterator& operator++() { internal_forward(); return *this; } //! Post-increment graph_iterator operator++(int) { graph_iterator result = *this; operator++(); return result; } private: // the graph over which we are iterating GraphContainerType *my_graph; // pointer into my_graph's my_nodes list pointer current_node; //! Private initializing constructor for begin() and end() iterators graph_iterator(GraphContainerType *g, bool begin); void internal_forward(); }; // class graph_iterator // flags to modify the behavior of the graph reset(). Can be combined. enum reset_flags { rf_reset_protocol = 0, rf_reset_bodies = 1 << 0, // delete the current node body, reset to a copy of the initial node body. rf_clear_edges = 1 << 1 // delete edges }; void activate_graph(graph& g); void deactivate_graph(graph& g); bool is_graph_active(graph& g); graph_task* prioritize_task(graph& g, graph_task& arena_task); void spawn_in_graph_arena(graph& g, graph_task& arena_task); void enqueue_in_graph_arena(graph &g, graph_task& arena_task); class graph; //! Base class for tasks generated by graph nodes. class graph_task : public d1::task { public: graph_task(graph& g, d1::small_object_allocator& allocator, node_priority_t node_priority = no_priority); graph& my_graph; // graph instance the task belongs to // TODO revamp: rename to my_priority node_priority_t priority; template void destruct_and_deallocate(const d1::execution_data& ed); protected: template void finalize(const d1::execution_data& ed); private: // To organize task_list graph_task* my_next{ nullptr }; d1::small_object_allocator my_allocator; d1::wait_tree_vertex_interface* my_reference_vertex; // TODO revamp: elaborate internal interfaces to avoid friends declarations friend class graph_task_list; friend graph_task* prioritize_task(graph& g, graph_task& gt); }; inline bool is_this_thread_in_graph_arena(graph& g); #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT class trackable_messages_graph_task : public graph_task { public: trackable_messages_graph_task(graph& g, d1::small_object_allocator& allocator, node_priority_t node_priority, const std::forward_list& msg_waiters) : graph_task(g, allocator, node_priority) , my_msg_wait_context_vertices(msg_waiters) { auto last_iterator = my_msg_reference_vertices.cbefore_begin(); for (auto& msg_waiter : my_msg_wait_context_vertices) { // If the task is created by the thread outside the graph arena, the lifetime of the thread reference vertex // may be shorter that the lifetime of the task, so thread reference vertex approach cannot be used // and the task should be associated with the msg wait context itself d1::wait_tree_vertex_interface* ref_vertex = is_this_thread_in_graph_arena(g) ? r1::get_thread_reference_vertex(msg_waiter) : msg_waiter; last_iterator = my_msg_reference_vertices.emplace_after(last_iterator, ref_vertex); ref_vertex->reserve(1); } } trackable_messages_graph_task(graph& g, d1::small_object_allocator& allocator, node_priority_t node_priority, std::forward_list&& msg_waiters) : graph_task(g, allocator, node_priority) , my_msg_wait_context_vertices(std::move(msg_waiters)) { } const std::forward_list get_msg_wait_context_vertices() const { return my_msg_wait_context_vertices; } protected: template void finalize(const d1::execution_data& ed) { auto wait_context_vertices = std::move(my_msg_wait_context_vertices); auto msg_reference_vertices = std::move(my_msg_reference_vertices); graph_task::finalize(ed); // If there is no thread reference vertices associated with the task // then this task was created by transferring the ownership from other metainfo // instance (e.g. while taking from the buffer) if (msg_reference_vertices.empty()) { for (auto& msg_waiter : wait_context_vertices) { msg_waiter->release(1); } } else { for (auto& msg_waiter : msg_reference_vertices) { msg_waiter->release(1); } } } private: // Each task that holds information about single message wait_contexts should hold two lists // The first one is wait_contexts associated with the message itself. They are needed // to be able to broadcast the list of wait_contexts to the node successors while executing the task. // The second list is a list of reference vertices for each wait_context_vertex in the first list // to support the distributed reference counting schema std::forward_list my_msg_wait_context_vertices; std::forward_list my_msg_reference_vertices; }; // class trackable_messages_graph_task #endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT struct graph_task_comparator { bool operator()(const graph_task* left, const graph_task* right) { return left->priority < right->priority; } }; typedef tbb::concurrent_priority_queue graph_task_priority_queue_t; class priority_task_selector : public d1::task { public: priority_task_selector(graph_task_priority_queue_t& priority_queue, d1::small_object_allocator& allocator) : my_priority_queue(priority_queue), my_allocator(allocator), my_task() {} task* execute(d1::execution_data& ed) override { next_task(); __TBB_ASSERT(my_task, nullptr); task* t_next = my_task->execute(ed); my_allocator.delete_object(this, ed); return t_next; } task* cancel(d1::execution_data& ed) override { if (!my_task) { next_task(); } __TBB_ASSERT(my_task, nullptr); task* t_next = my_task->cancel(ed); my_allocator.delete_object(this, ed); return t_next; } private: void next_task() { // TODO revamp: hold functors in priority queue instead of real tasks bool result = my_priority_queue.try_pop(my_task); __TBB_ASSERT_EX(result, "Number of critical tasks for scheduler and tasks" " in graph's priority queue mismatched"); __TBB_ASSERT(my_task && my_task != SUCCESSFULLY_ENQUEUED, "Incorrect task submitted to graph priority queue"); __TBB_ASSERT(my_task->priority != no_priority, "Tasks from graph's priority queue must have priority"); } graph_task_priority_queue_t& my_priority_queue; d1::small_object_allocator my_allocator; graph_task* my_task; }; template class run_and_put_task; template class run_task; //******************************************************************************** // graph tasks helpers //******************************************************************************** //! The list of graph tasks class graph_task_list : no_copy { private: graph_task* my_first; graph_task** my_next_ptr; public: //! Construct empty list graph_task_list() : my_first(nullptr), my_next_ptr(&my_first) {} //! True if list is empty; false otherwise. bool empty() const { return !my_first; } //! Push task onto back of list. void push_back(graph_task& task) { task.my_next = nullptr; *my_next_ptr = &task; my_next_ptr = &task.my_next; } //! Pop the front task from the list. graph_task& pop_front() { __TBB_ASSERT(!empty(), "attempt to pop item from empty task_list"); graph_task* result = my_first; my_first = result->my_next; if (!my_first) { my_next_ptr = &my_first; } return *result; } }; //! The graph class /** This class serves as a handle to the graph */ class graph : no_copy, public graph_proxy { friend class graph_node; void prepare_task_arena(bool reinit = false) { if (reinit) { __TBB_ASSERT(my_task_arena, "task arena is nullptr"); my_task_arena->terminate(); my_task_arena->initialize(task_arena::attach()); } else { __TBB_ASSERT(my_task_arena == nullptr, "task arena is not nullptr"); my_task_arena = new task_arena(task_arena::attach()); } if (!my_task_arena->is_active()) // failed to attach my_task_arena->initialize(); // create a new, default-initialized arena __TBB_ASSERT(my_task_arena->is_active(), "task arena is not active"); } public: //! Constructs a graph with isolated task_group_context graph(); //! Constructs a graph with use_this_context as context explicit graph(task_group_context& use_this_context); //! Destroys the graph. /** Calls wait_for_all, then destroys the root task and context. */ ~graph(); //! Used to register that an external entity may still interact with the graph. /** The graph will not return from wait_for_all until a matching number of release_wait calls is made. */ void reserve_wait() override; //! Deregisters an external entity that may have interacted with the graph. /** The graph will not return from wait_for_all until all the number of reserve_wait calls matches the number of release_wait calls. */ void release_wait() override; //! Wait until graph is idle and the number of release_wait calls equals to the number of //! reserve_wait calls. /** The waiting thread will go off and steal work while it is blocked in the wait_for_all. */ void wait_for_all() { cancelled = false; caught_exception = false; try_call([this] { my_task_arena->execute([this] { d1::wait(my_wait_context_vertex.get_context(), *my_context); }); cancelled = my_context->is_group_execution_cancelled(); }).on_exception([this] { my_context->reset(); caught_exception = true; cancelled = true; }); // TODO: the "if" condition below is just a work-around to support the concurrent wait // mode. The cancellation and exception mechanisms are still broken in this mode. // Consider using task group not to re-implement the same functionality. if (!(my_context->traits() & task_group_context::concurrent_wait)) { my_context->reset(); // consistent with behavior in catch() } } // TODO revamp: consider adding getter for task_group_context. // ITERATORS template friend class graph_iterator; // Graph iterator typedefs typedef graph_iterator iterator; typedef graph_iterator const_iterator; // Graph iterator constructors //! start iterator iterator begin(); //! end iterator iterator end(); //! start const iterator const_iterator begin() const; //! end const iterator const_iterator end() const; //! start const iterator const_iterator cbegin() const; //! end const iterator const_iterator cend() const; // thread-unsafe state reset. void reset(reset_flags f = rf_reset_protocol); //! cancels execution of the associated task_group_context void cancel(); //! return status of graph execution bool is_cancelled() { return cancelled; } bool exception_thrown() { return caught_exception; } private: d1::wait_context_vertex my_wait_context_vertex; task_group_context *my_context; bool own_context; bool cancelled; bool caught_exception; bool my_is_active; graph_node *my_nodes, *my_nodes_last; tbb::spin_mutex nodelist_mutex; void register_node(graph_node *n); void remove_node(graph_node *n); task_arena* my_task_arena; graph_task_priority_queue_t my_priority_queue; d1::wait_context_vertex& get_wait_context_vertex() { return my_wait_context_vertex; } friend void activate_graph(graph& g); friend void deactivate_graph(graph& g); friend bool is_graph_active(graph& g); friend bool is_this_thread_in_graph_arena(graph& g); friend graph_task* prioritize_task(graph& g, graph_task& arena_task); friend void spawn_in_graph_arena(graph& g, graph_task& arena_task); friend void enqueue_in_graph_arena(graph &g, graph_task& arena_task); friend class d1::task_arena_base; friend class graph_task; template friend class receiver; }; // class graph template inline void graph_task::destruct_and_deallocate(const d1::execution_data& ed) { auto allocator = my_allocator; // TODO: investigate if direct call of derived destructor gives any benefits. this->~graph_task(); allocator.deallocate(static_cast(this), ed); } template inline void graph_task::finalize(const d1::execution_data& ed) { d1::wait_tree_vertex_interface* reference_vertex = my_reference_vertex; destruct_and_deallocate(ed); reference_vertex->release(); } inline graph_task::graph_task(graph& g, d1::small_object_allocator& allocator, node_priority_t node_priority) : my_graph(g) , priority(node_priority) , my_allocator(allocator) { // If the task is created by the thread outside the graph arena, the lifetime of the thread reference vertex // may be shorter that the lifetime of the task, so thread reference vertex approach cannot be used // and the task should be associated with the graph wait context itself // TODO: consider how reference counting can be improved for such a use case. Most common example is the async_node d1::wait_context_vertex* graph_wait_context_vertex = &my_graph.get_wait_context_vertex(); my_reference_vertex = is_this_thread_in_graph_arena(g) ? r1::get_thread_reference_vertex(graph_wait_context_vertex) : graph_wait_context_vertex; __TBB_ASSERT(my_reference_vertex, nullptr); my_reference_vertex->reserve(); } //******************************************************************************** // end of graph tasks helpers //******************************************************************************** #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET class get_graph_helper; #endif //! The base of all graph nodes. class graph_node : no_copy { friend class graph; template friend class graph_iterator; #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET friend class get_graph_helper; #endif protected: graph& my_graph; graph& graph_reference() const { // TODO revamp: propagate graph_reference() method to all the reference places. return my_graph; } graph_node* next = nullptr; graph_node* prev = nullptr; public: explicit graph_node(graph& g); virtual ~graph_node(); protected: // performs the reset on an individual node. virtual void reset_node(reset_flags f = rf_reset_protocol) = 0; }; // class graph_node inline void activate_graph(graph& g) { g.my_is_active = true; } inline void deactivate_graph(graph& g) { g.my_is_active = false; } inline bool is_graph_active(graph& g) { return g.my_is_active; } inline bool is_this_thread_in_graph_arena(graph& g) { __TBB_ASSERT(g.my_task_arena && g.my_task_arena->is_active(), nullptr); return r1::execution_slot(*g.my_task_arena) != d1::slot_id(-1); } inline graph_task* prioritize_task(graph& g, graph_task& gt) { if( no_priority == gt.priority ) return > //! Non-preemptive priority pattern. The original task is submitted as a work item to the //! priority queue, and a new critical task is created to take and execute a work item with //! the highest known priority. The reference counting responsibility is transferred to //! the new task. // A newly created small_object_allocator should be used to allocate the priority_task_selector // instead of the allocator, associated with gt since gt can be allocated by another thread d1::small_object_allocator allocator; d1::task* critical_task = allocator.new_object(g.my_priority_queue, allocator); __TBB_ASSERT( critical_task, "bad_alloc?" ); g.my_priority_queue.push(>); using tbb::detail::d1::submit; submit( *critical_task, *g.my_task_arena, *g.my_context, /*as_critical=*/true ); return nullptr; } //! Spawns a task inside graph arena inline void spawn_in_graph_arena(graph& g, graph_task& arena_task) { if (is_graph_active(g)) { d1::task* gt = prioritize_task(g, arena_task); if( !gt ) return; __TBB_ASSERT(g.my_task_arena && g.my_task_arena->is_active(), nullptr); submit( *gt, *g.my_task_arena, *g.my_context #if __TBB_PREVIEW_CRITICAL_TASKS , /*as_critical=*/false #endif ); } } // TODO revamp: unify *_in_graph_arena functions //! Enqueues a task inside graph arena inline void enqueue_in_graph_arena(graph &g, graph_task& arena_task) { if (is_graph_active(g)) { __TBB_ASSERT( g.my_task_arena && g.my_task_arena->is_active(), "Is graph's arena initialized and active?" ); // TODO revamp: decide on the approach that does not postpone critical task if( d1::task* gt = prioritize_task(g, arena_task) ) submit( *gt, *g.my_task_arena, *g.my_context, /*as_critical=*/false); } } } // namespace d2 } // namespace detail } // namespace tbb #endif // __TBB_flow_graph_impl_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_flow_graph_indexer_impl.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB__flow_graph_indexer_impl_H #define __TBB__flow_graph_indexer_impl_H #ifndef __TBB_flow_graph_H #error Do not #include this internal file directly; use public TBB headers instead. #endif // included in namespace tbb::detail::d2 #include "_flow_graph_types_impl.h" // Output of the indexer_node is a tbb::flow::tagged_msg, and will be of // the form tagged_msg // where the value of tag will indicate which result was put to the // successor. template graph_task* do_try_put(const T &v, void *p __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) { typename IndexerNodeBaseType::output_type o(K, v); return reinterpret_cast(p)->try_put_task(&o __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo)); } template struct indexer_helper { template static inline void set_indexer_node_pointer(PortTuple &my_input, IndexerNodeBaseType *p, graph& g) { typedef typename std::tuple_element::type T; auto indexer_node_put_task = do_try_put; std::get(my_input).set_up(p, indexer_node_put_task, g); indexer_helper::template set_indexer_node_pointer(my_input, p, g); } }; template struct indexer_helper { template static inline void set_indexer_node_pointer(PortTuple &my_input, IndexerNodeBaseType *p, graph& g) { typedef typename std::tuple_element<0, TupleTypes>::type T; auto indexer_node_put_task = do_try_put; std::get<0>(my_input).set_up(p, indexer_node_put_task, g); } }; template class indexer_input_port : public receiver { private: void* my_indexer_ptr; typedef graph_task* (* forward_function_ptr)(T const &, void* __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo&)); forward_function_ptr my_try_put_task; graph* my_graph; public: void set_up(void* p, forward_function_ptr f, graph& g) { my_indexer_ptr = p; my_try_put_task = f; my_graph = &g; } protected: template< typename R, typename B > friend class run_and_put_task; template friend class broadcast_cache; template friend class round_robin_cache; graph_task* try_put_task(const T &v) override { return my_try_put_task(v, my_indexer_ptr __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{})); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT graph_task* try_put_task(const T& v, const message_metainfo& metainfo) override { return my_try_put_task(v, my_indexer_ptr, metainfo); } #endif graph& graph_reference() const override { return *my_graph; } }; template class indexer_node_FE { public: static const int N = std::tuple_size::value; typedef OutputType output_type; typedef InputTuple input_type; // Some versions of Intel(R) C++ Compiler fail to generate an implicit constructor for the class which has std::tuple as a member. indexer_node_FE() : my_inputs() {} input_type &input_ports() { return my_inputs; } protected: input_type my_inputs; }; //! indexer_node_base template class indexer_node_base : public graph_node, public indexer_node_FE, public sender { protected: using graph_node::my_graph; public: static const size_t N = std::tuple_size::value; typedef OutputType output_type; typedef StructTypes tuple_types; typedef typename sender::successor_type successor_type; typedef indexer_node_FE input_ports_type; private: // ----------- Aggregator ------------ enum op_type { reg_succ, rem_succ, try__put_task }; typedef indexer_node_base class_type; class indexer_node_base_operation : public d1::aggregated_operation { public: char type; union { output_type const *my_arg; successor_type *my_succ; graph_task* bypass_t; }; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT message_metainfo const* metainfo; #endif indexer_node_base_operation(const output_type* e, op_type t) : type(char(t)), my_arg(e) __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(nullptr)) {} #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT indexer_node_base_operation(const output_type* e, op_type t, const message_metainfo& info) : type(char(t)), my_arg(e), metainfo(&info) {} #endif indexer_node_base_operation(const successor_type &s, op_type t) : type(char(t)), my_succ(const_cast(&s)) {} }; typedef d1::aggregating_functor handler_type; friend class d1::aggregating_functor; d1::aggregator my_aggregator; void handle_operations(indexer_node_base_operation* op_list) { indexer_node_base_operation *current; while(op_list) { current = op_list; op_list = op_list->next; switch(current->type) { case reg_succ: my_successors.register_successor(*(current->my_succ)); current->status.store( SUCCEEDED, std::memory_order_release); break; case rem_succ: my_successors.remove_successor(*(current->my_succ)); current->status.store( SUCCEEDED, std::memory_order_release); break; case try__put_task: { current->bypass_t = my_successors.try_put_task(*(current->my_arg) __TBB_FLOW_GRAPH_METAINFO_ARG(*(current->metainfo))); current->status.store( SUCCEEDED, std::memory_order_release); // return of try_put_task actual return value } break; } } } // ---------- end aggregator ----------- public: indexer_node_base(graph& g) : graph_node(g), input_ports_type(), my_successors(this) { indexer_helper::set_indexer_node_pointer(this->my_inputs, this, g); my_aggregator.initialize_handler(handler_type(this)); } indexer_node_base(const indexer_node_base& other) : graph_node(other.my_graph), input_ports_type(), sender(), my_successors(this) { indexer_helper::set_indexer_node_pointer(this->my_inputs, this, other.my_graph); my_aggregator.initialize_handler(handler_type(this)); } bool register_successor(successor_type &r) override { indexer_node_base_operation op_data(r, reg_succ); my_aggregator.execute(&op_data); return op_data.status == SUCCEEDED; } bool remove_successor( successor_type &r) override { indexer_node_base_operation op_data(r, rem_succ); my_aggregator.execute(&op_data); return op_data.status == SUCCEEDED; } // not a virtual method in this class graph_task* try_put_task(output_type const *v __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) { indexer_node_base_operation op_data(v, try__put_task __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo)); my_aggregator.execute(&op_data); return op_data.bypass_t; } protected: void reset_node(reset_flags f) override { if(f & rf_clear_edges) { my_successors.clear(); } } private: broadcast_cache my_successors; }; //indexer_node_base template struct input_types; template struct input_types<1, InputTuple> { typedef typename std::tuple_element<0, InputTuple>::type first_type; typedef tagged_msg type; }; template struct input_types<2, InputTuple> { typedef typename std::tuple_element<0, InputTuple>::type first_type; typedef typename std::tuple_element<1, InputTuple>::type second_type; typedef tagged_msg type; }; template struct input_types<3, InputTuple> { typedef typename std::tuple_element<0, InputTuple>::type first_type; typedef typename std::tuple_element<1, InputTuple>::type second_type; typedef typename std::tuple_element<2, InputTuple>::type third_type; typedef tagged_msg type; }; template struct input_types<4, InputTuple> { typedef typename std::tuple_element<0, InputTuple>::type first_type; typedef typename std::tuple_element<1, InputTuple>::type second_type; typedef typename std::tuple_element<2, InputTuple>::type third_type; typedef typename std::tuple_element<3, InputTuple>::type fourth_type; typedef tagged_msg type; }; template struct input_types<5, InputTuple> { typedef typename std::tuple_element<0, InputTuple>::type first_type; typedef typename std::tuple_element<1, InputTuple>::type second_type; typedef typename std::tuple_element<2, InputTuple>::type third_type; typedef typename std::tuple_element<3, InputTuple>::type fourth_type; typedef typename std::tuple_element<4, InputTuple>::type fifth_type; typedef tagged_msg type; }; template struct input_types<6, InputTuple> { typedef typename std::tuple_element<0, InputTuple>::type first_type; typedef typename std::tuple_element<1, InputTuple>::type second_type; typedef typename std::tuple_element<2, InputTuple>::type third_type; typedef typename std::tuple_element<3, InputTuple>::type fourth_type; typedef typename std::tuple_element<4, InputTuple>::type fifth_type; typedef typename std::tuple_element<5, InputTuple>::type sixth_type; typedef tagged_msg type; }; template struct input_types<7, InputTuple> { typedef typename std::tuple_element<0, InputTuple>::type first_type; typedef typename std::tuple_element<1, InputTuple>::type second_type; typedef typename std::tuple_element<2, InputTuple>::type third_type; typedef typename std::tuple_element<3, InputTuple>::type fourth_type; typedef typename std::tuple_element<4, InputTuple>::type fifth_type; typedef typename std::tuple_element<5, InputTuple>::type sixth_type; typedef typename std::tuple_element<6, InputTuple>::type seventh_type; typedef tagged_msg type; }; template struct input_types<8, InputTuple> { typedef typename std::tuple_element<0, InputTuple>::type first_type; typedef typename std::tuple_element<1, InputTuple>::type second_type; typedef typename std::tuple_element<2, InputTuple>::type third_type; typedef typename std::tuple_element<3, InputTuple>::type fourth_type; typedef typename std::tuple_element<4, InputTuple>::type fifth_type; typedef typename std::tuple_element<5, InputTuple>::type sixth_type; typedef typename std::tuple_element<6, InputTuple>::type seventh_type; typedef typename std::tuple_element<7, InputTuple>::type eighth_type; typedef tagged_msg type; }; template struct input_types<9, InputTuple> { typedef typename std::tuple_element<0, InputTuple>::type first_type; typedef typename std::tuple_element<1, InputTuple>::type second_type; typedef typename std::tuple_element<2, InputTuple>::type third_type; typedef typename std::tuple_element<3, InputTuple>::type fourth_type; typedef typename std::tuple_element<4, InputTuple>::type fifth_type; typedef typename std::tuple_element<5, InputTuple>::type sixth_type; typedef typename std::tuple_element<6, InputTuple>::type seventh_type; typedef typename std::tuple_element<7, InputTuple>::type eighth_type; typedef typename std::tuple_element<8, InputTuple>::type nineth_type; typedef tagged_msg type; }; template struct input_types<10, InputTuple> { typedef typename std::tuple_element<0, InputTuple>::type first_type; typedef typename std::tuple_element<1, InputTuple>::type second_type; typedef typename std::tuple_element<2, InputTuple>::type third_type; typedef typename std::tuple_element<3, InputTuple>::type fourth_type; typedef typename std::tuple_element<4, InputTuple>::type fifth_type; typedef typename std::tuple_element<5, InputTuple>::type sixth_type; typedef typename std::tuple_element<6, InputTuple>::type seventh_type; typedef typename std::tuple_element<7, InputTuple>::type eighth_type; typedef typename std::tuple_element<8, InputTuple>::type nineth_type; typedef typename std::tuple_element<9, InputTuple>::type tenth_type; typedef tagged_msg type; }; // type generators template struct indexer_types : public input_types::value, OutputTuple> { static const int N = std::tuple_size::value; typedef typename input_types::type output_type; typedef typename wrap_tuple_elements::type input_ports_type; typedef indexer_node_FE indexer_FE_type; typedef indexer_node_base indexer_base_type; }; template class unfolded_indexer_node : public indexer_types::indexer_base_type { public: typedef typename indexer_types::input_ports_type input_ports_type; typedef OutputTuple tuple_types; typedef typename indexer_types::output_type output_type; private: typedef typename indexer_types::indexer_base_type base_type; public: unfolded_indexer_node(graph& g) : base_type(g) {} unfolded_indexer_node(const unfolded_indexer_node &other) : base_type(other) {} }; #endif /* __TBB__flow_graph_indexer_impl_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_flow_graph_item_buffer_impl.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB__flow_graph_item_buffer_impl_H #define __TBB__flow_graph_item_buffer_impl_H #ifndef __TBB_flow_graph_H #error Do not #include this internal file directly; use public TBB headers instead. #endif #include "_aligned_space.h" // in namespace tbb::flow::interfaceX (included in _flow_graph_node_impl.h) //! Expandable buffer of items. The possible operations are push, pop, //* tests for empty and so forth. No mutual exclusion is built in. //* objects are constructed into and explicitly-destroyed. get_my_item gives // a read-only reference to the item in the buffer. set_my_item may be called // with either an empty or occupied slot. template > class item_buffer { public: typedef T item_type; enum buffer_item_state { no_item=0, has_item=1, reserved_item=2 }; protected: struct aligned_space_item { item_type item; buffer_item_state state; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT message_metainfo metainfo; #endif }; typedef size_t size_type; typedef aligned_space buffer_item_type; typedef typename allocator_traits::template rebind_alloc allocator_type; buffer_item_type *my_array; size_type my_array_size; static const size_type initial_buffer_size = 4; size_type my_head; size_type my_tail; bool buffer_empty() const { return my_head == my_tail; } aligned_space_item &element(size_type i) { __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->state))%alignment_of::value), nullptr); __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->item))%alignment_of::value), nullptr); #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->metainfo))%alignment_of::value), nullptr); #endif return *my_array[i & (my_array_size - 1) ].begin(); } const aligned_space_item &element(size_type i) const { __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->state))%alignment_of::value), nullptr); __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->item))%alignment_of::value), nullptr); #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT __TBB_ASSERT(!(size_type(&(my_array[i&(my_array_size-1)].begin()->metainfo))%alignment_of::value), nullptr); #endif return *my_array[i & (my_array_size-1)].begin(); } bool my_item_valid(size_type i) const { return (i < my_tail) && (i >= my_head) && (element(i).state != no_item); } #if TBB_USE_ASSERT bool my_item_reserved(size_type i) const { return element(i).state == reserved_item; } #endif // object management in buffer const item_type &get_my_item(size_t i) const { __TBB_ASSERT(my_item_valid(i),"attempt to get invalid item"); return element(i).item; } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT message_metainfo& get_my_metainfo(size_t i) { __TBB_ASSERT(my_item_valid(i), "attempt to get invalid item"); return element(i).metainfo; } #endif // may be called with an empty slot or a slot that has already been constructed into. void set_my_item(size_t i, const item_type &o __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) { if(element(i).state != no_item) { destroy_item(i); } new(&(element(i).item)) item_type(o); element(i).state = has_item; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT new(&element(i).metainfo) message_metainfo(metainfo); for (auto& waiter : metainfo.waiters()) { waiter->reserve(1); } #endif } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT void set_my_item(size_t i, const item_type& o, message_metainfo&& metainfo) { if(element(i).state != no_item) { destroy_item(i); } new(&(element(i).item)) item_type(o); new(&element(i).metainfo) message_metainfo(std::move(metainfo)); // Skipping the reservation on metainfo.waiters since the ownership // is moving from metainfo to the cache element(i).state = has_item; } #endif // destructively-fetch an object from the buffer #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT void fetch_item(size_t i, item_type& o, message_metainfo& metainfo) { __TBB_ASSERT(my_item_valid(i), "Trying to fetch an empty slot"); o = get_my_item(i); // could have std::move assign semantics metainfo = std::move(get_my_metainfo(i)); destroy_item(i); } #else void fetch_item(size_t i, item_type &o) { __TBB_ASSERT(my_item_valid(i), "Trying to fetch an empty slot"); o = get_my_item(i); // could have std::move assign semantics destroy_item(i); } #endif // move an existing item from one slot to another. The moved-to slot must be unoccupied, // the moved-from slot must exist and not be reserved. The after, from will be empty, // to will be occupied but not reserved void move_item(size_t to, size_t from) { __TBB_ASSERT(!my_item_valid(to), "Trying to move to a non-empty slot"); __TBB_ASSERT(my_item_valid(from), "Trying to move from an empty slot"); // could have std::move semantics set_my_item(to, get_my_item(from) __TBB_FLOW_GRAPH_METAINFO_ARG(get_my_metainfo(from))); destroy_item(from); } // put an item in an empty slot. Return true if successful, else false #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT template bool place_item(size_t here, const item_type &me, Metainfo&& metainfo) { #if !TBB_DEPRECATED_SEQUENCER_DUPLICATES if(my_item_valid(here)) return false; #endif set_my_item(here, me, std::forward(metainfo)); return true; } #else bool place_item(size_t here, const item_type &me) { #if !TBB_DEPRECATED_SEQUENCER_DUPLICATES if(my_item_valid(here)) return false; #endif set_my_item(here, me); return true; } #endif // could be implemented with std::move semantics void swap_items(size_t i, size_t j) { __TBB_ASSERT(my_item_valid(i) && my_item_valid(j), "attempt to swap invalid item(s)"); item_type temp = get_my_item(i); #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT message_metainfo temp_metainfo = get_my_metainfo(i); set_my_item(i, get_my_item(j), get_my_metainfo(j)); set_my_item(j, temp, temp_metainfo); #else set_my_item(i, get_my_item(j)); set_my_item(j, temp); #endif } void destroy_item(size_type i) { __TBB_ASSERT(my_item_valid(i), "destruction of invalid item"); auto& e = element(i); e.item.~item_type(); e.state = no_item; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT for (auto& msg_waiter : e.metainfo.waiters()) { msg_waiter->release(1); } e.metainfo.~message_metainfo(); #endif } // returns the front element const item_type& front() const { __TBB_ASSERT(my_item_valid(my_head), "attempt to fetch head non-item"); return get_my_item(my_head); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT const message_metainfo& front_metainfo() const { __TBB_ASSERT(my_item_valid(my_head), "attempt to fetch head non-item"); return element(my_head).metainfo; } #endif // returns the back element const item_type& back() const { __TBB_ASSERT(my_item_valid(my_tail - 1), "attempt to fetch head non-item"); return get_my_item(my_tail - 1); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT const message_metainfo& back_metainfo() const { __TBB_ASSERT(my_item_valid(my_tail - 1), "attempt to fetch head non-item"); return element(my_tail - 1).metainfo; } #endif // following methods are for reservation of the front of a buffer. void reserve_item(size_type i) { __TBB_ASSERT(my_item_valid(i) && !my_item_reserved(i), "item cannot be reserved"); element(i).state = reserved_item; } void release_item(size_type i) { __TBB_ASSERT(my_item_reserved(i), "item is not reserved"); element(i).state = has_item; } void destroy_front() { destroy_item(my_head); ++my_head; } void destroy_back() { destroy_item(my_tail-1); --my_tail; } // we have to be able to test against a new tail value without changing my_tail // grow_array doesn't work if we change my_tail when the old array is too small size_type size(size_t new_tail = 0) { return (new_tail ? new_tail : my_tail) - my_head; } size_type capacity() { return my_array_size; } // sequencer_node does not use this method, so we don't // need a version that passes in the new_tail value. bool buffer_full() { return size() >= capacity(); } //! Grows the internal array. void grow_my_array( size_t minimum_size ) { // test that we haven't made the structure inconsistent. __TBB_ASSERT(capacity() >= my_tail - my_head, "total items exceed capacity"); size_type new_size = my_array_size ? 2*my_array_size : initial_buffer_size; while( new_sizestate = no_item; } for( size_type i=my_head; iitem); (void)new(new_space) item_type(get_my_item(i)); new_array[i&(new_size-1)].begin()->state = element(i).state; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT char* meta_space = (char *)&(new_array[i&(new_size-1)].begin()->metainfo); ::new(meta_space) message_metainfo(std::move(element(i).metainfo)); #endif } } clean_up_buffer(/*reset_pointers*/false); my_array = new_array; my_array_size = new_size; } bool push_back(item_type& v __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) { if (buffer_full()) { grow_my_array(size() + 1); } set_my_item(my_tail, v __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo)); ++my_tail; return true; } bool pop_back(item_type& v __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo& metainfo)) { if (!my_item_valid(my_tail - 1)) { return false; } auto& e = element(my_tail - 1); v = e.item; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT metainfo = std::move(e.metainfo); #endif destroy_back(); return true; } bool pop_front(item_type& v __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo& metainfo)) { if (!my_item_valid(my_head)) { return false; } auto& e = element(my_head); v = e.item; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT metainfo = std::move(e.metainfo); #endif destroy_front(); return true; } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT bool pop_back(item_type& v) { message_metainfo metainfo; return pop_back(v, metainfo); } bool pop_front(item_type& v) { message_metainfo metainfo; return pop_front(v, metainfo); } #endif // This is used both for reset and for grow_my_array. In the case of grow_my_array // we want to retain the values of the head and tail. void clean_up_buffer(bool reset_pointers) { if (my_array) { for( size_type i=my_head; i > class reservable_item_buffer : public item_buffer { protected: using item_buffer::my_item_valid; using item_buffer::my_head; public: reservable_item_buffer() : item_buffer(), my_reserved(false) {} void reset() {my_reserved = false; item_buffer::reset(); } protected: bool reserve_front(T &v) { if(my_reserved || !my_item_valid(this->my_head)) return false; my_reserved = true; // reserving the head v = this->front(); this->reserve_item(this->my_head); return true; } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT bool reserve_front(T& v, message_metainfo& metainfo) { if (my_reserved || !my_item_valid(this->my_head)) return false; my_reserved = true; // reserving the head v = this->front(); metainfo = this->front_metainfo(); this->reserve_item(this->my_head); return true; } #endif void consume_front() { __TBB_ASSERT(my_reserved, "Attempt to consume a non-reserved item"); this->destroy_front(); my_reserved = false; } void release_front() { __TBB_ASSERT(my_reserved, "Attempt to release a non-reserved item"); this->release_item(this->my_head); my_reserved = false; } bool my_reserved; }; #endif // __TBB__flow_graph_item_buffer_impl_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_flow_graph_join_impl.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB__flow_graph_join_impl_H #define __TBB__flow_graph_join_impl_H #ifndef __TBB_flow_graph_H #error Do not #include this internal file directly; use public TBB headers instead. #endif // included into namespace tbb::detail::d2 struct forwarding_base : no_assign { forwarding_base(graph &g) : graph_ref(g) {} virtual ~forwarding_base() {} graph& graph_ref; }; struct queueing_forwarding_base : forwarding_base { using forwarding_base::forwarding_base; // decrement_port_count may create a forwarding task. If we cannot handle the task // ourselves, ask decrement_port_count to deal with it. virtual graph_task* decrement_port_count(bool handle_task) = 0; }; struct reserving_forwarding_base : forwarding_base { using forwarding_base::forwarding_base; // decrement_port_count may create a forwarding task. If we cannot handle the task // ourselves, ask decrement_port_count to deal with it. virtual graph_task* decrement_port_count() = 0; virtual void increment_port_count() = 0; }; // specialization that lets us keep a copy of the current_key for building results. // KeyType can be a reference type. template struct matching_forwarding_base : public forwarding_base { typedef typename std::decay::type current_key_type; matching_forwarding_base(graph &g) : forwarding_base(g) { } virtual graph_task* increment_key_count(current_key_type const & /*t*/) = 0; current_key_type current_key; // so ports can refer to FE's desired items }; template< int N > struct join_helper { template< typename TupleType, typename PortType > static inline void set_join_node_pointer(TupleType &my_input, PortType *port) { std::get( my_input ).set_join_node_pointer(port); join_helper::set_join_node_pointer( my_input, port ); } template< typename TupleType > static inline void consume_reservations( TupleType &my_input ) { std::get( my_input ).consume(); join_helper::consume_reservations( my_input ); } template< typename TupleType > static inline void release_my_reservation( TupleType &my_input ) { std::get( my_input ).release(); } template static inline void release_reservations( TupleType &my_input) { join_helper::release_reservations(my_input); release_my_reservation(my_input); } template< typename InputTuple, typename OutputTuple > static inline bool reserve( InputTuple &my_input, OutputTuple &out) { if ( !std::get( my_input ).reserve( std::get( out ) ) ) return false; if ( !join_helper::reserve( my_input, out ) ) { release_my_reservation( my_input ); return false; } return true; } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT template static inline bool reserve(InputTuple& my_input, OutputTuple& out, message_metainfo& metainfo) { message_metainfo element_metainfo; if (!std::get(my_input).reserve(std::get(out), element_metainfo)) return false; if (!join_helper::reserve(my_input, out, metainfo)) { release_my_reservation(my_input); return false; } metainfo.merge(element_metainfo); return true; } #endif template static inline bool get_my_item( InputTuple &my_input, OutputTuple &out) { bool res = std::get(my_input).get_item(std::get(out) ); // may fail return join_helper::get_my_item(my_input, out) && res; // do get on other inputs before returning } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT template static inline bool get_my_item(InputTuple& my_input, OutputTuple& out, message_metainfo& metainfo) { message_metainfo element_metainfo; bool res = std::get(my_input).get_item(std::get(out), element_metainfo); metainfo.merge(element_metainfo); return join_helper::get_my_item(my_input, out, metainfo) && res; } #endif template static inline bool get_items(InputTuple &my_input, OutputTuple &out) { return get_my_item(my_input, out); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT template static inline bool get_items(InputTuple& my_input, OutputTuple& out, message_metainfo& metainfo) { return get_my_item(my_input, out, metainfo); } #endif template static inline void reset_my_port(InputTuple &my_input) { join_helper::reset_my_port(my_input); std::get(my_input).reset_port(); } template static inline void reset_ports(InputTuple& my_input) { reset_my_port(my_input); } template static inline void set_key_functors(InputTuple &my_input, KeyFuncTuple &my_key_funcs) { std::get(my_input).set_my_key_func(std::get(my_key_funcs)); std::get(my_key_funcs) = nullptr; join_helper::set_key_functors(my_input, my_key_funcs); } template< typename KeyFuncTuple> static inline void copy_key_functors(KeyFuncTuple &my_inputs, KeyFuncTuple &other_inputs) { __TBB_ASSERT( std::get(other_inputs).get_my_key_func(), "key matching join node should not be instantiated without functors." ); std::get(my_inputs).set_my_key_func(std::get(other_inputs).get_my_key_func()->clone()); join_helper::copy_key_functors(my_inputs, other_inputs); } template static inline void reset_inputs(InputTuple &my_input, reset_flags f) { join_helper::reset_inputs(my_input, f); std::get(my_input).reset_receiver(f); } }; // join_helper template< > struct join_helper<1> { template< typename TupleType, typename PortType > static inline void set_join_node_pointer(TupleType &my_input, PortType *port) { std::get<0>( my_input ).set_join_node_pointer(port); } template< typename TupleType > static inline void consume_reservations( TupleType &my_input ) { std::get<0>( my_input ).consume(); } template< typename TupleType > static inline void release_my_reservation( TupleType &my_input ) { std::get<0>( my_input ).release(); } template static inline void release_reservations( TupleType &my_input) { release_my_reservation(my_input); } template< typename InputTuple, typename OutputTuple > static inline bool reserve( InputTuple &my_input, OutputTuple &out) { return std::get<0>( my_input ).reserve( std::get<0>( out ) ); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT template static inline bool reserve(InputTuple& my_input, OutputTuple& out, message_metainfo& metainfo) { message_metainfo element_metainfo; bool result = std::get<0>(my_input).reserve(std::get<0>(out), element_metainfo); metainfo.merge(element_metainfo); return result; } #endif template static inline bool get_my_item( InputTuple &my_input, OutputTuple &out) { return std::get<0>(my_input).get_item(std::get<0>(out)); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT template static inline bool get_my_item(InputTuple& my_input, OutputTuple& out, message_metainfo& metainfo) { message_metainfo element_metainfo; bool res = std::get<0>(my_input).get_item(std::get<0>(out), element_metainfo); metainfo.merge(element_metainfo); return res; } #endif template static inline bool get_items(InputTuple &my_input, OutputTuple &out) { return get_my_item(my_input, out); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT template static inline bool get_items(InputTuple& my_input, OutputTuple& out, message_metainfo& metainfo) { return get_my_item(my_input, out, metainfo); } #endif template static inline void reset_my_port(InputTuple &my_input) { std::get<0>(my_input).reset_port(); } template static inline void reset_ports(InputTuple& my_input) { reset_my_port(my_input); } template static inline void set_key_functors(InputTuple &my_input, KeyFuncTuple &my_key_funcs) { std::get<0>(my_input).set_my_key_func(std::get<0>(my_key_funcs)); std::get<0>(my_key_funcs) = nullptr; } template< typename KeyFuncTuple> static inline void copy_key_functors(KeyFuncTuple &my_inputs, KeyFuncTuple &other_inputs) { __TBB_ASSERT( std::get<0>(other_inputs).get_my_key_func(), "key matching join node should not be instantiated without functors." ); std::get<0>(my_inputs).set_my_key_func(std::get<0>(other_inputs).get_my_key_func()->clone()); } template static inline void reset_inputs(InputTuple &my_input, reset_flags f) { std::get<0>(my_input).reset_receiver(f); } }; // join_helper<1> //! The two-phase join port template< typename T > class reserving_port : public receiver { public: typedef T input_type; typedef typename receiver::predecessor_type predecessor_type; private: // ----------- Aggregator ------------ enum op_type { reg_pred, rem_pred, res_item, rel_res, con_res }; typedef reserving_port class_type; class reserving_port_operation : public d1::aggregated_operation { public: char type; union { T *my_arg; predecessor_type *my_pred; }; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT message_metainfo* metainfo; #endif reserving_port_operation(const T& e, op_type t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo& info)) : type(char(t)), my_arg(const_cast(&e)) __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(&info)) {} #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT reserving_port_operation(const T& e, op_type t) : type(char(t)), my_arg(const_cast(&e)), metainfo(nullptr) {} #endif reserving_port_operation(const predecessor_type &s, op_type t) : type(char(t)), my_pred(const_cast(&s)) {} reserving_port_operation(op_type t) : type(char(t)) {} }; typedef d1::aggregating_functor handler_type; friend class d1::aggregating_functor; d1::aggregator my_aggregator; void handle_operations(reserving_port_operation* op_list) { reserving_port_operation *current; bool was_missing_predecessors = false; while(op_list) { current = op_list; op_list = op_list->next; switch(current->type) { case reg_pred: was_missing_predecessors = my_predecessors.empty(); my_predecessors.add(*(current->my_pred)); if ( was_missing_predecessors ) { (void) my_join->decrement_port_count(); // may try to forward } current->status.store( SUCCEEDED, std::memory_order_release); break; case rem_pred: if ( !my_predecessors.empty() ) { my_predecessors.remove(*(current->my_pred)); if ( my_predecessors.empty() ) // was the last predecessor my_join->increment_port_count(); } // TODO: consider returning failure if there were no predecessors to remove current->status.store( SUCCEEDED, std::memory_order_release ); break; case res_item: if ( reserved ) { current->status.store( FAILED, std::memory_order_release); } else { bool reserve_result = false; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT if (current->metainfo) { reserve_result = my_predecessors.try_reserve(*(current->my_arg), *(current->metainfo)); } else #endif { reserve_result = my_predecessors.try_reserve(*(current->my_arg)); } if (reserve_result) { reserved = true; current->status.store( SUCCEEDED, std::memory_order_release); } else { if ( my_predecessors.empty() ) { my_join->increment_port_count(); } current->status.store( FAILED, std::memory_order_release); } } break; case rel_res: reserved = false; my_predecessors.try_release( ); current->status.store( SUCCEEDED, std::memory_order_release); break; case con_res: reserved = false; my_predecessors.try_consume( ); current->status.store( SUCCEEDED, std::memory_order_release); break; } } } protected: template< typename R, typename B > friend class run_and_put_task; template friend class broadcast_cache; template friend class round_robin_cache; graph_task* try_put_task( const T & ) override { return nullptr; } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT graph_task* try_put_task(const T&, const message_metainfo&) override { return nullptr; } #endif graph& graph_reference() const override { return my_join->graph_ref; } public: //! Constructor reserving_port() : my_join(nullptr), my_predecessors(this), reserved(false) { my_aggregator.initialize_handler(handler_type(this)); } // copy constructor reserving_port(const reserving_port& /* other */) = delete; void set_join_node_pointer(reserving_forwarding_base *join) { my_join = join; } //! Add a predecessor bool register_predecessor( predecessor_type &src ) override { reserving_port_operation op_data(src, reg_pred); my_aggregator.execute(&op_data); return op_data.status == SUCCEEDED; } //! Remove a predecessor bool remove_predecessor( predecessor_type &src ) override { reserving_port_operation op_data(src, rem_pred); my_aggregator.execute(&op_data); return op_data.status == SUCCEEDED; } //! Reserve an item from the port bool reserve( T &v ) { reserving_port_operation op_data(v, res_item); my_aggregator.execute(&op_data); return op_data.status == SUCCEEDED; } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT bool reserve( T& v, message_metainfo& metainfo ) { reserving_port_operation op_data(v, res_item, metainfo); my_aggregator.execute(&op_data); return op_data.status == SUCCEEDED; } #endif //! Release the port void release( ) { reserving_port_operation op_data(rel_res); my_aggregator.execute(&op_data); } //! Complete use of the port void consume( ) { reserving_port_operation op_data(con_res); my_aggregator.execute(&op_data); } void reset_receiver( reset_flags f) { if(f & rf_clear_edges) my_predecessors.clear(); else my_predecessors.reset(); reserved = false; __TBB_ASSERT(!(f&rf_clear_edges) || my_predecessors.empty(), "port edges not removed"); } private: #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET friend class get_graph_helper; #endif reserving_forwarding_base *my_join; reservable_predecessor_cache< T, null_mutex > my_predecessors; bool reserved; }; // reserving_port //! queueing join_port template class queueing_port : public receiver, public item_buffer { public: typedef T input_type; typedef typename receiver::predecessor_type predecessor_type; typedef queueing_port class_type; // ----------- Aggregator ------------ private: enum op_type { get__item, res_port, try__put_task }; class queueing_port_operation : public d1::aggregated_operation { public: char type; T my_val; T* my_arg; graph_task* bypass_t; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT message_metainfo* metainfo; #endif // constructor for value parameter queueing_port_operation(const T& e, op_type t __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& info)) : type(char(t)), my_val(e), my_arg(nullptr) , bypass_t(nullptr) __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(const_cast(&info))) {} // constructor for pointer parameter queueing_port_operation(const T* p, op_type t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo& info)) : type(char(t)), my_arg(const_cast(p)) , bypass_t(nullptr) __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(&info)) {} #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT queueing_port_operation(const T* p, op_type t) : type(char(t)), my_arg(const_cast(p)), bypass_t(nullptr), metainfo(nullptr) {} #endif // constructor with no parameter queueing_port_operation(op_type t) : type(char(t)), my_arg(nullptr) , bypass_t(nullptr) __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(nullptr)) {} }; typedef d1::aggregating_functor handler_type; friend class d1::aggregating_functor; d1::aggregator my_aggregator; void handle_operations(queueing_port_operation* op_list) { queueing_port_operation *current; bool was_empty; while(op_list) { current = op_list; op_list = op_list->next; switch(current->type) { case try__put_task: { graph_task* rtask = nullptr; was_empty = this->buffer_empty(); #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT __TBB_ASSERT(current->metainfo, nullptr); this->push_back(current->my_val, *(current->metainfo)); #else this->push_back(current->my_val); #endif if (was_empty) rtask = my_join->decrement_port_count(false); else rtask = SUCCESSFULLY_ENQUEUED; current->bypass_t = rtask; current->status.store( SUCCEEDED, std::memory_order_release); } break; case get__item: if(!this->buffer_empty()) { __TBB_ASSERT(current->my_arg, nullptr); *(current->my_arg) = this->front(); #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT if (current->metainfo) { *(current->metainfo) = this->front_metainfo(); } #endif current->status.store( SUCCEEDED, std::memory_order_release); } else { current->status.store( FAILED, std::memory_order_release); } break; case res_port: __TBB_ASSERT(this->my_item_valid(this->my_head), "No item to reset"); this->destroy_front(); if(this->my_item_valid(this->my_head)) { (void)my_join->decrement_port_count(true); } current->status.store( SUCCEEDED, std::memory_order_release); break; } } } // ------------ End Aggregator --------------- protected: template< typename R, typename B > friend class run_and_put_task; template friend class broadcast_cache; template friend class round_robin_cache; private: graph_task* try_put_task_impl(const T& v __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) { queueing_port_operation op_data(v, try__put_task __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo)); my_aggregator.execute(&op_data); __TBB_ASSERT(op_data.status == SUCCEEDED || !op_data.bypass_t, "inconsistent return from aggregator"); if(!op_data.bypass_t) return SUCCESSFULLY_ENQUEUED; return op_data.bypass_t; } protected: graph_task* try_put_task(const T &v) override { return try_put_task_impl(v __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{})); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT graph_task* try_put_task(const T& v, const message_metainfo& metainfo) override { return try_put_task_impl(v, metainfo); } #endif graph& graph_reference() const override { return my_join->graph_ref; } public: //! Constructor queueing_port() : item_buffer() { my_join = nullptr; my_aggregator.initialize_handler(handler_type(this)); } //! copy constructor queueing_port(const queueing_port& /* other */) = delete; //! record parent for tallying available items void set_join_node_pointer(queueing_forwarding_base *join) { my_join = join; } bool get_item( T &v ) { queueing_port_operation op_data(&v, get__item); my_aggregator.execute(&op_data); return op_data.status == SUCCEEDED; } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT bool get_item( T& v, message_metainfo& metainfo ) { queueing_port_operation op_data(&v, get__item, metainfo); my_aggregator.execute(&op_data); return op_data.status == SUCCEEDED; } #endif // reset_port is called when item is accepted by successor, but // is initiated by join_node. void reset_port() { queueing_port_operation op_data(res_port); my_aggregator.execute(&op_data); return; } void reset_receiver(reset_flags) { item_buffer::reset(); } private: #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET friend class get_graph_helper; #endif queueing_forwarding_base *my_join; }; // queueing_port #include "_flow_graph_tagged_buffer_impl.h" template struct count_element { K my_key; size_t my_value; }; // method to access the key in the counting table // the ref has already been removed from K template< typename K > struct key_to_count_functor { typedef count_element table_item_type; const K& operator()(const table_item_type& v) { return v.my_key; } }; template struct key_matching_port_base { #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT using type = metainfo_hash_buffer; #else using type = hash_buffer; #endif }; // the ports can have only one template parameter. We wrap the types needed in // a traits type template< class TraitsType > class key_matching_port : public receiver, public key_matching_port_base< typename TraitsType::K, typename TraitsType::T, typename TraitsType::TtoK, typename TraitsType::KHash >::type { public: typedef TraitsType traits; typedef key_matching_port class_type; typedef typename TraitsType::T input_type; typedef typename TraitsType::K key_type; typedef typename std::decay::type noref_key_type; typedef typename receiver::predecessor_type predecessor_type; typedef typename TraitsType::TtoK type_to_key_func_type; typedef typename TraitsType::KHash hash_compare_type; typedef typename key_matching_port_base::type buffer_type; private: // ----------- Aggregator ------------ private: enum op_type { try__put, get__item, res_port }; class key_matching_port_operation : public d1::aggregated_operation { public: char type; input_type my_val; input_type *my_arg; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT message_metainfo* metainfo = nullptr; #endif // constructor for value parameter key_matching_port_operation(const input_type& e, op_type t __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& info)) : type(char(t)), my_val(e), my_arg(nullptr) __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(const_cast(&info))) {} // constructor for pointer parameter key_matching_port_operation(const input_type* p, op_type t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo& info)) : type(char(t)), my_arg(const_cast(p)) __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(&info)) {} // constructor with no parameter key_matching_port_operation(op_type t) : type(char(t)), my_arg(nullptr) {} }; typedef d1::aggregating_functor handler_type; friend class d1::aggregating_functor; d1::aggregator my_aggregator; void handle_operations(key_matching_port_operation* op_list) { key_matching_port_operation *current; while(op_list) { current = op_list; op_list = op_list->next; switch(current->type) { case try__put: { #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT __TBB_ASSERT(current->metainfo, nullptr); bool was_inserted = this->insert_with_key(current->my_val, *(current->metainfo)); #else bool was_inserted = this->insert_with_key(current->my_val); #endif // return failure if a duplicate insertion occurs current->status.store( was_inserted ? SUCCEEDED : FAILED, std::memory_order_release); } break; case get__item: { // use current_key from FE for item __TBB_ASSERT(current->my_arg, nullptr); #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT __TBB_ASSERT(current->metainfo, nullptr); bool find_result = this->find_with_key(my_join->current_key, *(current->my_arg), *(current->metainfo)); #else bool find_result = this->find_with_key(my_join->current_key, *(current->my_arg)); #endif #if TBB_USE_DEBUG if (!find_result) { __TBB_ASSERT(false, "Failed to find item corresponding to current_key."); } #else tbb::detail::suppress_unused_warning(find_result); #endif current->status.store( SUCCEEDED, std::memory_order_release); } break; case res_port: // use current_key from FE for item this->delete_with_key(my_join->current_key); current->status.store( SUCCEEDED, std::memory_order_release); break; } } } // ------------ End Aggregator --------------- protected: template< typename R, typename B > friend class run_and_put_task; template friend class broadcast_cache; template friend class round_robin_cache; private: graph_task* try_put_task_impl(const input_type& v __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) { key_matching_port_operation op_data(v, try__put __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo)); graph_task* rtask = nullptr; my_aggregator.execute(&op_data); if(op_data.status == SUCCEEDED) { rtask = my_join->increment_key_count((*(this->get_key_func()))(v)); // may spawn // rtask has to reflect the return status of the try_put if(!rtask) rtask = SUCCESSFULLY_ENQUEUED; } return rtask; } protected: graph_task* try_put_task(const input_type& v) override { return try_put_task_impl(v __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{})); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT graph_task* try_put_task(const input_type& v, const message_metainfo& metainfo) override { return try_put_task_impl(v, metainfo); } #endif graph& graph_reference() const override { return my_join->graph_ref; } public: key_matching_port() : receiver(), buffer_type() { my_join = nullptr; my_aggregator.initialize_handler(handler_type(this)); } // copy constructor key_matching_port(const key_matching_port& /*other*/) = delete; #if __INTEL_COMPILER <= 2021 // Suppress superfluous diagnostic about virtual keyword absence in a destructor of an inherited // class while the parent class has the virtual keyword for the destrocutor. virtual #endif ~key_matching_port() { } void set_join_node_pointer(forwarding_base *join) { my_join = dynamic_cast*>(join); } void set_my_key_func(type_to_key_func_type *f) { this->set_key_func(f); } type_to_key_func_type* get_my_key_func() { return this->get_key_func(); } bool get_item( input_type &v ) { // aggregator uses current_key from FE for Key key_matching_port_operation op_data(&v, get__item); my_aggregator.execute(&op_data); return op_data.status == SUCCEEDED; } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT bool get_item( input_type& v, message_metainfo& metainfo ) { // aggregator uses current_key from FE for Key key_matching_port_operation op_data(&v, get__item, metainfo); my_aggregator.execute(&op_data); return op_data.status == SUCCEEDED; } #endif // reset_port is called when item is accepted by successor, but // is initiated by join_node. void reset_port() { key_matching_port_operation op_data(res_port); my_aggregator.execute(&op_data); return; } void reset_receiver(reset_flags ) { buffer_type::reset(); } private: // my_join forwarding base used to count number of inputs that // received key. matching_forwarding_base *my_join; }; // key_matching_port using namespace graph_policy_namespace; template class join_node_base; //! join_node_FE : implements input port policy template class join_node_FE; template class join_node_FE : public reserving_forwarding_base { private: static const int N = std::tuple_size::value; typedef OutputTuple output_type; typedef InputTuple input_type; typedef join_node_base base_node_type; // for forwarding public: join_node_FE(graph &g) : reserving_forwarding_base(g), my_node(nullptr) { ports_with_no_inputs = N; join_helper::set_join_node_pointer(my_inputs, this); } join_node_FE(const join_node_FE& other) : reserving_forwarding_base((other.reserving_forwarding_base::graph_ref)), my_node(nullptr) { ports_with_no_inputs = N; join_helper::set_join_node_pointer(my_inputs, this); } void set_my_node(base_node_type *new_my_node) { my_node = new_my_node; } void increment_port_count() override { ++ports_with_no_inputs; } // if all input_ports have predecessors, spawn forward to try and consume tuples graph_task* decrement_port_count() override { if(ports_with_no_inputs.fetch_sub(1) == 1) { if(is_graph_active(this->graph_ref)) { d1::small_object_allocator allocator{}; typedef forward_task_bypass task_type; graph_task* t = allocator.new_object(graph_ref, allocator, *my_node); spawn_in_graph_arena(this->graph_ref, *t); } } return nullptr; } input_type &input_ports() { return my_inputs; } protected: void reset( reset_flags f) { // called outside of parallel contexts ports_with_no_inputs = N; join_helper::reset_inputs(my_inputs, f); } // all methods on input ports should be called under mutual exclusion from join_node_base. bool tuple_build_may_succeed() { return !ports_with_no_inputs; } bool try_to_make_tuple(output_type &out) { if(ports_with_no_inputs) return false; return join_helper::reserve(my_inputs, out); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT bool try_to_make_tuple(output_type &out, message_metainfo& metainfo) { if (ports_with_no_inputs) return false; return join_helper::reserve(my_inputs, out, metainfo); } #endif void tuple_accepted() { join_helper::consume_reservations(my_inputs); } void tuple_rejected() { join_helper::release_reservations(my_inputs); } input_type my_inputs; base_node_type *my_node; std::atomic ports_with_no_inputs; }; // join_node_FE template class join_node_FE : public queueing_forwarding_base { public: static const int N = std::tuple_size::value; typedef OutputTuple output_type; typedef InputTuple input_type; typedef join_node_base base_node_type; // for forwarding join_node_FE(graph &g) : queueing_forwarding_base(g), my_node(nullptr) { ports_with_no_items = N; join_helper::set_join_node_pointer(my_inputs, this); } join_node_FE(const join_node_FE& other) : queueing_forwarding_base((other.queueing_forwarding_base::graph_ref)), my_node(nullptr) { ports_with_no_items = N; join_helper::set_join_node_pointer(my_inputs, this); } // needed for forwarding void set_my_node(base_node_type *new_my_node) { my_node = new_my_node; } void reset_port_count() { ports_with_no_items = N; } // if all input_ports have items, spawn forward to try and consume tuples graph_task* decrement_port_count(bool handle_task) override { if(ports_with_no_items.fetch_sub(1) == 1) { if(is_graph_active(this->graph_ref)) { d1::small_object_allocator allocator{}; typedef forward_task_bypass task_type; graph_task* t = allocator.new_object(graph_ref, allocator, *my_node); if( !handle_task ) return t; spawn_in_graph_arena(this->graph_ref, *t); } } return nullptr; } input_type &input_ports() { return my_inputs; } protected: void reset( reset_flags f) { reset_port_count(); join_helper::reset_inputs(my_inputs, f ); } // all methods on input ports should be called under mutual exclusion from join_node_base. bool tuple_build_may_succeed() { return !ports_with_no_items; } bool try_to_make_tuple(output_type &out) { if(ports_with_no_items) return false; return join_helper::get_items(my_inputs, out); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT bool try_to_make_tuple(output_type &out, message_metainfo& metainfo) { if(ports_with_no_items) return false; return join_helper::get_items(my_inputs, out, metainfo); } #endif void tuple_accepted() { reset_port_count(); join_helper::reset_ports(my_inputs); } void tuple_rejected() { // nothing to do. } input_type my_inputs; base_node_type *my_node; std::atomic ports_with_no_items; }; // join_node_FE // key_matching join front-end. template class join_node_FE, InputTuple, OutputTuple> : public matching_forwarding_base, // buffer of key value counts public hash_buffer< // typedefed below to key_to_count_buffer_type typename std::decay::type&, // force ref type on K count_element::type>, type_to_key_function_body< count_element::type>, typename std::decay::type& >, KHash >, // buffer of output items public item_buffer { public: static const int N = std::tuple_size::value; typedef OutputTuple output_type; typedef InputTuple input_type; typedef K key_type; typedef typename std::decay::type unref_key_type; typedef KHash key_hash_compare; // must use K without ref. typedef count_element count_element_type; // method that lets us refer to the key of this type. typedef key_to_count_functor key_to_count_func; typedef type_to_key_function_body< count_element_type, unref_key_type&> TtoK_function_body_type; typedef type_to_key_function_body_leaf TtoK_function_body_leaf_type; // this is the type of the special table that keeps track of the number of discrete // elements corresponding to each key that we've seen. typedef hash_buffer< unref_key_type&, count_element_type, TtoK_function_body_type, key_hash_compare > key_to_count_buffer_type; typedef item_buffer output_buffer_type; typedef join_node_base, InputTuple, OutputTuple> base_node_type; // for forwarding typedef matching_forwarding_base forwarding_base_type; // ----------- Aggregator ------------ // the aggregator is only needed to serialize the access to the hash table. // and the output_buffer_type base class private: enum op_type { res_count, inc_count, may_succeed, try_make }; typedef join_node_FE, InputTuple, OutputTuple> class_type; class key_matching_FE_operation : public d1::aggregated_operation { public: char type; unref_key_type my_val; output_type* my_output; graph_task* bypass_t; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT message_metainfo* metainfo = nullptr; #endif // constructor for value parameter key_matching_FE_operation(const unref_key_type& e , op_type t) : type(char(t)), my_val(e), my_output(nullptr), bypass_t(nullptr) {} key_matching_FE_operation(output_type *p, op_type t) : type(char(t)), my_output(p), bypass_t(nullptr) {} #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT key_matching_FE_operation(output_type *p, op_type t, message_metainfo& info) : type(char(t)), my_output(p), bypass_t(nullptr), metainfo(&info) {} #endif // constructor with no parameter key_matching_FE_operation(op_type t) : type(char(t)), my_output(nullptr), bypass_t(nullptr) {} }; typedef d1::aggregating_functor handler_type; friend class d1::aggregating_functor; d1::aggregator my_aggregator; // called from aggregator, so serialized // returns a task pointer if the a task would have been enqueued but we asked that // it be returned. Otherwise returns nullptr. graph_task* fill_output_buffer(unref_key_type &t) { output_type l_out; graph_task* rtask = nullptr; bool do_fwd = this->buffer_empty() && is_graph_active(this->graph_ref); this->current_key = t; this->delete_with_key(this->current_key); // remove the key #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT message_metainfo metainfo; #endif if(join_helper::get_items(my_inputs, l_out __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo))) { // <== call back this->push_back(l_out __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo)); if(do_fwd) { // we enqueue if receiving an item from predecessor, not if successor asks for item d1::small_object_allocator allocator{}; typedef forward_task_bypass task_type; rtask = allocator.new_object(this->graph_ref, allocator, *my_node); do_fwd = false; } // retire the input values join_helper::reset_ports(my_inputs); // <== call back } else { __TBB_ASSERT(false, "should have had something to push"); } return rtask; } void handle_operations(key_matching_FE_operation* op_list) { key_matching_FE_operation *current; while(op_list) { current = op_list; op_list = op_list->next; switch(current->type) { case res_count: // called from BE { this->destroy_front(); current->status.store( SUCCEEDED, std::memory_order_release); } break; case inc_count: { // called from input ports count_element_type *p = nullptr; unref_key_type &t = current->my_val; if(!(this->find_ref_with_key(t,p))) { count_element_type ev; ev.my_key = t; ev.my_value = 0; this->insert_with_key(ev); bool found = this->find_ref_with_key(t, p); __TBB_ASSERT_EX(found, "should find key after inserting it"); } if(++(p->my_value) == size_t(N)) { current->bypass_t = fill_output_buffer(t); } } current->status.store( SUCCEEDED, std::memory_order_release); break; case may_succeed: // called from BE current->status.store( this->buffer_empty() ? FAILED : SUCCEEDED, std::memory_order_release); break; case try_make: // called from BE if(this->buffer_empty()) { current->status.store( FAILED, std::memory_order_release); } else { *(current->my_output) = this->front(); #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT if (current->metainfo) { *(current->metainfo) = this->front_metainfo(); } #endif current->status.store( SUCCEEDED, std::memory_order_release); } break; } } } // ------------ End Aggregator --------------- public: template join_node_FE(graph &g, FunctionTuple &TtoK_funcs) : forwarding_base_type(g), my_node(nullptr) { join_helper::set_join_node_pointer(my_inputs, this); join_helper::set_key_functors(my_inputs, TtoK_funcs); my_aggregator.initialize_handler(handler_type(this)); TtoK_function_body_type *cfb = new TtoK_function_body_leaf_type(key_to_count_func()); this->set_key_func(cfb); } join_node_FE(const join_node_FE& other) : forwarding_base_type((other.forwarding_base_type::graph_ref)), key_to_count_buffer_type(), output_buffer_type() { my_node = nullptr; join_helper::set_join_node_pointer(my_inputs, this); join_helper::copy_key_functors(my_inputs, const_cast(other.my_inputs)); my_aggregator.initialize_handler(handler_type(this)); TtoK_function_body_type *cfb = new TtoK_function_body_leaf_type(key_to_count_func()); this->set_key_func(cfb); } // needed for forwarding void set_my_node(base_node_type *new_my_node) { my_node = new_my_node; } void reset_port_count() { // called from BE key_matching_FE_operation op_data(res_count); my_aggregator.execute(&op_data); return; } // if all input_ports have items, spawn forward to try and consume tuples // return a task if we are asked and did create one. graph_task *increment_key_count(unref_key_type const & t) override { // called from input_ports key_matching_FE_operation op_data(t, inc_count); my_aggregator.execute(&op_data); return op_data.bypass_t; } input_type &input_ports() { return my_inputs; } protected: void reset( reset_flags f ) { // called outside of parallel contexts join_helper::reset_inputs(my_inputs, f); key_to_count_buffer_type::reset(); output_buffer_type::reset(); } // all methods on input ports should be called under mutual exclusion from join_node_base. bool tuple_build_may_succeed() { // called from back-end key_matching_FE_operation op_data(may_succeed); my_aggregator.execute(&op_data); return op_data.status == SUCCEEDED; } // cannot lock while calling back to input_ports. current_key will only be set // and reset under the aggregator, so it will remain consistent. bool try_to_make_tuple(output_type &out) { key_matching_FE_operation op_data(&out,try_make); my_aggregator.execute(&op_data); return op_data.status == SUCCEEDED; } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT bool try_to_make_tuple(output_type &out, message_metainfo& metainfo) { key_matching_FE_operation op_data(&out, try_make, metainfo); my_aggregator.execute(&op_data); return op_data.status == SUCCEEDED; } #endif void tuple_accepted() { reset_port_count(); // reset current_key after ports reset. } void tuple_rejected() { // nothing to do. } input_type my_inputs; // input ports base_node_type *my_node; }; // join_node_FE, InputTuple, OutputTuple> //! join_node_base template class join_node_base : public graph_node, public join_node_FE, public sender { protected: using graph_node::my_graph; public: typedef OutputTuple output_type; typedef typename sender::successor_type successor_type; typedef join_node_FE input_ports_type; using input_ports_type::tuple_build_may_succeed; using input_ports_type::try_to_make_tuple; using input_ports_type::tuple_accepted; using input_ports_type::tuple_rejected; private: // ----------- Aggregator ------------ enum op_type { reg_succ, rem_succ, try__get, do_fwrd, do_fwrd_bypass }; typedef join_node_base class_type; class join_node_base_operation : public d1::aggregated_operation { public: char type; union { output_type *my_arg; successor_type *my_succ; }; graph_task* bypass_t; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT message_metainfo* metainfo; #endif join_node_base_operation(const output_type& e, op_type t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo& info)) : type(char(t)), my_arg(const_cast(&e)), bypass_t(nullptr) __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo(&info)) {} #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT join_node_base_operation(const output_type& e, op_type t) : type(char(t)), my_arg(const_cast(&e)), bypass_t(nullptr), metainfo(nullptr) {} #endif join_node_base_operation(const successor_type &s, op_type t) : type(char(t)), my_succ(const_cast(&s)), bypass_t(nullptr) {} join_node_base_operation(op_type t) : type(char(t)), bypass_t(nullptr) {} }; typedef d1::aggregating_functor handler_type; friend class d1::aggregating_functor; bool forwarder_busy; d1::aggregator my_aggregator; void handle_operations(join_node_base_operation* op_list) { join_node_base_operation *current; while(op_list) { current = op_list; op_list = op_list->next; switch(current->type) { case reg_succ: { my_successors.register_successor(*(current->my_succ)); if(tuple_build_may_succeed() && !forwarder_busy && is_graph_active(my_graph)) { d1::small_object_allocator allocator{}; typedef forward_task_bypass< join_node_base > task_type; graph_task* t = allocator.new_object(my_graph, allocator, *this); spawn_in_graph_arena(my_graph, *t); forwarder_busy = true; } current->status.store( SUCCEEDED, std::memory_order_release); } break; case rem_succ: my_successors.remove_successor(*(current->my_succ)); current->status.store( SUCCEEDED, std::memory_order_release); break; case try__get: if(tuple_build_may_succeed()) { bool make_tuple_result = false; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT if (current->metainfo) { make_tuple_result = try_to_make_tuple(*(current->my_arg), *(current->metainfo)); } else #endif { make_tuple_result = try_to_make_tuple(*(current->my_arg)); } if(make_tuple_result) { #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT if (current->metainfo) { // Since elements would be removed from queues while calling to tuple_accepted // together with corresponding message_metainfo objects // we need to prolong the wait until the successor would create a task for removed elements for (auto waiter : current->metainfo->waiters()) { waiter->reserve(1); } } #endif tuple_accepted(); current->status.store( SUCCEEDED, std::memory_order_release); } else current->status.store( FAILED, std::memory_order_release); } else current->status.store( FAILED, std::memory_order_release); break; case do_fwrd_bypass: { bool build_succeeded; graph_task *last_task = nullptr; output_type out; // forwarding must be exclusive, because try_to_make_tuple and tuple_accepted // are separate locked methods in the FE. We could conceivably fetch the front // of the FE queue, then be swapped out, have someone else consume the FE's // object, then come back, forward, and then try to remove it from the queue // again. Without reservation of the FE, the methods accessing it must be locked. // We could remember the keys of the objects we forwarded, and then remove // them from the input ports after forwarding is complete? if(tuple_build_may_succeed()) { // checks output queue of FE do { #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT message_metainfo metainfo; #endif // fetch front_end of queue build_succeeded = try_to_make_tuple(out __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo)); if(build_succeeded) { graph_task *new_task = my_successors.try_put_task(out __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo)); last_task = combine_tasks(my_graph, last_task, new_task); if(new_task) { tuple_accepted(); } else { tuple_rejected(); build_succeeded = false; } } } while(build_succeeded); } current->bypass_t = last_task; current->status.store( SUCCEEDED, std::memory_order_release); forwarder_busy = false; } break; } } } // ---------- end aggregator ----------- public: join_node_base(graph &g) : graph_node(g), input_ports_type(g), forwarder_busy(false), my_successors(this) { input_ports_type::set_my_node(this); my_aggregator.initialize_handler(handler_type(this)); } join_node_base(const join_node_base& other) : graph_node(other.graph_node::my_graph), input_ports_type(other), sender(), forwarder_busy(false), my_successors(this) { input_ports_type::set_my_node(this); my_aggregator.initialize_handler(handler_type(this)); } template join_node_base(graph &g, FunctionTuple f) : graph_node(g), input_ports_type(g, f), forwarder_busy(false), my_successors(this) { input_ports_type::set_my_node(this); my_aggregator.initialize_handler(handler_type(this)); } bool register_successor(successor_type &r) override { join_node_base_operation op_data(r, reg_succ); my_aggregator.execute(&op_data); return op_data.status == SUCCEEDED; } bool remove_successor( successor_type &r) override { join_node_base_operation op_data(r, rem_succ); my_aggregator.execute(&op_data); return op_data.status == SUCCEEDED; } bool try_get( output_type &v) override { join_node_base_operation op_data(v, try__get); my_aggregator.execute(&op_data); return op_data.status == SUCCEEDED; } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT bool try_get( output_type &v, message_metainfo& metainfo) override { join_node_base_operation op_data(v, try__get, metainfo); my_aggregator.execute(&op_data); return op_data.status == SUCCEEDED; } #endif protected: void reset_node(reset_flags f) override { input_ports_type::reset(f); if(f & rf_clear_edges) my_successors.clear(); } private: broadcast_cache my_successors; friend class forward_task_bypass< join_node_base >; graph_task *forward_task() { join_node_base_operation op_data(do_fwrd_bypass); my_aggregator.execute(&op_data); return op_data.bypass_t; } }; // join_node_base // join base class type generator template class PT, typename OutputTuple, typename JP> struct join_base { typedef join_node_base::type, OutputTuple> type; }; template struct join_base > { typedef key_matching key_traits_type; typedef K key_type; typedef KHash key_hash_compare; typedef join_node_base< key_traits_type, // ports type typename wrap_key_tuple_elements::type, OutputTuple > type; }; //! unfolded_join_node : passes input_ports_type to join_node_base. We build the input port type // using tuple_element. The class PT is the port type (reserving_port, queueing_port, key_matching_port) // and should match the typename. template class PT, typename OutputTuple, typename JP> class unfolded_join_node : public join_base::type { public: typedef typename wrap_tuple_elements::type input_ports_type; typedef OutputTuple output_type; private: typedef join_node_base base_type; public: unfolded_join_node(graph &g) : base_type(g) {} unfolded_join_node(const unfolded_join_node &other) : base_type(other) {} }; #if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING template struct key_from_message_body { K operator()(const T& t) const { return key_from_message(t); } }; // Adds const to reference type template struct key_from_message_body { const K& operator()(const T& t) const { return key_from_message(t); } }; #endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */ // key_matching unfolded_join_node. This must be a separate specialization because the constructors // differ. template class unfolded_join_node<2,key_matching_port,OutputTuple,key_matching > : public join_base<2,key_matching_port,OutputTuple,key_matching >::type { typedef typename std::tuple_element<0, OutputTuple>::type T0; typedef typename std::tuple_element<1, OutputTuple>::type T1; public: typedef typename wrap_key_tuple_elements<2,key_matching_port,key_matching,OutputTuple>::type input_ports_type; typedef OutputTuple output_type; private: typedef join_node_base, input_ports_type, output_type > base_type; typedef type_to_key_function_body *f0_p; typedef type_to_key_function_body *f1_p; typedef std::tuple< f0_p, f1_p > func_initializer_type; public: #if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING unfolded_join_node(graph &g) : base_type(g, func_initializer_type( new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()) ) ) { } #endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */ template unfolded_join_node(graph &g, Body0 body0, Body1 body1) : base_type(g, func_initializer_type( new type_to_key_function_body_leaf(body0), new type_to_key_function_body_leaf(body1) ) ) { static_assert(std::tuple_size::value == 2, "wrong number of body initializers"); } unfolded_join_node(const unfolded_join_node &other) : base_type(other) {} }; template class unfolded_join_node<3,key_matching_port,OutputTuple,key_matching > : public join_base<3,key_matching_port,OutputTuple,key_matching >::type { typedef typename std::tuple_element<0, OutputTuple>::type T0; typedef typename std::tuple_element<1, OutputTuple>::type T1; typedef typename std::tuple_element<2, OutputTuple>::type T2; public: typedef typename wrap_key_tuple_elements<3,key_matching_port,key_matching,OutputTuple>::type input_ports_type; typedef OutputTuple output_type; private: typedef join_node_base, input_ports_type, output_type > base_type; typedef type_to_key_function_body *f0_p; typedef type_to_key_function_body *f1_p; typedef type_to_key_function_body *f2_p; typedef std::tuple< f0_p, f1_p, f2_p > func_initializer_type; public: #if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING unfolded_join_node(graph &g) : base_type(g, func_initializer_type( new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()) ) ) { } #endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */ template unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2) : base_type(g, func_initializer_type( new type_to_key_function_body_leaf(body0), new type_to_key_function_body_leaf(body1), new type_to_key_function_body_leaf(body2) ) ) { static_assert(std::tuple_size::value == 3, "wrong number of body initializers"); } unfolded_join_node(const unfolded_join_node &other) : base_type(other) {} }; template class unfolded_join_node<4,key_matching_port,OutputTuple,key_matching > : public join_base<4,key_matching_port,OutputTuple,key_matching >::type { typedef typename std::tuple_element<0, OutputTuple>::type T0; typedef typename std::tuple_element<1, OutputTuple>::type T1; typedef typename std::tuple_element<2, OutputTuple>::type T2; typedef typename std::tuple_element<3, OutputTuple>::type T3; public: typedef typename wrap_key_tuple_elements<4,key_matching_port,key_matching,OutputTuple>::type input_ports_type; typedef OutputTuple output_type; private: typedef join_node_base, input_ports_type, output_type > base_type; typedef type_to_key_function_body *f0_p; typedef type_to_key_function_body *f1_p; typedef type_to_key_function_body *f2_p; typedef type_to_key_function_body *f3_p; typedef std::tuple< f0_p, f1_p, f2_p, f3_p > func_initializer_type; public: #if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING unfolded_join_node(graph &g) : base_type(g, func_initializer_type( new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()) ) ) { } #endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */ template unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2, Body3 body3) : base_type(g, func_initializer_type( new type_to_key_function_body_leaf(body0), new type_to_key_function_body_leaf(body1), new type_to_key_function_body_leaf(body2), new type_to_key_function_body_leaf(body3) ) ) { static_assert(std::tuple_size::value == 4, "wrong number of body initializers"); } unfolded_join_node(const unfolded_join_node &other) : base_type(other) {} }; template class unfolded_join_node<5,key_matching_port,OutputTuple,key_matching > : public join_base<5,key_matching_port,OutputTuple,key_matching >::type { typedef typename std::tuple_element<0, OutputTuple>::type T0; typedef typename std::tuple_element<1, OutputTuple>::type T1; typedef typename std::tuple_element<2, OutputTuple>::type T2; typedef typename std::tuple_element<3, OutputTuple>::type T3; typedef typename std::tuple_element<4, OutputTuple>::type T4; public: typedef typename wrap_key_tuple_elements<5,key_matching_port,key_matching,OutputTuple>::type input_ports_type; typedef OutputTuple output_type; private: typedef join_node_base , input_ports_type, output_type > base_type; typedef type_to_key_function_body *f0_p; typedef type_to_key_function_body *f1_p; typedef type_to_key_function_body *f2_p; typedef type_to_key_function_body *f3_p; typedef type_to_key_function_body *f4_p; typedef std::tuple< f0_p, f1_p, f2_p, f3_p, f4_p > func_initializer_type; public: #if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING unfolded_join_node(graph &g) : base_type(g, func_initializer_type( new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()) ) ) { } #endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */ template unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2, Body3 body3, Body4 body4) : base_type(g, func_initializer_type( new type_to_key_function_body_leaf(body0), new type_to_key_function_body_leaf(body1), new type_to_key_function_body_leaf(body2), new type_to_key_function_body_leaf(body3), new type_to_key_function_body_leaf(body4) ) ) { static_assert(std::tuple_size::value == 5, "wrong number of body initializers"); } unfolded_join_node(const unfolded_join_node &other) : base_type(other) {} }; #if __TBB_VARIADIC_MAX >= 6 template class unfolded_join_node<6,key_matching_port,OutputTuple,key_matching > : public join_base<6,key_matching_port,OutputTuple,key_matching >::type { typedef typename std::tuple_element<0, OutputTuple>::type T0; typedef typename std::tuple_element<1, OutputTuple>::type T1; typedef typename std::tuple_element<2, OutputTuple>::type T2; typedef typename std::tuple_element<3, OutputTuple>::type T3; typedef typename std::tuple_element<4, OutputTuple>::type T4; typedef typename std::tuple_element<5, OutputTuple>::type T5; public: typedef typename wrap_key_tuple_elements<6,key_matching_port,key_matching,OutputTuple>::type input_ports_type; typedef OutputTuple output_type; private: typedef join_node_base , input_ports_type, output_type > base_type; typedef type_to_key_function_body *f0_p; typedef type_to_key_function_body *f1_p; typedef type_to_key_function_body *f2_p; typedef type_to_key_function_body *f3_p; typedef type_to_key_function_body *f4_p; typedef type_to_key_function_body *f5_p; typedef std::tuple< f0_p, f1_p, f2_p, f3_p, f4_p, f5_p > func_initializer_type; public: #if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING unfolded_join_node(graph &g) : base_type(g, func_initializer_type( new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()) ) ) { } #endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */ template unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2, Body3 body3, Body4 body4, Body5 body5) : base_type(g, func_initializer_type( new type_to_key_function_body_leaf(body0), new type_to_key_function_body_leaf(body1), new type_to_key_function_body_leaf(body2), new type_to_key_function_body_leaf(body3), new type_to_key_function_body_leaf(body4), new type_to_key_function_body_leaf(body5) ) ) { static_assert(std::tuple_size::value == 6, "wrong number of body initializers"); } unfolded_join_node(const unfolded_join_node &other) : base_type(other) {} }; #endif #if __TBB_VARIADIC_MAX >= 7 template class unfolded_join_node<7,key_matching_port,OutputTuple,key_matching > : public join_base<7,key_matching_port,OutputTuple,key_matching >::type { typedef typename std::tuple_element<0, OutputTuple>::type T0; typedef typename std::tuple_element<1, OutputTuple>::type T1; typedef typename std::tuple_element<2, OutputTuple>::type T2; typedef typename std::tuple_element<3, OutputTuple>::type T3; typedef typename std::tuple_element<4, OutputTuple>::type T4; typedef typename std::tuple_element<5, OutputTuple>::type T5; typedef typename std::tuple_element<6, OutputTuple>::type T6; public: typedef typename wrap_key_tuple_elements<7,key_matching_port,key_matching,OutputTuple>::type input_ports_type; typedef OutputTuple output_type; private: typedef join_node_base , input_ports_type, output_type > base_type; typedef type_to_key_function_body *f0_p; typedef type_to_key_function_body *f1_p; typedef type_to_key_function_body *f2_p; typedef type_to_key_function_body *f3_p; typedef type_to_key_function_body *f4_p; typedef type_to_key_function_body *f5_p; typedef type_to_key_function_body *f6_p; typedef std::tuple< f0_p, f1_p, f2_p, f3_p, f4_p, f5_p, f6_p > func_initializer_type; public: #if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING unfolded_join_node(graph &g) : base_type(g, func_initializer_type( new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()) ) ) { } #endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */ template unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2, Body3 body3, Body4 body4, Body5 body5, Body6 body6) : base_type(g, func_initializer_type( new type_to_key_function_body_leaf(body0), new type_to_key_function_body_leaf(body1), new type_to_key_function_body_leaf(body2), new type_to_key_function_body_leaf(body3), new type_to_key_function_body_leaf(body4), new type_to_key_function_body_leaf(body5), new type_to_key_function_body_leaf(body6) ) ) { static_assert(std::tuple_size::value == 7, "wrong number of body initializers"); } unfolded_join_node(const unfolded_join_node &other) : base_type(other) {} }; #endif #if __TBB_VARIADIC_MAX >= 8 template class unfolded_join_node<8,key_matching_port,OutputTuple,key_matching > : public join_base<8,key_matching_port,OutputTuple,key_matching >::type { typedef typename std::tuple_element<0, OutputTuple>::type T0; typedef typename std::tuple_element<1, OutputTuple>::type T1; typedef typename std::tuple_element<2, OutputTuple>::type T2; typedef typename std::tuple_element<3, OutputTuple>::type T3; typedef typename std::tuple_element<4, OutputTuple>::type T4; typedef typename std::tuple_element<5, OutputTuple>::type T5; typedef typename std::tuple_element<6, OutputTuple>::type T6; typedef typename std::tuple_element<7, OutputTuple>::type T7; public: typedef typename wrap_key_tuple_elements<8,key_matching_port,key_matching,OutputTuple>::type input_ports_type; typedef OutputTuple output_type; private: typedef join_node_base , input_ports_type, output_type > base_type; typedef type_to_key_function_body *f0_p; typedef type_to_key_function_body *f1_p; typedef type_to_key_function_body *f2_p; typedef type_to_key_function_body *f3_p; typedef type_to_key_function_body *f4_p; typedef type_to_key_function_body *f5_p; typedef type_to_key_function_body *f6_p; typedef type_to_key_function_body *f7_p; typedef std::tuple< f0_p, f1_p, f2_p, f3_p, f4_p, f5_p, f6_p, f7_p > func_initializer_type; public: #if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING unfolded_join_node(graph &g) : base_type(g, func_initializer_type( new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()) ) ) { } #endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */ template unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2, Body3 body3, Body4 body4, Body5 body5, Body6 body6, Body7 body7) : base_type(g, func_initializer_type( new type_to_key_function_body_leaf(body0), new type_to_key_function_body_leaf(body1), new type_to_key_function_body_leaf(body2), new type_to_key_function_body_leaf(body3), new type_to_key_function_body_leaf(body4), new type_to_key_function_body_leaf(body5), new type_to_key_function_body_leaf(body6), new type_to_key_function_body_leaf(body7) ) ) { static_assert(std::tuple_size::value == 8, "wrong number of body initializers"); } unfolded_join_node(const unfolded_join_node &other) : base_type(other) {} }; #endif #if __TBB_VARIADIC_MAX >= 9 template class unfolded_join_node<9,key_matching_port,OutputTuple,key_matching > : public join_base<9,key_matching_port,OutputTuple,key_matching >::type { typedef typename std::tuple_element<0, OutputTuple>::type T0; typedef typename std::tuple_element<1, OutputTuple>::type T1; typedef typename std::tuple_element<2, OutputTuple>::type T2; typedef typename std::tuple_element<3, OutputTuple>::type T3; typedef typename std::tuple_element<4, OutputTuple>::type T4; typedef typename std::tuple_element<5, OutputTuple>::type T5; typedef typename std::tuple_element<6, OutputTuple>::type T6; typedef typename std::tuple_element<7, OutputTuple>::type T7; typedef typename std::tuple_element<8, OutputTuple>::type T8; public: typedef typename wrap_key_tuple_elements<9,key_matching_port,key_matching,OutputTuple>::type input_ports_type; typedef OutputTuple output_type; private: typedef join_node_base , input_ports_type, output_type > base_type; typedef type_to_key_function_body *f0_p; typedef type_to_key_function_body *f1_p; typedef type_to_key_function_body *f2_p; typedef type_to_key_function_body *f3_p; typedef type_to_key_function_body *f4_p; typedef type_to_key_function_body *f5_p; typedef type_to_key_function_body *f6_p; typedef type_to_key_function_body *f7_p; typedef type_to_key_function_body *f8_p; typedef std::tuple< f0_p, f1_p, f2_p, f3_p, f4_p, f5_p, f6_p, f7_p, f8_p > func_initializer_type; public: #if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING unfolded_join_node(graph &g) : base_type(g, func_initializer_type( new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()) ) ) { } #endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */ template unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2, Body3 body3, Body4 body4, Body5 body5, Body6 body6, Body7 body7, Body8 body8) : base_type(g, func_initializer_type( new type_to_key_function_body_leaf(body0), new type_to_key_function_body_leaf(body1), new type_to_key_function_body_leaf(body2), new type_to_key_function_body_leaf(body3), new type_to_key_function_body_leaf(body4), new type_to_key_function_body_leaf(body5), new type_to_key_function_body_leaf(body6), new type_to_key_function_body_leaf(body7), new type_to_key_function_body_leaf(body8) ) ) { static_assert(std::tuple_size::value == 9, "wrong number of body initializers"); } unfolded_join_node(const unfolded_join_node &other) : base_type(other) {} }; #endif #if __TBB_VARIADIC_MAX >= 10 template class unfolded_join_node<10,key_matching_port,OutputTuple,key_matching > : public join_base<10,key_matching_port,OutputTuple,key_matching >::type { typedef typename std::tuple_element<0, OutputTuple>::type T0; typedef typename std::tuple_element<1, OutputTuple>::type T1; typedef typename std::tuple_element<2, OutputTuple>::type T2; typedef typename std::tuple_element<3, OutputTuple>::type T3; typedef typename std::tuple_element<4, OutputTuple>::type T4; typedef typename std::tuple_element<5, OutputTuple>::type T5; typedef typename std::tuple_element<6, OutputTuple>::type T6; typedef typename std::tuple_element<7, OutputTuple>::type T7; typedef typename std::tuple_element<8, OutputTuple>::type T8; typedef typename std::tuple_element<9, OutputTuple>::type T9; public: typedef typename wrap_key_tuple_elements<10,key_matching_port,key_matching,OutputTuple>::type input_ports_type; typedef OutputTuple output_type; private: typedef join_node_base , input_ports_type, output_type > base_type; typedef type_to_key_function_body *f0_p; typedef type_to_key_function_body *f1_p; typedef type_to_key_function_body *f2_p; typedef type_to_key_function_body *f3_p; typedef type_to_key_function_body *f4_p; typedef type_to_key_function_body *f5_p; typedef type_to_key_function_body *f6_p; typedef type_to_key_function_body *f7_p; typedef type_to_key_function_body *f8_p; typedef type_to_key_function_body *f9_p; typedef std::tuple< f0_p, f1_p, f2_p, f3_p, f4_p, f5_p, f6_p, f7_p, f8_p, f9_p > func_initializer_type; public: #if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING unfolded_join_node(graph &g) : base_type(g, func_initializer_type( new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()), new type_to_key_function_body_leaf >(key_from_message_body()) ) ) { } #endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */ template unfolded_join_node(graph &g, Body0 body0, Body1 body1, Body2 body2, Body3 body3, Body4 body4, Body5 body5, Body6 body6, Body7 body7, Body8 body8, Body9 body9) : base_type(g, func_initializer_type( new type_to_key_function_body_leaf(body0), new type_to_key_function_body_leaf(body1), new type_to_key_function_body_leaf(body2), new type_to_key_function_body_leaf(body3), new type_to_key_function_body_leaf(body4), new type_to_key_function_body_leaf(body5), new type_to_key_function_body_leaf(body6), new type_to_key_function_body_leaf(body7), new type_to_key_function_body_leaf(body8), new type_to_key_function_body_leaf(body9) ) ) { static_assert(std::tuple_size::value == 10, "wrong number of body initializers"); } unfolded_join_node(const unfolded_join_node &other) : base_type(other) {} }; #endif //! templated function to refer to input ports of the join node template typename std::tuple_element::type &input_port(JNT &jn) { return std::get(jn.input_ports()); } #endif // __TBB__flow_graph_join_impl_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_flow_graph_node_impl.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB__flow_graph_node_impl_H #define __TBB__flow_graph_node_impl_H #ifndef __TBB_flow_graph_H #error Do not #include this internal file directly; use public TBB headers instead. #endif #include "_flow_graph_item_buffer_impl.h" template< typename T, typename A > class function_input_queue : public item_buffer { public: bool empty() const { return this->buffer_empty(); } const T& front() const { return this->item_buffer::front(); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT const message_metainfo& front_metainfo() const { return this->item_buffer::front_metainfo(); } #endif void pop() { this->destroy_front(); } bool push( T& t ) { return this->push_back( t ); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT bool push( T& t, const message_metainfo& metainfo ) { return this->push_back(t, metainfo); } #endif }; //! Input and scheduling for a function node that takes a type Input as input // The only up-ref is apply_body_impl, which should implement the function // call and any handling of the result. template< typename Input, typename Policy, typename A, typename ImplType > class function_input_base : public receiver, no_assign { enum op_type {reg_pred, rem_pred, try_fwd, tryput_bypass, app_body_bypass, occupy_concurrency }; typedef function_input_base class_type; public: //! The input type of this receiver typedef Input input_type; typedef typename receiver::predecessor_type predecessor_type; typedef predecessor_cache predecessor_cache_type; typedef function_input_queue input_queue_type; typedef typename allocator_traits::template rebind_alloc allocator_type; static_assert(!has_policy::value || !has_policy::value, ""); //! Constructor for function_input_base function_input_base( graph &g, size_t max_concurrency, node_priority_t a_priority, bool is_no_throw ) : my_graph_ref(g), my_max_concurrency(max_concurrency) , my_concurrency(0), my_priority(a_priority), my_is_no_throw(is_no_throw) , my_queue(!has_policy::value ? new input_queue_type() : nullptr) , my_predecessors(this) , forwarder_busy(false) { my_aggregator.initialize_handler(handler_type(this)); } //! Copy constructor function_input_base( const function_input_base& src ) : function_input_base(src.my_graph_ref, src.my_max_concurrency, src.my_priority, src.my_is_no_throw) {} //! Destructor // The queue is allocated by the constructor for {multi}function_node. // TODO: pass the graph_buffer_policy to the base so it can allocate the queue instead. // This would be an interface-breaking change. virtual ~function_input_base() { delete my_queue; my_queue = nullptr; } graph_task* try_put_task( const input_type& t) override { return try_put_task_base(t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{})); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT graph_task* try_put_task( const input_type& t, const message_metainfo& metainfo ) override { return try_put_task_base(t, metainfo); } #endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT //! Adds src to the list of cached predecessors. bool register_predecessor( predecessor_type &src ) override { operation_type op_data(reg_pred); op_data.r = &src; my_aggregator.execute(&op_data); return true; } //! Removes src from the list of cached predecessors. bool remove_predecessor( predecessor_type &src ) override { operation_type op_data(rem_pred); op_data.r = &src; my_aggregator.execute(&op_data); return true; } protected: void reset_function_input_base( reset_flags f) { my_concurrency = 0; if(my_queue) { my_queue->reset(); } reset_receiver(f); forwarder_busy = false; } graph& my_graph_ref; const size_t my_max_concurrency; size_t my_concurrency; node_priority_t my_priority; const bool my_is_no_throw; input_queue_type *my_queue; predecessor_cache my_predecessors; void reset_receiver( reset_flags f) { if( f & rf_clear_edges) my_predecessors.clear(); else my_predecessors.reset(); __TBB_ASSERT(!(f & rf_clear_edges) || my_predecessors.empty(), "function_input_base reset failed"); } graph& graph_reference() const override { return my_graph_ref; } graph_task* try_get_postponed_task(const input_type& i) { operation_type op_data(i, app_body_bypass); // tries to pop an item or get_item my_aggregator.execute(&op_data); return op_data.bypass_t; } private: friend class apply_body_task_bypass< class_type, input_type >; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT friend class apply_body_task_bypass< class_type, input_type, trackable_messages_graph_task >; #endif friend class forward_task_bypass< class_type >; class operation_type : public d1::aggregated_operation< operation_type > { public: char type; union { input_type *elem; predecessor_type *r; }; graph_task* bypass_t; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT message_metainfo* metainfo; #endif operation_type(const input_type& e, op_type t) : type(char(t)), elem(const_cast(&e)), bypass_t(nullptr) #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT , metainfo(nullptr) #endif {} #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT operation_type(const input_type& e, op_type t, const message_metainfo& info) : type(char(t)), elem(const_cast(&e)), bypass_t(nullptr), metainfo(const_cast(&info)) {} #endif operation_type(op_type t) : type(char(t)), r(nullptr), bypass_t(nullptr) {} }; bool forwarder_busy; typedef d1::aggregating_functor handler_type; friend class d1::aggregating_functor; d1::aggregator< handler_type, operation_type > my_aggregator; graph_task* perform_queued_requests() { graph_task* new_task = nullptr; if(my_queue) { if(!my_queue->empty()) { ++my_concurrency; // TODO: consider removing metainfo from the queue using move semantics to avoid // ref counter increase new_task = create_body_task(my_queue->front() __TBB_FLOW_GRAPH_METAINFO_ARG(my_queue->front_metainfo())); my_queue->pop(); } } else { input_type i; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT message_metainfo metainfo; #endif if(my_predecessors.get_item(i __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo))) { ++my_concurrency; new_task = create_body_task(i __TBB_FLOW_GRAPH_METAINFO_ARG(std::move(metainfo))); } } return new_task; } void handle_operations(operation_type *op_list) { operation_type* tmp; while (op_list) { tmp = op_list; op_list = op_list->next; switch (tmp->type) { case reg_pred: my_predecessors.add(*(tmp->r)); tmp->status.store(SUCCEEDED, std::memory_order_release); if (!forwarder_busy) { forwarder_busy = true; spawn_forward_task(); } break; case rem_pred: my_predecessors.remove(*(tmp->r)); tmp->status.store(SUCCEEDED, std::memory_order_release); break; case app_body_bypass: { tmp->bypass_t = nullptr; __TBB_ASSERT(my_max_concurrency != 0, nullptr); --my_concurrency; if(my_concurrencybypass_t = perform_queued_requests(); tmp->status.store(SUCCEEDED, std::memory_order_release); } break; case tryput_bypass: internal_try_put_task(tmp); break; case try_fwd: internal_forward(tmp); break; case occupy_concurrency: if (my_concurrency < my_max_concurrency) { ++my_concurrency; tmp->status.store(SUCCEEDED, std::memory_order_release); } else { tmp->status.store(FAILED, std::memory_order_release); } break; } } } //! Put to the node, but return the task instead of enqueueing it void internal_try_put_task(operation_type *op) { __TBB_ASSERT(my_max_concurrency != 0, nullptr); if (my_concurrency < my_max_concurrency) { ++my_concurrency; graph_task* new_task = create_body_task(*(op->elem) __TBB_FLOW_GRAPH_METAINFO_ARG(*(op->metainfo))); op->bypass_t = new_task; op->status.store(SUCCEEDED, std::memory_order_release); } else if ( my_queue && my_queue->push(*(op->elem) __TBB_FLOW_GRAPH_METAINFO_ARG(*(op->metainfo))) ) { op->bypass_t = SUCCESSFULLY_ENQUEUED; op->status.store(SUCCEEDED, std::memory_order_release); } else { op->bypass_t = nullptr; op->status.store(FAILED, std::memory_order_release); } } //! Creates tasks for postponed messages if available and if concurrency allows void internal_forward(operation_type *op) { op->bypass_t = nullptr; if (my_concurrency < my_max_concurrency) op->bypass_t = perform_queued_requests(); if(op->bypass_t) op->status.store(SUCCEEDED, std::memory_order_release); else { forwarder_busy = false; op->status.store(FAILED, std::memory_order_release); } } graph_task* internal_try_put_bypass( const input_type& t __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) { operation_type op_data(t, tryput_bypass __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo)); my_aggregator.execute(&op_data); if( op_data.status == SUCCEEDED ) { return op_data.bypass_t; } return nullptr; } graph_task* try_put_task_base(const input_type& t __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) { if ( my_is_no_throw ) return try_put_task_impl(t, has_policy() __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo)); else return try_put_task_impl(t, std::false_type() __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo)); } graph_task* try_put_task_impl( const input_type& t, /*lightweight=*/std::true_type __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) { if( my_max_concurrency == 0 ) { return apply_body_bypass(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo)); } else { operation_type check_op(t, occupy_concurrency); my_aggregator.execute(&check_op); if( check_op.status == SUCCEEDED ) { return apply_body_bypass(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo)); } return internal_try_put_bypass(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo)); } } graph_task* try_put_task_impl( const input_type& t, /*lightweight=*/std::false_type __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) { if( my_max_concurrency == 0 ) { return create_body_task(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo)); } else { return internal_try_put_bypass(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo)); } } //! Applies the body to the provided input // then decides if more work is available graph_task* apply_body_bypass( const input_type &i __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) { return static_cast(this)->apply_body_impl_bypass(i __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo)); } //! allocates a task to apply a body #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT template graph_task* create_body_task( const input_type &input, Metainfo&& metainfo ) #else graph_task* create_body_task( const input_type &input ) #endif { if (!is_graph_active(my_graph_ref)) { return nullptr; } // TODO revamp: extract helper for common graph task allocation part d1::small_object_allocator allocator{}; graph_task* t = nullptr; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT if (!metainfo.empty()) { using task_type = apply_body_task_bypass; t = allocator.new_object(my_graph_ref, allocator, *this, input, my_priority, std::forward(metainfo)); } else #endif { using task_type = apply_body_task_bypass; t = allocator.new_object(my_graph_ref, allocator, *this, input, my_priority); } return t; } //! This is executed by an enqueued task, the "forwarder" graph_task* forward_task() { operation_type op_data(try_fwd); graph_task* rval = nullptr; do { op_data.status = WAIT; my_aggregator.execute(&op_data); if(op_data.status == SUCCEEDED) { graph_task* ttask = op_data.bypass_t; __TBB_ASSERT( ttask && ttask != SUCCESSFULLY_ENQUEUED, nullptr); rval = combine_tasks(my_graph_ref, rval, ttask); } } while (op_data.status == SUCCEEDED); return rval; } inline graph_task* create_forward_task() { if (!is_graph_active(my_graph_ref)) { return nullptr; } d1::small_object_allocator allocator{}; typedef forward_task_bypass task_type; graph_task* t = allocator.new_object( graph_reference(), allocator, *this, my_priority ); return t; } //! Spawns a task that calls forward() inline void spawn_forward_task() { graph_task* tp = create_forward_task(); if(tp) { spawn_in_graph_arena(graph_reference(), *tp); } } node_priority_t priority() const override { return my_priority; } }; // function_input_base //! Implements methods for a function node that takes a type Input as input and sends // a type Output to its successors. template< typename Input, typename Output, typename Policy, typename A> class function_input : public function_input_base > { public: typedef Input input_type; typedef Output output_type; typedef function_body function_body_type; typedef function_input my_class; typedef function_input_base base_type; typedef function_input_queue input_queue_type; // constructor template function_input( graph &g, size_t max_concurrency, Body& body, node_priority_t a_priority ) : base_type(g, max_concurrency, a_priority, noexcept(tbb::detail::invoke(body, input_type()))) , my_body( new function_body_leaf< input_type, output_type, Body>(body) ) , my_init_body( new function_body_leaf< input_type, output_type, Body>(body) ) { } //! Copy constructor function_input( const function_input& src ) : base_type(src), my_body( src.my_init_body->clone() ), my_init_body(src.my_init_body->clone() ) { } #if __INTEL_COMPILER <= 2021 // Suppress superfluous diagnostic about virtual keyword absence in a destructor of an inherited // class while the parent class has the virtual keyword for the destrocutor. virtual #endif ~function_input() { delete my_body; delete my_init_body; } template< typename Body > Body copy_function_object() { function_body_type &body_ref = *this->my_body; return dynamic_cast< function_body_leaf & >(body_ref).get_body(); } output_type apply_body_impl( const input_type& i) { // There is an extra copied needed to capture the // body execution without the try_put fgt_begin_body( my_body ); output_type v = tbb::detail::invoke(*my_body, i); fgt_end_body( my_body ); return v; } //TODO: consider moving into the base class graph_task* apply_body_impl_bypass( const input_type &i __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) { output_type v = apply_body_impl(i); graph_task* postponed_task = nullptr; if( base_type::my_max_concurrency != 0 ) { postponed_task = base_type::try_get_postponed_task(i); __TBB_ASSERT( !postponed_task || postponed_task != SUCCESSFULLY_ENQUEUED, nullptr); } if( postponed_task ) { // make the task available for other workers since we do not know successors' // execution policy spawn_in_graph_arena(base_type::graph_reference(), *postponed_task); } graph_task* successor_task = successors().try_put_task(v __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo)); #if _MSC_VER && !__INTEL_COMPILER #pragma warning (push) #pragma warning (disable: 4127) /* suppress conditional expression is constant */ #endif if(has_policy::value) { #if _MSC_VER && !__INTEL_COMPILER #pragma warning (pop) #endif if(!successor_task) { // Return confirmative status since current // node's body has been executed anyway successor_task = SUCCESSFULLY_ENQUEUED; } } return successor_task; } protected: void reset_function_input(reset_flags f) { base_type::reset_function_input_base(f); if(f & rf_reset_bodies) { function_body_type *tmp = my_init_body->clone(); delete my_body; my_body = tmp; } } function_body_type *my_body; function_body_type *my_init_body; virtual broadcast_cache &successors() = 0; }; // function_input // helper templates to clear the successor edges of the output ports of an multifunction_node template struct clear_element { template static void clear_this(P &p) { (void)std::get(p).successors().clear(); clear_element::clear_this(p); } #if TBB_USE_ASSERT template static bool this_empty(P &p) { if(std::get(p).successors().empty()) return clear_element::this_empty(p); return false; } #endif }; template<> struct clear_element<1> { template static void clear_this(P &p) { (void)std::get<0>(p).successors().clear(); } #if TBB_USE_ASSERT template static bool this_empty(P &p) { return std::get<0>(p).successors().empty(); } #endif }; template struct init_output_ports { template static OutputTuple call(graph& g, const std::tuple&) { return OutputTuple(Args(g)...); } }; // struct init_output_ports //! Implements methods for a function node that takes a type Input as input // and has a tuple of output ports specified. template< typename Input, typename OutputPortSet, typename Policy, typename A> class multifunction_input : public function_input_base > { public: static const int N = std::tuple_size::value; typedef Input input_type; typedef OutputPortSet output_ports_type; typedef multifunction_body multifunction_body_type; typedef multifunction_input my_class; typedef function_input_base base_type; typedef function_input_queue input_queue_type; // constructor template multifunction_input(graph &g, size_t max_concurrency,Body& body, node_priority_t a_priority ) : base_type(g, max_concurrency, a_priority, noexcept(tbb::detail::invoke(body, input_type(), my_output_ports))) , my_body( new multifunction_body_leaf(body) ) , my_init_body( new multifunction_body_leaf(body) ) , my_output_ports(init_output_ports::call(g, my_output_ports)){ } //! Copy constructor multifunction_input( const multifunction_input& src ) : base_type(src), my_body( src.my_init_body->clone() ), my_init_body(src.my_init_body->clone() ), my_output_ports( init_output_ports::call(src.my_graph_ref, my_output_ports) ) { } ~multifunction_input() { delete my_body; delete my_init_body; } template< typename Body > Body copy_function_object() { multifunction_body_type &body_ref = *this->my_body; return *static_cast(dynamic_cast< multifunction_body_leaf & >(body_ref).get_body_ptr()); } // for multifunction nodes we do not have a single successor as such. So we just tell // the task we were successful. //TODO: consider moving common parts with implementation in function_input into separate function graph_task* apply_body_impl_bypass( const input_type &i __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo&) ) { fgt_begin_body( my_body ); (*my_body)(i, my_output_ports); fgt_end_body( my_body ); graph_task* ttask = nullptr; if(base_type::my_max_concurrency != 0) { ttask = base_type::try_get_postponed_task(i); } return ttask ? ttask : SUCCESSFULLY_ENQUEUED; } output_ports_type &output_ports(){ return my_output_ports; } protected: void reset(reset_flags f) { base_type::reset_function_input_base(f); if(f & rf_clear_edges)clear_element::clear_this(my_output_ports); if(f & rf_reset_bodies) { multifunction_body_type* tmp = my_init_body->clone(); delete my_body; my_body = tmp; } __TBB_ASSERT(!(f & rf_clear_edges) || clear_element::this_empty(my_output_ports), "multifunction_node reset failed"); } multifunction_body_type *my_body; multifunction_body_type *my_init_body; output_ports_type my_output_ports; }; // multifunction_input // template to refer to an output port of a multifunction_node template typename std::tuple_element::type &output_port(MOP &op) { return std::get(op.output_ports()); } inline void check_task_and_spawn(graph& g, graph_task* t) { if (t && t != SUCCESSFULLY_ENQUEUED) { spawn_in_graph_arena(g, *t); } } // helper structs for split_node template struct emit_element { template static graph_task* emit_this(graph& g, const T &t, P &p) { // TODO: consider to collect all the tasks in task_list and spawn them all at once graph_task* last_task = std::get(p).try_put_task(std::get(t)); check_task_and_spawn(g, last_task); return emit_element::emit_this(g,t,p); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT template static graph_task* emit_this(graph& g, const TupleType& t, PortsType& p, const message_metainfo& metainfo) { // TODO: consider to collect all the tasks in task_list and spawn them all at once graph_task* last_task = std::get(p).try_put_task(std::get(t), metainfo); check_task_and_spawn(g, last_task); return emit_element::emit_this(g, t, p, metainfo); } #endif }; template<> struct emit_element<1> { template static graph_task* emit_this(graph& g, const T &t, P &p) { graph_task* last_task = std::get<0>(p).try_put_task(std::get<0>(t)); check_task_and_spawn(g, last_task); return SUCCESSFULLY_ENQUEUED; } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT template static graph_task* emit_this(graph& g, const TupleType& t, PortsType& ports, const message_metainfo& metainfo) { graph_task* last_task = std::get<0>(ports).try_put_task(std::get<0>(t), metainfo); check_task_and_spawn(g, last_task); return SUCCESSFULLY_ENQUEUED; } #endif }; //! Implements methods for an executable node that takes continue_msg as input template< typename Output, typename Policy> class continue_input : public continue_receiver { public: //! The input type of this receiver typedef continue_msg input_type; //! The output type of this receiver typedef Output output_type; typedef function_body function_body_type; typedef continue_input class_type; template< typename Body > continue_input( graph &g, Body& body, node_priority_t a_priority ) : continue_receiver(/*number_of_predecessors=*/0, a_priority) , my_graph_ref(g) , my_body( new function_body_leaf< input_type, output_type, Body>(body) ) , my_init_body( new function_body_leaf< input_type, output_type, Body>(body) ) { } template< typename Body > continue_input( graph &g, int number_of_predecessors, Body& body, node_priority_t a_priority ) : continue_receiver( number_of_predecessors, a_priority ) , my_graph_ref(g) , my_body( new function_body_leaf< input_type, output_type, Body>(body) ) , my_init_body( new function_body_leaf< input_type, output_type, Body>(body) ) { } continue_input( const continue_input& src ) : continue_receiver(src), my_graph_ref(src.my_graph_ref), my_body( src.my_init_body->clone() ), my_init_body( src.my_init_body->clone() ) {} ~continue_input() { delete my_body; delete my_init_body; } template< typename Body > Body copy_function_object() { function_body_type &body_ref = *my_body; return dynamic_cast< function_body_leaf & >(body_ref).get_body(); } void reset_receiver( reset_flags f) override { continue_receiver::reset_receiver(f); if(f & rf_reset_bodies) { function_body_type *tmp = my_init_body->clone(); delete my_body; my_body = tmp; } } protected: graph& my_graph_ref; function_body_type *my_body; function_body_type *my_init_body; virtual broadcast_cache &successors() = 0; friend class apply_body_task_bypass< class_type, continue_msg >; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT friend class apply_body_task_bypass< class_type, continue_msg, trackable_messages_graph_task >; #endif //! Applies the body to the provided input graph_task* apply_body_bypass( input_type __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo) ) { // There is an extra copied needed to capture the // body execution without the try_put fgt_begin_body( my_body ); output_type v = (*my_body)( continue_msg() ); fgt_end_body( my_body ); return successors().try_put_task( v __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo) ); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT graph_task* execute(const message_metainfo& metainfo) override { #else graph_task* execute() override { #endif if(!is_graph_active(my_graph_ref)) { return nullptr; } #if _MSC_VER && !__INTEL_COMPILER #pragma warning (push) #pragma warning (disable: 4127) /* suppress conditional expression is constant */ #endif if(has_policy::value) { #if _MSC_VER && !__INTEL_COMPILER #pragma warning (pop) #endif return apply_body_bypass( continue_msg() __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo) ); } else { d1::small_object_allocator allocator{}; graph_task* t = nullptr; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT if (!metainfo.empty()) { using task_type = apply_body_task_bypass; t = allocator.new_object( graph_reference(), allocator, *this, continue_msg(), my_priority, metainfo ); } else #endif { using task_type = apply_body_task_bypass; t = allocator.new_object( graph_reference(), allocator, *this, continue_msg(), my_priority ); } return t; } } graph& graph_reference() const override { return my_graph_ref; } }; // continue_input //! Implements methods for both executable and function nodes that puts Output to its successors template< typename Output > class function_output : public sender { public: template friend struct clear_element; typedef Output output_type; typedef typename sender::successor_type successor_type; typedef broadcast_cache broadcast_cache_type; function_output(graph& g) : my_successors(this), my_graph_ref(g) {} function_output(const function_output& other) = delete; //! Adds a new successor to this node bool register_successor( successor_type &r ) override { successors().register_successor( r ); return true; } //! Removes a successor from this node bool remove_successor( successor_type &r ) override { successors().remove_successor( r ); return true; } broadcast_cache_type &successors() { return my_successors; } graph& graph_reference() const { return my_graph_ref; } protected: broadcast_cache_type my_successors; graph& my_graph_ref; }; // function_output template< typename Output > class multifunction_output : public function_output { public: typedef Output output_type; typedef function_output base_type; using base_type::my_successors; multifunction_output(graph& g) : base_type(g) {} multifunction_output(const multifunction_output& other) : base_type(other.my_graph_ref) {} bool try_put(const output_type &i) { graph_task *res = try_put_task(i); if( !res ) return false; if( res != SUCCESSFULLY_ENQUEUED ) { // wrapping in task_arena::execute() is not needed since the method is called from // inside task::execute() spawn_in_graph_arena(graph_reference(), *res); } return true; } using base_type::graph_reference; protected: graph_task* try_put_task(const output_type &i) { return my_successors.try_put_task(i); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT graph_task* try_put_task(const output_type& i, const message_metainfo& metainfo) { return my_successors.try_put_task(i, metainfo); } #endif template friend struct emit_element; }; // multifunction_output //composite_node template void add_nodes_impl(CompositeType*, bool) {} template< typename CompositeType, typename NodeType1, typename... NodeTypes > void add_nodes_impl(CompositeType *c_node, bool visible, const NodeType1& n1, const NodeTypes&... n) { void *addr = const_cast(&n1); fgt_alias_port(c_node, addr, visible); add_nodes_impl(c_node, visible, n...); } #endif // __TBB__flow_graph_node_impl_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_flow_graph_node_set_impl.h ================================================ /* Copyright (c) 2020-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_flow_graph_node_set_impl_H #define __TBB_flow_graph_node_set_impl_H #ifndef __TBB_flow_graph_H #error Do not #include this internal file directly; use public TBB headers instead. #endif // Included in namespace tbb::detail::d2 (in flow_graph.h) #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET // Visual Studio 2019 reports an error while calling predecessor_selector::get and successor_selector::get // Seems like the well-formed expression in trailing decltype is treated as ill-formed // TODO: investigate problems with decltype in trailing return types or find the cross-platform solution #define __TBB_MSVC_DISABLE_TRAILING_DECLTYPE (_MSC_VER >= 1900) namespace order { struct undefined {}; struct following {}; struct preceding {}; } class get_graph_helper { public: // TODO: consider making graph_reference() public and consistent interface to get a reference to the graph // and remove get_graph_helper template static graph& get(const T& object) { return get_impl(object, std::is_base_of()); } private: // Get graph from the object of type derived from graph_node template static graph& get_impl(const T& object, std::true_type) { return static_cast(&object)->my_graph; } template static graph& get_impl(const T& object, std::false_type) { return object.graph_reference(); } }; template struct node_set { typedef Order order_type; std::tuple nodes; node_set(Nodes&... ns) : nodes(ns...) {} template node_set(const node_set& set) : nodes(set.nodes) {} graph& graph_reference() const { return get_graph_helper::get(std::get<0>(nodes)); } }; namespace alias_helpers { template using output_type = typename T::output_type; template using output_ports_type = typename T::output_ports_type; template using input_type = typename T::input_type; template using input_ports_type = typename T::input_ports_type; } // namespace alias_helpers template using has_output_type = supports; template using has_input_type = supports; template using has_input_ports_type = supports; template using has_output_ports_type = supports; template struct is_sender : std::is_base_of, T> {}; template struct is_receiver : std::is_base_of, T> {}; template struct is_async_node : std::false_type {}; template struct is_async_node> : std::true_type {}; template node_set follows(FirstPredecessor& first_predecessor, Predecessors&... predecessors) { static_assert((conjunction, has_output_type...>::value), "Not all node's predecessors has output_type typedef"); static_assert((conjunction, is_sender...>::value), "Not all node's predecessors are senders"); return node_set(first_predecessor, predecessors...); } template node_set follows(node_set& predecessors_set) { static_assert((conjunction...>::value), "Not all nodes in the set has output_type typedef"); static_assert((conjunction...>::value), "Not all nodes in the set are senders"); return node_set(predecessors_set); } template node_set precedes(FirstSuccessor& first_successor, Successors&... successors) { static_assert((conjunction, has_input_type...>::value), "Not all node's successors has input_type typedef"); static_assert((conjunction, is_receiver...>::value), "Not all node's successors are receivers"); return node_set(first_successor, successors...); } template node_set precedes(node_set& successors_set) { static_assert((conjunction...>::value), "Not all nodes in the set has input_type typedef"); static_assert((conjunction...>::value), "Not all nodes in the set are receivers"); return node_set(successors_set); } template node_set make_node_set(Node& first_node, Nodes&... nodes) { return node_set(first_node, nodes...); } template class successor_selector { template static auto get_impl(NodeType& node, std::true_type) -> decltype(input_port(node)) { return input_port(node); } template static NodeType& get_impl(NodeType& node, std::false_type) { return node; } public: template #if __TBB_MSVC_DISABLE_TRAILING_DECLTYPE static auto& get(NodeType& node) #else static auto get(NodeType& node) -> decltype(get_impl(node, has_input_ports_type())) #endif { return get_impl(node, has_input_ports_type()); } }; template class predecessor_selector { template static auto internal_get(NodeType& node, std::true_type) -> decltype(output_port(node)) { return output_port(node); } template static NodeType& internal_get(NodeType& node, std::false_type) { return node;} template #if __TBB_MSVC_DISABLE_TRAILING_DECLTYPE static auto& get_impl(NodeType& node, std::false_type) #else static auto get_impl(NodeType& node, std::false_type) -> decltype(internal_get(node, has_output_ports_type())) #endif { return internal_get(node, has_output_ports_type()); } template static AsyncNode& get_impl(AsyncNode& node, std::true_type) { return node; } public: template #if __TBB_MSVC_DISABLE_TRAILING_DECLTYPE static auto& get(NodeType& node) #else static auto get(NodeType& node) -> decltype(get_impl(node, is_async_node())) #endif { return get_impl(node, is_async_node()); } }; template class make_edges_helper { public: template static void connect_predecessors(PredecessorsTuple& predecessors, NodeType& node) { make_edge(std::get(predecessors), successor_selector::get(node)); make_edges_helper::connect_predecessors(predecessors, node); } template static void connect_successors(NodeType& node, SuccessorsTuple& successors) { make_edge(predecessor_selector::get(node), std::get(successors)); make_edges_helper::connect_successors(node, successors); } }; template<> struct make_edges_helper<0> { template static void connect_predecessors(PredecessorsTuple& predecessors, NodeType& node) { make_edge(std::get<0>(predecessors), successor_selector<0>::get(node)); } template static void connect_successors(NodeType& node, SuccessorsTuple& successors) { make_edge(predecessor_selector<0>::get(node), std::get<0>(successors)); } }; // TODO: consider adding an overload for making edges between node sets template void make_edges(const node_set& s, NodeType& node) { const std::size_t SetSize = std::tuple_size::value; make_edges_helper::connect_predecessors(s.nodes, node); } template void make_edges(NodeType& node, const node_set& s) { const std::size_t SetSize = std::tuple_size::value; make_edges_helper::connect_successors(node, s.nodes); } template void make_edges_in_order(const node_set& ns, NodeType& node) { make_edges(ns, node); } template void make_edges_in_order(const node_set& ns, NodeType& node) { make_edges(node, ns); } #endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET #endif // __TBB_flow_graph_node_set_impl_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_flow_graph_nodes_deduction.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_flow_graph_nodes_deduction_H #define __TBB_flow_graph_nodes_deduction_H #if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT namespace tbb { namespace detail { namespace d2 { template struct declare_body_types { using input_type = Input; using output_type = Output; }; struct NoInputBody {}; template struct declare_body_types { using output_type = Output; }; template struct body_types; template struct body_types : declare_body_types {}; template struct body_types : declare_body_types {}; template struct body_types : declare_body_types {}; template struct body_types : declare_body_types {}; template struct body_types : declare_body_types {}; template struct body_types : declare_body_types {}; template struct body_types : declare_body_types {}; template struct body_types : declare_body_types {}; template struct body_types : declare_body_types {}; template using input_t = typename body_types::input_type; template using output_t = typename body_types::output_type; template auto decide_on_operator_overload(Output (T::*name)(const Input&) const)->decltype(name); template auto decide_on_operator_overload(Output (T::*name)(const Input&))->decltype(name); template auto decide_on_operator_overload(Output (T::*name)(Input&) const)->decltype(name); template auto decide_on_operator_overload(Output (T::*name)(Input&))->decltype(name); template auto decide_on_operator_overload(Output (*name)(const Input&))->decltype(name); template auto decide_on_operator_overload(Output (*name)(Input&))->decltype(name); template decltype(decide_on_operator_overload(&Body::operator())) decide_on_callable_type(int); template decltype(decide_on_operator_overload(std::declval())) decide_on_callable_type(...); // Deduction guides for Flow Graph nodes template input_node(GraphOrSet&&, Body) ->input_node(0))>>; #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template struct decide_on_set; template struct decide_on_set> { using type = typename Node::output_type; }; template struct decide_on_set> { using type = typename Node::input_type; }; template using decide_on_set_t = typename decide_on_set>::type; template broadcast_node(const NodeSet&) ->broadcast_node>; template buffer_node(const NodeSet&) ->buffer_node>; template queue_node(const NodeSet&) ->queue_node>; #endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template sequencer_node(GraphOrProxy&&, Sequencer) ->sequencer_node(0))>>; #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template priority_queue_node(const NodeSet&, const Compare&) ->priority_queue_node, Compare>; template priority_queue_node(const NodeSet&) ->priority_queue_node, std::less>>; #endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template struct join_key { using type = Key; }; template struct join_key { using type = T&; }; template using join_key_t = typename join_key::type; #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template join_node(const node_set&, Policy) ->join_node, Policy>; template join_node(const node_set&, Policy) ->join_node; template join_node(const node_set) ->join_node, queueing>; template join_node(const node_set) ->join_node; #endif template join_node(GraphOrProxy&&, Body, Bodies...) ->join_node(0))>, input_t(0))>...>, key_matching(0))>>>>; #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template indexer_node(const node_set&) ->indexer_node; #endif #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template limiter_node(const NodeSet&, size_t) ->limiter_node>; template split_node(const node_set&) ->split_node; template split_node(const node_set&) ->split_node>; #endif template function_node(GraphOrSet&&, size_t, Body, Policy, node_priority_t = no_priority) ->function_node(0))>, output_t(0))>, Policy>; template function_node(GraphOrSet&&, size_t, Body, node_priority_t = no_priority) ->function_node(0))>, output_t(0))>, queueing>; template struct continue_output { using type = Output; }; template <> struct continue_output { using type = continue_msg; }; template using continue_output_t = typename continue_output::type; template continue_node(GraphOrSet&&, Body, Policy, node_priority_t = no_priority) ->continue_node>, Policy>; template continue_node(GraphOrSet&&, int, Body, Policy, node_priority_t = no_priority) ->continue_node>, Policy>; template continue_node(GraphOrSet&&, Body, node_priority_t = no_priority) ->continue_node>, Policy>; template continue_node(GraphOrSet&&, int, Body, node_priority_t = no_priority) ->continue_node>, Policy>; #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template overwrite_node(const NodeSet&) ->overwrite_node>; template write_once_node(const NodeSet&) ->write_once_node>; #endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET } // namespace d2 } // namespace detail } // namespace tbb #endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT #endif // __TBB_flow_graph_nodes_deduction_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_flow_graph_tagged_buffer_impl.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ // a hash table buffer that can expand, and can support as many deletions as // additions, list-based, with elements of list held in array (for destruction // management), multiplicative hashing (like ets). No synchronization built-in. // #ifndef __TBB__flow_graph_hash_buffer_impl_H #define __TBB__flow_graph_hash_buffer_impl_H #ifndef __TBB_flow_graph_H #error Do not #include this internal file directly; use public TBB headers instead. #endif // included in namespace tbb::flow::interfaceX::internal // elements in the table are a simple list; we need pointer to next element to // traverse the chain template struct hash_buffer_element : public aligned_pair { using key_type = Key; using value_type = ValueType; value_type* get_value_ptr() { return reinterpret_cast(this->first); } hash_buffer_element* get_next() { return reinterpret_cast(this->second); } void set_next(hash_buffer_element* new_next) { this->second = reinterpret_cast(new_next); } void create_element(const value_type& v) { ::new(this->first) value_type(v); } void create_element(hash_buffer_element&& other) { ::new(this->first) value_type(std::move(*other.get_value_ptr())); } void destroy_element() { get_value_ptr()->~value_type(); } }; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT template struct metainfo_hash_buffer_element : public aligned_triple { using key_type = Key; using value_type = ValueType; value_type* get_value_ptr() { return reinterpret_cast(this->first); } metainfo_hash_buffer_element* get_next() { return reinterpret_cast(this->second); } void set_next(metainfo_hash_buffer_element* new_next) { this->second = reinterpret_cast(new_next); } message_metainfo& get_metainfo() { return this->third; } void create_element(const value_type& v, const message_metainfo& metainfo) { __TBB_ASSERT(this->third.empty(), nullptr); ::new(this->first) value_type(v); this->third = metainfo; for (auto waiter : metainfo.waiters()) { waiter->reserve(1); } } void create_element(metainfo_hash_buffer_element&& other) { __TBB_ASSERT(this->third.empty(), nullptr); ::new(this->first) value_type(std::move(*other.get_value_ptr())); this->third = std::move(other.get_metainfo()); } void destroy_element() { get_value_ptr()->~value_type(); for (auto waiter : get_metainfo().waiters()) { waiter->release(1); } get_metainfo() = message_metainfo{}; } }; #endif template < typename ElementType, typename ValueToKey, // abstract method that returns "const Key" or "const Key&" given ValueType typename HashCompare, // has hash and equal typename Allocator=tbb::cache_aligned_allocator > class hash_buffer_impl : public HashCompare { public: static const size_t INITIAL_SIZE = 8; // initial size of the hash pointer table typedef typename ElementType::key_type key_type; typedef typename ElementType::value_type value_type; typedef ElementType element_type; typedef value_type *pointer_type; typedef element_type *list_array_type; // array we manage manually typedef list_array_type *pointer_array_type; typedef typename std::allocator_traits::template rebind_alloc pointer_array_allocator_type; typedef typename std::allocator_traits::template rebind_alloc elements_array_allocator; typedef typename std::decay::type Knoref; private: ValueToKey *my_key; size_t my_size; size_t nelements; pointer_array_type pointer_array; // pointer_array[my_size] list_array_type elements_array; // elements_array[my_size / 2] element_type* free_list; size_t mask() { return my_size - 1; } void set_up_free_list( element_type **p_free_list, list_array_type la, size_t sz) { for(size_t i=0; i < sz - 1; ++i ) { // construct free list la[i].set_next(&(la[i + 1])); } la[sz - 1].set_next(nullptr); *p_free_list = (element_type *)&(la[0]); } // cleanup for exceptions struct DoCleanup { pointer_array_type *my_pa; list_array_type *my_elements; size_t my_size; DoCleanup(pointer_array_type &pa, list_array_type &my_els, size_t sz) : my_pa(&pa), my_elements(&my_els), my_size(sz) { } ~DoCleanup() { if(my_pa) { size_t dont_care = 0; internal_free_buffer(*my_pa, *my_elements, my_size, dont_care); } } }; // exception-safety requires we do all the potentially-throwing operations first void grow_array() { size_t new_size = my_size*2; size_t new_nelements = nelements; // internal_free_buffer zeroes this list_array_type new_elements_array = nullptr; pointer_array_type new_pointer_array = nullptr; list_array_type new_free_list = nullptr; { DoCleanup my_cleanup(new_pointer_array, new_elements_array, new_size); new_elements_array = elements_array_allocator().allocate(my_size); #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT for (std::size_t i = 0; i < my_size; ++i) { ::new(new_elements_array + i) element_type(); } #endif new_pointer_array = pointer_array_allocator_type().allocate(new_size); for(size_t i=0; i < new_size; ++i) new_pointer_array[i] = nullptr; set_up_free_list(&new_free_list, new_elements_array, my_size ); for(size_t i=0; i < my_size; ++i) { for( element_type* op = pointer_array[i]; op; op = (element_type *)(op->get_next())) { internal_insert_with_key(new_pointer_array, new_size, new_free_list, std::move(*op)); } } my_cleanup.my_pa = nullptr; my_cleanup.my_elements = nullptr; } internal_free_buffer(pointer_array, elements_array, my_size, nelements); free_list = new_free_list; pointer_array = new_pointer_array; elements_array = new_elements_array; my_size = new_size; nelements = new_nelements; } // v should have perfect forwarding if std::move implemented. // we use this method to move elements in grow_array, so can't use class fields template const value_type& get_value_from_pack(const Value& value, const Args&...) { return value; } template const value_type& get_value_from_pack(Element&& element) { return *(element.get_value_ptr()); } template void internal_insert_with_key( element_type **p_pointer_array, size_t p_sz, list_array_type &p_free_list, Args&&... args) { size_t l_mask = p_sz-1; __TBB_ASSERT(my_key, "Error: value-to-key functor not provided"); size_t h = this->hash(tbb::detail::invoke(*my_key, get_value_from_pack(args...))) & l_mask; __TBB_ASSERT(p_free_list, "Error: free list not set up."); element_type* my_elem = p_free_list; p_free_list = (element_type *)(p_free_list->get_next()); my_elem->create_element(std::forward(args)...); my_elem->set_next(p_pointer_array[h]); p_pointer_array[h] = my_elem; } void internal_initialize_buffer() { pointer_array = pointer_array_allocator_type().allocate(my_size); for(size_t i = 0; i < my_size; ++i) pointer_array[i] = nullptr; elements_array = elements_array_allocator().allocate(my_size / 2); #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT for (std::size_t i = 0; i < my_size / 2; ++i) { ::new(elements_array + i) element_type(); } #endif set_up_free_list(&free_list, elements_array, my_size / 2); } // made static so an enclosed class can use to properly dispose of the internals static void internal_free_buffer( pointer_array_type &pa, list_array_type &el, size_t &sz, size_t &ne ) { if(pa) { for(size_t i = 0; i < sz; ++i ) { element_type *p_next; for( element_type *p = pa[i]; p; p = p_next) { p_next = p->get_next(); p->destroy_element(); } } pointer_array_allocator_type().deallocate(pa, sz); pa = nullptr; } // Separate test (if allocation of pa throws, el may be allocated. // but no elements will be constructed.) if(el) { #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT for (std::size_t i = 0; i < sz / 2; ++i) { (el + i)->~element_type(); } #endif elements_array_allocator().deallocate(el, sz / 2); el = nullptr; } sz = INITIAL_SIZE; ne = 0; } public: hash_buffer_impl() : my_key(nullptr), my_size(INITIAL_SIZE), nelements(0) { internal_initialize_buffer(); } ~hash_buffer_impl() { internal_free_buffer(pointer_array, elements_array, my_size, nelements); delete my_key; my_key = nullptr; } hash_buffer_impl(const hash_buffer_impl&) = delete; hash_buffer_impl& operator=(const hash_buffer_impl&) = delete; void reset() { internal_free_buffer(pointer_array, elements_array, my_size, nelements); internal_initialize_buffer(); } // Take ownership of func object allocated with new. // This method is only used internally, so can't be misused by user. void set_key_func(ValueToKey *vtk) { my_key = vtk; } // pointer is used to clone() ValueToKey* get_key_func() { return my_key; } template bool insert_with_key(const value_type &v, Args&&... args) { element_type* p = nullptr; __TBB_ASSERT(my_key, "Error: value-to-key functor not provided"); if(find_element_ref_with_key(tbb::detail::invoke(*my_key, v), p)) { p->destroy_element(); p->create_element(v, std::forward(args)...); return false; } ++nelements; if(nelements*2 > my_size) grow_array(); internal_insert_with_key(pointer_array, my_size, free_list, v, std::forward(args)...); return true; } bool find_element_ref_with_key(const Knoref& k, element_type*& v) { size_t i = this->hash(k) & mask(); for(element_type* p = pointer_array[i]; p; p = (element_type *)(p->get_next())) { __TBB_ASSERT(my_key, "Error: value-to-key functor not provided"); if(this->equal(tbb::detail::invoke(*my_key, *p->get_value_ptr()), k)) { v = p; return true; } } return false; } // returns true and sets v to array element if found, else returns false. bool find_ref_with_key(const Knoref& k, pointer_type &v) { element_type* element_ptr = nullptr; bool res = find_element_ref_with_key(k, element_ptr); v = element_ptr->get_value_ptr(); return res; } bool find_with_key( const Knoref& k, value_type &v) { value_type *p; if(find_ref_with_key(k, p)) { v = *p; return true; } else return false; } void delete_with_key(const Knoref& k) { size_t h = this->hash(k) & mask(); element_type* prev = nullptr; for(element_type* p = pointer_array[h]; p; prev = p, p = (element_type *)(p->get_next())) { value_type *vp = p->get_value_ptr(); __TBB_ASSERT(my_key, "Error: value-to-key functor not provided"); if(this->equal(tbb::detail::invoke(*my_key, *vp), k)) { p->destroy_element(); if(prev) prev->set_next(p->get_next()); else pointer_array[h] = (element_type *)(p->get_next()); p->set_next(free_list); free_list = p; --nelements; return; } } __TBB_ASSERT(false, "key not found for delete"); } }; template < typename Key, // type of key within ValueType typename ValueType, typename ValueToKey, // abstract method that returns "const Key" or "const Key&" given ValueType typename HashCompare, // has hash and equal typename Allocator=tbb::cache_aligned_allocator> > using hash_buffer = hash_buffer_impl, ValueToKey, HashCompare, Allocator>; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT template < typename Key, // type of key within ValueType typename ValueType, typename ValueToKey, // abstract method that returns "const Key" or "const Key&" given ValueType typename HashCompare, // has hash and equal typename Allocator=tbb::cache_aligned_allocator> > struct metainfo_hash_buffer : public hash_buffer_impl, ValueToKey, HashCompare, Allocator> { private: using base_type = hash_buffer_impl, ValueToKey, HashCompare, Allocator>; public: bool find_with_key(const typename base_type::Knoref& k, typename base_type::value_type& v, message_metainfo& metainfo) { typename base_type::element_type* p = nullptr; bool result = this->find_element_ref_with_key(k, p); if (result) { v = *(p->get_value_ptr()); metainfo = p->get_metainfo(); } return result; } }; #endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT #endif // __TBB__flow_graph_hash_buffer_impl_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_flow_graph_trace_impl.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _FGT_GRAPH_TRACE_IMPL_H #define _FGT_GRAPH_TRACE_IMPL_H #include "../profiling.h" #if (_MSC_VER >= 1900) #include #endif namespace tbb { namespace detail { namespace d2 { template< typename T > class sender; template< typename T > class receiver; #if TBB_USE_PROFILING_TOOLS #if __TBB_FLOW_TRACE_CODEPTR #if (_MSC_VER >= 1900) #define CODEPTR() (_ReturnAddress()) #elif __TBB_GCC_VERSION >= 40800 #define CODEPTR() ( __builtin_return_address(0)) #else #define CODEPTR() nullptr #endif #else #define CODEPTR() nullptr #endif /* __TBB_FLOW_TRACE_CODEPTR */ static inline void fgt_alias_port(void *node, void *p, bool visible) { if(visible) itt_relation_add( d1::ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_NODE ); else itt_relation_add( d1::ITT_DOMAIN_FLOW, p, FLOW_NODE, __itt_relation_is_child_of, node, FLOW_NODE ); } static inline void fgt_composite ( void* codeptr, void *node, void *graph ) { itt_make_task_group( d1::ITT_DOMAIN_FLOW, node, FLOW_NODE, graph, FLOW_GRAPH, FLOW_COMPOSITE_NODE ); suppress_unused_warning( codeptr ); #if __TBB_FLOW_TRACE_CODEPTR if (codeptr != nullptr) { register_node_addr(d1::ITT_DOMAIN_FLOW, node, FLOW_NODE, CODE_ADDRESS, &codeptr); } #endif } static inline void fgt_internal_alias_input_port( void *node, void *p, string_resource_index name_index ) { itt_make_task_group( d1::ITT_DOMAIN_FLOW, p, FLOW_INPUT_PORT, node, FLOW_NODE, name_index ); itt_relation_add( d1::ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_INPUT_PORT ); } static inline void fgt_internal_alias_output_port( void *node, void *p, string_resource_index name_index ) { itt_make_task_group( d1::ITT_DOMAIN_FLOW, p, FLOW_OUTPUT_PORT, node, FLOW_NODE, name_index ); itt_relation_add( d1::ITT_DOMAIN_FLOW, node, FLOW_NODE, __itt_relation_is_parent_of, p, FLOW_OUTPUT_PORT ); } template void alias_input_port(void *node, receiver* port, string_resource_index name_index) { // TODO: Make fgt_internal_alias_input_port a function template? fgt_internal_alias_input_port( node, port, name_index); } template < typename PortsTuple, int N > struct fgt_internal_input_alias_helper { static void alias_port( void *node, PortsTuple &ports ) { alias_input_port( node, &(std::get(ports)), static_cast(FLOW_INPUT_PORT_0 + N - 1) ); fgt_internal_input_alias_helper::alias_port( node, ports ); } }; template < typename PortsTuple > struct fgt_internal_input_alias_helper { static void alias_port( void * /* node */, PortsTuple & /* ports */ ) { } }; template void alias_output_port(void *node, sender* port, string_resource_index name_index) { // TODO: Make fgt_internal_alias_output_port a function template? fgt_internal_alias_output_port( node, static_cast(port), name_index); } template < typename PortsTuple, int N > struct fgt_internal_output_alias_helper { static void alias_port( void *node, PortsTuple &ports ) { alias_output_port( node, &(std::get(ports)), static_cast(FLOW_OUTPUT_PORT_0 + N - 1) ); fgt_internal_output_alias_helper::alias_port( node, ports ); } }; template < typename PortsTuple > struct fgt_internal_output_alias_helper { static void alias_port( void * /*node*/, PortsTuple &/*ports*/ ) { } }; static inline void fgt_internal_create_input_port( void *node, void *p, string_resource_index name_index ) { itt_make_task_group( d1::ITT_DOMAIN_FLOW, p, FLOW_INPUT_PORT, node, FLOW_NODE, name_index ); } static inline void fgt_internal_create_output_port( void* codeptr, void *node, void *p, string_resource_index name_index ) { itt_make_task_group(d1::ITT_DOMAIN_FLOW, p, FLOW_OUTPUT_PORT, node, FLOW_NODE, name_index); suppress_unused_warning( codeptr ); #if __TBB_FLOW_TRACE_CODEPTR if (codeptr != nullptr) { register_node_addr(d1::ITT_DOMAIN_FLOW, node, FLOW_NODE, CODE_ADDRESS, &codeptr); } #endif } template void register_input_port(void *node, receiver* port, string_resource_index name_index) { // TODO: Make fgt_internal_create_input_port a function template? fgt_internal_create_input_port(node, static_cast(port), name_index); } template < typename PortsTuple, int N > struct fgt_internal_input_helper { static void register_port( void *node, PortsTuple &ports ) { register_input_port( node, &(std::get(ports)), static_cast(FLOW_INPUT_PORT_0 + N - 1) ); fgt_internal_input_helper::register_port( node, ports ); } }; template < typename PortsTuple > struct fgt_internal_input_helper { static void register_port( void *node, PortsTuple &ports ) { register_input_port( node, &(std::get<0>(ports)), FLOW_INPUT_PORT_0 ); } }; template void register_output_port(void* codeptr, void *node, sender* port, string_resource_index name_index) { // TODO: Make fgt_internal_create_output_port a function template? fgt_internal_create_output_port( codeptr, node, static_cast(port), name_index); } template < typename PortsTuple, int N > struct fgt_internal_output_helper { static void register_port( void* codeptr, void *node, PortsTuple &ports ) { register_output_port( codeptr, node, &(std::get(ports)), static_cast(FLOW_OUTPUT_PORT_0 + N - 1) ); fgt_internal_output_helper::register_port( codeptr, node, ports ); } }; template < typename PortsTuple > struct fgt_internal_output_helper { static void register_port( void* codeptr, void *node, PortsTuple &ports ) { register_output_port( codeptr, node, &(std::get<0>(ports)), FLOW_OUTPUT_PORT_0 ); } }; template< typename NodeType > void fgt_multioutput_node_desc( const NodeType *node, const char *desc ) { void *addr = (void *)( static_cast< receiver< typename NodeType::input_type > * >(const_cast< NodeType *>(node)) ); itt_metadata_str_add( d1::ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc ); } template< typename NodeType > void fgt_multiinput_multioutput_node_desc( const NodeType *node, const char *desc ) { void *addr = const_cast(node); itt_metadata_str_add( d1::ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc ); } template< typename NodeType > static inline void fgt_node_desc( const NodeType *node, const char *desc ) { void *addr = (void *)( static_cast< sender< typename NodeType::output_type > * >(const_cast< NodeType *>(node)) ); itt_metadata_str_add( d1::ITT_DOMAIN_FLOW, addr, FLOW_NODE, FLOW_OBJECT_NAME, desc ); } static inline void fgt_graph_desc( const void *g, const char *desc ) { void *addr = const_cast< void *>(g); itt_metadata_str_add( d1::ITT_DOMAIN_FLOW, addr, FLOW_GRAPH, FLOW_OBJECT_NAME, desc ); } static inline void fgt_body( void *node, void *body ) { itt_relation_add( d1::ITT_DOMAIN_FLOW, body, FLOW_BODY, __itt_relation_is_child_of, node, FLOW_NODE ); } template< int N, typename PortsTuple > static inline void fgt_multioutput_node(void* codeptr, string_resource_index t, void *g, void *input_port, PortsTuple &ports ) { itt_make_task_group( d1::ITT_DOMAIN_FLOW, input_port, FLOW_NODE, g, FLOW_GRAPH, t ); fgt_internal_create_input_port( input_port, input_port, FLOW_INPUT_PORT_0 ); fgt_internal_output_helper::register_port(codeptr, input_port, ports ); } template< int N, typename PortsTuple > static inline void fgt_multioutput_node_with_body( void* codeptr, string_resource_index t, void *g, void *input_port, PortsTuple &ports, void *body ) { itt_make_task_group( d1::ITT_DOMAIN_FLOW, input_port, FLOW_NODE, g, FLOW_GRAPH, t ); fgt_internal_create_input_port( input_port, input_port, FLOW_INPUT_PORT_0 ); fgt_internal_output_helper::register_port( codeptr, input_port, ports ); fgt_body( input_port, body ); } template< int N, typename PortsTuple > static inline void fgt_multiinput_node( void* codeptr, string_resource_index t, void *g, PortsTuple &ports, void *output_port) { itt_make_task_group( d1::ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t ); fgt_internal_create_output_port( codeptr, output_port, output_port, FLOW_OUTPUT_PORT_0 ); fgt_internal_input_helper::register_port( output_port, ports ); } static inline void fgt_multiinput_multioutput_node( void* codeptr, string_resource_index t, void *n, void *g ) { itt_make_task_group( d1::ITT_DOMAIN_FLOW, n, FLOW_NODE, g, FLOW_GRAPH, t ); suppress_unused_warning( codeptr ); #if __TBB_FLOW_TRACE_CODEPTR if (codeptr != nullptr) { register_node_addr(d1::ITT_DOMAIN_FLOW, n, FLOW_NODE, CODE_ADDRESS, &codeptr); } #endif } static inline void fgt_node( void* codeptr, string_resource_index t, void *g, void *output_port ) { itt_make_task_group( d1::ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t ); fgt_internal_create_output_port( codeptr, output_port, output_port, FLOW_OUTPUT_PORT_0 ); } static void fgt_node_with_body( void* codeptr, string_resource_index t, void *g, void *output_port, void *body ) { itt_make_task_group( d1::ITT_DOMAIN_FLOW, output_port, FLOW_NODE, g, FLOW_GRAPH, t ); fgt_internal_create_output_port(codeptr, output_port, output_port, FLOW_OUTPUT_PORT_0 ); fgt_body( output_port, body ); } static inline void fgt_node( void* codeptr, string_resource_index t, void *g, void *input_port, void *output_port ) { fgt_node( codeptr, t, g, output_port ); fgt_internal_create_input_port( output_port, input_port, FLOW_INPUT_PORT_0 ); } static inline void fgt_node_with_body( void* codeptr, string_resource_index t, void *g, void *input_port, void *output_port, void *body ) { fgt_node_with_body( codeptr, t, g, output_port, body ); fgt_internal_create_input_port( output_port, input_port, FLOW_INPUT_PORT_0 ); } static inline void fgt_node( void* codeptr, string_resource_index t, void *g, void *input_port, void *decrement_port, void *output_port ) { fgt_node( codeptr, t, g, input_port, output_port ); fgt_internal_create_input_port( output_port, decrement_port, FLOW_INPUT_PORT_1 ); } static inline void fgt_make_edge( void *output_port, void *input_port ) { itt_relation_add( d1::ITT_DOMAIN_FLOW, output_port, FLOW_OUTPUT_PORT, __itt_relation_is_predecessor_to, input_port, FLOW_INPUT_PORT); } static inline void fgt_remove_edge( void *output_port, void *input_port ) { itt_relation_add( d1::ITT_DOMAIN_FLOW, output_port, FLOW_OUTPUT_PORT, __itt_relation_is_sibling_of, input_port, FLOW_INPUT_PORT); } static inline void fgt_graph( void *g ) { itt_make_task_group( d1::ITT_DOMAIN_FLOW, g, FLOW_GRAPH, nullptr, FLOW_NULL, FLOW_GRAPH ); } static inline void fgt_begin_body( void *body ) { itt_task_begin( d1::ITT_DOMAIN_FLOW, body, FLOW_BODY, nullptr, FLOW_NULL, FLOW_BODY ); } static inline void fgt_end_body( void * ) { itt_task_end( d1::ITT_DOMAIN_FLOW ); } static inline void fgt_async_try_put_begin( void *node, void *port ) { itt_task_begin( d1::ITT_DOMAIN_FLOW, port, FLOW_OUTPUT_PORT, node, FLOW_NODE, FLOW_OUTPUT_PORT ); } static inline void fgt_async_try_put_end( void *, void * ) { itt_task_end( d1::ITT_DOMAIN_FLOW ); } static inline void fgt_async_reserve( void *node, void *graph ) { itt_region_begin( d1::ITT_DOMAIN_FLOW, node, FLOW_NODE, graph, FLOW_GRAPH, FLOW_NULL ); } static inline void fgt_async_commit( void *node, void * /*graph*/) { itt_region_end( d1::ITT_DOMAIN_FLOW, node, FLOW_NODE ); } static inline void fgt_reserve_wait( void *graph ) { itt_region_begin( d1::ITT_DOMAIN_FLOW, graph, FLOW_GRAPH, nullptr, FLOW_NULL, FLOW_NULL ); } static inline void fgt_release_wait( void *graph ) { itt_region_end( d1::ITT_DOMAIN_FLOW, graph, FLOW_GRAPH ); } #else // TBB_USE_PROFILING_TOOLS #define CODEPTR() nullptr static inline void fgt_alias_port(void * /*node*/, void * /*p*/, bool /*visible*/ ) { } static inline void fgt_composite ( void* /*codeptr*/, void * /*node*/, void * /*graph*/ ) { } static inline void fgt_graph( void * /*g*/ ) { } template< typename NodeType > static inline void fgt_multioutput_node_desc( const NodeType * /*node*/, const char * /*desc*/ ) { } template< typename NodeType > static inline void fgt_node_desc( const NodeType * /*node*/, const char * /*desc*/ ) { } static inline void fgt_graph_desc( const void * /*g*/, const char * /*desc*/ ) { } template< int N, typename PortsTuple > static inline void fgt_multioutput_node( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*input_port*/, PortsTuple & /*ports*/ ) { } template< int N, typename PortsTuple > static inline void fgt_multioutput_node_with_body( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*input_port*/, PortsTuple & /*ports*/, void * /*body*/ ) { } template< int N, typename PortsTuple > static inline void fgt_multiinput_node( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, PortsTuple & /*ports*/, void * /*output_port*/ ) { } static inline void fgt_multiinput_multioutput_node( void* /*codeptr*/, string_resource_index /*t*/, void * /*node*/, void * /*graph*/ ) { } static inline void fgt_node( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*input_port*/, void * /*output_port*/ ) { } static inline void fgt_node( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*input_port*/, void * /*decrement_port*/, void * /*output_port*/ ) { } static inline void fgt_node_with_body( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*output_port*/, void * /*body*/ ) { } static inline void fgt_node_with_body( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*input_port*/, void * /*output_port*/, void * /*body*/ ) { } static inline void fgt_make_edge( void * /*output_port*/, void * /*input_port*/ ) { } static inline void fgt_remove_edge( void * /*output_port*/, void * /*input_port*/ ) { } static inline void fgt_begin_body( void * /*body*/ ) { } static inline void fgt_end_body( void * /*body*/) { } static inline void fgt_async_try_put_begin( void * /*node*/, void * /*port*/ ) { } static inline void fgt_async_try_put_end( void * /*node*/ , void * /*port*/ ) { } static inline void fgt_async_reserve( void * /*node*/, void * /*graph*/ ) { } static inline void fgt_async_commit( void * /*node*/, void * /*graph*/ ) { } static inline void fgt_reserve_wait( void * /*graph*/ ) { } static inline void fgt_release_wait( void * /*graph*/ ) { } template< typename NodeType > void fgt_multiinput_multioutput_node_desc( const NodeType * /*node*/, const char * /*desc*/ ) { } template < typename PortsTuple, int N > struct fgt_internal_input_alias_helper { static void alias_port( void * /*node*/, PortsTuple & /*ports*/ ) { } }; template < typename PortsTuple, int N > struct fgt_internal_output_alias_helper { static void alias_port( void * /*node*/, PortsTuple & /*ports*/ ) { } }; #endif // TBB_USE_PROFILING_TOOLS } // d2 } // namespace detail } // namespace tbb #endif // _FGT_GRAPH_TRACE_IMPL_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_flow_graph_types_impl.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB__flow_graph_types_impl_H #define __TBB__flow_graph_types_impl_H #ifndef __TBB_flow_graph_H #error Do not #include this internal file directly; use public TBB headers instead. #endif // included in namespace tbb::detail::d2 // the change to key_matching (adding a K and KHash template parameter, making it a class) // means we have to pass this data to the key_matching_port. All the ports have only one // template parameter, so we have to wrap the following types in a trait: // // . K == key_type // . KHash == hash and compare for Key // . TtoK == function_body that given an object of T, returns its K // . T == type accepted by port, and stored in the hash table // // The port will have an additional parameter on node construction, which is a function_body // that accepts a const T& and returns a K which is the field in T which is its K. template struct KeyTrait { typedef Kp K; typedef Tp T; typedef type_to_key_function_body TtoK; typedef KHashp KHash; }; // wrap each element of a tuple in a template, and make a tuple of the result. template class PT, typename TypeTuple> struct wrap_tuple_elements; // A wrapper that generates the traits needed for each port of a key-matching join, // and the type of the tuple of input ports. template class PT, typename KeyTraits, typename TypeTuple> struct wrap_key_tuple_elements; template class PT, typename... Args> struct wrap_tuple_elements >{ typedef typename std::tuple... > type; }; template class PT, typename KeyTraits, typename... Args> struct wrap_key_tuple_elements > { typedef typename KeyTraits::key_type K; typedef typename KeyTraits::hash_compare_type KHash; typedef typename std::tuple >... > type; }; template< int... S > class sequence {}; template< int N, int... S > struct make_sequence : make_sequence < N - 1, N - 1, S... > {}; template< int... S > struct make_sequence < 0, S... > { typedef sequence type; }; template struct alignment_of { typedef struct { char t; U padded; } test_alignment; static const size_t value = sizeof(test_alignment) - sizeof(U); }; template struct max_alignment_helper; template struct max_alignment_helper { using type = typename max_alignment_helper::type>::type; }; template struct max_alignment_helper { using type = typename std::conditional::type; }; template using max_alignment_helper_t = typename max_alignment_helper::type; #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) #pragma warning(push) #pragma warning(disable: 4324) // warning C4324: structure was padded due to alignment specifier #endif // T1, T2 are actual types stored. The space defined for T1 in the type returned // is a char array of the correct size. Type T2 should be trivially-constructible, // T1 must be explicitly managed. template struct alignas(alignof(max_alignment_helper_t)) aligned_pair { char first[sizeof(T1)]; T2 second; }; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT template struct alignas(alignof(max_alignment_helper_t)) aligned_triple { char first[sizeof(T1)]; T2 second; T3 third; }; #endif #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) #pragma warning(pop) // warning 4324 is back #endif // support for variant type // type we use when we're not storing a value struct default_constructed { }; // type which contains another type, tests for what type is contained, and references to it. // Wrapper // void CopyTo( void *newSpace) : builds a Wrapper copy of itself in newSpace // struct to allow us to copy and test the type of objects struct WrapperBase { virtual ~WrapperBase() {} virtual void CopyTo(void* /*newSpace*/) const = 0; }; // Wrapper contains a T, with the ability to test what T is. The Wrapper can be // constructed from a T, can be copy-constructed from another Wrapper, and can be // examined via value(), but not modified. template struct Wrapper: public WrapperBase { typedef T value_type; typedef T* pointer_type; private: T value_space; public: const value_type &value() const { return value_space; } private: Wrapper(); // on exception will ensure the Wrapper will contain only a trivially-constructed object struct _unwind_space { pointer_type space; _unwind_space(pointer_type p) : space(p) {} ~_unwind_space() { if(space) (void) new (space) Wrapper(default_constructed()); } }; public: explicit Wrapper( const T& other ) : value_space(other) { } explicit Wrapper(const Wrapper& other) = delete; void CopyTo(void* newSpace) const override { _unwind_space guard((pointer_type)newSpace); (void) new(newSpace) Wrapper(value_space); guard.space = nullptr; } ~Wrapper() { } }; // specialization for array objects template struct Wrapper : public WrapperBase { typedef T value_type; typedef T* pointer_type; // space must be untyped. typedef T ArrayType[N]; private: // The space is not of type T[N] because when copy-constructing, it would be // default-initialized and then copied to in some fashion, resulting in two // constructions and one destruction per element. If the type is char[ ], we // placement new into each element, resulting in one construction per element. static const size_t space_size = sizeof(ArrayType); char value_space[space_size]; // on exception will ensure the already-built objects will be destructed // (the value_space is a char array, so it is already trivially-destructible.) struct _unwind_class { pointer_type space; int already_built; _unwind_class(pointer_type p) : space(p), already_built(0) {} ~_unwind_class() { if(space) { for(size_t i = already_built; i > 0 ; --i ) space[i-1].~value_type(); (void) new(space) Wrapper(default_constructed()); } } }; public: const ArrayType &value() const { char *vp = const_cast(value_space); return reinterpret_cast(*vp); } private: Wrapper(); public: // have to explicitly construct because other decays to a const value_type* explicit Wrapper(const ArrayType& other) { _unwind_class guard((pointer_type)value_space); pointer_type vp = reinterpret_cast(&value_space); for(size_t i = 0; i < N; ++i ) { (void) new(vp++) value_type(other[i]); ++(guard.already_built); } guard.space = nullptr; } explicit Wrapper(const Wrapper& other) : WrapperBase() { // we have to do the heavy lifting to copy contents _unwind_class guard((pointer_type)value_space); pointer_type dp = reinterpret_cast(value_space); pointer_type sp = reinterpret_cast(const_cast(other.value_space)); for(size_t i = 0; i < N; ++i, ++dp, ++sp) { (void) new(dp) value_type(*sp); ++(guard.already_built); } guard.space = nullptr; } void CopyTo(void* newSpace) const override { (void) new(newSpace) Wrapper(*this); // exceptions handled in copy constructor } ~Wrapper() { // have to destroy explicitly in reverse order pointer_type vp = reinterpret_cast(&value_space); for(size_t i = N; i > 0 ; --i ) vp[i-1].~value_type(); } }; // given a tuple, return the type of the element that has the maximum alignment requirement. // Given a tuple and that type, return the number of elements of the object with the max // alignment requirement that is at least as big as the largest object in the tuple. template struct pick_one; template struct pick_one { typedef T1 type; }; template struct pick_one { typedef T2 type; }; template< template class Selector, typename T1, typename T2 > struct pick_max { typedef typename pick_one< (Selector::value > Selector::value), T1, T2 >::type type; }; template struct size_of { static const int value = sizeof(T); }; template< size_t N, class Tuple, template class Selector > struct pick_tuple_max { typedef typename pick_tuple_max::type LeftMaxType; typedef typename std::tuple_element::type ThisType; typedef typename pick_max::type type; }; template< class Tuple, template class Selector > struct pick_tuple_max<0, Tuple, Selector> { typedef typename std::tuple_element<0, Tuple>::type type; }; // is the specified type included in a tuple? template struct is_element_of { typedef typename std::tuple_element::type T_i; static const bool value = std::is_same::value || is_element_of::value; }; template struct is_element_of { typedef typename std::tuple_element<0, Tuple>::type T_i; static const bool value = std::is_same::value; }; // allow the construction of types that are listed tuple. If a disallowed type // construction is written, a method involving this type is created. The // type has no definition, so a syntax error is generated. template struct ERROR_Type_Not_allowed_In_Tagged_Msg_Not_Member_Of_Tuple; template struct do_if; template struct do_if { static void construct(void *mySpace, const T& x) { (void) new(mySpace) Wrapper(x); } }; template struct do_if { static void construct(void * /*mySpace*/, const T& x) { // This method is instantiated when the type T does not match any of the // element types in the Tuple in variant. ERROR_Type_Not_allowed_In_Tagged_Msg_Not_Member_Of_Tuple::bad_type(x); } }; // Tuple tells us the allowed types that variant can hold. It determines the alignment of the space in // Wrapper, and how big Wrapper is. // // the object can only be tested for type, and a read-only reference can be fetched by cast_to(). using tbb::detail::punned_cast; struct tagged_null_type {}; template class tagged_msg { typedef std::tuple= 6 , T5 #endif #if __TBB_VARIADIC_MAX >= 7 , T6 #endif #if __TBB_VARIADIC_MAX >= 8 , T7 #endif #if __TBB_VARIADIC_MAX >= 9 , T8 #endif #if __TBB_VARIADIC_MAX >= 10 , T9 #endif > Tuple; private: class variant { static const size_t N = std::tuple_size::value; typedef typename pick_tuple_max::type AlignType; typedef typename pick_tuple_max::type MaxSizeType; static const size_t MaxNBytes = (sizeof(Wrapper)+sizeof(AlignType)-1); static const size_t MaxNElements = MaxNBytes/sizeof(AlignType); typedef aligned_space SpaceType; SpaceType my_space; static const size_t MaxSize = sizeof(SpaceType); public: variant() { (void) new(&my_space) Wrapper(default_constructed()); } template variant( const T& x ) { do_if::value>::construct(&my_space,x); } variant(const variant& other) { const WrapperBase * h = punned_cast(&(other.my_space)); h->CopyTo(&my_space); } // assignment must destroy and re-create the Wrapper type, as there is no way // to create a Wrapper-to-Wrapper assign even if we find they agree in type. void operator=( const variant& rhs ) { if(&rhs != this) { WrapperBase *h = punned_cast(&my_space); h->~WrapperBase(); const WrapperBase *ch = punned_cast(&(rhs.my_space)); ch->CopyTo(&my_space); } } template const U& variant_cast_to() const { const Wrapper *h = dynamic_cast*>(punned_cast(&my_space)); if(!h) { throw_exception(exception_id::bad_tagged_msg_cast); } return h->value(); } template bool variant_is_a() const { return dynamic_cast*>(punned_cast(&my_space)) != nullptr; } bool variant_is_default_constructed() const {return variant_is_a();} ~variant() { WrapperBase *h = punned_cast(&my_space); h->~WrapperBase(); } }; //class variant TagType my_tag; variant my_msg; public: tagged_msg(): my_tag(TagType(~0)), my_msg(){} template tagged_msg(T const &index, R const &value) : my_tag(index), my_msg(value) {} template tagged_msg(T const &index, R (&value)[N]) : my_tag(index), my_msg(value) {} void set_tag(TagType const &index) {my_tag = index;} TagType tag() const {return my_tag;} template const V& cast_to() const {return my_msg.template variant_cast_to();} template bool is_a() const {return my_msg.template variant_is_a();} bool is_default_constructed() const {return my_msg.variant_is_default_constructed();} }; //class tagged_msg // template to simplify cast and test for tagged_msg in template contexts template const V& cast_to(T const &t) { return t.template cast_to(); } template bool is_a(T const &t) { return t.template is_a(); } enum op_stat { WAIT = 0, SUCCEEDED, FAILED }; #endif /* __TBB__flow_graph_types_impl_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_hash_compare.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_detail__hash_compare_H #define __TBB_detail__hash_compare_H #include #include "_containers_helpers.h" namespace tbb { namespace detail { namespace d1 { template class hash_compare { using is_transparent_hash = has_transparent_key_equal; public: using hasher = Hash; using key_equal = typename is_transparent_hash::type; hash_compare() = default; hash_compare( hasher hash, key_equal equal ) : my_hasher(hash), my_equal(equal) {} std::size_t operator()( const Key& key ) const { return std::size_t(my_hasher(key)); } bool operator()( const Key& key1, const Key& key2 ) const { return my_equal(key1, key2); } template ::type> std::size_t operator()( const K& key ) const { return std::size_t(my_hasher(key)); } template ::type> bool operator()( const K1& key1, const K2& key2 ) const { return my_equal(key1, key2); } hasher hash_function() const { return my_hasher; } key_equal key_eq() const { return my_equal; } private: hasher my_hasher; key_equal my_equal; }; // class hash_compare //! hash_compare that is default argument for concurrent_hash_map template class tbb_hash_compare { public: std::size_t hash( const Key& a ) const { return my_hash_func(a); } #if defined(_MSC_VER) && _MSC_VER <= 1900 #pragma warning (push) // MSVC 2015 throws a strange warning: 'std::size_t': forcing value to bool 'true' or 'false' #pragma warning (disable: 4800) #endif bool equal( const Key& a, const Key& b ) const { return my_key_equal(a, b); } #if defined(_MSC_VER) && _MSC_VER <= 1900 #pragma warning (pop) #endif private: std::hash my_hash_func; std::equal_to my_key_equal; }; } // namespace d1 #if __TBB_CPP20_CONCEPTS_PRESENT inline namespace d0 { template concept hash_compare = std::copy_constructible && requires( const std::remove_reference_t& hc, const Key& key1, const Key& key2 ) { { hc.hash(key1) } -> std::same_as; { hc.equal(key1, key2) } -> std::convertible_to; }; } // namespace d0 #endif // __TBB_CPP20_CONCEPTS_PRESENT } // namespace detail } // namespace tbb #if TBB_DEFINE_STD_HASH_SPECIALIZATIONS namespace std { template struct hash> { public: std::size_t operator()( const std::pair& p ) const { return first_hash(p.first) ^ second_hash(p.second); } private: std::hash first_hash; std::hash second_hash; }; // struct hash // Apple clang and MSVC defines their own specializations for std::hash> #if !(_LIBCPP_VERSION) && !(_CPPLIB_VER) template struct hash> { public: std::size_t operator()( const std::basic_string& s ) const { std::size_t h = 0; for ( const CharT* c = s.c_str(); *c; ++c ) { h = h * hash_multiplier ^ char_hash(*c); } return h; } private: static constexpr std::size_t hash_multiplier = tbb::detail::select_size_t_constant<2654435769U, 11400714819323198485ULL>::value; std::hash char_hash; }; // struct hash #endif // !(_LIBCPP_VERSION || _CPPLIB_VER) } // namespace std #endif // TBB_DEFINE_STD_HASH_SPECIALIZATIONS #endif // __TBB_detail__hash_compare_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_intrusive_list_node.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _TBB_detail__intrusive_list_node_H #define _TBB_detail__intrusive_list_node_H namespace tbb { namespace detail { namespace d1 { //! Data structure to be inherited by the types that can form intrusive lists. /** Intrusive list is formed by means of the member_intrusive_list template class. Note that type T must derive from intrusive_list_node either publicly or declare instantiation member_intrusive_list as a friend. This class implements a limited subset of std::list interface. **/ struct intrusive_list_node { intrusive_list_node* my_prev_node{}; intrusive_list_node* my_next_node{}; #if TBB_USE_ASSERT intrusive_list_node() { my_prev_node = my_next_node = this; } #endif /* TBB_USE_ASSERT */ }; } // namespace d1 } // namespace detail } // namespace tbb #endif // _TBB_detail__intrusive_list_node_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_machine.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_detail__machine_H #define __TBB_detail__machine_H #include "_config.h" #include "_assert.h" #include #include #include #include #ifdef _WIN32 #include #ifdef __TBBMALLOC_BUILD #define WIN32_LEAN_AND_MEAN #ifndef NOMINMAX #define NOMINMAX #endif #include // SwitchToThread() #endif #ifdef _MSC_VER #if __TBB_x86_64 || __TBB_x86_32 #pragma intrinsic(__rdtsc) #endif #endif #endif #if __TBB_x86_64 || __TBB_x86_32 #include // _mm_pause #endif #if (_WIN32) #include // _control87 #endif #if __TBB_GLIBCXX_THIS_THREAD_YIELD_BROKEN #include // sched_yield #else #include // std::this_thread::yield() #endif namespace tbb { namespace detail { inline namespace d0 { //-------------------------------------------------------------------------------------------------- // Yield implementation //-------------------------------------------------------------------------------------------------- #if __TBB_GLIBCXX_THIS_THREAD_YIELD_BROKEN static inline void yield() { int err = sched_yield(); __TBB_ASSERT_EX(err == 0, "sched_yield has failed"); } #elif __TBBMALLOC_BUILD && _WIN32 // Use Windows API for yield in tbbmalloc to avoid dependency on C++ runtime with some implementations. static inline void yield() { SwitchToThread(); } #else using std::this_thread::yield; #endif //-------------------------------------------------------------------------------------------------- // atomic_fence_seq_cst implementation //-------------------------------------------------------------------------------------------------- static inline void atomic_fence_seq_cst() { #if (__TBB_x86_64 || __TBB_x86_32) && defined(__GNUC__) && __GNUC__ < 11 unsigned char dummy = 0u; __asm__ __volatile__ ("lock; notb %0" : "+m" (dummy) :: "memory"); #else std::atomic_thread_fence(std::memory_order_seq_cst); #endif } //-------------------------------------------------------------------------------------------------- // Pause implementation //-------------------------------------------------------------------------------------------------- static inline void machine_pause(int32_t delay) { #if __TBB_x86_64 || __TBB_x86_32 while (delay-- > 0) { _mm_pause(); } #elif __ARM_ARCH_7A__ || __aarch64__ while (delay-- > 0) { __asm__ __volatile__("isb sy" ::: "memory"); } #else /* Generic */ (void)delay; // suppress without including _template_helpers.h yield(); #endif } //////////////////////////////////////////////////////////////////////////////////////////////////// // tbb::detail::log2() implementation //////////////////////////////////////////////////////////////////////////////////////////////////// // TODO: Use log2p1() function that will be available in C++20 standard #if defined(__GNUC__) || defined(__clang__) namespace gnu_builtins { inline uintptr_t clz(unsigned int x) { return static_cast(__builtin_clz(x)); } inline uintptr_t clz(unsigned long int x) { return static_cast(__builtin_clzl(x)); } inline uintptr_t clz(unsigned long long int x) { return static_cast(__builtin_clzll(x)); } } #elif defined(_MSC_VER) #pragma intrinsic(__TBB_W(_BitScanReverse)) namespace msvc_intrinsics { static inline uintptr_t bit_scan_reverse(uintptr_t i) { unsigned long j; __TBB_W(_BitScanReverse)( &j, i ); return j; } } #endif template constexpr std::uintptr_t number_of_bits() { return sizeof(T) * CHAR_BIT; } // logarithm is the index of the most significant non-zero bit static inline uintptr_t machine_log2(uintptr_t x) { #if defined(__GNUC__) || defined(__clang__) // If P is a power of 2 and x() - 1) ^ gnu_builtins::clz(x); #elif defined(_MSC_VER) return msvc_intrinsics::bit_scan_reverse(x); #elif __i386__ || __i386 /*for Sun OS*/ || __MINGW32__ uintptr_t j, i = x; __asm__("bsr %1,%0" : "=r"(j) : "r"(i)); return j; #elif __powerpc__ || __POWERPC__ #if __TBB_WORDSIZE==8 __asm__ __volatile__ ("cntlzd %0,%0" : "+r"(x)); return 63 - static_cast(x); #else __asm__ __volatile__ ("cntlzw %0,%0" : "+r"(x)); return 31 - static_cast(x); #endif /*__TBB_WORDSIZE*/ #elif __sparc uint64_t count; // one hot encode x |= (x >> 1); x |= (x >> 2); x |= (x >> 4); x |= (x >> 8); x |= (x >> 16); x |= (x >> 32); // count 1's __asm__ ("popc %1, %0" : "=r"(count) : "r"(x) ); return count - 1; #else intptr_t result = 0; if( sizeof(x) > 4 && (uintptr_t tmp = x >> 32) ) { x = tmp; result += 32; } if( uintptr_t tmp = x >> 16 ) { x = tmp; result += 16; } if( uintptr_t tmp = x >> 8 ) { x = tmp; result += 8; } if( uintptr_t tmp = x >> 4 ) { x = tmp; result += 4; } if( uintptr_t tmp = x >> 2 ) { x = tmp; result += 2; } return (x & 2) ? result + 1 : result; #endif } //////////////////////////////////////////////////////////////////////////////////////////////////// // tbb::detail::reverse_bits() implementation //////////////////////////////////////////////////////////////////////////////////////////////////// #if TBB_USE_CLANG_BITREVERSE_BUILTINS namespace llvm_builtins { inline uint8_t builtin_bitreverse(uint8_t x) { return __builtin_bitreverse8 (x); } inline uint16_t builtin_bitreverse(uint16_t x) { return __builtin_bitreverse16(x); } inline uint32_t builtin_bitreverse(uint32_t x) { return __builtin_bitreverse32(x); } inline uint64_t builtin_bitreverse(uint64_t x) { return __builtin_bitreverse64(x); } } #else // generic template struct reverse { static const T byte_table[256]; }; template const T reverse::byte_table[256] = { 0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0, 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8, 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, 0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA, 0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6, 0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE, 0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1, 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9, 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, 0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, 0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB, 0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7, 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF }; inline unsigned char reverse_byte(unsigned char src) { return reverse::byte_table[src]; } #endif // TBB_USE_CLANG_BITREVERSE_BUILTINS template T machine_reverse_bits(T src) { #if TBB_USE_CLANG_BITREVERSE_BUILTINS return builtin_bitreverse(fixed_width_cast(src)); #else /* Generic */ T dst; unsigned char *original = reinterpret_cast(&src); unsigned char *reversed = reinterpret_cast(&dst); for ( int i = sizeof(T) - 1; i >= 0; i-- ) { reversed[i] = reverse_byte( original[sizeof(T) - i - 1] ); } return dst; #endif // TBB_USE_CLANG_BITREVERSE_BUILTINS } } // inline namespace d0 namespace d1 { #if (_WIN32) // API to retrieve/update FPU control setting #define __TBB_CPU_CTL_ENV_PRESENT 1 struct cpu_ctl_env { unsigned int x87cw{}; #if (__TBB_x86_64) // Changing the infinity mode or the floating-point precision is not supported on x64. // The attempt causes an assertion. See // https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/control87-controlfp-control87-2 static constexpr unsigned int X87CW_CONTROL_MASK = _MCW_DN | _MCW_EM | _MCW_RC; #else static constexpr unsigned int X87CW_CONTROL_MASK = ~0U; #endif #if (__TBB_x86_32 || __TBB_x86_64) unsigned int mxcsr{}; static constexpr unsigned int MXCSR_CONTROL_MASK = ~0x3fu; /* all except last six status bits */ #endif bool operator!=( const cpu_ctl_env& ctl ) const { return #if (__TBB_x86_32 || __TBB_x86_64) mxcsr != ctl.mxcsr || #endif x87cw != ctl.x87cw; } void get_env() { x87cw = _control87(0, 0); #if (__TBB_x86_32 || __TBB_x86_64) mxcsr = _mm_getcsr(); #endif } void set_env() const { _control87(x87cw, X87CW_CONTROL_MASK); #if (__TBB_x86_32 || __TBB_x86_64) _mm_setcsr(mxcsr & MXCSR_CONTROL_MASK); #endif } }; #elif (__TBB_x86_32 || __TBB_x86_64) // API to retrieve/update FPU control setting #define __TBB_CPU_CTL_ENV_PRESENT 1 struct cpu_ctl_env { int mxcsr{}; short x87cw{}; static const int MXCSR_CONTROL_MASK = ~0x3f; /* all except last six status bits */ bool operator!=(const cpu_ctl_env& ctl) const { return mxcsr != ctl.mxcsr || x87cw != ctl.x87cw; } void get_env() { __asm__ __volatile__( "stmxcsr %0\n\t" "fstcw %1" : "=m"(mxcsr), "=m"(x87cw) ); mxcsr &= MXCSR_CONTROL_MASK; } void set_env() const { __asm__ __volatile__( "ldmxcsr %0\n\t" "fldcw %1" : : "m"(mxcsr), "m"(x87cw) ); } }; #endif } // namespace d1 } // namespace detail } // namespace tbb #if !__TBB_CPU_CTL_ENV_PRESENT #include #include namespace tbb { namespace detail { namespace r1 { void* __TBB_EXPORTED_FUNC cache_aligned_allocate(std::size_t size); void __TBB_EXPORTED_FUNC cache_aligned_deallocate(void* p); } // namespace r1 namespace d1 { class cpu_ctl_env { fenv_t *my_fenv_ptr; public: cpu_ctl_env() : my_fenv_ptr(nullptr) {} ~cpu_ctl_env() { if ( my_fenv_ptr ) r1::cache_aligned_deallocate( (void*)my_fenv_ptr ); } // It is possible not to copy memory but just to copy pointers but the following issues should be addressed: // 1. The arena lifetime and the context lifetime are independent; // 2. The user is allowed to recapture different FPU settings to context so 'current FPU settings' inside // dispatch loop may become invalid. // But do we really want to improve the fenv implementation? It seems to be better to replace the fenv implementation // with a platform specific implementation. cpu_ctl_env( const cpu_ctl_env &src ) : my_fenv_ptr(nullptr) { *this = src; } cpu_ctl_env& operator=( const cpu_ctl_env &src ) { __TBB_ASSERT( src.my_fenv_ptr, nullptr); if ( !my_fenv_ptr ) my_fenv_ptr = (fenv_t*)r1::cache_aligned_allocate(sizeof(fenv_t)); *my_fenv_ptr = *src.my_fenv_ptr; return *this; } bool operator!=( const cpu_ctl_env &ctl ) const { __TBB_ASSERT( my_fenv_ptr, "cpu_ctl_env is not initialized." ); __TBB_ASSERT( ctl.my_fenv_ptr, "cpu_ctl_env is not initialized." ); return std::memcmp( (void*)my_fenv_ptr, (void*)ctl.my_fenv_ptr, sizeof(fenv_t) ); } void get_env () { if ( !my_fenv_ptr ) my_fenv_ptr = (fenv_t*)r1::cache_aligned_allocate(sizeof(fenv_t)); fegetenv( my_fenv_ptr ); } const cpu_ctl_env& set_env () const { __TBB_ASSERT( my_fenv_ptr, "cpu_ctl_env is not initialized." ); fesetenv( my_fenv_ptr ); return *this; } }; } // namespace d1 } // namespace detail } // namespace tbb #endif /* !__TBB_CPU_CTL_ENV_PRESENT */ #endif // __TBB_detail__machine_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_mutex_common.h ================================================ /* Copyright (c) 2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_detail__mutex_common_H #define __TBB_detail__mutex_common_H #include "_config.h" #include "_utils.h" #if __TBB_CPP20_CONCEPTS_PRESENT #include namespace tbb { namespace detail { inline namespace d0 { template concept mutex_scoped_lock = std::default_initializable && std::constructible_from && requires( Lock& lock, Mutex& mutex ) { lock.acquire(mutex); { lock.try_acquire(mutex) } -> adaptive_same_as; lock.release(); }; template concept rw_mutex_scoped_lock = mutex_scoped_lock && std::constructible_from && requires( Lock& lock, Mutex& mutex ) { lock.acquire(mutex, false); { lock.try_acquire(mutex, false) } -> adaptive_same_as; { lock.upgrade_to_writer() } -> adaptive_same_as; { lock.downgrade_to_reader() } -> adaptive_same_as; }; template concept scoped_lockable = mutex_scoped_lock; template concept rw_scoped_lockable = scoped_lockable && rw_mutex_scoped_lock; } // namespace d0 } // namespace detail } // namespace tbb #endif // __TBB_CPP20_CONCEPTS_PRESENT #endif // __TBB_detail__mutex_common_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_namespace_injection.h ================================================ /* Copyright (c) 2020-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ // All public entities of the OneAPI Spec are available under oneapi namespace // Define tbb namespace first as it might not be known yet namespace tbb {} namespace oneapi { namespace tbb = ::tbb; } ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_node_handle.h ================================================ /* Copyright (c) 2019-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_detail__node_handle_H #define __TBB_detail__node_handle_H #include "_allocator_traits.h" #include "_assert.h" namespace tbb { namespace detail { namespace d1 { // A structure to access private node handle methods in internal TBB classes // Regular friend declaration is not convenient because classes which use node handle // can be placed in the different versioning namespaces. struct node_handle_accessor { template static typename NodeHandleType::node* get_node_ptr( NodeHandleType& nh ) { return nh.get_node_ptr(); } template static NodeHandleType construct( typename NodeHandleType::node* node_ptr ) { return NodeHandleType{node_ptr}; } template static void deactivate( NodeHandleType& nh ) { nh.deactivate(); } }; // struct node_handle_accessor template class node_handle_base { public: using allocator_type = Allocator; protected: using node = Node; using allocator_traits_type = tbb::detail::allocator_traits; public: node_handle_base() : my_node(nullptr), my_allocator() {} node_handle_base(node_handle_base&& nh) : my_node(nh.my_node), my_allocator(std::move(nh.my_allocator)) { nh.my_node = nullptr; } __TBB_nodiscard bool empty() const { return my_node == nullptr; } explicit operator bool() const { return my_node != nullptr; } ~node_handle_base() { internal_destroy(); } node_handle_base& operator=( node_handle_base&& nh ) { internal_destroy(); my_node = nh.my_node; move_assign_allocators(my_allocator, nh.my_allocator); nh.deactivate(); return *this; } void swap( node_handle_base& nh ) { using std::swap; swap(my_node, nh.my_node); swap_allocators(my_allocator, nh.my_allocator); } allocator_type get_allocator() const { return my_allocator; } protected: node_handle_base( node* n ) : my_node(n) {} void internal_destroy() { if(my_node != nullptr) { allocator_traits_type::destroy(my_allocator, my_node->storage()); typename allocator_traits_type::template rebind_alloc node_allocator(my_allocator); node_allocator.deallocate(my_node, 1); } } node* get_node_ptr() { return my_node; } void deactivate() { my_node = nullptr; } node* my_node; allocator_type my_allocator; }; // node handle for maps template class node_handle : public node_handle_base { using base_type = node_handle_base; public: using key_type = Key; using mapped_type = typename Value::second_type; using allocator_type = typename base_type::allocator_type; node_handle() = default; key_type& key() const { __TBB_ASSERT(!this->empty(), "Cannot get key from the empty node_type object"); return *const_cast(&(this->my_node->value().first)); } mapped_type& mapped() const { __TBB_ASSERT(!this->empty(), "Cannot get mapped value from the empty node_type object"); return this->my_node->value().second; } private: friend struct node_handle_accessor; node_handle( typename base_type::node* n ) : base_type(n) {} }; // class node_handle // node handle for sets template class node_handle : public node_handle_base { using base_type = node_handle_base; public: using value_type = Key; using allocator_type = typename base_type::allocator_type; node_handle() = default; value_type& value() const { __TBB_ASSERT(!this->empty(), "Cannot get value from the empty node_type object"); return *const_cast(&(this->my_node->value())); } private: friend struct node_handle_accessor; node_handle( typename base_type::node* n ) : base_type(n) {} }; // class node_handle template void swap( node_handle& lhs, node_handle& rhs ) { return lhs.swap(rhs); } } // namespace d1 } // namespace detail } // namespace tbb #endif // __TBB_detail__node_handle_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_pipeline_filters.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_parallel_filters_H #define __TBB_parallel_filters_H #include "_config.h" #include "_task.h" #include "_pipeline_filters_deduction.h" #include "../tbb_allocator.h" #include #include namespace tbb { namespace detail { namespace d1 { class base_filter; } namespace d2 { template __TBB_requires(std::copyable) class input_node; } namespace r1 { TBB_EXPORT void __TBB_EXPORTED_FUNC set_end_of_input(d1::base_filter&); class pipeline; class stage_task; class input_buffer; } namespace d1 { class filter_node; //! A stage in a pipeline. /** @ingroup algorithms */ class base_filter{ private: //! Value used to mark "not in pipeline" static base_filter* not_in_pipeline() { return reinterpret_cast(std::intptr_t(-1)); } public: //! The lowest bit 0 is for parallel vs serial static constexpr unsigned int filter_is_serial = 0x1; //! 2nd bit distinguishes ordered vs unordered filters. static constexpr unsigned int filter_is_out_of_order = 0x1<<1; //! 3rd bit marks input filters emitting small objects static constexpr unsigned int filter_may_emit_null = 0x1<<2; base_filter(const base_filter&) = delete; base_filter& operator=(const base_filter&) = delete; protected: explicit base_filter( unsigned int m ) : next_filter_in_pipeline(not_in_pipeline()), my_input_buffer(nullptr), my_filter_mode(m), my_pipeline(nullptr) {} // signal end-of-input for concrete_filters void set_end_of_input() { r1::set_end_of_input(*this); } public: //! True if filter is serial. bool is_serial() const { return bool( my_filter_mode & filter_is_serial ); } //! True if filter must receive stream in order. bool is_ordered() const { return (my_filter_mode & filter_is_serial) && !(my_filter_mode & filter_is_out_of_order); } //! true if an input filter can emit null bool object_may_be_null() { return ( my_filter_mode & filter_may_emit_null ) == filter_may_emit_null; } //! Operate on an item from the input stream, and return item for output stream. /** Returns nullptr if filter is a sink. */ virtual void* operator()( void* item ) = 0; //! Destroy filter. virtual ~base_filter() {}; //! Destroys item if pipeline was cancelled. /** Required to prevent memory leaks. Note it can be called concurrently even for serial filters.*/ virtual void finalize( void* /*item*/ ) {} private: //! Pointer to next filter in the pipeline. base_filter* next_filter_in_pipeline; //! Buffer for incoming tokens, or nullptr if not required. /** The buffer is required if the filter is serial. */ r1::input_buffer* my_input_buffer; friend class r1::stage_task; friend class r1::pipeline; friend void r1::set_end_of_input(d1::base_filter&); //! Storage for filter mode and dynamically checked implementation version. const unsigned int my_filter_mode; //! Pointer to the pipeline. r1::pipeline* my_pipeline; }; template class concrete_filter; //! input_filter control to signal end-of-input for parallel_pipeline class flow_control { bool is_pipeline_stopped = false; flow_control() = default; template friend class concrete_filter; template __TBB_requires(std::copyable) friend class d2::input_node; public: void stop() { is_pipeline_stopped = true; } }; // Emulate std::is_trivially_copyable (false positives not allowed, false negatives suboptimal but safe). #if __TBB_CPP11_TYPE_PROPERTIES_PRESENT template using tbb_trivially_copyable = std::is_trivially_copyable; #else template struct tbb_trivially_copyable { enum { value = false }; }; template struct tbb_trivially_copyable < T* > { enum { value = true }; }; template<> struct tbb_trivially_copyable < bool > { enum { value = true }; }; template<> struct tbb_trivially_copyable < char > { enum { value = true }; }; template<> struct tbb_trivially_copyable < signed char > { enum { value = true }; }; template<> struct tbb_trivially_copyable { enum { value = true }; }; template<> struct tbb_trivially_copyable < short > { enum { value = true }; }; template<> struct tbb_trivially_copyable { enum { value = true }; }; template<> struct tbb_trivially_copyable < int > { enum { value = true }; }; template<> struct tbb_trivially_copyable { enum { value = true }; }; template<> struct tbb_trivially_copyable < long > { enum { value = true }; }; template<> struct tbb_trivially_copyable { enum { value = true }; }; template<> struct tbb_trivially_copyable < long long> { enum { value = true }; }; template<> struct tbb_trivially_copyable { enum { value = true }; }; template<> struct tbb_trivially_copyable < float > { enum { value = true }; }; template<> struct tbb_trivially_copyable < double > { enum { value = true }; }; template<> struct tbb_trivially_copyable < long double > { enum { value = true }; }; #endif // __TBB_CPP11_TYPE_PROPERTIES_PRESENT template struct use_allocator { static constexpr bool value = sizeof(T) > sizeof(void *) || !tbb_trivially_copyable::value; }; // A helper class to customize how a type is passed between filters. // Usage: token_helper::value> template struct token_helper; // using tbb_allocator template struct token_helper { using pointer = T*; using value_type = T; static pointer create_token(value_type && source) { return new (r1::allocate_memory(sizeof(T))) T(std::move(source)); } static value_type & token(pointer & t) { return *t; } static void * cast_to_void_ptr(pointer ref) { return reinterpret_cast(ref); } static pointer cast_from_void_ptr(void * ref) { return reinterpret_cast(ref); } static void destroy_token(pointer token) { token->~value_type(); r1::deallocate_memory(token); } }; // pointer specialization template struct token_helper { using pointer = T*; using value_type = T*; static pointer create_token(const value_type & source) { return source; } static value_type & token(pointer & t) { return t; } static void * cast_to_void_ptr(pointer ref) { return reinterpret_cast(ref); } static pointer cast_from_void_ptr(void * ref) { return reinterpret_cast(ref); } static void destroy_token( pointer /*token*/) {} }; // converting type to and from void*, passing objects directly template struct token_helper { typedef union { T actual_value; void * void_overlay; } type_to_void_ptr_map; using pointer = T; // not really a pointer in this case. using value_type = T; static pointer create_token(const value_type & source) { return source; } static value_type & token(pointer & t) { return t; } static void * cast_to_void_ptr(pointer ref) { type_to_void_ptr_map mymap; mymap.void_overlay = nullptr; mymap.actual_value = ref; return mymap.void_overlay; } static pointer cast_from_void_ptr(void * ref) { type_to_void_ptr_map mymap; mymap.void_overlay = ref; return mymap.actual_value; } static void destroy_token( pointer /*token*/) {} }; // intermediate template class concrete_filter: public base_filter { const Body& my_body; using input_helper = token_helper::value>; using input_pointer = typename input_helper::pointer; using output_helper = token_helper::value>; using output_pointer = typename output_helper::pointer; void* operator()(void* input) override { input_pointer temp_input = input_helper::cast_from_void_ptr(input); output_pointer temp_output = output_helper::create_token(tbb::detail::invoke(my_body, std::move(input_helper::token(temp_input)))); input_helper::destroy_token(temp_input); return output_helper::cast_to_void_ptr(temp_output); } void finalize(void * input) override { input_pointer temp_input = input_helper::cast_from_void_ptr(input); input_helper::destroy_token(temp_input); } public: concrete_filter(unsigned int m, const Body& body) : base_filter(m), my_body(body) {} }; // input template class concrete_filter: public base_filter { const Body& my_body; using output_helper = token_helper::value>; using output_pointer = typename output_helper::pointer; void* operator()(void*) override { flow_control control; output_pointer temp_output = output_helper::create_token(my_body(control)); if(control.is_pipeline_stopped) { output_helper::destroy_token(temp_output); set_end_of_input(); return nullptr; } return output_helper::cast_to_void_ptr(temp_output); } public: concrete_filter(unsigned int m, const Body& body) : base_filter(m | filter_may_emit_null), my_body(body) {} }; // output template class concrete_filter: public base_filter { const Body& my_body; using input_helper = token_helper::value>; using input_pointer = typename input_helper::pointer; void* operator()(void* input) override { input_pointer temp_input = input_helper::cast_from_void_ptr(input); tbb::detail::invoke(my_body, std::move(input_helper::token(temp_input))); input_helper::destroy_token(temp_input); return nullptr; } void finalize(void* input) override { input_pointer temp_input = input_helper::cast_from_void_ptr(input); input_helper::destroy_token(temp_input); } public: concrete_filter(unsigned int m, const Body& body) : base_filter(m), my_body(body) {} }; template class concrete_filter: public base_filter { const Body& my_body; void* operator()(void*) override { flow_control control; my_body(control); void* output = control.is_pipeline_stopped ? nullptr : (void*)(std::intptr_t)-1; return output; } public: concrete_filter(unsigned int m, const Body& body) : base_filter(m), my_body(body) {} }; class filter_node_ptr { filter_node * my_node; public: filter_node_ptr() : my_node(nullptr) {} filter_node_ptr(filter_node *); ~filter_node_ptr(); filter_node_ptr(const filter_node_ptr &); filter_node_ptr(filter_node_ptr &&); void operator=(filter_node *); void operator=(const filter_node_ptr &); void operator=(filter_node_ptr &&); filter_node& operator*() const; operator bool() const; }; //! Abstract base class that represents a node in a parse tree underlying a filter class. /** These nodes are always heap-allocated and can be shared by filter objects. */ class filter_node { /** Count must be atomic because it is hidden state for user, but might be shared by threads. */ std::atomic ref_count; public: filter_node_ptr left; filter_node_ptr right; protected: filter_node() : ref_count(0), left(nullptr), right(nullptr) { #ifdef __TBB_TEST_FILTER_NODE_COUNT ++(__TBB_TEST_FILTER_NODE_COUNT); #endif } public: filter_node(const filter_node_ptr& x, const filter_node_ptr& y) : filter_node(){ left = x; right = y; } filter_node(const filter_node&) = delete; filter_node& operator=(const filter_node&) = delete; //! Add concrete_filter to pipeline virtual base_filter* create_filter() const { __TBB_ASSERT(false, "method of non-leaf was called"); return nullptr; } //! Increment reference count void add_ref() { ref_count.fetch_add(1, std::memory_order_relaxed); } //! Decrement reference count and delete if it becomes zero. void remove_ref() { __TBB_ASSERT(ref_count>0,"ref_count underflow"); if( ref_count.fetch_sub(1, std::memory_order_relaxed) == 1 ) { this->~filter_node(); r1::deallocate_memory(this); } } virtual ~filter_node() { #ifdef __TBB_TEST_FILTER_NODE_COUNT --(__TBB_TEST_FILTER_NODE_COUNT); #endif } }; inline filter_node_ptr::filter_node_ptr(filter_node * nd) : my_node(nd) { if (my_node) { my_node->add_ref(); } } inline filter_node_ptr::~filter_node_ptr() { if (my_node) { my_node->remove_ref(); } } inline filter_node_ptr::filter_node_ptr(const filter_node_ptr & rhs) : my_node(rhs.my_node) { if (my_node) { my_node->add_ref(); } } inline filter_node_ptr::filter_node_ptr(filter_node_ptr && rhs) : my_node(rhs.my_node) { rhs.my_node = nullptr; } inline void filter_node_ptr::operator=(filter_node * rhs) { // Order of operations below carefully chosen so that reference counts remain correct // in unlikely event that remove_ref throws exception. filter_node* old = my_node; my_node = rhs; if (my_node) { my_node->add_ref(); } if (old) { old->remove_ref(); } } inline void filter_node_ptr::operator=(const filter_node_ptr & rhs) { *this = rhs.my_node; } inline void filter_node_ptr::operator=(filter_node_ptr && rhs) { filter_node* old = my_node; my_node = rhs.my_node; rhs.my_node = nullptr; if (old) { old->remove_ref(); } } inline filter_node& filter_node_ptr::operator*() const{ __TBB_ASSERT(my_node,"nullptr node is used"); return *my_node; } inline filter_node_ptr::operator bool() const { return my_node != nullptr; } //! Node in parse tree representing result of make_filter. template class filter_node_leaf: public filter_node { const unsigned int my_mode; const Body my_body; base_filter* create_filter() const override { return new(r1::allocate_memory(sizeof(concrete_filter))) concrete_filter(my_mode,my_body); } public: filter_node_leaf( unsigned int m, const Body& b ) : my_mode(m), my_body(b) {} }; template ::input_type> using filter_input = typename std::conditional::value, void, Input>::type; template using filter_output = typename filter_body_types::output_type; } // namespace d1 } // namespace detail } // namespace tbb #endif /* __TBB_parallel_filters_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_pipeline_filters_deduction.h ================================================ /* Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB__pipeline_filters_deduction_H #define __TBB__pipeline_filters_deduction_H #include "_config.h" #include #include namespace tbb { namespace detail { namespace d1 { template struct declare_filter_types { using input_type = typename std::remove_const::type>::type; using output_type = typename std::remove_const::type>::type; }; template struct filter_body_types; template struct filter_body_types : declare_filter_types {}; template struct filter_body_types : declare_filter_types {}; } // namespace d1 } // namespace detail } // namespace tbb #endif // __TBB__pipeline_filters_deduction_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_range_common.h ================================================ /* Copyright (c) 2005-2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_detail__range_common_H #define __TBB_detail__range_common_H #include "_config.h" #include "_utils.h" #if __TBB_CPP20_CONCEPTS_PRESENT #include #endif #include namespace tbb { namespace detail { inline namespace d0 { //! Dummy type that distinguishes splitting constructor from copy constructor. /** * See description of parallel_for and parallel_reduce for example usages. * @ingroup algorithms */ class split {}; //! Type enables transmission of splitting proportion from partitioners to range objects /** * In order to make use of such facility Range objects must implement * splitting constructor with this type passed. */ class proportional_split : no_assign { public: proportional_split(size_t _left = 1, size_t _right = 1) : my_left(_left), my_right(_right) { } size_t left() const { return my_left; } size_t right() const { return my_right; } // used when range does not support proportional split explicit operator split() const { return split(); } private: size_t my_left, my_right; }; template struct range_split_object_provider { template static split get( PartitionerSplitType& ) { return split(); } }; template struct range_split_object_provider::value>::type> { template static PartitionerSplitType& get( PartitionerSplitType& split_obj ) { return split_obj; } }; template auto get_range_split_object( PartitionerSplitType& split_obj ) -> decltype(range_split_object_provider::get(split_obj)) { return range_split_object_provider::get(split_obj); } template using range_iterator_type = decltype(std::begin(std::declval())); #if __TBB_CPP20_CONCEPTS_PRESENT template using iterator_reference_type = typename std::iterator_traits::reference; template using range_reference_type = iterator_reference_type>; template concept blocked_range_value = std::copyable && requires( const std::remove_reference_t& lhs, const std::remove_reference_t& rhs ) { { lhs < rhs } -> relaxed_convertible_to; { lhs - rhs } -> std::convertible_to; { lhs + (rhs - lhs) } -> std::convertible_to; }; template concept splittable = std::constructible_from; template concept tbb_range = std::copy_constructible && splittable && requires( const std::remove_reference_t& range ) { { range.empty() } -> relaxed_convertible_to; { range.is_divisible() } -> relaxed_convertible_to; }; template struct iterator_concept_helper; // New specializations should be added in case of using container_based_sequence with // the new iterator tag types template struct iterator_concept_helper { static constexpr bool value = std::input_iterator; }; template struct iterator_concept_helper { static constexpr bool value = std::random_access_iterator; }; template concept iterator_satisfies = iterator_concept_helper::value; template concept container_based_sequence = requires( Sequence& seq ) { { std::begin(seq) } -> iterator_satisfies; { std::end(seq) } -> iterator_satisfies; }; #endif // __TBB_CPP20_CONCEPTS_PRESENT } // namespace d0 } // namespace detail } // namespace tbb #endif // __TBB_detail__range_common_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_rtm_mutex.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB__rtm_mutex_impl_H #define __TBB__rtm_mutex_impl_H #include "_assert.h" #include "_utils.h" #include "../spin_mutex.h" #include "../profiling.h" namespace tbb { namespace detail { namespace r1 { struct rtm_mutex_impl; } namespace d1 { #if _MSC_VER && !defined(__INTEL_COMPILER) // Suppress warning: structure was padded due to alignment specifier #pragma warning (push) #pragma warning (disable: 4324) #endif /** A rtm_mutex is an speculation-enabled spin mutex. It should be used for locking short critical sections where the lock is contended but the data it protects are not. If zero-initialized, the mutex is considered unheld. @ingroup synchronization */ class alignas(max_nfs_size) rtm_mutex : private spin_mutex { private: enum class rtm_state { rtm_none, rtm_transacting, rtm_real }; public: //! Constructors rtm_mutex() noexcept { create_itt_sync(this, "tbb::speculative_spin_mutex", ""); } //! Destructor ~rtm_mutex() = default; //! Represents acquisition of a mutex. class scoped_lock { public: friend class rtm_mutex; //! Construct lock that has not acquired a mutex. /** Equivalent to zero-initialization of *this. */ constexpr scoped_lock() : m_mutex(nullptr), m_transaction_state(rtm_state::rtm_none) {} //! Acquire lock on given mutex. scoped_lock(rtm_mutex& m) : m_mutex(nullptr), m_transaction_state(rtm_state::rtm_none) { acquire(m); } //! Release lock (if lock is held). ~scoped_lock() { if(m_transaction_state != rtm_state::rtm_none) { release(); } } //! No Copy scoped_lock(const scoped_lock&) = delete; scoped_lock& operator=(const scoped_lock&) = delete; //! Acquire lock on given mutex. void acquire(rtm_mutex& m); //! Try acquire lock on given mutex. bool try_acquire(rtm_mutex& m); //! Release lock void release(); private: rtm_mutex* m_mutex; rtm_state m_transaction_state; friend r1::rtm_mutex_impl; }; //! Mutex traits static constexpr bool is_rw_mutex = false; static constexpr bool is_recursive_mutex = false; static constexpr bool is_fair_mutex = false; private: friend r1::rtm_mutex_impl; }; // end of rtm_mutex } // namespace d1 namespace r1 { //! Internal acquire lock. // only_speculate == true if we're doing a try_lock, else false. TBB_EXPORT void __TBB_EXPORTED_FUNC acquire(d1::rtm_mutex&, d1::rtm_mutex::scoped_lock&, bool only_speculate = false); //! Internal try_acquire lock. TBB_EXPORT bool __TBB_EXPORTED_FUNC try_acquire(d1::rtm_mutex&, d1::rtm_mutex::scoped_lock&); //! Internal release lock. TBB_EXPORT void __TBB_EXPORTED_FUNC release(d1::rtm_mutex::scoped_lock&); } // namespace r1 namespace d1 { //! Acquire lock on given mutex. inline void rtm_mutex::scoped_lock::acquire(rtm_mutex& m) { __TBB_ASSERT(!m_mutex, "lock is already acquired"); r1::acquire(m, *this); } //! Try acquire lock on given mutex. inline bool rtm_mutex::scoped_lock::try_acquire(rtm_mutex& m) { __TBB_ASSERT(!m_mutex, "lock is already acquired"); return r1::try_acquire(m, *this); } //! Release lock inline void rtm_mutex::scoped_lock::release() { __TBB_ASSERT(m_mutex, "lock is not acquired"); __TBB_ASSERT(m_transaction_state != rtm_state::rtm_none, "lock is not acquired"); return r1::release(*this); } #if _MSC_VER && !defined(__INTEL_COMPILER) #pragma warning (pop) // 4324 warning #endif #if TBB_USE_PROFILING_TOOLS inline void set_name(rtm_mutex& obj, const char* name) { itt_set_sync_name(&obj, name); } #if (_WIN32||_WIN64) inline void set_name(rtm_mutex& obj, const wchar_t* name) { itt_set_sync_name(&obj, name); } #endif // WIN #else inline void set_name(rtm_mutex&, const char*) {} #if (_WIN32||_WIN64) inline void set_name(rtm_mutex&, const wchar_t*) {} #endif // WIN #endif } // namespace d1 } // namespace detail } // namespace tbb #endif /* __TBB__rtm_mutex_impl_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_rtm_rw_mutex.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_detail__rtm_rw_mutex_H #define __TBB_detail__rtm_rw_mutex_H #include "_assert.h" #include "_utils.h" #include "../spin_rw_mutex.h" #include namespace tbb { namespace detail { namespace r1 { struct rtm_rw_mutex_impl; } namespace d1 { constexpr std::size_t speculation_granularity = 64; #if _MSC_VER && !defined(__INTEL_COMPILER) // Suppress warning: structure was padded due to alignment specifier #pragma warning (push) #pragma warning (disable: 4324) #endif //! Fast, unfair, spinning speculation-enabled reader-writer lock with backoff and writer-preference /** @ingroup synchronization */ class alignas(max_nfs_size) rtm_rw_mutex : private spin_rw_mutex { friend struct r1::rtm_rw_mutex_impl; private: enum class rtm_type { rtm_not_in_mutex, rtm_transacting_reader, rtm_transacting_writer, rtm_real_reader, rtm_real_writer }; public: //! Constructors rtm_rw_mutex() noexcept : write_flag(false) { create_itt_sync(this, "tbb::speculative_spin_rw_mutex", ""); } //! Destructor ~rtm_rw_mutex() = default; //! Represents acquisition of a mutex. class scoped_lock { friend struct r1::rtm_rw_mutex_impl; public: //! Construct lock that has not acquired a mutex. /** Equivalent to zero-initialization of *this. */ constexpr scoped_lock() : m_mutex(nullptr), m_transaction_state(rtm_type::rtm_not_in_mutex) {} //! Acquire lock on given mutex. scoped_lock(rtm_rw_mutex& m, bool write = true) : m_mutex(nullptr), m_transaction_state(rtm_type::rtm_not_in_mutex) { acquire(m, write); } //! Release lock (if lock is held). ~scoped_lock() { if(m_transaction_state != rtm_type::rtm_not_in_mutex) { release(); } } //! No Copy scoped_lock(const scoped_lock&) = delete; scoped_lock& operator=(const scoped_lock&) = delete; //! Acquire lock on given mutex. inline void acquire(rtm_rw_mutex& m, bool write = true); //! Try acquire lock on given mutex. inline bool try_acquire(rtm_rw_mutex& m, bool write = true); //! Release lock inline void release(); //! Upgrade reader to become a writer. /** Returns whether the upgrade happened without releasing and re-acquiring the lock */ inline bool upgrade_to_writer(); //! Downgrade writer to become a reader. inline bool downgrade_to_reader(); inline bool is_writer() const; private: rtm_rw_mutex* m_mutex; rtm_type m_transaction_state; }; //! Mutex traits static constexpr bool is_rw_mutex = true; static constexpr bool is_recursive_mutex = false; static constexpr bool is_fair_mutex = false; private: alignas(speculation_granularity) std::atomic write_flag; }; #if _MSC_VER && !defined(__INTEL_COMPILER) #pragma warning (pop) // 4324 warning #endif } // namespace d1 namespace r1 { //! Internal acquire write lock. // only_speculate == true if we're doing a try_lock, else false. TBB_EXPORT void __TBB_EXPORTED_FUNC acquire_writer(d1::rtm_rw_mutex&, d1::rtm_rw_mutex::scoped_lock&, bool only_speculate = false); //! Internal acquire read lock. // only_speculate == true if we're doing a try_lock, else false. TBB_EXPORT void __TBB_EXPORTED_FUNC acquire_reader(d1::rtm_rw_mutex&, d1::rtm_rw_mutex::scoped_lock&, bool only_speculate = false); //! Internal upgrade reader to become a writer. TBB_EXPORT bool __TBB_EXPORTED_FUNC upgrade(d1::rtm_rw_mutex::scoped_lock&); //! Internal downgrade writer to become a reader. TBB_EXPORT bool __TBB_EXPORTED_FUNC downgrade(d1::rtm_rw_mutex::scoped_lock&); //! Internal try_acquire write lock. TBB_EXPORT bool __TBB_EXPORTED_FUNC try_acquire_writer(d1::rtm_rw_mutex&, d1::rtm_rw_mutex::scoped_lock&); //! Internal try_acquire read lock. TBB_EXPORT bool __TBB_EXPORTED_FUNC try_acquire_reader(d1::rtm_rw_mutex&, d1::rtm_rw_mutex::scoped_lock&); //! Internal release lock. TBB_EXPORT void __TBB_EXPORTED_FUNC release(d1::rtm_rw_mutex::scoped_lock&); } namespace d1 { //! Acquire lock on given mutex. void rtm_rw_mutex::scoped_lock::acquire(rtm_rw_mutex& m, bool write) { __TBB_ASSERT(!m_mutex, "lock is already acquired"); if (write) { r1::acquire_writer(m, *this); } else { r1::acquire_reader(m, *this); } } //! Try acquire lock on given mutex. bool rtm_rw_mutex::scoped_lock::try_acquire(rtm_rw_mutex& m, bool write) { __TBB_ASSERT(!m_mutex, "lock is already acquired"); if (write) { return r1::try_acquire_writer(m, *this); } else { return r1::try_acquire_reader(m, *this); } } //! Release lock void rtm_rw_mutex::scoped_lock::release() { __TBB_ASSERT(m_mutex, "lock is not acquired"); __TBB_ASSERT(m_transaction_state != rtm_type::rtm_not_in_mutex, "lock is not acquired"); return r1::release(*this); } //! Upgrade reader to become a writer. /** Returns whether the upgrade happened without releasing and re-acquiring the lock */ bool rtm_rw_mutex::scoped_lock::upgrade_to_writer() { __TBB_ASSERT(m_mutex, "lock is not acquired"); if (m_transaction_state == rtm_type::rtm_transacting_writer || m_transaction_state == rtm_type::rtm_real_writer) { return true; // Already a writer } return r1::upgrade(*this); } //! Downgrade writer to become a reader. bool rtm_rw_mutex::scoped_lock::downgrade_to_reader() { __TBB_ASSERT(m_mutex, "lock is not acquired"); if (m_transaction_state == rtm_type::rtm_transacting_reader || m_transaction_state == rtm_type::rtm_real_reader) { return true; // Already a reader } return r1::downgrade(*this); } bool rtm_rw_mutex::scoped_lock::is_writer() const { __TBB_ASSERT(m_mutex, "lock is not acquired"); return m_transaction_state == rtm_type::rtm_transacting_writer || m_transaction_state == rtm_type::rtm_real_writer; } #if TBB_USE_PROFILING_TOOLS inline void set_name(rtm_rw_mutex& obj, const char* name) { itt_set_sync_name(&obj, name); } #if (_WIN32||_WIN64) inline void set_name(rtm_rw_mutex& obj, const wchar_t* name) { itt_set_sync_name(&obj, name); } #endif // WIN #else inline void set_name(rtm_rw_mutex&, const char*) {} #if (_WIN32||_WIN64) inline void set_name(rtm_rw_mutex&, const wchar_t*) {} #endif // WIN #endif } // namespace d1 } // namespace detail } // namespace tbb #endif // __TBB_detail__rtm_rw_mutex_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_scoped_lock.h ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_detail_scoped_lock_H #define __TBB_detail_scoped_lock_H namespace tbb { namespace detail { namespace d1 { // unique_scoped_lock supposes that Mutex operations never throw template class unique_scoped_lock { //! Points to currently held Mutex, or nullptr if no lock is held. Mutex* m_mutex{}; public: //! Construct without acquiring a Mutex. constexpr unique_scoped_lock() noexcept : m_mutex(nullptr) {} //! Construct and acquire lock on a Mutex. unique_scoped_lock(Mutex& m) { acquire(m); } //! No Copy unique_scoped_lock(const unique_scoped_lock&) = delete; unique_scoped_lock& operator=(const unique_scoped_lock&) = delete; //! Acquire lock. void acquire(Mutex& m) { __TBB_ASSERT(m_mutex == nullptr, "The mutex is already acquired"); m_mutex = &m; m.lock(); } //! Try acquiring lock (non-blocking) /** Return true if lock acquired; false otherwise. */ bool try_acquire(Mutex& m) { __TBB_ASSERT(m_mutex == nullptr, "The mutex is already acquired"); bool succeed = m.try_lock(); if (succeed) { m_mutex = &m; } return succeed; } //! Release lock void release() { __TBB_ASSERT(m_mutex, "release on Mutex::unique_scoped_lock that is not holding a lock"); m_mutex->unlock(); m_mutex = nullptr; } //! Destroy lock. If holding a lock, releases the lock first. ~unique_scoped_lock() { if (m_mutex) { release(); } } }; // rw_scoped_lock supposes that Mutex operations never throw template class rw_scoped_lock { public: //! Construct lock that has not acquired a mutex. /** Equivalent to zero-initialization of *this. */ constexpr rw_scoped_lock() noexcept {} //! Acquire lock on given mutex. rw_scoped_lock(Mutex& m, bool write = true) { acquire(m, write); } //! Release lock (if lock is held). ~rw_scoped_lock() { if (m_mutex) { release(); } } //! No Copy rw_scoped_lock(const rw_scoped_lock&) = delete; rw_scoped_lock& operator=(const rw_scoped_lock&) = delete; //! Acquire lock on given mutex. void acquire(Mutex& m, bool write = true) { __TBB_ASSERT(m_mutex == nullptr, "The mutex is already acquired"); m_is_writer = write; m_mutex = &m; if (write) { m_mutex->lock(); } else { m_mutex->lock_shared(); } } //! Try acquire lock on given mutex. bool try_acquire(Mutex& m, bool write = true) { bool succeed = write ? m.try_lock() : m.try_lock_shared(); if (succeed) { m_mutex = &m; m_is_writer = write; } return succeed; } //! Release lock. void release() { __TBB_ASSERT(m_mutex != nullptr, "The mutex is not acquired"); Mutex* m = m_mutex; m_mutex = nullptr; if (m_is_writer) { m->unlock(); } else { m->unlock_shared(); } } //! Upgrade reader to become a writer. /** Returns whether the upgrade happened without releasing and re-acquiring the lock */ bool upgrade_to_writer() { __TBB_ASSERT(m_mutex != nullptr, "The mutex is not acquired"); if (m_is_writer) { return true; // Already a writer } m_is_writer = true; return m_mutex->upgrade(); } //! Downgrade writer to become a reader. bool downgrade_to_reader() { __TBB_ASSERT(m_mutex != nullptr, "The mutex is not acquired"); if (m_is_writer) { m_mutex->downgrade(); m_is_writer = false; } return true; } bool is_writer() const { __TBB_ASSERT(m_mutex != nullptr, "The mutex is not acquired"); return m_is_writer; } protected: //! The pointer to the current mutex that is held, or nullptr if no mutex is held. Mutex* m_mutex {nullptr}; //! If mutex != nullptr, then is_writer is true if holding a writer lock, false if holding a reader lock. /** Not defined if not holding a lock. */ bool m_is_writer {false}; }; } // namespace d1 } // namespace detail } // namespace tbb #endif // __TBB_detail_scoped_lock_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_segment_table.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_detail__segment_table_H #define __TBB_detail__segment_table_H #include "_config.h" #include "_allocator_traits.h" #include "_template_helpers.h" #include "_utils.h" #include "_assert.h" #include "_exception.h" #include #include #include #include #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) #pragma warning(push) #pragma warning(disable: 4127) // warning C4127: conditional expression is constant #endif namespace tbb { namespace detail { namespace d1 { template class segment_table { public: using value_type = T; using segment_type = T*; using atomic_segment = std::atomic; using segment_table_type = atomic_segment*; using size_type = std::size_t; using segment_index_type = std::size_t; using allocator_type = Allocator; using allocator_traits_type = tbb::detail::allocator_traits; using segment_table_allocator_type = typename allocator_traits_type::template rebind_alloc; protected: using segment_table_allocator_traits = tbb::detail::allocator_traits; using derived_type = DerivedType; static constexpr size_type pointers_per_embedded_table = PointersPerEmbeddedTable; static constexpr size_type pointers_per_long_table = sizeof(size_type) * 8; public: segment_table( const allocator_type& alloc = allocator_type() ) : my_segment_table_allocator(alloc), my_segment_table(nullptr) , my_first_block{}, my_size{}, my_segment_table_allocation_failed{} { my_segment_table.store(my_embedded_table, std::memory_order_relaxed); zero_table(my_embedded_table, pointers_per_embedded_table); } segment_table( const segment_table& other ) : my_segment_table_allocator(segment_table_allocator_traits:: select_on_container_copy_construction(other.my_segment_table_allocator)) , my_segment_table(nullptr), my_first_block{}, my_size{}, my_segment_table_allocation_failed{} { my_segment_table.store(my_embedded_table, std::memory_order_relaxed); zero_table(my_embedded_table, pointers_per_embedded_table); try_call( [&] { internal_transfer(other, copy_segment_body_type{*this}); } ).on_exception( [&] { clear(); }); } segment_table( const segment_table& other, const allocator_type& alloc ) : my_segment_table_allocator(alloc), my_segment_table(nullptr) , my_first_block{}, my_size{}, my_segment_table_allocation_failed{} { my_segment_table.store(my_embedded_table, std::memory_order_relaxed); zero_table(my_embedded_table, pointers_per_embedded_table); try_call( [&] { internal_transfer(other, copy_segment_body_type{*this}); } ).on_exception( [&] { clear(); }); } segment_table( segment_table&& other ) : my_segment_table_allocator(std::move(other.my_segment_table_allocator)), my_segment_table(nullptr) , my_first_block{}, my_size{}, my_segment_table_allocation_failed{} { my_segment_table.store(my_embedded_table, std::memory_order_relaxed); zero_table(my_embedded_table, pointers_per_embedded_table); internal_move(std::move(other)); } segment_table( segment_table&& other, const allocator_type& alloc ) : my_segment_table_allocator(alloc), my_segment_table(nullptr), my_first_block{} , my_size{}, my_segment_table_allocation_failed{} { my_segment_table.store(my_embedded_table, std::memory_order_relaxed); zero_table(my_embedded_table, pointers_per_embedded_table); using is_equal_type = typename segment_table_allocator_traits::is_always_equal; internal_move_construct_with_allocator(std::move(other), alloc, is_equal_type()); } ~segment_table() { clear(); } segment_table& operator=( const segment_table& other ) { if (this != &other) { copy_assign_allocators(my_segment_table_allocator, other.my_segment_table_allocator); internal_transfer(other, copy_segment_body_type{*this}); } return *this; } segment_table& operator=( segment_table&& other ) noexcept(derived_type::is_noexcept_assignment) { using pocma_type = typename segment_table_allocator_traits::propagate_on_container_move_assignment; using is_equal_type = typename segment_table_allocator_traits::is_always_equal; if (this != &other) { move_assign_allocators(my_segment_table_allocator, other.my_segment_table_allocator); internal_move_assign(std::move(other), tbb::detail::disjunction()); } return *this; } void swap( segment_table& other ) noexcept(derived_type::is_noexcept_swap) { using is_equal_type = typename segment_table_allocator_traits::is_always_equal; using pocs_type = typename segment_table_allocator_traits::propagate_on_container_swap; if (this != &other) { swap_allocators(my_segment_table_allocator, other.my_segment_table_allocator); internal_swap(other, tbb::detail::disjunction()); } } segment_type get_segment( segment_index_type index ) const { return get_table()[index] + segment_base(index); } value_type& operator[]( size_type index ) { return internal_subscript(index); } const value_type& operator[]( size_type index ) const { return const_cast(this)->internal_subscript(index); } const segment_table_allocator_type& get_allocator() const { return my_segment_table_allocator; } segment_table_allocator_type& get_allocator() { return my_segment_table_allocator; } void enable_segment( segment_type& segment, segment_table_type table, segment_index_type seg_index, size_type index ) { // Allocate new segment segment_type new_segment = self()->create_segment(table, seg_index, index); if (new_segment != nullptr) { // Store (new_segment - segment_base) into the segment table to allow access to the table by index via // my_segment_table[segment_index_of(index)][index] segment_type disabled_segment = nullptr; if (!table[seg_index].compare_exchange_strong(disabled_segment, new_segment - segment_base(seg_index))) { // compare_exchange failed => some other thread has already enabled this segment // Deallocate the memory self()->deallocate_segment(new_segment, seg_index); } } segment = table[seg_index].load(std::memory_order_acquire); __TBB_ASSERT(segment != nullptr, "If create_segment returned nullptr, the element should be stored in the table"); } void delete_segment( segment_index_type seg_index ) { segment_type segment_to_delete = self()->nullify_segment(get_table(), seg_index); if (segment_to_delete == segment_allocation_failure_tag) { return; } segment_to_delete += segment_base(seg_index); // Deallocate the segment self()->destroy_segment(segment_to_delete, seg_index); } size_type number_of_segments( segment_table_type table ) const { // Check for an active table, if it is embedded table - return the number of embedded segments // Otherwise - return the maximum number of segments return table == my_embedded_table ? pointers_per_embedded_table : pointers_per_long_table; } size_type capacity() const noexcept { segment_table_type table = get_table(); size_type num_segments = number_of_segments(table); for (size_type seg_index = 0; seg_index < num_segments; ++seg_index) { // Check if the pointer is valid (allocated) if (table[seg_index].load(std::memory_order_relaxed) <= segment_allocation_failure_tag) { return segment_base(seg_index); } } return segment_base(num_segments); } size_type find_last_allocated_segment( segment_table_type table ) const noexcept { size_type end = 0; size_type num_segments = number_of_segments(table); for (size_type seg_index = 0; seg_index < num_segments; ++seg_index) { // Check if the pointer is valid (allocated) if (table[seg_index].load(std::memory_order_relaxed) > segment_allocation_failure_tag) { end = seg_index + 1; } } return end; } void reserve( size_type n ) { if (n > allocator_traits_type::max_size(my_segment_table_allocator)) { throw_exception(exception_id::reservation_length_error); } size_type size = my_size.load(std::memory_order_relaxed); segment_index_type start_seg_idx = size == 0 ? 0 : segment_index_of(size - 1) + 1; for (segment_index_type seg_idx = start_seg_idx; segment_base(seg_idx) < n; ++seg_idx) { size_type first_index = segment_base(seg_idx); internal_subscript(first_index); } } void clear() { clear_segments(); clear_table(); my_size.store(0, std::memory_order_relaxed); my_first_block.store(0, std::memory_order_relaxed); } void clear_segments() { segment_table_type current_segment_table = get_table(); for (size_type i = number_of_segments(current_segment_table); i != 0; --i) { if (current_segment_table[i - 1].load(std::memory_order_relaxed) != nullptr) { // If the segment was enabled - disable and deallocate it delete_segment(i - 1); } } } void destroy_and_deallocate_table(segment_table_type table, size_type num_segments) { auto& alloc = get_allocator(); for (size_type seg_idx = 0; seg_idx < num_segments; ++seg_idx) { segment_table_allocator_traits::destroy(alloc, &table[seg_idx]); } segment_table_allocator_traits::deallocate(alloc, table, num_segments); } void clear_table() { segment_table_type current_segment_table = get_table(); if (current_segment_table != my_embedded_table) { // If the active table is not the embedded one - deallocate the active table destroy_and_deallocate_table(current_segment_table, pointers_per_long_table); my_segment_table.store(my_embedded_table, std::memory_order_relaxed); zero_table(my_embedded_table, pointers_per_embedded_table); } } void extend_table_if_necessary(segment_table_type& table, size_type start_index, size_type end_index) { // Extend segment table if an active table is an embedded one and the requested index is // outside it if (table == my_embedded_table && end_index > embedded_table_size) { if (start_index <= embedded_table_size) { // More than one thread can get here: the one that has assigned the first block and // is in the process of allocating it now, and the one that saw the first block has // been assigned already, but not yet allocated. This latter thread decides not to // wait for the first one and extend the table itself. try_call([&] { segment_table_type new_table = self()->allocate_long_table(my_embedded_table, start_index); // It is possible that the table was extended by the thread that allocated first // block. In this case, the below CAS fails and re-reads the new table pointer. if (my_segment_table.compare_exchange_strong( table, new_table, /*memory order in case of a success*/std::memory_order_release, /*memory order in case of a failure*/std::memory_order_acquire)) { // CAS was successful, update the local table pointer with now actual table = new_table; } else if (new_table) { // Other thread was the first to replace the segment table. Current thread's // table is not needed anymore, so destroying it. destroy_and_deallocate_table(new_table, pointers_per_long_table); } }).on_exception([&] { my_segment_table_allocation_failed.store(true, std::memory_order_relaxed); }); } else { atomic_backoff backoff; do { if (my_segment_table_allocation_failed.load(std::memory_order_relaxed)) { throw_exception(exception_id::bad_alloc); } backoff.pause(); table = my_segment_table.load(std::memory_order_acquire); } while (table == my_embedded_table); } } } // Return the segment where index is stored static constexpr segment_index_type segment_index_of( size_type index ) { return size_type(tbb::detail::log2(uintptr_t(index|1))); } // Needed to calculate the offset in segment static constexpr size_type segment_base( size_type index ) { return size_type(1) << index & ~size_type(1); } // Return size of the segment static constexpr size_type segment_size( size_type index ) { return index == 0 ? 2 : size_type(1) << index; } private: derived_type* self() { return static_cast(this); } struct copy_segment_body_type { void operator()( segment_index_type index, segment_type from, segment_type to ) const { my_instance.self()->copy_segment(index, from, to); } segment_table& my_instance; }; struct move_segment_body_type { void operator()( segment_index_type index, segment_type from, segment_type to ) const { my_instance.self()->move_segment(index, from, to); } segment_table& my_instance; }; // Transgers all segments from the other table template void internal_transfer( const segment_table& other, TransferBody transfer_segment ) { static_cast(this)->destroy_elements(); assign_first_block_if_necessary(other.my_first_block.load(std::memory_order_relaxed)); my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed); segment_table_type other_table = other.get_table(); size_type end_segment_size = segment_size(other.find_last_allocated_segment(other_table)); // If an exception occurred in other, then the size may be greater than the size of the end segment. size_type other_size = end_segment_size < other.my_size.load(std::memory_order_relaxed) ? other.my_size.load(std::memory_order_relaxed) : end_segment_size; other_size = my_segment_table_allocation_failed ? embedded_table_size : other_size; for (segment_index_type i = 0; segment_base(i) < other_size; ++i) { // If the segment in other table is enabled - transfer it if (other_table[i].load(std::memory_order_relaxed) == segment_allocation_failure_tag) { my_size = segment_base(i); break; } else if (other_table[i].load(std::memory_order_relaxed) != nullptr) { internal_subscript(segment_base(i)); transfer_segment(i, other.get_table()[i].load(std::memory_order_relaxed) + segment_base(i), get_table()[i].load(std::memory_order_relaxed) + segment_base(i)); } } } // Moves the other segment table // Only equal allocators are allowed void internal_move( segment_table&& other ) { // NOTE: allocators should be equal clear(); my_first_block.store(other.my_first_block.load(std::memory_order_relaxed), std::memory_order_relaxed); my_size.store(other.my_size.load(std::memory_order_relaxed), std::memory_order_relaxed); // If an active table in other is embedded - restore all of the embedded segments if (other.get_table() == other.my_embedded_table) { for ( size_type i = 0; i != pointers_per_embedded_table; ++i ) { segment_type other_segment = other.my_embedded_table[i].load(std::memory_order_relaxed); my_embedded_table[i].store(other_segment, std::memory_order_relaxed); other.my_embedded_table[i].store(nullptr, std::memory_order_relaxed); } my_segment_table.store(my_embedded_table, std::memory_order_relaxed); } else { my_segment_table.store(other.my_segment_table, std::memory_order_relaxed); other.my_segment_table.store(other.my_embedded_table, std::memory_order_relaxed); zero_table(other.my_embedded_table, pointers_per_embedded_table); } other.my_size.store(0, std::memory_order_relaxed); } // Move construct the segment table with the allocator object // if any instances of allocator_type are always equal void internal_move_construct_with_allocator( segment_table&& other, const allocator_type&, /*is_always_equal = */ std::true_type ) { internal_move(std::move(other)); } // Move construct the segment table with the allocator object // if any instances of allocator_type are always equal void internal_move_construct_with_allocator( segment_table&& other, const allocator_type& alloc, /*is_always_equal = */ std::false_type ) { if (other.my_segment_table_allocator == alloc) { // If allocators are equal - restore pointers internal_move(std::move(other)); } else { // If allocators are not equal - perform per element move with reallocation try_call( [&] { internal_transfer(other, move_segment_body_type{*this}); } ).on_exception( [&] { clear(); }); } } // Move assigns the segment table to other is any instances of allocator_type are always equal // or propagate_on_container_move_assignment is true void internal_move_assign( segment_table&& other, /*is_always_equal || POCMA = */ std::true_type ) { internal_move(std::move(other)); } // Move assigns the segment table to other is any instances of allocator_type are not always equal // and propagate_on_container_move_assignment is false void internal_move_assign( segment_table&& other, /*is_always_equal || POCMA = */ std::false_type ) { if (my_segment_table_allocator == other.my_segment_table_allocator) { // If allocators are equal - restore pointers internal_move(std::move(other)); } else { // If allocators are not equal - perform per element move with reallocation internal_transfer(other, move_segment_body_type{*this}); } } // Swaps two segment tables if any instances of allocator_type are always equal // or propagate_on_container_swap is true void internal_swap( segment_table& other, /*is_always_equal || POCS = */ std::true_type ) { internal_swap_fields(other); } // Swaps two segment tables if any instances of allocator_type are not always equal // and propagate_on_container_swap is false // According to the C++ standard, swapping of two containers with unequal allocators // is an undefined behavior scenario void internal_swap( segment_table& other, /*is_always_equal || POCS = */ std::false_type ) { __TBB_ASSERT(my_segment_table_allocator == other.my_segment_table_allocator, "Swapping with unequal allocators is not allowed"); internal_swap_fields(other); } void internal_swap_fields( segment_table& other ) { // If an active table in either *this segment table or other is an embedded one - swaps the embedded tables if (get_table() == my_embedded_table || other.get_table() == other.my_embedded_table) { for (size_type i = 0; i != pointers_per_embedded_table; ++i) { segment_type current_segment = my_embedded_table[i].load(std::memory_order_relaxed); segment_type other_segment = other.my_embedded_table[i].load(std::memory_order_relaxed); my_embedded_table[i].store(other_segment, std::memory_order_relaxed); other.my_embedded_table[i].store(current_segment, std::memory_order_relaxed); } } segment_table_type current_segment_table = get_table(); segment_table_type other_segment_table = other.get_table(); // If an active table is an embedded one - // store an active table in other to the embedded one from other if (current_segment_table == my_embedded_table) { other.my_segment_table.store(other.my_embedded_table, std::memory_order_relaxed); } else { // Otherwise - store it to the active segment table other.my_segment_table.store(current_segment_table, std::memory_order_relaxed); } // If an active table in other segment table is an embedded one - // store an active table in other to the embedded one from *this if (other_segment_table == other.my_embedded_table) { my_segment_table.store(my_embedded_table, std::memory_order_relaxed); } else { // Otherwise - store it to the active segment table in other my_segment_table.store(other_segment_table, std::memory_order_relaxed); } auto first_block = other.my_first_block.load(std::memory_order_relaxed); other.my_first_block.store(my_first_block.load(std::memory_order_relaxed), std::memory_order_relaxed); my_first_block.store(first_block, std::memory_order_relaxed); auto size = other.my_size.load(std::memory_order_relaxed); other.my_size.store(my_size.load(std::memory_order_relaxed), std::memory_order_relaxed); my_size.store(size, std::memory_order_relaxed); } protected: // A flag indicates that an exception was throws during segment allocations const segment_type segment_allocation_failure_tag = reinterpret_cast(1); static constexpr size_type embedded_table_size = segment_size(pointers_per_embedded_table); template value_type& internal_subscript( size_type index ) { segment_index_type seg_index = segment_index_of(index); segment_table_type table = my_segment_table.load(std::memory_order_acquire); segment_type segment = nullptr; if (allow_out_of_range_access) { if (derived_type::allow_table_extending) { extend_table_if_necessary(table, index, index + 1); } segment = table[seg_index].load(std::memory_order_acquire); // If the required segment is disabled - enable it if (segment == nullptr) { enable_segment(segment, table, seg_index, index); } // Check if an exception was thrown during segment allocation if (segment == segment_allocation_failure_tag) { throw_exception(exception_id::bad_alloc); } } else { segment = table[seg_index].load(std::memory_order_acquire); } __TBB_ASSERT(segment != nullptr, nullptr); return segment[index]; } void assign_first_block_if_necessary(segment_index_type index) { size_type zero = 0; if (this->my_first_block.load(std::memory_order_relaxed) == zero) { this->my_first_block.compare_exchange_strong(zero, index); } } void zero_table( segment_table_type table, size_type count ) { for (size_type i = 0; i != count; ++i) { table[i].store(nullptr, std::memory_order_relaxed); } } segment_table_type get_table() const { return my_segment_table.load(std::memory_order_acquire); } segment_table_allocator_type my_segment_table_allocator; std::atomic my_segment_table; atomic_segment my_embedded_table[pointers_per_embedded_table]; // Number of segments in first block std::atomic my_first_block; // Number of elements in table std::atomic my_size; // Flag to indicate failed extend table std::atomic my_segment_table_allocation_failed; }; // class segment_table } // namespace d1 } // namespace detail } // namespace tbb #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) #pragma warning(pop) // warning 4127 is back #endif #endif // __TBB_detail__segment_table_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_small_object_pool.h ================================================ /* Copyright (c) 2020-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB__small_object_pool_H #define __TBB__small_object_pool_H #include "_config.h" #include "_assert.h" #include "../profiling.h" #include #include #include namespace tbb { namespace detail { namespace d1 { class small_object_pool { protected: small_object_pool() = default; }; struct execution_data; } namespace r1 { TBB_EXPORT void* __TBB_EXPORTED_FUNC allocate(d1::small_object_pool*& pool, std::size_t number_of_bytes, const d1::execution_data& ed); TBB_EXPORT void* __TBB_EXPORTED_FUNC allocate(d1::small_object_pool*& pool, std::size_t number_of_bytes); TBB_EXPORT void __TBB_EXPORTED_FUNC deallocate(d1::small_object_pool& pool, void* ptr, std::size_t number_of_bytes, const d1::execution_data& ed); TBB_EXPORT void __TBB_EXPORTED_FUNC deallocate(d1::small_object_pool& pool, void* ptr, std::size_t number_of_bytes); } namespace d1 { class small_object_allocator { public: template Type* new_object(execution_data& ed, Args&&... args) { void* allocated_object = r1::allocate(m_pool, sizeof(Type), ed); auto constructed_object = new(allocated_object) Type(std::forward(args)...); return constructed_object; } template Type* new_object(Args&&... args) { void* allocated_object = r1::allocate(m_pool, sizeof(Type)); auto constructed_object = new(allocated_object) Type(std::forward(args)...); return constructed_object; } template void delete_object(Type* object, const execution_data& ed) { // Copy this since it can be a member of the passed object and // unintentionally destroyed when Type destructor is called below small_object_allocator alloc = *this; object->~Type(); alloc.deallocate(object, ed); } template void delete_object(Type* object) { // Copy this since it can be a member of the passed object and // unintentionally destroyed when Type destructor is called below small_object_allocator alloc = *this; object->~Type(); alloc.deallocate(object); } template void deallocate(Type* ptr, const execution_data& ed) { call_itt_task_notify(destroy, ptr); __TBB_ASSERT(m_pool != nullptr, "Pool must be valid for deallocate call"); r1::deallocate(*m_pool, ptr, sizeof(Type), ed); } template void deallocate(Type* ptr) { call_itt_task_notify(destroy, ptr); __TBB_ASSERT(m_pool != nullptr, "Pool must be valid for deallocate call"); r1::deallocate(*m_pool, ptr, sizeof(Type)); } private: small_object_pool* m_pool{}; }; } // namespace d1 } // namespace detail } // namespace tbb #endif /* __TBB__small_object_pool_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_string_resource.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ TBB_STRING_RESOURCE(ALGORITHM, "tbb_algorithm") TBB_STRING_RESOURCE(PARALLEL_FOR, "tbb_parallel_for") TBB_STRING_RESOURCE(PARALLEL_FOR_EACH, "tbb_parallel_for_each") TBB_STRING_RESOURCE(PARALLEL_INVOKE, "tbb_parallel_invoke") TBB_STRING_RESOURCE(PARALLEL_REDUCE, "tbb_parallel_reduce") TBB_STRING_RESOURCE(PARALLEL_SCAN, "tbb_parallel_scan") TBB_STRING_RESOURCE(PARALLEL_SORT, "tbb_parallel_sort") TBB_STRING_RESOURCE(PARALLEL_PIPELINE, "tbb_parallel_pipeline") TBB_STRING_RESOURCE(CUSTOM_CTX, "tbb_custom") TBB_STRING_RESOURCE(FLOW_NULL, "null") TBB_STRING_RESOURCE(FLOW_BROADCAST_NODE, "broadcast_node") TBB_STRING_RESOURCE(FLOW_BUFFER_NODE, "buffer_node") TBB_STRING_RESOURCE(FLOW_CONTINUE_NODE, "continue_node") TBB_STRING_RESOURCE(FLOW_FUNCTION_NODE, "function_node") TBB_STRING_RESOURCE(FLOW_JOIN_NODE_QUEUEING, "join_node (queueing)") TBB_STRING_RESOURCE(FLOW_JOIN_NODE_RESERVING, "join_node (reserving)") TBB_STRING_RESOURCE(FLOW_JOIN_NODE_TAG_MATCHING, "join_node (tag_matching)") TBB_STRING_RESOURCE(FLOW_LIMITER_NODE, "limiter_node") TBB_STRING_RESOURCE(FLOW_MULTIFUNCTION_NODE, "multifunction_node") TBB_STRING_RESOURCE(FLOW_OVERWRITE_NODE, "overwrite_node") TBB_STRING_RESOURCE(FLOW_PRIORITY_QUEUE_NODE, "priority_queue_node") TBB_STRING_RESOURCE(FLOW_QUEUE_NODE, "queue_node") TBB_STRING_RESOURCE(FLOW_SEQUENCER_NODE, "sequencer_node") TBB_STRING_RESOURCE(FLOW_INPUT_NODE, "input_node") TBB_STRING_RESOURCE(FLOW_SPLIT_NODE, "split_node") TBB_STRING_RESOURCE(FLOW_WRITE_ONCE_NODE, "write_once_node") TBB_STRING_RESOURCE(FLOW_INDEXER_NODE, "indexer_node") TBB_STRING_RESOURCE(FLOW_COMPOSITE_NODE, "composite_node") TBB_STRING_RESOURCE(FLOW_ASYNC_NODE, "async_node") TBB_STRING_RESOURCE(FLOW_INPUT_PORT, "input_port") TBB_STRING_RESOURCE(FLOW_INPUT_PORT_0, "input_port_0") TBB_STRING_RESOURCE(FLOW_INPUT_PORT_1, "input_port_1") TBB_STRING_RESOURCE(FLOW_INPUT_PORT_2, "input_port_2") TBB_STRING_RESOURCE(FLOW_INPUT_PORT_3, "input_port_3") TBB_STRING_RESOURCE(FLOW_INPUT_PORT_4, "input_port_4") TBB_STRING_RESOURCE(FLOW_INPUT_PORT_5, "input_port_5") TBB_STRING_RESOURCE(FLOW_INPUT_PORT_6, "input_port_6") TBB_STRING_RESOURCE(FLOW_INPUT_PORT_7, "input_port_7") TBB_STRING_RESOURCE(FLOW_INPUT_PORT_8, "input_port_8") TBB_STRING_RESOURCE(FLOW_INPUT_PORT_9, "input_port_9") TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT, "output_port") TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_0, "output_port_0") TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_1, "output_port_1") TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_2, "output_port_2") TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_3, "output_port_3") TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_4, "output_port_4") TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_5, "output_port_5") TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_6, "output_port_6") TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_7, "output_port_7") TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_8, "output_port_8") TBB_STRING_RESOURCE(FLOW_OUTPUT_PORT_9, "output_port_9") TBB_STRING_RESOURCE(FLOW_OBJECT_NAME, "object_name") TBB_STRING_RESOURCE(FLOW_BODY, "body") TBB_STRING_RESOURCE(FLOW_GRAPH, "graph") TBB_STRING_RESOURCE(FLOW_NODE, "node") TBB_STRING_RESOURCE(FLOW_TASKS, "tbb_flow_graph") TBB_STRING_RESOURCE(USER_EVENT, "user_event") #if __TBB_FLOW_TRACE_CODEPTR TBB_STRING_RESOURCE(CODE_ADDRESS, "code_address") #endif ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_task.h ================================================ /* Copyright (c) 2020-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB__task_H #define __TBB__task_H #include "_config.h" #include "_assert.h" #include "_template_helpers.h" #include "_small_object_pool.h" #include "../profiling.h" #include #include #include #include #include #include namespace tbb { namespace detail { namespace d1 { using slot_id = unsigned short; constexpr slot_id no_slot = slot_id(~0); constexpr slot_id any_slot = slot_id(~1); class task; class wait_context; class task_group_context; struct execution_data; class wait_tree_vertex_interface; class task_arena_base; } namespace d2 { class task_group; class task_group_base; } namespace r1 { //! Task spawn/wait entry points TBB_EXPORT void __TBB_EXPORTED_FUNC spawn(d1::task& t, d1::task_group_context& ctx); TBB_EXPORT void __TBB_EXPORTED_FUNC spawn(d1::task& t, d1::task_group_context& ctx, d1::slot_id id); TBB_EXPORT void __TBB_EXPORTED_FUNC execute_and_wait(d1::task& t, d1::task_group_context& t_ctx, d1::wait_context&, d1::task_group_context& w_ctx); TBB_EXPORT void __TBB_EXPORTED_FUNC wait(d1::wait_context&, d1::task_group_context& ctx); TBB_EXPORT d1::slot_id __TBB_EXPORTED_FUNC execution_slot(const d1::execution_data*); TBB_EXPORT d1::slot_id __TBB_EXPORTED_FUNC execution_slot(const d1::task_arena_base&); TBB_EXPORT d1::task_group_context* __TBB_EXPORTED_FUNC current_context(); TBB_EXPORT d1::wait_tree_vertex_interface* get_thread_reference_vertex(d1::wait_tree_vertex_interface* wc); // Do not place under __TBB_RESUMABLE_TASKS. It is a stub for unsupported platforms. struct suspend_point_type; using suspend_callback_type = void(*)(void*, suspend_point_type*); //! The resumable tasks entry points TBB_EXPORT void __TBB_EXPORTED_FUNC suspend(suspend_callback_type suspend_callback, void* user_callback); TBB_EXPORT void __TBB_EXPORTED_FUNC resume(suspend_point_type* tag); TBB_EXPORT suspend_point_type* __TBB_EXPORTED_FUNC current_suspend_point(); TBB_EXPORT void __TBB_EXPORTED_FUNC notify_waiters(std::uintptr_t wait_ctx_addr); class thread_data; class task_dispatcher; class external_waiter; struct task_accessor; struct task_arena_impl; } // namespace r1 namespace d1 { class task_arena; using suspend_point = r1::suspend_point_type*; #if __TBB_RESUMABLE_TASKS template static void suspend_callback(void* user_callback, suspend_point sp) { // Copy user function to a new stack after the context switch to avoid a race when the previous // suspend point is resumed while the user_callback is being called. F user_callback_copy = *static_cast(user_callback); user_callback_copy(sp); } template void suspend(F f) { r1::suspend(&suspend_callback, &f); } inline void resume(suspend_point tag) { r1::resume(tag); } #endif /* __TBB_RESUMABLE_TASKS */ // TODO align wait_context on cache lane class wait_context { static constexpr std::uint64_t overflow_mask = ~((1LLU << 32) - 1); std::uint64_t m_version_and_traits{1}; std::atomic m_ref_count{}; void add_reference(std::int64_t delta) { call_itt_task_notify(releasing, this); std::uint64_t r = m_ref_count.fetch_add(static_cast(delta)) + static_cast(delta); __TBB_ASSERT_EX((r & overflow_mask) == 0, "Overflow is detected"); if (!r) { // Some external waiters or coroutine waiters sleep in wait list // Should to notify them that work is done std::uintptr_t wait_ctx_addr = std::uintptr_t(this); r1::notify_waiters(wait_ctx_addr); } } bool continue_execution() const { std::uint64_t r = m_ref_count.load(std::memory_order_acquire); __TBB_ASSERT_EX((r & overflow_mask) == 0, "Overflow is detected"); return r > 0; } friend class r1::thread_data; friend class r1::task_dispatcher; friend class r1::external_waiter; friend class wait_context_vertex; friend struct r1::task_arena_impl; friend struct r1::suspend_point_type; public: // Despite the internal reference count is uin64_t we limit the user interface with uint32_t // to preserve a part of the internal reference count for special needs. wait_context(std::uint32_t ref_count) : m_ref_count{ref_count} { suppress_unused_warning(m_version_and_traits); } wait_context(const wait_context&) = delete; ~wait_context() { __TBB_ASSERT(!continue_execution(), nullptr); } void reserve(std::uint32_t delta = 1) { add_reference(delta); } void release(std::uint32_t delta = 1) { add_reference(-std::int64_t(delta)); } }; class wait_tree_vertex_interface { public: virtual void reserve(std::uint32_t delta = 1) = 0; virtual void release(std::uint32_t delta = 1) = 0; protected: virtual ~wait_tree_vertex_interface() = default; }; class wait_context_vertex : public wait_tree_vertex_interface { public: wait_context_vertex(std::uint32_t ref = 0) : m_wait(ref) {} void reserve(std::uint32_t delta = 1) override { m_wait.reserve(delta); } void release(std::uint32_t delta = 1) override { m_wait.release(delta); } wait_context& get_context() { return m_wait; } private: friend class d2::task_group; friend class d2::task_group_base; bool continue_execution() const { return m_wait.continue_execution(); } wait_context m_wait; }; class reference_vertex : public wait_tree_vertex_interface { public: reference_vertex(wait_tree_vertex_interface* parent, std::uint32_t ref_count) : my_parent{parent}, m_ref_count{ref_count} {} void reserve(std::uint32_t delta = 1) override { if (m_ref_count.fetch_add(static_cast(delta)) == 0) { my_parent->reserve(); } } void release(std::uint32_t delta = 1) override { auto parent = my_parent; std::uint64_t ref = m_ref_count.fetch_sub(static_cast(delta)) - static_cast(delta); if (ref == 0) { parent->release(); } } std::uint32_t get_num_child() { return static_cast(m_ref_count.load(std::memory_order_acquire)); } private: wait_tree_vertex_interface* my_parent; std::atomic m_ref_count; }; struct execution_data { task_group_context* context{}; slot_id original_slot{}; slot_id affinity_slot{}; }; inline task_group_context* context(const execution_data& ed) { return ed.context; } inline slot_id original_slot(const execution_data& ed) { return ed.original_slot; } inline slot_id affinity_slot(const execution_data& ed) { return ed.affinity_slot; } inline slot_id execution_slot(const execution_data& ed) { return r1::execution_slot(&ed); } inline bool is_same_affinity(const execution_data& ed) { return affinity_slot(ed) == no_slot || affinity_slot(ed) == execution_slot(ed); } inline bool is_stolen(const execution_data& ed) { return original_slot(ed) != execution_slot(ed); } inline void spawn(task& t, task_group_context& ctx) { call_itt_task_notify(releasing, &t); r1::spawn(t, ctx); } inline void spawn(task& t, task_group_context& ctx, slot_id id) { call_itt_task_notify(releasing, &t); r1::spawn(t, ctx, id); } inline void execute_and_wait(task& t, task_group_context& t_ctx, wait_context& wait_ctx, task_group_context& w_ctx) { r1::execute_and_wait(t, t_ctx, wait_ctx, w_ctx); call_itt_task_notify(acquired, &wait_ctx); call_itt_task_notify(destroy, &wait_ctx); } inline void wait(wait_context& wait_ctx, task_group_context& ctx) { r1::wait(wait_ctx, ctx); call_itt_task_notify(acquired, &wait_ctx); call_itt_task_notify(destroy, &wait_ctx); } using r1::current_context; class task_traits { std::uint64_t m_version_and_traits{}; friend struct r1::task_accessor; }; //! Alignment for a task object static constexpr std::size_t task_alignment = 64; //! Base class for user-defined tasks. /** @ingroup task_scheduling */ class alignas(task_alignment) task : public task_traits { protected: virtual ~task() = default; public: virtual task* execute(execution_data&) = 0; virtual task* cancel(execution_data&) = 0; private: std::uint64_t m_reserved[6]{}; friend struct r1::task_accessor; }; static_assert(sizeof(task) == task_alignment, "task size is broken"); } // namespace d1 } // namespace detail } // namespace tbb #endif /* __TBB__task_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_task_handle.h ================================================ /* Copyright (c) 2020-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_task_handle_H #define __TBB_task_handle_H #include "_config.h" #include "_task.h" #include "_small_object_pool.h" #include "_utils.h" #include namespace tbb { namespace detail { namespace d1 { class task_group_context; class wait_context; struct execution_data; } namespace d2 { class task_handle; class task_handle_task : public d1::task { std::uint64_t m_version_and_traits{}; d1::wait_tree_vertex_interface* m_wait_tree_vertex; d1::task_group_context& m_ctx; d1::small_object_allocator m_allocator; public: void finalize(const d1::execution_data* ed = nullptr) { if (ed) { m_allocator.delete_object(this, *ed); } else { m_allocator.delete_object(this); } } task_handle_task(d1::wait_tree_vertex_interface* vertex, d1::task_group_context& ctx, d1::small_object_allocator& alloc) : m_wait_tree_vertex(vertex) , m_ctx(ctx) , m_allocator(alloc) { suppress_unused_warning(m_version_and_traits); m_wait_tree_vertex->reserve(); } ~task_handle_task() override { m_wait_tree_vertex->release(); } d1::task_group_context& ctx() const { return m_ctx; } }; class task_handle { struct task_handle_task_finalizer_t{ void operator()(task_handle_task* p){ p->finalize(); } }; using handle_impl_t = std::unique_ptr; handle_impl_t m_handle = {nullptr}; public: task_handle() = default; task_handle(task_handle&&) = default; task_handle& operator=(task_handle&&) = default; explicit operator bool() const noexcept { return static_cast(m_handle); } friend bool operator==(task_handle const& th, std::nullptr_t) noexcept; friend bool operator==(std::nullptr_t, task_handle const& th) noexcept; friend bool operator!=(task_handle const& th, std::nullptr_t) noexcept; friend bool operator!=(std::nullptr_t, task_handle const& th) noexcept; private: friend struct task_handle_accessor; task_handle(task_handle_task* t) : m_handle {t}{}; d1::task* release() { return m_handle.release(); } }; struct task_handle_accessor { static task_handle construct(task_handle_task* t) { return {t}; } static d1::task* release(task_handle& th) { return th.release(); } static d1::task_group_context& ctx_of(task_handle& th) { __TBB_ASSERT(th.m_handle, "ctx_of does not expect empty task_handle."); return th.m_handle->ctx(); } }; inline bool operator==(task_handle const& th, std::nullptr_t) noexcept { return th.m_handle == nullptr; } inline bool operator==(std::nullptr_t, task_handle const& th) noexcept { return th.m_handle == nullptr; } inline bool operator!=(task_handle const& th, std::nullptr_t) noexcept { return th.m_handle != nullptr; } inline bool operator!=(std::nullptr_t, task_handle const& th) noexcept { return th.m_handle != nullptr; } } // namespace d2 } // namespace detail } // namespace tbb #endif /* __TBB_task_handle_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_template_helpers.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_detail__template_helpers_H #define __TBB_detail__template_helpers_H #include "_utils.h" #include "_config.h" #include #include #include #include #include #include namespace tbb { namespace detail { inline namespace d0 { // An internal implementation of void_t, which can be used in SFINAE contexts template struct void_impl { using type = void; }; // struct void_impl template using void_t = typename void_impl::type; // Generic SFINAE helper for expression checks, based on the idea demonstrated in ISO C++ paper n4502 template class... Checks> struct supports_impl { using type = std::false_type; }; template class... Checks> struct supports_impl...>, Checks...> { using type = std::true_type; }; template class... Checks> using supports = typename supports_impl::type; //! A template to select either 32-bit or 64-bit constant as compile time, depending on machine word size. template struct select_size_t_constant { // Explicit cast is needed to avoid compiler warnings about possible truncation. // The value of the right size, which is selected by ?:, is anyway not truncated or promoted. static const std::size_t value = static_cast((sizeof(std::size_t)==sizeof(u)) ? u : ull); }; // TODO: do we really need it? //! Cast between unrelated pointer types. /** This method should be used sparingly as a last resort for dealing with situations that inherently break strict ISO C++ aliasing rules. */ // T is a pointer type because it will be explicitly provided by the programmer as a template argument; // U is a referent type to enable the compiler to check that "ptr" is a pointer, deducing U in the process. template inline T punned_cast( U* ptr ) { std::uintptr_t x = reinterpret_cast(ptr); return reinterpret_cast(x); } template struct padded_base : T { char pad[S - R]; }; template struct padded_base : T {}; //! Pads type T to fill out to a multiple of cache line size. template struct padded : padded_base {}; #if __TBB_CPP14_INTEGER_SEQUENCE_PRESENT using std::index_sequence; using std::make_index_sequence; #else template class index_sequence {}; template struct make_index_sequence_impl : make_index_sequence_impl < N - 1, N - 1, S... > {}; template struct make_index_sequence_impl <0, S...> { using type = index_sequence; }; template using make_index_sequence = typename make_index_sequence_impl::type; #endif /* __TBB_CPP14_INTEGER_SEQUENCE_PRESENT */ //! Attach an index to a type to use it with an index sequence template using indexed_t = T; #if __TBB_CPP17_LOGICAL_OPERATIONS_PRESENT using std::conjunction; using std::disjunction; #else // __TBB_CPP17_LOGICAL_OPERATIONS_PRESENT template struct conjunction : std::true_type {}; template struct conjunction : std::conditional, First>::type {}; template struct conjunction : T {}; template struct disjunction : std::false_type {}; template struct disjunction : std::conditional>::type {}; template struct disjunction : T {}; #endif // __TBB_CPP17_LOGICAL_OPERATIONS_PRESENT template using iterator_value_t = typename std::iterator_traits::value_type; template using iterator_key_t = typename std::remove_const::first_type>::type; template using iterator_mapped_t = typename iterator_value_t::second_type; template using iterator_alloc_pair_t = std::pair>::type, iterator_mapped_t>; template using alloc_value_type = typename A::value_type; template using alloc_ptr_t = typename std::allocator_traits::pointer; template using has_allocate = decltype(std::declval&>() = std::declval().allocate(0)); template using has_deallocate = decltype(std::declval().deallocate(std::declval>(), 0)); // alloc_value_type should be checked first, because it can be used in other checks template using is_allocator = supports; #if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT template inline constexpr bool is_allocator_v = is_allocator::value; #endif // Template class in which the "type" determines the type of the element number N in pack Args template struct pack_element { using type = void; }; template struct pack_element { using type = typename pack_element::type; }; template struct pack_element<0, T, Args...> { using type = T; }; template using pack_element_t = typename pack_element::type; template class raii_guard { public: static_assert( std::is_nothrow_copy_constructible::value && std::is_nothrow_move_constructible::value, "Throwing an exception during the Func copy or move construction cause an unexpected behavior." ); raii_guard( Func f ) noexcept : my_func(f), is_active(true) {} raii_guard( raii_guard&& g ) noexcept : my_func(std::move(g.my_func)), is_active(g.is_active) { g.is_active = false; } ~raii_guard() { if (is_active) { my_func(); } } void dismiss() { is_active = false; } private: Func my_func; bool is_active; }; // class raii_guard template raii_guard make_raii_guard( Func f ) { return raii_guard(f); } template struct try_call_proxy { try_call_proxy( Body b ) : body(b) {} template void on_exception( OnExceptionBody on_exception_body ) { auto guard = make_raii_guard(on_exception_body); body(); guard.dismiss(); } template void on_completion(OnCompletionBody on_completion_body) { auto guard = make_raii_guard(on_completion_body); body(); } Body body; }; // struct try_call_proxy // Template helper function for API // try_call(lambda1).on_exception(lambda2) // Executes lambda1 and if it throws an exception - executes lambda2 template try_call_proxy try_call( Body b ) { return try_call_proxy(b); } #if __TBB_CPP17_IS_SWAPPABLE_PRESENT using std::is_nothrow_swappable; using std::is_swappable; #else // __TBB_CPP17_IS_SWAPPABLE_PRESENT namespace is_swappable_detail { using std::swap; template using has_swap = decltype(swap(std::declval(), std::declval())); #if _MSC_VER && _MSC_VER <= 1900 && !__INTEL_COMPILER // Workaround for VS2015: it fails to instantiate noexcept(...) inside std::integral_constant. template struct noexcept_wrapper { static const bool value = noexcept(swap(std::declval(), std::declval())); }; template struct is_nothrow_swappable_impl : std::integral_constant::value> {}; #else template struct is_nothrow_swappable_impl : std::integral_constant(), std::declval()))> {}; #endif } template struct is_swappable : supports {}; template struct is_nothrow_swappable : conjunction, is_swappable_detail::is_nothrow_swappable_impl> {}; #endif // __TBB_CPP17_IS_SWAPPABLE_PRESENT //! Allows to store a function parameter pack as a variable and later pass it to another function template< typename... Types > struct stored_pack; template<> struct stored_pack<> { using pack_type = stored_pack<>; stored_pack() {} // Friend front-end functions template< typename F, typename Pack > friend void call(F&& f, Pack&& p); template< typename Ret, typename F, typename Pack > friend Ret call_and_return(F&& f, Pack&& p); protected: // Ideally, ref-qualified non-static methods would be used, // but that would greatly reduce the set of compilers where it works. template< typename Ret, typename F, typename... Preceding > static Ret call(F&& f, const pack_type& /*pack*/, Preceding&&... params) { return std::forward(f)(std::forward(params)...); } template< typename Ret, typename F, typename... Preceding > static Ret call(F&& f, pack_type&& /*pack*/, Preceding&&... params) { return std::forward(f)(std::forward(params)...); } }; template< typename T, typename... Types > struct stored_pack : stored_pack { using pack_type = stored_pack; using pack_remainder = stored_pack; // Since lifetime of original values is out of control, copies should be made. // Thus references should be stripped away from the deduced type. typename std::decay::type leftmost_value; // Here rvalue references act in the same way as forwarding references, // as long as class template parameters were deduced via forwarding references. stored_pack(T&& t, Types&&... types) : pack_remainder(std::forward(types)...), leftmost_value(std::forward(t)) {} // Friend front-end functions template< typename F, typename Pack > friend void call(F&& f, Pack&& p); template< typename Ret, typename F, typename Pack > friend Ret call_and_return(F&& f, Pack&& p); protected: template< typename Ret, typename F, typename... Preceding > static Ret call(F&& f, pack_type& pack, Preceding&&... params) { return pack_remainder::template call( std::forward(f), static_cast(pack), std::forward(params)... , pack.leftmost_value ); } template< typename Ret, typename F, typename... Preceding > static Ret call(F&& f, pack_type&& pack, Preceding&&... params) { return pack_remainder::template call( std::forward(f), static_cast(pack), std::forward(params)... , std::move(pack.leftmost_value) ); } }; //! Calls the given function with arguments taken from a stored_pack template< typename F, typename Pack > void call(F&& f, Pack&& p) { std::decay::type::template call(std::forward(f), std::forward(p)); } template< typename Ret, typename F, typename Pack > Ret call_and_return(F&& f, Pack&& p) { return std::decay::type::template call(std::forward(f), std::forward(p)); } template< typename... Types > stored_pack save_pack(Types&&... types) { return stored_pack(std::forward(types)...); } // A structure with the value which is equal to Trait::value // but can be used in the immediate context due to parameter T template struct dependent_bool : std::integral_constant {}; template struct body_arg_detector; template struct body_arg_detector { using arg_type = Arg; }; template struct body_arg_detector { using arg_type = Arg; }; template struct argument_detector; template struct argument_detector { using type = typename body_arg_detector::arg_type; }; template struct argument_detector { using type = Arg; }; // Detects the argument type of callable, works for callable with one argument. template using argument_type_of = typename argument_detector::type>::type; template struct type_identity { using type = T; }; template using type_identity_t = typename type_identity::type; } // inline namespace d0 } // namespace detail } // namespace tbb #endif // __TBB_detail__template_helpers_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_utils.h ================================================ /* Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_detail__utils_H #define __TBB_detail__utils_H #include #include #include #include #include "_config.h" #include "_assert.h" #include "_machine.h" namespace tbb { namespace detail { inline namespace d0 { //! Utility template function to prevent "unused" warnings by various compilers. template void suppress_unused_warning(T&&...) {} //! Compile-time constant that is upper bound on cache line/sector size. /** It should be used only in situations where having a compile-time upper bound is more useful than a run-time exact answer. @ingroup memory_allocation */ constexpr size_t max_nfs_size = 128; constexpr std::size_t max_nfs_size_exp = 7; static_assert(1 << max_nfs_size_exp == max_nfs_size, "max_nfs_size_exp must be a log2(max_nfs_size)"); //! Class that implements exponential backoff. class atomic_backoff { //! Time delay, in units of "pause" instructions. /** Should be equal to approximately the number of "pause" instructions that take the same time as an context switch. Must be a power of two.*/ static constexpr std::int32_t LOOPS_BEFORE_YIELD = 16; std::int32_t count; public: // In many cases, an object of this type is initialized eagerly on hot path, // as in for(atomic_backoff b; ; b.pause()) { /*loop body*/ } // For this reason, the construction cost must be very small! atomic_backoff() : count(1) {} // This constructor pauses immediately; do not use on hot paths! atomic_backoff(bool) : count(1) { pause(); } //! No Copy atomic_backoff(const atomic_backoff&) = delete; atomic_backoff& operator=(const atomic_backoff&) = delete; //! Pause for a while. void pause() { if (count <= LOOPS_BEFORE_YIELD) { machine_pause(count); // Pause twice as long the next time. count *= 2; } else { // Pause is so long that we might as well yield CPU to scheduler. yield(); } } //! Pause for a few times and return false if saturated. bool bounded_pause() { machine_pause(count); if (count < LOOPS_BEFORE_YIELD) { // Pause twice as long the next time. count *= 2; return true; } else { return false; } } void reset() { count = 1; } }; //! Spin WHILE the condition is true. /** T and U should be comparable types. */ template T spin_wait_while(const std::atomic& location, C comp, std::memory_order order) { atomic_backoff backoff; T snapshot = location.load(order); while (comp(snapshot)) { backoff.pause(); snapshot = location.load(order); } return snapshot; } //! Spin WHILE the value of the variable is equal to a given value /** T and U should be comparable types. */ template T spin_wait_while_eq(const std::atomic& location, const U value, std::memory_order order = std::memory_order_acquire) { return spin_wait_while(location, [&value](T t) { return t == value; }, order); } //! Spin UNTIL the value of the variable is equal to a given value /** T and U should be comparable types. */ template T spin_wait_until_eq(const std::atomic& location, const U value, std::memory_order order = std::memory_order_acquire) { return spin_wait_while(location, [&value](T t) { return t != value; }, order); } //! Spin UNTIL the condition returns true or spinning time is up. /** Returns what the passed functor returned last time it was invoked. */ template bool timed_spin_wait_until(Condition condition) { // 32 pauses + 32 yields are meausered as balanced spin time before sleep. bool finish = condition(); for (int i = 1; !finish && i < 32; finish = condition(), i *= 2) { machine_pause(i); } for (int i = 32; !finish && i < 64; finish = condition(), ++i) { yield(); } return finish; } template T clamp(T value, T lower_bound, T upper_bound) { __TBB_ASSERT(lower_bound <= upper_bound, "Incorrect bounds"); return value > lower_bound ? (value > upper_bound ? upper_bound : value) : lower_bound; } template std::uintptr_t log2(T in) { __TBB_ASSERT(in > 0, "The logarithm of a non-positive value is undefined."); return machine_log2(in); } template T reverse_bits(T src) { return machine_reverse_bits(src); } template T reverse_n_bits(T src, std::size_t n) { __TBB_ASSERT(n != 0, "Reverse for 0 bits is undefined behavior."); return reverse_bits(src) >> (number_of_bits() - n); } // A function to check if passed integer is a power of two template constexpr bool is_power_of_two( IntegerType arg ) { static_assert(std::is_integral::value, "An argument for is_power_of_two should be integral type"); return arg && (0 == (arg & (arg - 1))); } // A function to determine if passed integer is a power of two // at least as big as another power of two, i.e. for strictly positive i and j, // with j being a power of two, determines whether i==j< constexpr bool is_power_of_two_at_least(ArgIntegerType arg, DivisorIntegerType divisor) { // Divisor should be a power of two static_assert(std::is_integral::value, "An argument for is_power_of_two_at_least should be integral type"); return 0 == (arg & (arg - divisor)); } // A function to compute arg modulo divisor where divisor is a power of 2. template inline ArgIntegerType modulo_power_of_two(ArgIntegerType arg, DivisorIntegerType divisor) { __TBB_ASSERT( is_power_of_two(divisor), "Divisor should be a power of two" ); return arg & (divisor - 1); } //! A function to check if passed in pointer is aligned on a specific border template constexpr bool is_aligned(T* pointer, std::uintptr_t alignment) { return 0 == (reinterpret_cast(pointer) & (alignment - 1)); } #if TBB_USE_ASSERT static void* const poisoned_ptr = reinterpret_cast(-1); //! Set p to invalid pointer value. template inline void poison_pointer( T* &p ) { p = reinterpret_cast(poisoned_ptr); } template inline void poison_pointer(std::atomic& p) { p.store(reinterpret_cast(poisoned_ptr), std::memory_order_relaxed); } /** Expected to be used in assertions only, thus no empty form is defined. **/ template inline bool is_poisoned( T* p ) { return p == reinterpret_cast(poisoned_ptr); } template inline bool is_poisoned(const std::atomic& p) { return is_poisoned(p.load(std::memory_order_relaxed)); } #else template inline void poison_pointer(T&) {/*do nothing*/} #endif /* !TBB_USE_ASSERT */ template bool assert_pointer_valid(T* p, const char* comment = nullptr) { suppress_unused_warning(p, comment); __TBB_ASSERT(p != nullptr, comment); __TBB_ASSERT(!is_poisoned(p), comment); #if !(_MSC_VER && _MSC_VER <= 1900 && !__INTEL_COMPILER) __TBB_ASSERT(is_aligned(p, alignment == 0 ? alignof(T) : alignment), comment); #endif // Returns something to simplify assert_pointers_valid implementation. return true; } template void assert_pointers_valid(Args*... p) { // suppress_unused_warning is used as an evaluation context for the variadic pack. suppress_unused_warning(assert_pointer_valid(p)...); } //! Base class for types that should not be assigned. class no_assign { public: void operator=(const no_assign&) = delete; no_assign(const no_assign&) = default; no_assign() = default; }; //! Base class for types that should not be copied or assigned. class no_copy: no_assign { public: no_copy(const no_copy&) = delete; no_copy() = default; }; template void swap_atomics_relaxed(std::atomic& lhs, std::atomic& rhs){ T tmp = lhs.load(std::memory_order_relaxed); lhs.store(rhs.load(std::memory_order_relaxed), std::memory_order_relaxed); rhs.store(tmp, std::memory_order_relaxed); } //! One-time initialization states enum class do_once_state { uninitialized = 0, ///< No execution attempts have been undertaken yet pending, ///< A thread is executing associated do-once routine executed, ///< Do-once routine has been executed initialized = executed ///< Convenience alias }; //! One-time initialization function /** /param initializer Pointer to function without arguments The variant that returns bool is used for cases when initialization can fail and it is OK to continue execution, but the state should be reset so that the initialization attempt was repeated the next time. /param state Shared state associated with initializer that specifies its initialization state. Must be initially set to #uninitialized value (e.g. by means of default static zero initialization). **/ template void atomic_do_once( const F& initializer, std::atomic& state ) { // The loop in the implementation is necessary to avoid race when thread T2 // that arrived in the middle of initialization attempt by another thread T1 // has just made initialization possible. // In such a case T2 has to rely on T1 to initialize, but T1 may already be past // the point where it can recognize the changed conditions. do_once_state expected_state; while ( state.load( std::memory_order_acquire ) != do_once_state::executed ) { if( state.load( std::memory_order_relaxed ) == do_once_state::uninitialized ) { expected_state = do_once_state::uninitialized; #if defined(__INTEL_COMPILER) && __INTEL_COMPILER <= 1910 using enum_type = typename std::underlying_type::type; if( ((std::atomic&)state).compare_exchange_strong( (enum_type&)expected_state, (enum_type)do_once_state::pending ) ) { #else if( state.compare_exchange_strong( expected_state, do_once_state::pending ) ) { #endif run_initializer( initializer, state ); break; } } spin_wait_while_eq( state, do_once_state::pending ); } } // Run the initializer which can not fail template void run_initializer(const Functor& f, std::atomic& state ) { f(); state.store(do_once_state::executed, std::memory_order_release); } #if __TBB_CPP20_CONCEPTS_PRESENT template concept boolean_testable_impl = std::convertible_to; template concept boolean_testable = boolean_testable_impl && requires( T&& t ) { { !std::forward(t) } -> boolean_testable_impl; }; #if __TBB_CPP20_COMPARISONS_PRESENT struct synthesized_three_way_comparator { template auto operator()( const T1& lhs, const T2& rhs ) const requires requires { { lhs < rhs } -> boolean_testable; { rhs < lhs } -> boolean_testable; } { if constexpr (std::three_way_comparable_with) { return lhs <=> rhs; } else { if (lhs < rhs) { return std::weak_ordering::less; } if (rhs < lhs) { return std::weak_ordering::greater; } return std::weak_ordering::equivalent; } } }; // struct synthesized_three_way_comparator template using synthesized_three_way_result = decltype(synthesized_three_way_comparator{}(std::declval(), std::declval())); #endif // __TBB_CPP20_COMPARISONS_PRESENT // Check if the type T is implicitly OR explicitly convertible to U template concept relaxed_convertible_to = std::constructible_from; template concept adaptive_same_as = #if __TBB_STRICT_CONSTRAINTS std::same_as; #else std::convertible_to; #endif #endif // __TBB_CPP20_CONCEPTS_PRESENT template auto invoke(F&& f, Args&&... args) #if __TBB_CPP17_INVOKE_PRESENT noexcept(std::is_nothrow_invocable_v) -> std::invoke_result_t { return std::invoke(std::forward(f), std::forward(args)...); } #else // __TBB_CPP17_INVOKE_PRESENT noexcept(noexcept(std::forward(f)(std::forward(args)...))) -> decltype(std::forward(f)(std::forward(args)...)) { return std::forward(f)(std::forward(args)...); } #endif // __TBB_CPP17_INVOKE_PRESENT } // namespace d0 namespace d1 { class delegate_base { public: virtual bool operator()() const = 0; virtual ~delegate_base() {} }; template class delegated_function : public delegate_base { public: delegated_function(FuncType& f) : my_func(f) {} bool operator()() const override { return my_func(); } private: FuncType &my_func; }; } // namespace d1 } // namespace detail } // namespace tbb #endif // __TBB_detail__utils_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/detail/_waitable_atomic.h ================================================ /* Copyright (c) 2021-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_detail__address_waiters_H #define __TBB_detail__address_waiters_H #include "_utils.h" namespace tbb { namespace detail { namespace r1 { TBB_EXPORT void __TBB_EXPORTED_FUNC wait_on_address(void* address, d1::delegate_base& wakeup_condition, std::uintptr_t context); TBB_EXPORT void __TBB_EXPORTED_FUNC notify_by_address(void* address, std::uintptr_t context); TBB_EXPORT void __TBB_EXPORTED_FUNC notify_by_address_one(void* address); TBB_EXPORT void __TBB_EXPORTED_FUNC notify_by_address_all(void* address); } // namespace r1 namespace d1 { template void adaptive_wait_on_address(void* address, Predicate wakeup_condition, std::uintptr_t context) { if (!timed_spin_wait_until(wakeup_condition)) { d1::delegated_function pred(wakeup_condition); r1::wait_on_address(address, pred, context); } } template class waitable_atomic { public: waitable_atomic() = default; explicit waitable_atomic(T value) : my_atomic(value) {} waitable_atomic(const waitable_atomic&) = delete; waitable_atomic& operator=(const waitable_atomic&) = delete; T load(std::memory_order order) const noexcept { return my_atomic.load(order); } T exchange(T desired) noexcept { return my_atomic.exchange(desired); } void wait(T old, std::uintptr_t context, std::memory_order order) { auto wakeup_condition = [&] { return my_atomic.load(order) != old; }; if (!timed_spin_wait_until(wakeup_condition)) { // We need to use while here, because notify_all() will wake up all threads // But predicate for them might be false d1::delegated_function pred(wakeup_condition); do { r1::wait_on_address(this, pred, context); } while (!wakeup_condition()); } } void notify_one_relaxed() { r1::notify_by_address_one(this); } // TODO: consider adding following interfaces: // store(desired, memory_order) // notify_all_relaxed() // wait_until(T, std::uintptr_t, std::memory_order) // notify_relaxed(std::uintptr_t context) private: std::atomic my_atomic{}; }; } // namespace d1 } // namespace detail } // namespace tbb #endif // __TBB_detail__address_waiters_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/enumerable_thread_specific.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_enumerable_thread_specific_H #define __TBB_enumerable_thread_specific_H #include "detail/_config.h" #include "detail/_namespace_injection.h" #include "detail/_assert.h" #include "detail/_template_helpers.h" #include "detail/_aligned_space.h" #include "concurrent_vector.h" #include "tbb_allocator.h" #include "cache_aligned_allocator.h" #include "profiling.h" #include #include #include // memcpy #include // std::ptrdiff_t #include "task.h" // for task::suspend_point #if _WIN32 || _WIN64 #ifndef NOMINMAX #define NOMINMAX #define __TBB_DEFINED_NOMINMAX 1 #endif #include #if __TBB_DEFINED_NOMINMAX #undef NOMINMAX #undef __TBB_DEFINED_NOMINMAX #endif #else #include #endif namespace tbb { namespace detail { namespace d1 { //! enum for selecting between single key and key-per-instance versions enum ets_key_usage_type { ets_key_per_instance , ets_no_key #if __TBB_RESUMABLE_TASKS , ets_suspend_aware #endif }; // Forward declaration to use in internal classes template class enumerable_thread_specific; template struct internal_ets_key_selector { using key_type = std::thread::id; static key_type current_key() { return std::this_thread::get_id(); } }; // Intel Compiler on OSX cannot create atomics objects that instantiated from non-fundamental types #if __INTEL_COMPILER && __APPLE__ template<> struct internal_ets_key_selector { using key_type = std::size_t; static key_type current_key() { auto id = std::this_thread::get_id(); return reinterpret_cast(id); } }; #endif template struct ets_key_selector : internal_ets_key_selector {}; #if __TBB_RESUMABLE_TASKS template <> struct ets_key_selector { using key_type = suspend_point; static key_type current_key() { return r1::current_suspend_point(); } }; #endif template class ets_base : detail::no_copy { protected: using key_type = typename ets_key_selector::key_type; public: struct slot; struct array { array* next; std::size_t lg_size; slot& at( std::size_t k ) { return (reinterpret_cast(reinterpret_cast(this+1)))[k]; } std::size_t size() const { return std::size_t(1) << lg_size; } std::size_t mask() const { return size() - 1; } std::size_t start( std::size_t h ) const { return h >> (8 * sizeof(std::size_t) - lg_size); } }; struct slot { std::atomic key; void* ptr; bool empty() const { return key.load(std::memory_order_relaxed) == key_type(); } bool match( key_type k ) const { return key.load(std::memory_order_relaxed) == k; } bool claim( key_type k ) { // TODO: maybe claim ptr, because key_type is not guaranteed to fit into word size key_type expected = key_type(); return key.compare_exchange_strong(expected, k); } }; protected: //! Root of linked list of arrays of decreasing size. /** nullptr if and only if my_count==0. Each array in the list is half the size of its predecessor. */ std::atomic my_root; std::atomic my_count; virtual void* create_local() = 0; virtual void* create_array(std::size_t _size) = 0; // _size in bytes virtual void free_array(void* ptr, std::size_t _size) = 0; // _size in bytes array* allocate( std::size_t lg_size ) { std::size_t n = std::size_t(1) << lg_size; array* a = static_cast(create_array(sizeof(array) + n * sizeof(slot))); a->lg_size = lg_size; std::memset( a + 1, 0, n * sizeof(slot) ); return a; } void deallocate(array* a) { std::size_t n = std::size_t(1) << (a->lg_size); free_array( static_cast(a), std::size_t(sizeof(array) + n * sizeof(slot)) ); } ets_base() : my_root{nullptr}, my_count{0} {} virtual ~ets_base(); // g++ complains if this is not virtual void* table_lookup( bool& exists ); void table_clear(); // The following functions are not used in concurrent context, // so we don't need synchronization and ITT annotations there. template void table_elementwise_copy( const ets_base& other, void*(*add_element)(ets_base&, void*) ) { __TBB_ASSERT(!my_root.load(std::memory_order_relaxed), nullptr); __TBB_ASSERT(!my_count.load(std::memory_order_relaxed), nullptr); if( !other.my_root.load(std::memory_order_relaxed) ) return; array* root = allocate(other.my_root.load(std::memory_order_relaxed)->lg_size); my_root.store(root, std::memory_order_relaxed); root->next = nullptr; my_count.store(other.my_count.load(std::memory_order_relaxed), std::memory_order_relaxed); std::size_t mask = root->mask(); for( array* r = other.my_root.load(std::memory_order_relaxed); r; r = r->next ) { for( std::size_t i = 0; i < r->size(); ++i ) { slot& s1 = r->at(i); if( !s1.empty() ) { for( std::size_t j = root->start(std::hash{}(s1.key.load(std::memory_order_relaxed))); ; j = (j+1)&mask ) { slot& s2 = root->at(j); if( s2.empty() ) { s2.ptr = add_element(static_cast&>(*this), s1.ptr); s2.key.store(s1.key.load(std::memory_order_relaxed), std::memory_order_relaxed); break; } else if( s2.match(s1.key.load(std::memory_order_relaxed)) ) break; } } } } } void table_swap( ets_base& other ) { __TBB_ASSERT(this!=&other, "Don't swap an instance with itself"); swap_atomics_relaxed(my_root, other.my_root); swap_atomics_relaxed(my_count, other.my_count); } }; template ets_base::~ets_base() { __TBB_ASSERT(!my_root.load(std::memory_order_relaxed), nullptr); } template void ets_base::table_clear() { while ( array* r = my_root.load(std::memory_order_relaxed) ) { my_root.store(r->next, std::memory_order_relaxed); deallocate(r); } my_count.store(0, std::memory_order_relaxed); } template void* ets_base::table_lookup( bool& exists ) { const key_type k = ets_key_selector::current_key(); __TBB_ASSERT(k != key_type(), nullptr); void* found; std::size_t h = std::hash{}(k); for( array* r = my_root.load(std::memory_order_acquire); r; r = r->next ) { call_itt_notify(acquired,r); std::size_t mask=r->mask(); for(std::size_t i = r->start(h); ;i=(i+1)&mask) { slot& s = r->at(i); if( s.empty() ) break; if( s.match(k) ) { if( r == my_root.load(std::memory_order_acquire) ) { // Success at top level exists = true; return s.ptr; } else { // Success at some other level. Need to insert at top level. exists = true; found = s.ptr; goto insert; } } } } // Key does not yet exist. The density of slots in the table does not exceed 0.5, // for if this will occur a new table is allocated with double the current table // size, which is swapped in as the new root table. So an empty slot is guaranteed. exists = false; found = create_local(); { std::size_t c = ++my_count; array* r = my_root.load(std::memory_order_acquire); call_itt_notify(acquired,r); if( !r || c > r->size()/2 ) { std::size_t s = r ? r->lg_size : 2; while( c > std::size_t(1)<<(s-1) ) ++s; array* a = allocate(s); for(;;) { a->next = r; call_itt_notify(releasing,a); array* new_r = r; if( my_root.compare_exchange_strong(new_r, a) ) break; call_itt_notify(acquired, new_r); __TBB_ASSERT(new_r != nullptr, nullptr); if( new_r->lg_size >= s ) { // Another thread inserted an equal or bigger array, so our array is superfluous. deallocate(a); break; } r = new_r; } } } insert: // Whether a slot has been found in an older table, or if it has been inserted at this level, // it has already been accounted for in the total. Guaranteed to be room for it, and it is // not present, so search for empty slot and use it. array* ir = my_root.load(std::memory_order_acquire); call_itt_notify(acquired, ir); std::size_t mask = ir->mask(); for(std::size_t i = ir->start(h);; i = (i+1)&mask) { slot& s = ir->at(i); if( s.empty() ) { if( s.claim(k) ) { s.ptr = found; return found; } } } } //! Specialization that exploits native TLS template <> class ets_base: public ets_base { using super = ets_base; #if _WIN32||_WIN64 #if __TBB_WIN8UI_SUPPORT using tls_key_t = DWORD; void create_key() { my_key = FlsAlloc(nullptr); } void destroy_key() { FlsFree(my_key); } void set_tls(void * value) { FlsSetValue(my_key, (LPVOID)value); } void* get_tls() { return (void *)FlsGetValue(my_key); } #else using tls_key_t = DWORD; void create_key() { my_key = TlsAlloc(); } void destroy_key() { TlsFree(my_key); } void set_tls(void * value) { TlsSetValue(my_key, (LPVOID)value); } void* get_tls() { return (void *)TlsGetValue(my_key); } #endif #else using tls_key_t = pthread_key_t; void create_key() { pthread_key_create(&my_key, nullptr); } void destroy_key() { pthread_key_delete(my_key); } void set_tls( void * value ) const { pthread_setspecific(my_key, value); } void* get_tls() const { return pthread_getspecific(my_key); } #endif tls_key_t my_key; virtual void* create_local() override = 0; virtual void* create_array(std::size_t _size) override = 0; // _size in bytes virtual void free_array(void* ptr, std::size_t _size) override = 0; // size in bytes protected: ets_base() {create_key();} ~ets_base() {destroy_key();} void* table_lookup( bool& exists ) { void* found = get_tls(); if( found ) { exists=true; } else { found = super::table_lookup(exists); set_tls(found); } return found; } void table_clear() { destroy_key(); create_key(); super::table_clear(); } void table_swap( ets_base& other ) { using std::swap; __TBB_ASSERT(this!=&other, "Don't swap an instance with itself"); swap(my_key, other.my_key); super::table_swap(other); } }; //! Random access iterator for traversing the thread local copies. template< typename Container, typename Value > class enumerable_thread_specific_iterator { //! current position in the concurrent_vector Container *my_container; typename Container::size_type my_index; mutable Value *my_value; template friend bool operator==( const enumerable_thread_specific_iterator& i, const enumerable_thread_specific_iterator& j ); template friend bool operator<( const enumerable_thread_specific_iterator& i, const enumerable_thread_specific_iterator& j ); template friend std::ptrdiff_t operator-( const enumerable_thread_specific_iterator& i, const enumerable_thread_specific_iterator& j ); template friend class enumerable_thread_specific_iterator; public: //! STL support using difference_type = std::ptrdiff_t; using value_type = Value; using pointer = Value*; using reference = Value&; using iterator_category = std::random_access_iterator_tag; enumerable_thread_specific_iterator( const Container &container, typename Container::size_type index ) : my_container(&const_cast(container)), my_index(index), my_value(nullptr) {} //! Default constructor enumerable_thread_specific_iterator() : my_container(nullptr), my_index(0), my_value(nullptr) {} template enumerable_thread_specific_iterator( const enumerable_thread_specific_iterator& other ) : my_container( other.my_container ), my_index( other.my_index), my_value( const_cast(other.my_value) ) {} enumerable_thread_specific_iterator operator+( std::ptrdiff_t offset ) const { return enumerable_thread_specific_iterator(*my_container, my_index + offset); } friend enumerable_thread_specific_iterator operator+( std::ptrdiff_t offset, enumerable_thread_specific_iterator v ) { return enumerable_thread_specific_iterator(*v.my_container, v.my_index + offset); } enumerable_thread_specific_iterator &operator+=( std::ptrdiff_t offset ) { my_index += offset; my_value = nullptr; return *this; } enumerable_thread_specific_iterator operator-( std::ptrdiff_t offset ) const { return enumerable_thread_specific_iterator( *my_container, my_index-offset ); } enumerable_thread_specific_iterator &operator-=( std::ptrdiff_t offset ) { my_index -= offset; my_value = nullptr; return *this; } Value& operator*() const { Value* value = my_value; if( !value ) { value = my_value = (*my_container)[my_index].value(); } __TBB_ASSERT( value==(*my_container)[my_index].value(), "corrupt cache" ); return *value; } Value& operator[]( std::ptrdiff_t k ) const { return *(*my_container)[my_index + k].value(); } Value* operator->() const {return &operator*();} enumerable_thread_specific_iterator& operator++() { ++my_index; my_value = nullptr; return *this; } enumerable_thread_specific_iterator& operator--() { --my_index; my_value = nullptr; return *this; } //! Post increment enumerable_thread_specific_iterator operator++(int) { enumerable_thread_specific_iterator result = *this; ++my_index; my_value = nullptr; return result; } //! Post decrement enumerable_thread_specific_iterator operator--(int) { enumerable_thread_specific_iterator result = *this; --my_index; my_value = nullptr; return result; } }; template bool operator==( const enumerable_thread_specific_iterator& i, const enumerable_thread_specific_iterator& j ) { return i.my_index == j.my_index && i.my_container == j.my_container; } template bool operator!=( const enumerable_thread_specific_iterator& i, const enumerable_thread_specific_iterator& j ) { return !(i==j); } template bool operator<( const enumerable_thread_specific_iterator& i, const enumerable_thread_specific_iterator& j ) { return i.my_index bool operator>( const enumerable_thread_specific_iterator& i, const enumerable_thread_specific_iterator& j ) { return j bool operator>=( const enumerable_thread_specific_iterator& i, const enumerable_thread_specific_iterator& j ) { return !(i bool operator<=( const enumerable_thread_specific_iterator& i, const enumerable_thread_specific_iterator& j ) { return !(j std::ptrdiff_t operator-( const enumerable_thread_specific_iterator& i, const enumerable_thread_specific_iterator& j ) { return i.my_index-j.my_index; } template class segmented_iterator { template friend bool operator==(const segmented_iterator& i, const segmented_iterator& j); template friend bool operator!=(const segmented_iterator& i, const segmented_iterator& j); template friend class segmented_iterator; public: segmented_iterator() {my_segcont = nullptr;} segmented_iterator( const SegmentedContainer& _segmented_container ) : my_segcont(const_cast(&_segmented_container)), outer_iter(my_segcont->end()) { } ~segmented_iterator() {} using InnerContainer = typename SegmentedContainer::value_type; using inner_iterator = typename InnerContainer::iterator; using outer_iterator = typename SegmentedContainer::iterator; // STL support // TODO: inherit all types from segmented container? using difference_type = std::ptrdiff_t; using value_type = Value; using size_type = typename SegmentedContainer::size_type; using pointer = Value*; using reference = Value&; using iterator_category = std::input_iterator_tag; // Copy Constructor template segmented_iterator(const segmented_iterator& other) : my_segcont(other.my_segcont), outer_iter(other.outer_iter), // can we assign a default-constructed iterator to inner if we're at the end? inner_iter(other.inner_iter) {} // assignment template segmented_iterator& operator=( const segmented_iterator& other) { my_segcont = other.my_segcont; outer_iter = other.outer_iter; if(outer_iter != my_segcont->end()) inner_iter = other.inner_iter; return *this; } // allow assignment of outer iterator to segmented iterator. Once it is // assigned, move forward until a non-empty inner container is found or // the end of the outer container is reached. segmented_iterator& operator=(const outer_iterator& new_outer_iter) { __TBB_ASSERT(my_segcont != nullptr, nullptr); // check that this iterator points to something inside the segmented container for(outer_iter = new_outer_iter ;outer_iter!=my_segcont->end(); ++outer_iter) { if( !outer_iter->empty() ) { inner_iter = outer_iter->begin(); break; } } return *this; } // pre-increment segmented_iterator& operator++() { advance_me(); return *this; } // post-increment segmented_iterator operator++(int) { segmented_iterator tmp = *this; operator++(); return tmp; } bool operator==(const outer_iterator& other_outer) const { __TBB_ASSERT(my_segcont != nullptr, nullptr); return (outer_iter == other_outer && (outer_iter == my_segcont->end() || inner_iter == outer_iter->begin())); } bool operator!=(const outer_iterator& other_outer) const { return !operator==(other_outer); } // (i)* RHS reference operator*() const { __TBB_ASSERT(my_segcont != nullptr, nullptr); __TBB_ASSERT(outer_iter != my_segcont->end(), "Dereferencing a pointer at end of container"); __TBB_ASSERT(inner_iter != outer_iter->end(), nullptr); // should never happen return *inner_iter; } // i-> pointer operator->() const { return &operator*();} private: SegmentedContainer* my_segcont; outer_iterator outer_iter; inner_iterator inner_iter; void advance_me() { __TBB_ASSERT(my_segcont != nullptr, nullptr); __TBB_ASSERT(outer_iter != my_segcont->end(), nullptr); // not true if there are no inner containers __TBB_ASSERT(inner_iter != outer_iter->end(), nullptr); // not true if the inner containers are all empty. ++inner_iter; while(inner_iter == outer_iter->end() && ++outer_iter != my_segcont->end()) { inner_iter = outer_iter->begin(); } } }; // segmented_iterator template bool operator==( const segmented_iterator& i, const segmented_iterator& j ) { if(i.my_segcont != j.my_segcont) return false; if(i.my_segcont == nullptr) return true; if(i.outer_iter != j.outer_iter) return false; if(i.outer_iter == i.my_segcont->end()) return true; return i.inner_iter == j.inner_iter; } // != template bool operator!=( const segmented_iterator& i, const segmented_iterator& j ) { return !(i==j); } template struct construct_by_default: no_assign { void construct(void*where) {new(where) T();} // C++ note: the () in T() ensure zero initialization. construct_by_default( int ) {} }; template struct construct_by_exemplar: no_assign { const T exemplar; void construct(void*where) {new(where) T(exemplar);} construct_by_exemplar( const T& t ) : exemplar(t) {} construct_by_exemplar( T&& t ) : exemplar(std::move(t)) {} }; template struct construct_by_finit: no_assign { Finit f; void construct(void* where) {new(where) T(f());} construct_by_finit( Finit&& f_ ) : f(std::move(f_)) {} }; template struct construct_by_args: no_assign { stored_pack pack; void construct(void* where) { call( [where](const typename std::decay

::type&... args ){ new(where) T(args...); }, pack ); } construct_by_args( P&& ... args ) : pack(std::forward

(args)...) {} }; // storage for initialization function pointer // TODO: consider removing the template parameter T here and in callback_leaf class callback_base { public: // Clone *this virtual callback_base* clone() const = 0; // Destruct and free *this virtual void destroy() = 0; // Need virtual destructor to satisfy GCC compiler warning virtual ~callback_base() { } // Construct T at where virtual void construct(void* where) = 0; }; template class callback_leaf: public callback_base, Constructor { template callback_leaf( P&& ... params ) : Constructor(std::forward

(params)...) {} // TODO: make the construction/destruction consistent (use allocator.construct/destroy) using my_allocator_type = typename tbb::tbb_allocator; callback_base* clone() const override { return make(*this); } void destroy() override { my_allocator_type alloc; tbb::detail::allocator_traits::destroy(alloc, this); tbb::detail::allocator_traits::deallocate(alloc, this, 1); } void construct(void* where) override { Constructor::construct(where); } public: template static callback_base* make( P&& ... params ) { void* where = my_allocator_type().allocate(1); return new(where) callback_leaf( std::forward

(params)... ); } }; //! Template for recording construction of objects in table /** All maintenance of the space will be done explicitly on push_back, and all thread local copies must be destroyed before the concurrent vector is deleted. The flag is_built is initialized to false. When the local is successfully-constructed, set the flag to true or call value_committed(). If the constructor throws, the flag will be false. */ template struct ets_element { detail::aligned_space my_space; bool is_built; ets_element() { is_built = false; } // not currently-built U* value() { return my_space.begin(); } U* value_committed() { is_built = true; return my_space.begin(); } ~ets_element() { if(is_built) { my_space.begin()->~U(); is_built = false; } } }; // A predicate that can be used for a compile-time compatibility check of ETS instances // Ideally, it should have been declared inside the ETS class, but unfortunately // in that case VS2013 does not enable the variadic constructor. template struct is_compatible_ets : std::false_type {}; template struct is_compatible_ets< T, enumerable_thread_specific > : std::is_same {}; // A predicate that checks whether, for a variable 'foo' of type T, foo() is a valid expression template using has_empty_braces_operator = decltype(std::declval()()); template using is_callable_no_args = supports; //! The enumerable_thread_specific container /** enumerable_thread_specific has the following properties: - thread-local copies are lazily created, with default, exemplar or function initialization. - thread-local copies do not move (during lifetime, and excepting clear()) so the address of a copy is invariant. - the contained objects need not have operator=() defined if combine is not used. - enumerable_thread_specific containers may be copy-constructed or assigned. - thread-local copies can be managed by hash-table, or can be accessed via TLS storage for speed. - outside of parallel contexts, the contents of all thread-local copies are accessible by iterator or using combine or combine_each methods @par Segmented iterator When the thread-local objects are containers with input_iterators defined, a segmented iterator may be used to iterate over all the elements of all thread-local copies. @par combine and combine_each - Both methods are defined for enumerable_thread_specific. - combine() requires the type T have operator=() defined. - neither method modifies the contents of the object (though there is no guarantee that the applied methods do not modify the object.) - Both are evaluated in serial context (the methods are assumed to be non-benign.) @ingroup containers */ template , ets_key_usage_type ETS_key_type=ets_no_key > class enumerable_thread_specific: ets_base { template friend class enumerable_thread_specific; using padded_element = padded>; //! A generic range, used to create range objects from the iterators template class generic_range_type: public blocked_range { public: using value_type = T; using reference = T&; using const_reference = const T&; using iterator = I; using difference_type = std::ptrdiff_t; generic_range_type( I begin_, I end_, std::size_t grainsize_ = 1) : blocked_range(begin_,end_,grainsize_) {} template generic_range_type( const generic_range_type& r) : blocked_range(r.begin(),r.end(),r.grainsize()) {} generic_range_type( generic_range_type& r, split ) : blocked_range(r,split()) {} }; using allocator_traits_type = tbb::detail::allocator_traits; using padded_allocator_type = typename allocator_traits_type::template rebind_alloc; using internal_collection_type = tbb::concurrent_vector< padded_element, padded_allocator_type >; callback_base *my_construct_callback; internal_collection_type my_locals; // TODO: consider unifying the callback mechanism for all create_local* methods below // (likely non-compatible and requires interface version increase) void* create_local() override { padded_element& lref = *my_locals.grow_by(1); my_construct_callback->construct(lref.value()); return lref.value_committed(); } static void* create_local_by_copy( ets_base& base, void* p ) { enumerable_thread_specific& ets = static_cast(base); padded_element& lref = *ets.my_locals.grow_by(1); new(lref.value()) T(*static_cast(p)); return lref.value_committed(); } static void* create_local_by_move( ets_base& base, void* p ) { enumerable_thread_specific& ets = static_cast(base); padded_element& lref = *ets.my_locals.grow_by(1); new(lref.value()) T(std::move(*static_cast(p))); return lref.value_committed(); } using array_allocator_type = typename allocator_traits_type::template rebind_alloc; // _size is in bytes void* create_array(std::size_t _size) override { std::size_t nelements = (_size + sizeof(uintptr_t) -1) / sizeof(uintptr_t); return array_allocator_type().allocate(nelements); } void free_array( void* _ptr, std::size_t _size) override { std::size_t nelements = (_size + sizeof(uintptr_t) -1) / sizeof(uintptr_t); array_allocator_type().deallocate( reinterpret_cast(_ptr),nelements); } public: //! Basic types using value_type = T; using allocator_type = Allocator; using size_type = typename internal_collection_type::size_type; using difference_type = typename internal_collection_type::difference_type; using reference = value_type&; using const_reference = const value_type&; using pointer = typename allocator_traits_type::pointer; using const_pointer = typename allocator_traits_type::const_pointer; // Iterator types using iterator = enumerable_thread_specific_iterator; using const_iterator = enumerable_thread_specific_iterator; // Parallel range types using range_type = generic_range_type; using const_range_type = generic_range_type; //! Default constructor. Each local instance of T is default constructed. enumerable_thread_specific() : my_construct_callback( callback_leaf >::make(/*dummy argument*/0) ){} //! Constructor with initializer functor. Each local instance of T is constructed by T(finit()). template ::type>::value>::type> explicit enumerable_thread_specific( Finit finit ) : my_construct_callback( callback_leaf >::make( std::move(finit) ) ){} //! Constructor with exemplar. Each local instance of T is copy-constructed from the exemplar. explicit enumerable_thread_specific( const T& exemplar ) : my_construct_callback( callback_leaf >::make( exemplar ) ){} explicit enumerable_thread_specific( T&& exemplar ) : my_construct_callback( callback_leaf >::make( std::move(exemplar) ) ){} //! Variadic constructor with initializer arguments. Each local instance of T is constructed by T(args...) template ::type>::value && !is_compatible_ets::type>::value && !std::is_same::type>::value >::type> enumerable_thread_specific( P1&& arg1, P&& ... args ) : my_construct_callback( callback_leaf >::make( std::forward(arg1), std::forward

(args)... ) ){} //! Destructor ~enumerable_thread_specific() { if(my_construct_callback) my_construct_callback->destroy(); // Deallocate the hash table before overridden free_array() becomes inaccessible this->ets_base::table_clear(); } //! returns reference to local, discarding exists reference local() { bool exists; return local(exists); } //! Returns reference to calling thread's local copy, creating one if necessary reference local(bool& exists) { void* ptr = this->table_lookup(exists); return *(T*)ptr; } //! Get the number of local copies size_type size() const { return my_locals.size(); } //! true if there have been no local copies created bool empty() const { return my_locals.empty(); } //! begin iterator iterator begin() { return iterator( my_locals, 0 ); } //! end iterator iterator end() { return iterator(my_locals, my_locals.size() ); } //! begin const iterator const_iterator begin() const { return const_iterator(my_locals, 0); } //! end const iterator const_iterator end() const { return const_iterator(my_locals, my_locals.size()); } //! Get range for parallel algorithms range_type range( std::size_t grainsize=1 ) { return range_type( begin(), end(), grainsize ); } //! Get const range for parallel algorithms const_range_type range( std::size_t grainsize=1 ) const { return const_range_type( begin(), end(), grainsize ); } //! Destroys local copies void clear() { my_locals.clear(); this->table_clear(); // callback is not destroyed } private: template void internal_copy(const enumerable_thread_specific& other) { // this tests is_compatible_ets static_assert( (is_compatible_ets::type>::value), "is_compatible_ets fails" ); // Initialize my_construct_callback first, so that it is valid even if rest of this routine throws an exception. my_construct_callback = other.my_construct_callback->clone(); __TBB_ASSERT(my_locals.size()==0, nullptr); my_locals.reserve(other.size()); this->table_elementwise_copy( other, create_local_by_copy ); } void internal_swap(enumerable_thread_specific& other) { using std::swap; __TBB_ASSERT( this!=&other, nullptr); swap(my_construct_callback, other.my_construct_callback); // concurrent_vector::swap() preserves storage space, // so addresses to the vector kept in ETS hash table remain valid. swap(my_locals, other.my_locals); this->ets_base::table_swap(other); } template void internal_move(enumerable_thread_specific&& other) { static_assert( (is_compatible_ets::type>::value), "is_compatible_ets fails" ); my_construct_callback = other.my_construct_callback; other.my_construct_callback = nullptr; __TBB_ASSERT(my_locals.size()==0, nullptr); my_locals.reserve(other.size()); this->table_elementwise_copy( other, create_local_by_move ); } public: enumerable_thread_specific( const enumerable_thread_specific& other ) : ets_base() /* prevents GCC warnings with -Wextra */ { internal_copy(other); } template enumerable_thread_specific( const enumerable_thread_specific& other ) { internal_copy(other); } enumerable_thread_specific( enumerable_thread_specific&& other ) : my_construct_callback() { // TODO: use internal_move correctly here internal_swap(other); } template enumerable_thread_specific( enumerable_thread_specific&& other ) : my_construct_callback() { internal_move(std::move(other)); } enumerable_thread_specific& operator=( const enumerable_thread_specific& other ) { if( this != &other ) { this->clear(); my_construct_callback->destroy(); internal_copy( other ); } return *this; } template enumerable_thread_specific& operator=( const enumerable_thread_specific& other ) { __TBB_ASSERT( static_cast(this)!=static_cast(&other), nullptr); // Objects of different types this->clear(); my_construct_callback->destroy(); internal_copy(other); return *this; } enumerable_thread_specific& operator=( enumerable_thread_specific&& other ) { if( this != &other ) { // TODO: use internal_move correctly here internal_swap(other); } return *this; } template enumerable_thread_specific& operator=( enumerable_thread_specific&& other ) { __TBB_ASSERT( static_cast(this)!=static_cast(&other), nullptr); // Objects of different types this->clear(); my_construct_callback->destroy(); internal_move(std::move(other)); return *this; } // CombineFunc has signature T(T,T) or T(const T&, const T&) template T combine(CombineFunc f_combine) { if(begin() == end()) { ets_element location; my_construct_callback->construct(location.value()); return *location.value_committed(); } const_iterator ci = begin(); T my_result = *ci; while(++ci != end()) my_result = f_combine( my_result, *ci ); return my_result; } // combine_func_t takes T by value or by [const] reference, and returns nothing template void combine_each(CombineFunc f_combine) { for(iterator ci = begin(); ci != end(); ++ci) { f_combine( *ci ); } } }; // enumerable_thread_specific template< typename Container > class flattened2d { // This intermediate typedef is to address issues with VC7.1 compilers using conval_type = typename Container::value_type; public: //! Basic types using size_type = typename conval_type::size_type; using difference_type = typename conval_type::difference_type; using allocator_type = typename conval_type::allocator_type; using value_type = typename conval_type::value_type; using reference = typename conval_type::reference; using const_reference = typename conval_type::const_reference; using pointer = typename conval_type::pointer; using const_pointer = typename conval_type::const_pointer; using iterator = segmented_iterator; using const_iterator = segmented_iterator; flattened2d( const Container &c, typename Container::const_iterator b, typename Container::const_iterator e ) : my_container(const_cast(&c)), my_begin(b), my_end(e) { } explicit flattened2d( const Container &c ) : my_container(const_cast(&c)), my_begin(c.begin()), my_end(c.end()) { } iterator begin() { return iterator(*my_container) = my_begin; } iterator end() { return iterator(*my_container) = my_end; } const_iterator begin() const { return const_iterator(*my_container) = my_begin; } const_iterator end() const { return const_iterator(*my_container) = my_end; } size_type size() const { size_type tot_size = 0; for(typename Container::const_iterator i = my_begin; i != my_end; ++i) { tot_size += i->size(); } return tot_size; } private: Container *my_container; typename Container::const_iterator my_begin; typename Container::const_iterator my_end; }; template flattened2d flatten2d(const Container &c, const typename Container::const_iterator b, const typename Container::const_iterator e) { return flattened2d(c, b, e); } template flattened2d flatten2d(const Container &c) { return flattened2d(c); } } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::enumerable_thread_specific; using detail::d1::flattened2d; using detail::d1::flatten2d; // ets enum keys using detail::d1::ets_key_usage_type; using detail::d1::ets_key_per_instance; using detail::d1::ets_no_key; #if __TBB_RESUMABLE_TASKS using detail::d1::ets_suspend_aware; #endif } // inline namespace v1 } // namespace tbb #endif // __TBB_enumerable_thread_specific_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/flow_graph.h ================================================ /* Copyright (c) 2005-2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_flow_graph_H #define __TBB_flow_graph_H #include #include #include #include "detail/_config.h" #include "detail/_namespace_injection.h" #include "spin_mutex.h" #include "null_mutex.h" #include "spin_rw_mutex.h" #include "null_rw_mutex.h" #include "detail/_pipeline_filters.h" #include "detail/_task.h" #include "detail/_small_object_pool.h" #include "cache_aligned_allocator.h" #include "detail/_exception.h" #include "detail/_template_helpers.h" #include "detail/_aggregator.h" #include "detail/_allocator_traits.h" #include "detail/_utils.h" #include "profiling.h" #include "task_arena.h" #if TBB_USE_PROFILING_TOOLS && ( __unix__ || __APPLE__ ) #if __INTEL_COMPILER // Disabled warning "routine is both inline and noinline" #pragma warning (push) #pragma warning( disable: 2196 ) #endif #define __TBB_NOINLINE_SYM __attribute__((noinline)) #else #define __TBB_NOINLINE_SYM #endif #include #include #include #include #if __TBB_CPP20_CONCEPTS_PRESENT #include #endif /** @file \brief The graph related classes and functions There are some applications that best express dependencies as messages passed between nodes in a graph. These messages may contain data or simply act as signals that a predecessors has completed. The graph class and its associated node classes can be used to express such applications. */ namespace tbb { namespace detail { namespace d2 { //! An enumeration the provides the two most common concurrency levels: unlimited and serial enum concurrency { unlimited = 0, serial = 1 }; //! A generic null type struct null_type {}; //! An empty class used for messages that mean "I'm done" class continue_msg {}; } // namespace d2 #if __TBB_CPP20_CONCEPTS_PRESENT inline namespace d0 { template concept node_body_return_type = std::same_as || std::convertible_to; // TODO: consider using std::invocable here template concept continue_node_body = std::copy_constructible && requires( Body& body, const tbb::detail::d2::continue_msg& v ) { { body(v) } -> node_body_return_type; }; template concept function_node_body = std::copy_constructible && std::invocable && node_body_return_type, Output>; template concept join_node_function_object = std::copy_constructible && std::invocable && std::convertible_to, Key>; template concept input_node_body = std::copy_constructible && requires( Body& body, tbb::detail::d1::flow_control& fc ) { { body(fc) } -> adaptive_same_as; }; template concept multifunction_node_body = std::copy_constructible && std::invocable; template concept sequencer = std::copy_constructible && std::invocable && std::convertible_to, std::size_t>; template concept async_node_body = std::copy_constructible && std::invocable; } // inline namespace d0 #endif // __TBB_CPP20_CONCEPTS_PRESENT namespace d2 { //! Forward declaration section template< typename T > class sender; template< typename T > class receiver; class continue_receiver; template< typename T, typename U > class limiter_node; // needed for resetting decrementer template class successor_cache; template class broadcast_cache; template class round_robin_cache; template class predecessor_cache; template class reservable_predecessor_cache; #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET namespace order { struct following; struct preceding; } template struct node_set; #endif } // namespace d2 } // namespace detail } // namespace tbb //! The graph class #include "detail/_flow_graph_impl.h" namespace tbb { namespace detail { namespace d2 { static inline std::pair order_tasks(graph_task* first, graph_task* second) { if (second->priority > first->priority) return std::make_pair(second, first); return std::make_pair(first, second); } // submit task if necessary. Returns the non-enqueued task if there is one. static inline graph_task* combine_tasks(graph& g, graph_task* left, graph_task* right) { // if no RHS task, don't change left. if (right == nullptr) return left; // right != nullptr if (left == nullptr) return right; if (left == SUCCESSFULLY_ENQUEUED) return right; // left contains a task if (right != SUCCESSFULLY_ENQUEUED) { // both are valid tasks auto tasks_pair = order_tasks(left, right); spawn_in_graph_arena(g, *tasks_pair.first); return tasks_pair.second; } return left; } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT class message_metainfo { public: using waiters_type = std::forward_list; message_metainfo() = default; message_metainfo(const waiters_type& waiters) : my_waiters(waiters) {} message_metainfo(waiters_type&& waiters) : my_waiters(std::move(waiters)) {} const waiters_type& waiters() const & { return my_waiters; } waiters_type&& waiters() && { return std::move(my_waiters); } bool empty() const { return my_waiters.empty(); } void merge(const message_metainfo& other) { // TODO: should we avoid duplications on merging my_waiters.insert_after(my_waiters.before_begin(), other.waiters().begin(), other.waiters().end()); } private: waiters_type my_waiters; }; // class message_metainfo #define __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo) , metainfo #else #define __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo) #endif // __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT //! Pure virtual template class that defines a sender of messages of type T template< typename T > class sender { public: virtual ~sender() {} //! Request an item from the sender virtual bool try_get( T & ) { return false; } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT virtual bool try_get( T &, message_metainfo& ) { return false; } #endif //! Reserves an item in the sender virtual bool try_reserve( T & ) { return false; } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT virtual bool try_reserve( T &, message_metainfo& ) { return false; } #endif //! Releases the reserved item virtual bool try_release( ) { return false; } //! Consumes the reserved item virtual bool try_consume( ) { return false; } protected: //! The output type of this sender typedef T output_type; //! The successor type for this node typedef receiver successor_type; //! Add a new successor to this node virtual bool register_successor( successor_type &r ) = 0; //! Removes a successor from this node virtual bool remove_successor( successor_type &r ) = 0; template friend bool register_successor(sender& s, receiver& r); template friend bool remove_successor (sender& s, receiver& r); }; // class sender template bool register_successor(sender& s, receiver& r) { return s.register_successor(r); } template bool remove_successor(sender& s, receiver& r) { return s.remove_successor(r); } //! Pure virtual template class that defines a receiver of messages of type T template< typename T > class receiver { private: template bool internal_try_put(const T& t, TryPutTaskArgs&&... args) { graph_task* res = try_put_task(t, std::forward(args)...); if (!res) return false; if (res != SUCCESSFULLY_ENQUEUED) spawn_in_graph_arena(graph_reference(), *res); return true; } public: //! Destructor virtual ~receiver() {} //! Put an item to the receiver bool try_put( const T& t ) { return internal_try_put(t); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT //! Put an item to the receiver and wait for completion bool try_put_and_wait( const T& t ) { // Since try_put_and_wait is a blocking call, it is safe to create wait_context on stack d1::wait_context_vertex msg_wait_vertex{}; bool res = internal_try_put(t, message_metainfo{message_metainfo::waiters_type{&msg_wait_vertex}}); if (res) { __TBB_ASSERT(graph_reference().my_context != nullptr, "No wait_context associated with the Flow Graph"); d1::wait(msg_wait_vertex.get_context(), *graph_reference().my_context); } return res; } #endif //! put item to successor; return task to run the successor if possible. protected: //! The input type of this receiver typedef T input_type; //! The predecessor type for this node typedef sender predecessor_type; template< typename R, typename B > friend class run_and_put_task; template< typename X, typename Y > friend class broadcast_cache; template< typename X, typename Y > friend class round_robin_cache; virtual graph_task *try_put_task(const T& t) = 0; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT virtual graph_task *try_put_task(const T& t, const message_metainfo&) = 0; #endif virtual graph& graph_reference() const = 0; template friend class successor_cache; virtual bool is_continue_receiver() { return false; } // TODO revamp: reconsider the inheritance and move node priority out of receiver virtual node_priority_t priority() const { return no_priority; } //! Add a predecessor to the node virtual bool register_predecessor( predecessor_type & ) { return false; } //! Remove a predecessor from the node virtual bool remove_predecessor( predecessor_type & ) { return false; } template friend bool register_predecessor(receiver& r, sender& s); template friend bool remove_predecessor (receiver& r, sender& s); }; // class receiver template bool register_predecessor(receiver& r, sender& s) { return r.register_predecessor(s); } template bool remove_predecessor(receiver& r, sender& s) { return r.remove_predecessor(s); } //! Base class for receivers of completion messages /** These receivers automatically reset, but cannot be explicitly waited on */ class continue_receiver : public receiver< continue_msg > { protected: //! Constructor explicit continue_receiver( int number_of_predecessors, node_priority_t a_priority ) { my_predecessor_count = my_initial_predecessor_count = number_of_predecessors; my_current_count = 0; my_priority = a_priority; } //! Copy constructor continue_receiver( const continue_receiver& src ) : receiver() { my_predecessor_count = my_initial_predecessor_count = src.my_initial_predecessor_count; my_current_count = 0; my_priority = src.my_priority; } //! Increments the trigger threshold bool register_predecessor( predecessor_type & ) override { spin_mutex::scoped_lock l(my_mutex); ++my_predecessor_count; return true; } //! Decrements the trigger threshold /** Does not check to see if the removal of the predecessor now makes the current count exceed the new threshold. So removing a predecessor while the graph is active can cause unexpected results. */ bool remove_predecessor( predecessor_type & ) override { spin_mutex::scoped_lock l(my_mutex); --my_predecessor_count; return true; } //! The input type typedef continue_msg input_type; //! The predecessor type for this node typedef receiver::predecessor_type predecessor_type; template< typename R, typename B > friend class run_and_put_task; template friend class broadcast_cache; template friend class round_robin_cache; private: // execute body is supposed to be too small to create a task for. graph_task* try_put_task_impl( const input_type& __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo) ) { #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT message_metainfo predecessor_metainfo; #endif { spin_mutex::scoped_lock l(my_mutex); #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT // Prolong the wait and store the metainfo until receiving signals from all the predecessors for (auto waiter : metainfo.waiters()) { waiter->reserve(1); } my_current_metainfo.merge(metainfo); #endif if ( ++my_current_count < my_predecessor_count ) return SUCCESSFULLY_ENQUEUED; else { my_current_count = 0; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT predecessor_metainfo = my_current_metainfo; my_current_metainfo = message_metainfo{}; #endif } } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT graph_task* res = execute(predecessor_metainfo); for (auto waiter : predecessor_metainfo.waiters()) { waiter->release(1); } #else graph_task* res = execute(); #endif return res? res : SUCCESSFULLY_ENQUEUED; } protected: graph_task* try_put_task( const input_type& input ) override { return try_put_task_impl(input __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{})); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT graph_task* try_put_task( const input_type& input, const message_metainfo& metainfo ) override { return try_put_task_impl(input, metainfo); } #endif spin_mutex my_mutex; int my_predecessor_count; int my_current_count; int my_initial_predecessor_count; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT message_metainfo my_current_metainfo; #endif node_priority_t my_priority; // the friend declaration in the base class did not eliminate the "protected class" // error in gcc 4.1.2 template friend class limiter_node; virtual void reset_receiver( reset_flags f ) { my_current_count = 0; if (f & rf_clear_edges) { my_predecessor_count = my_initial_predecessor_count; } } //! Does whatever should happen when the threshold is reached /** This should be very fast or else spawn a task. This is called while the sender is blocked in the try_put(). */ #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT virtual graph_task* execute(const message_metainfo& metainfo) = 0; #else virtual graph_task* execute() = 0; #endif template friend class successor_cache; bool is_continue_receiver() override { return true; } node_priority_t priority() const override { return my_priority; } }; // class continue_receiver #if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING template K key_from_message( const T &t ) { return t.key(); } #endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */ } // d1 } // detail } // tbb #include "detail/_flow_graph_trace_impl.h" #include "detail/_hash_compare.h" namespace tbb { namespace detail { namespace d2 { #include "detail/_flow_graph_body_impl.h" #include "detail/_flow_graph_cache_impl.h" #include "detail/_flow_graph_types_impl.h" using namespace graph_policy_namespace; template graph_iterator::graph_iterator(C *g, bool begin) : my_graph(g), current_node(nullptr) { if (begin) current_node = my_graph->my_nodes; //else it is an end iterator by default } template typename graph_iterator::reference graph_iterator::operator*() const { __TBB_ASSERT(current_node, "graph_iterator at end"); return *operator->(); } template typename graph_iterator::pointer graph_iterator::operator->() const { return current_node; } template void graph_iterator::internal_forward() { if (current_node) current_node = current_node->next; } //! Constructs a graph with isolated task_group_context inline graph::graph() : my_wait_context_vertex(0), my_nodes(nullptr), my_nodes_last(nullptr), my_task_arena(nullptr) { prepare_task_arena(); own_context = true; cancelled = false; caught_exception = false; my_context = new (r1::cache_aligned_allocate(sizeof(task_group_context))) task_group_context(FLOW_TASKS); fgt_graph(this); my_is_active = true; } inline graph::graph(task_group_context& use_this_context) : my_wait_context_vertex(0), my_context(&use_this_context), my_nodes(nullptr), my_nodes_last(nullptr), my_task_arena(nullptr) { prepare_task_arena(); own_context = false; cancelled = false; caught_exception = false; fgt_graph(this); my_is_active = true; } inline graph::~graph() { wait_for_all(); if (own_context) { my_context->~task_group_context(); r1::cache_aligned_deallocate(my_context); } delete my_task_arena; } inline void graph::reserve_wait() { my_wait_context_vertex.reserve(); fgt_reserve_wait(this); } inline void graph::release_wait() { fgt_release_wait(this); my_wait_context_vertex.release(); } inline void graph::register_node(graph_node *n) { n->next = nullptr; { spin_mutex::scoped_lock lock(nodelist_mutex); n->prev = my_nodes_last; if (my_nodes_last) my_nodes_last->next = n; my_nodes_last = n; if (!my_nodes) my_nodes = n; } } inline void graph::remove_node(graph_node *n) { { spin_mutex::scoped_lock lock(nodelist_mutex); __TBB_ASSERT(my_nodes && my_nodes_last, "graph::remove_node: Error: no registered nodes"); if (n->prev) n->prev->next = n->next; if (n->next) n->next->prev = n->prev; if (my_nodes_last == n) my_nodes_last = n->prev; if (my_nodes == n) my_nodes = n->next; } n->prev = n->next = nullptr; } inline void graph::reset( reset_flags f ) { // reset context deactivate_graph(*this); my_context->reset(); cancelled = false; caught_exception = false; // reset all the nodes comprising the graph for(iterator ii = begin(); ii != end(); ++ii) { graph_node *my_p = &(*ii); my_p->reset_node(f); } // Reattach the arena. Might be useful to run the graph in a particular task_arena // while not limiting graph lifetime to a single task_arena::execute() call. prepare_task_arena( /*reinit=*/true ); activate_graph(*this); } inline void graph::cancel() { my_context->cancel_group_execution(); } inline graph::iterator graph::begin() { return iterator(this, true); } inline graph::iterator graph::end() { return iterator(this, false); } inline graph::const_iterator graph::begin() const { return const_iterator(this, true); } inline graph::const_iterator graph::end() const { return const_iterator(this, false); } inline graph::const_iterator graph::cbegin() const { return const_iterator(this, true); } inline graph::const_iterator graph::cend() const { return const_iterator(this, false); } inline graph_node::graph_node(graph& g) : my_graph(g) { my_graph.register_node(this); } inline graph_node::~graph_node() { my_graph.remove_node(this); } #include "detail/_flow_graph_node_impl.h" //! An executable node that acts as a source, i.e. it has no predecessors template < typename Output > __TBB_requires(std::copyable) class input_node : public graph_node, public sender< Output > { public: //! The type of the output message, which is complete typedef Output output_type; //! The type of successors of this node typedef typename sender::successor_type successor_type; // Input node has no input type typedef null_type input_type; //! Constructor for a node with a successor template< typename Body > __TBB_requires(input_node_body) __TBB_NOINLINE_SYM input_node( graph &g, Body body ) : graph_node(g), my_active(false) , my_body( new input_body_leaf< output_type, Body>(body) ) , my_init_body( new input_body_leaf< output_type, Body>(body) ) , my_successors(this), my_reserved(false), my_has_cached_item(false) { fgt_node_with_body(CODEPTR(), FLOW_INPUT_NODE, &this->my_graph, static_cast *>(this), this->my_body); } #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template __TBB_requires(input_node_body) input_node( const node_set& successors, Body body ) : input_node(successors.graph_reference(), body) { make_edges(*this, successors); } #endif //! Copy constructor __TBB_NOINLINE_SYM input_node( const input_node& src ) : graph_node(src.my_graph), sender() , my_active(false) , my_body(src.my_init_body->clone()), my_init_body(src.my_init_body->clone()) , my_successors(this), my_reserved(false), my_has_cached_item(false) { fgt_node_with_body(CODEPTR(), FLOW_INPUT_NODE, &this->my_graph, static_cast *>(this), this->my_body); } //! The destructor ~input_node() { delete my_body; delete my_init_body; } //! Add a new successor to this node bool register_successor( successor_type &r ) override { spin_mutex::scoped_lock lock(my_mutex); my_successors.register_successor(r); if ( my_active ) spawn_put(); return true; } //! Removes a successor from this node bool remove_successor( successor_type &r ) override { spin_mutex::scoped_lock lock(my_mutex); my_successors.remove_successor(r); return true; } //! Request an item from the node bool try_get( output_type &v ) override { spin_mutex::scoped_lock lock(my_mutex); if ( my_reserved ) return false; if ( my_has_cached_item ) { v = my_cached_item; my_has_cached_item = false; return true; } // we've been asked to provide an item, but we have none. enqueue a task to // provide one. if ( my_active ) spawn_put(); return false; } //! Reserves an item. bool try_reserve( output_type &v ) override { spin_mutex::scoped_lock lock(my_mutex); if ( my_reserved ) { return false; } if ( my_has_cached_item ) { v = my_cached_item; my_reserved = true; return true; } else { return false; } } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT private: bool try_reserve( output_type& v, message_metainfo& ) override { return try_reserve(v); } bool try_get( output_type& v, message_metainfo& ) override { return try_get(v); } public: #endif //! Release a reserved item. /** true = item has been released and so remains in sender, dest must request or reserve future items */ bool try_release( ) override { spin_mutex::scoped_lock lock(my_mutex); __TBB_ASSERT( my_reserved && my_has_cached_item, "releasing non-existent reservation" ); my_reserved = false; if(!my_successors.empty()) spawn_put(); return true; } //! Consumes a reserved item bool try_consume( ) override { spin_mutex::scoped_lock lock(my_mutex); __TBB_ASSERT( my_reserved && my_has_cached_item, "consuming non-existent reservation" ); my_reserved = false; my_has_cached_item = false; if ( !my_successors.empty() ) { spawn_put(); } return true; } //! Activates a node that was created in the inactive state void activate() { spin_mutex::scoped_lock lock(my_mutex); my_active = true; if (!my_successors.empty()) spawn_put(); } template Body copy_function_object() { input_body &body_ref = *this->my_body; return dynamic_cast< input_body_leaf & >(body_ref).get_body(); } protected: //! resets the input_node to its initial state void reset_node( reset_flags f) override { my_active = false; my_reserved = false; my_has_cached_item = false; if(f & rf_clear_edges) my_successors.clear(); if(f & rf_reset_bodies) { input_body *tmp = my_init_body->clone(); delete my_body; my_body = tmp; } } private: spin_mutex my_mutex; bool my_active; input_body *my_body; input_body *my_init_body; broadcast_cache< output_type > my_successors; bool my_reserved; bool my_has_cached_item; output_type my_cached_item; // used by apply_body_bypass, can invoke body of node. bool try_reserve_apply_body(output_type &v) { spin_mutex::scoped_lock lock(my_mutex); if ( my_reserved ) { return false; } if ( !my_has_cached_item ) { d1::flow_control control; fgt_begin_body( my_body ); my_cached_item = (*my_body)(control); my_has_cached_item = !control.is_pipeline_stopped; fgt_end_body( my_body ); } if ( my_has_cached_item ) { v = my_cached_item; my_reserved = true; return true; } else { return false; } } graph_task* create_put_task() { d1::small_object_allocator allocator{}; typedef input_node_task_bypass< input_node > task_type; graph_task* t = allocator.new_object(my_graph, allocator, *this); return t; } //! Spawns a task that applies the body void spawn_put( ) { if(is_graph_active(this->my_graph)) { spawn_in_graph_arena(this->my_graph, *create_put_task()); } } friend class input_node_task_bypass< input_node >; //! Applies the body. Returning SUCCESSFULLY_ENQUEUED okay; forward_task_bypass will handle it. graph_task* apply_body_bypass( ) { output_type v; if ( !try_reserve_apply_body(v) ) return nullptr; graph_task *last_task = my_successors.try_put_task(v); if ( last_task ) try_consume(); else try_release(); return last_task; } }; // class input_node //! Implements a function node that supports Input -> Output template __TBB_requires(std::default_initializable && std::copy_constructible && std::copy_constructible) class function_node : public graph_node , public function_input< Input, Output, Policy, cache_aligned_allocator > , public function_output { typedef cache_aligned_allocator internals_allocator; public: typedef Input input_type; typedef Output output_type; typedef function_input input_impl_type; typedef function_input_queue input_queue_type; typedef function_output fOutput_type; typedef typename input_impl_type::predecessor_type predecessor_type; typedef typename fOutput_type::successor_type successor_type; using input_impl_type::my_predecessors; //! Constructor // input_queue_type is allocated here, but destroyed in the function_input_base. // TODO: pass the graph_buffer_policy to the function_input_base so it can all // be done in one place. This would be an interface-breaking change. template< typename Body > __TBB_requires(function_node_body) __TBB_NOINLINE_SYM function_node( graph &g, size_t concurrency, Body body, Policy = Policy(), node_priority_t a_priority = no_priority ) : graph_node(g), input_impl_type(g, concurrency, body, a_priority), fOutput_type(g) { fgt_node_with_body( CODEPTR(), FLOW_FUNCTION_NODE, &this->my_graph, static_cast *>(this), static_cast *>(this), this->my_body ); } template __TBB_requires(function_node_body) function_node( graph& g, size_t concurrency, Body body, node_priority_t a_priority ) : function_node(g, concurrency, body, Policy(), a_priority) {} #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template __TBB_requires(function_node_body) function_node( const node_set& nodes, size_t concurrency, Body body, Policy p = Policy(), node_priority_t a_priority = no_priority ) : function_node(nodes.graph_reference(), concurrency, body, p, a_priority) { make_edges_in_order(nodes, *this); } template __TBB_requires(function_node_body) function_node( const node_set& nodes, size_t concurrency, Body body, node_priority_t a_priority ) : function_node(nodes, concurrency, body, Policy(), a_priority) {} #endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET //! Copy constructor __TBB_NOINLINE_SYM function_node( const function_node& src ) : graph_node(src.my_graph), input_impl_type(src), fOutput_type(src.my_graph) { fgt_node_with_body( CODEPTR(), FLOW_FUNCTION_NODE, &this->my_graph, static_cast *>(this), static_cast *>(this), this->my_body ); } protected: template< typename R, typename B > friend class run_and_put_task; template friend class broadcast_cache; template friend class round_robin_cache; using input_impl_type::try_put_task; broadcast_cache &successors () override { return fOutput_type::my_successors; } void reset_node(reset_flags f) override { input_impl_type::reset_function_input(f); // TODO: use clear() instead. if(f & rf_clear_edges) { successors().clear(); my_predecessors.clear(); } __TBB_ASSERT(!(f & rf_clear_edges) || successors().empty(), "function_node successors not empty"); __TBB_ASSERT(this->my_predecessors.empty(), "function_node predecessors not empty"); } }; // class function_node //! implements a function node that supports Input -> (set of outputs) // Output is a tuple of output types. template __TBB_requires(std::default_initializable && std::copy_constructible) class multifunction_node : public graph_node, public multifunction_input < Input, typename wrap_tuple_elements< std::tuple_size::value, // #elements in tuple multifunction_output, // wrap this around each element Output // the tuple providing the types >::type, Policy, cache_aligned_allocator > { typedef cache_aligned_allocator internals_allocator; protected: static const int N = std::tuple_size::value; public: typedef Input input_type; typedef null_type output_type; typedef typename wrap_tuple_elements::type output_ports_type; typedef multifunction_input< input_type, output_ports_type, Policy, internals_allocator> input_impl_type; typedef function_input_queue input_queue_type; private: using input_impl_type::my_predecessors; public: template __TBB_requires(multifunction_node_body) __TBB_NOINLINE_SYM multifunction_node( graph &g, size_t concurrency, Body body, Policy = Policy(), node_priority_t a_priority = no_priority ) : graph_node(g), input_impl_type(g, concurrency, body, a_priority) { fgt_multioutput_node_with_body( CODEPTR(), FLOW_MULTIFUNCTION_NODE, &this->my_graph, static_cast *>(this), this->output_ports(), this->my_body ); } template __TBB_requires(multifunction_node_body) __TBB_NOINLINE_SYM multifunction_node(graph& g, size_t concurrency, Body body, node_priority_t a_priority) : multifunction_node(g, concurrency, body, Policy(), a_priority) {} #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template __TBB_requires(multifunction_node_body) __TBB_NOINLINE_SYM multifunction_node(const node_set& nodes, size_t concurrency, Body body, Policy p = Policy(), node_priority_t a_priority = no_priority) : multifunction_node(nodes.graph_reference(), concurrency, body, p, a_priority) { make_edges_in_order(nodes, *this); } template __TBB_requires(multifunction_node_body) __TBB_NOINLINE_SYM multifunction_node(const node_set& nodes, size_t concurrency, Body body, node_priority_t a_priority) : multifunction_node(nodes, concurrency, body, Policy(), a_priority) {} #endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET __TBB_NOINLINE_SYM multifunction_node( const multifunction_node &other) : graph_node(other.my_graph), input_impl_type(other) { fgt_multioutput_node_with_body( CODEPTR(), FLOW_MULTIFUNCTION_NODE, &this->my_graph, static_cast *>(this), this->output_ports(), this->my_body ); } // all the guts are in multifunction_input... protected: void reset_node(reset_flags f) override { input_impl_type::reset(f); } }; // multifunction_node //! split_node: accepts a tuple as input, forwards each element of the tuple to its // successors. The node has unlimited concurrency, so it does not reject inputs. template class split_node : public graph_node, public receiver { static const int N = std::tuple_size::value; typedef receiver base_type; public: typedef TupleType input_type; typedef typename wrap_tuple_elements< N, // #elements in tuple multifunction_output, // wrap this around each element TupleType // the tuple providing the types >::type output_ports_type; __TBB_NOINLINE_SYM explicit split_node(graph &g) : graph_node(g), my_output_ports(init_output_ports::call(g, my_output_ports)) { fgt_multioutput_node(CODEPTR(), FLOW_SPLIT_NODE, &this->my_graph, static_cast *>(this), this->output_ports()); } #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template __TBB_NOINLINE_SYM split_node(const node_set& nodes) : split_node(nodes.graph_reference()) { make_edges_in_order(nodes, *this); } #endif __TBB_NOINLINE_SYM split_node(const split_node& other) : graph_node(other.my_graph), base_type(other), my_output_ports(init_output_ports::call(other.my_graph, my_output_ports)) { fgt_multioutput_node(CODEPTR(), FLOW_SPLIT_NODE, &this->my_graph, static_cast *>(this), this->output_ports()); } output_ports_type &output_ports() { return my_output_ports; } protected: graph_task *try_put_task(const TupleType& t) override { // Sending split messages in parallel is not justified, as overheads would prevail. // Also, we do not have successors here. So we just tell the task returned here is successful. return emit_element::emit_this(this->my_graph, t, output_ports()); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT graph_task* try_put_task(const TupleType& t, const message_metainfo& metainfo) override { // Sending split messages in parallel is not justified, as overheads would prevail. // Also, we do not have successors here. So we just tell the task returned here is successful. return emit_element::emit_this(this->my_graph, t, output_ports(), metainfo); } #endif void reset_node(reset_flags f) override { if (f & rf_clear_edges) clear_element::clear_this(my_output_ports); __TBB_ASSERT(!(f & rf_clear_edges) || clear_element::this_empty(my_output_ports), "split_node reset failed"); } graph& graph_reference() const override { return my_graph; } private: output_ports_type my_output_ports; }; //! Implements an executable node that supports continue_msg -> Output template > __TBB_requires(std::copy_constructible) class continue_node : public graph_node, public continue_input, public function_output { public: typedef continue_msg input_type; typedef Output output_type; typedef continue_input input_impl_type; typedef function_output fOutput_type; typedef typename input_impl_type::predecessor_type predecessor_type; typedef typename fOutput_type::successor_type successor_type; //! Constructor for executable node with continue_msg -> Output template __TBB_requires(continue_node_body) __TBB_NOINLINE_SYM continue_node( graph &g, Body body, Policy = Policy(), node_priority_t a_priority = no_priority ) : graph_node(g), input_impl_type( g, body, a_priority ), fOutput_type(g) { fgt_node_with_body( CODEPTR(), FLOW_CONTINUE_NODE, &this->my_graph, static_cast *>(this), static_cast *>(this), this->my_body ); } template __TBB_requires(continue_node_body) continue_node( graph& g, Body body, node_priority_t a_priority ) : continue_node(g, body, Policy(), a_priority) {} #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template __TBB_requires(continue_node_body) continue_node( const node_set& nodes, Body body, Policy p = Policy(), node_priority_t a_priority = no_priority ) : continue_node(nodes.graph_reference(), body, p, a_priority ) { make_edges_in_order(nodes, *this); } template __TBB_requires(continue_node_body) continue_node( const node_set& nodes, Body body, node_priority_t a_priority) : continue_node(nodes, body, Policy(), a_priority) {} #endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET //! Constructor for executable node with continue_msg -> Output template __TBB_requires(continue_node_body) __TBB_NOINLINE_SYM continue_node( graph &g, int number_of_predecessors, Body body, Policy = Policy(), node_priority_t a_priority = no_priority ) : graph_node(g) , input_impl_type(g, number_of_predecessors, body, a_priority), fOutput_type(g) { fgt_node_with_body( CODEPTR(), FLOW_CONTINUE_NODE, &this->my_graph, static_cast *>(this), static_cast *>(this), this->my_body ); } template __TBB_requires(continue_node_body) continue_node( graph& g, int number_of_predecessors, Body body, node_priority_t a_priority) : continue_node(g, number_of_predecessors, body, Policy(), a_priority) {} #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template __TBB_requires(continue_node_body) continue_node( const node_set& nodes, int number_of_predecessors, Body body, Policy p = Policy(), node_priority_t a_priority = no_priority ) : continue_node(nodes.graph_reference(), number_of_predecessors, body, p, a_priority) { make_edges_in_order(nodes, *this); } template __TBB_requires(continue_node_body) continue_node( const node_set& nodes, int number_of_predecessors, Body body, node_priority_t a_priority ) : continue_node(nodes, number_of_predecessors, body, Policy(), a_priority) {} #endif //! Copy constructor __TBB_NOINLINE_SYM continue_node( const continue_node& src ) : graph_node(src.my_graph), input_impl_type(src), function_output(src.my_graph) { fgt_node_with_body( CODEPTR(), FLOW_CONTINUE_NODE, &this->my_graph, static_cast *>(this), static_cast *>(this), this->my_body ); } protected: template< typename R, typename B > friend class run_and_put_task; template friend class broadcast_cache; template friend class round_robin_cache; using input_impl_type::try_put_task; broadcast_cache &successors () override { return fOutput_type::my_successors; } void reset_node(reset_flags f) override { input_impl_type::reset_receiver(f); if(f & rf_clear_edges)successors().clear(); __TBB_ASSERT(!(f & rf_clear_edges) || successors().empty(), "continue_node not reset"); } }; // continue_node //! Forwards messages of type T to all successors template class broadcast_node : public graph_node, public receiver, public sender { public: typedef T input_type; typedef T output_type; typedef typename receiver::predecessor_type predecessor_type; typedef typename sender::successor_type successor_type; private: broadcast_cache my_successors; public: __TBB_NOINLINE_SYM explicit broadcast_node(graph& g) : graph_node(g), my_successors(this) { fgt_node( CODEPTR(), FLOW_BROADCAST_NODE, &this->my_graph, static_cast *>(this), static_cast *>(this) ); } #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template broadcast_node(const node_set& nodes) : broadcast_node(nodes.graph_reference()) { make_edges_in_order(nodes, *this); } #endif // Copy constructor __TBB_NOINLINE_SYM broadcast_node( const broadcast_node& src ) : broadcast_node(src.my_graph) {} //! Adds a successor bool register_successor( successor_type &r ) override { my_successors.register_successor( r ); return true; } //! Removes s as a successor bool remove_successor( successor_type &r ) override { my_successors.remove_successor( r ); return true; } private: graph_task* try_put_task_impl(const T& t __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) { graph_task* new_task = my_successors.try_put_task(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo)); if (!new_task) new_task = SUCCESSFULLY_ENQUEUED; return new_task; } protected: template< typename R, typename B > friend class run_and_put_task; template friend class broadcast_cache; template friend class round_robin_cache; //! build a task to run the successor if possible. Default is old behavior. graph_task* try_put_task(const T& t) override { return try_put_task_impl(t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{})); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT graph_task* try_put_task(const T& t, const message_metainfo& metainfo) override { return try_put_task_impl(t, metainfo); } #endif graph& graph_reference() const override { return my_graph; } void reset_node(reset_flags f) override { if (f&rf_clear_edges) { my_successors.clear(); } __TBB_ASSERT(!(f & rf_clear_edges) || my_successors.empty(), "Error resetting broadcast_node"); } }; // broadcast_node //! Forwards messages in arbitrary order template class buffer_node : public graph_node , public reservable_item_buffer< T, cache_aligned_allocator > , public receiver, public sender { typedef cache_aligned_allocator internals_allocator; public: typedef T input_type; typedef T output_type; typedef typename receiver::predecessor_type predecessor_type; typedef typename sender::successor_type successor_type; typedef buffer_node class_type; protected: typedef size_t size_type; round_robin_cache< T, null_rw_mutex > my_successors; friend class forward_task_bypass< class_type >; enum op_type {reg_succ, rem_succ, req_item, res_item, rel_res, con_res, put_item, try_fwd_task }; // implements the aggregator_operation concept class buffer_operation : public d1::aggregated_operation< buffer_operation > { public: char type; T* elem; graph_task* ltask; successor_type *r; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT message_metainfo* metainfo{ nullptr }; #endif buffer_operation(const T& e, op_type t) : type(char(t)) , elem(const_cast(&e)) , ltask(nullptr) , r(nullptr) {} #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT buffer_operation(const T& e, op_type t, const message_metainfo& info) : type(char(t)), elem(const_cast(&e)), ltask(nullptr), r(nullptr) , metainfo(const_cast(&info)) {} buffer_operation(op_type t, message_metainfo& info) : type(char(t)), elem(nullptr), ltask(nullptr), r(nullptr), metainfo(&info) {} #endif buffer_operation(op_type t) : type(char(t)), elem(nullptr), ltask(nullptr), r(nullptr) {} }; bool forwarder_busy; typedef d1::aggregating_functor handler_type; friend class d1::aggregating_functor; d1::aggregator< handler_type, buffer_operation> my_aggregator; virtual void handle_operations(buffer_operation *op_list) { handle_operations_impl(op_list, this); } template void handle_operations_impl(buffer_operation *op_list, derived_type* derived) { __TBB_ASSERT(static_cast(derived) == this, "'this' is not a base class for derived"); buffer_operation *tmp = nullptr; bool try_forwarding = false; while (op_list) { tmp = op_list; op_list = op_list->next; switch (tmp->type) { case reg_succ: internal_reg_succ(tmp); try_forwarding = true; break; case rem_succ: internal_rem_succ(tmp); break; case req_item: internal_pop(tmp); break; case res_item: internal_reserve(tmp); break; case rel_res: internal_release(tmp); try_forwarding = true; break; case con_res: internal_consume(tmp); try_forwarding = true; break; case put_item: try_forwarding = internal_push(tmp); break; case try_fwd_task: internal_forward_task(tmp); break; } } derived->order(); if (try_forwarding && !forwarder_busy) { if(is_graph_active(this->my_graph)) { forwarder_busy = true; typedef forward_task_bypass task_type; d1::small_object_allocator allocator{}; graph_task* new_task = allocator.new_object(graph_reference(), allocator, *this); // tmp should point to the last item handled by the aggregator. This is the operation // the handling thread enqueued. So modifying that record will be okay. // TODO revamp: check that the issue is still present // workaround for icc bug (at least 12.0 and 13.0) // error: function "tbb::flow::interfaceX::combine_tasks" cannot be called with the given argument list // argument types are: (graph, graph_task *, graph_task *) graph_task *z = tmp->ltask; graph &g = this->my_graph; tmp->ltask = combine_tasks(g, z, new_task); // in case the op generated a task } } } // handle_operations inline graph_task *grab_forwarding_task( buffer_operation &op_data) { return op_data.ltask; } inline bool enqueue_forwarding_task(buffer_operation &op_data) { graph_task *ft = grab_forwarding_task(op_data); if(ft) { spawn_in_graph_arena(graph_reference(), *ft); return true; } return false; } //! This is executed by an enqueued task, the "forwarder" virtual graph_task *forward_task() { buffer_operation op_data(try_fwd_task); graph_task *last_task = nullptr; do { op_data.status = WAIT; op_data.ltask = nullptr; my_aggregator.execute(&op_data); // workaround for icc bug graph_task *xtask = op_data.ltask; graph& g = this->my_graph; last_task = combine_tasks(g, last_task, xtask); } while (op_data.status ==SUCCEEDED); return last_task; } //! Register successor virtual void internal_reg_succ(buffer_operation *op) { __TBB_ASSERT(op->r, nullptr); my_successors.register_successor(*(op->r)); op->status.store(SUCCEEDED, std::memory_order_release); } //! Remove successor virtual void internal_rem_succ(buffer_operation *op) { __TBB_ASSERT(op->r, nullptr); my_successors.remove_successor(*(op->r)); op->status.store(SUCCEEDED, std::memory_order_release); } private: void order() {} bool is_item_valid() { return this->my_item_valid(this->my_tail - 1); } void try_put_and_add_task(graph_task*& last_task) { graph_task* new_task = my_successors.try_put_task(this->back() __TBB_FLOW_GRAPH_METAINFO_ARG(this->back_metainfo())); if (new_task) { // workaround for icc bug graph& g = this->my_graph; last_task = combine_tasks(g, last_task, new_task); this->destroy_back(); } } protected: //! Tries to forward valid items to successors virtual void internal_forward_task(buffer_operation *op) { internal_forward_task_impl(op, this); } template void internal_forward_task_impl(buffer_operation *op, derived_type* derived) { __TBB_ASSERT(static_cast(derived) == this, "'this' is not a base class for derived"); if (this->my_reserved || !derived->is_item_valid()) { op->status.store(FAILED, std::memory_order_release); this->forwarder_busy = false; return; } // Try forwarding, giving each successor a chance graph_task* last_task = nullptr; size_type counter = my_successors.size(); for (; counter > 0 && derived->is_item_valid(); --counter) derived->try_put_and_add_task(last_task); op->ltask = last_task; // return task if (last_task && !counter) { op->status.store(SUCCEEDED, std::memory_order_release); } else { op->status.store(FAILED, std::memory_order_release); forwarder_busy = false; } } virtual bool internal_push(buffer_operation *op) { __TBB_ASSERT(op->elem, nullptr); #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT __TBB_ASSERT(op->metainfo, nullptr); this->push_back(*(op->elem), (*op->metainfo)); #else this->push_back(*(op->elem)); #endif op->status.store(SUCCEEDED, std::memory_order_release); return true; } virtual void internal_pop(buffer_operation *op) { __TBB_ASSERT(op->elem, nullptr); #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT bool pop_result = op->metainfo ? this->pop_back(*(op->elem), *(op->metainfo)) : this->pop_back(*(op->elem)); #else bool pop_result = this->pop_back(*(op->elem)); #endif if (pop_result) { op->status.store(SUCCEEDED, std::memory_order_release); } else { op->status.store(FAILED, std::memory_order_release); } } virtual void internal_reserve(buffer_operation *op) { __TBB_ASSERT(op->elem, nullptr); #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT bool reserve_result = op->metainfo ? this->reserve_front(*(op->elem), *(op->metainfo)) : this->reserve_front(*(op->elem)); #else bool reserve_result = this->reserve_front(*(op->elem)); #endif if (reserve_result) { op->status.store(SUCCEEDED, std::memory_order_release); } else { op->status.store(FAILED, std::memory_order_release); } } virtual void internal_consume(buffer_operation *op) { this->consume_front(); op->status.store(SUCCEEDED, std::memory_order_release); } virtual void internal_release(buffer_operation *op) { this->release_front(); op->status.store(SUCCEEDED, std::memory_order_release); } public: //! Constructor __TBB_NOINLINE_SYM explicit buffer_node( graph &g ) : graph_node(g), reservable_item_buffer(), receiver(), sender(), my_successors(this), forwarder_busy(false) { my_aggregator.initialize_handler(handler_type(this)); fgt_node( CODEPTR(), FLOW_BUFFER_NODE, &this->my_graph, static_cast *>(this), static_cast *>(this) ); } #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template buffer_node(const node_set& nodes) : buffer_node(nodes.graph_reference()) { make_edges_in_order(nodes, *this); } #endif //! Copy constructor __TBB_NOINLINE_SYM buffer_node( const buffer_node& src ) : buffer_node(src.my_graph) {} // // message sender implementation // //! Adds a new successor. /** Adds successor r to the list of successors; may forward tasks. */ bool register_successor( successor_type &r ) override { buffer_operation op_data(reg_succ); op_data.r = &r; my_aggregator.execute(&op_data); (void)enqueue_forwarding_task(op_data); return true; } //! Removes a successor. /** Removes successor r from the list of successors. It also calls r.remove_predecessor(*this) to remove this node as a predecessor. */ bool remove_successor( successor_type &r ) override { // TODO revamp: investigate why full qualification is necessary here tbb::detail::d2::remove_predecessor(r, *this); buffer_operation op_data(rem_succ); op_data.r = &r; my_aggregator.execute(&op_data); // even though this operation does not cause a forward, if we are the handler, and // a forward is scheduled, we may be the first to reach this point after the aggregator, // and so should check for the task. (void)enqueue_forwarding_task(op_data); return true; } //! Request an item from the buffer_node /** true = v contains the returned item
false = no item has been returned */ bool try_get( T &v ) override { buffer_operation op_data(req_item); op_data.elem = &v; my_aggregator.execute(&op_data); (void)enqueue_forwarding_task(op_data); return (op_data.status==SUCCEEDED); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT bool try_get( T &v, message_metainfo& metainfo ) override { buffer_operation op_data(req_item, metainfo); op_data.elem = &v; my_aggregator.execute(&op_data); (void)enqueue_forwarding_task(op_data); return (op_data.status==SUCCEEDED); } #endif //! Reserves an item. /** false = no item can be reserved
true = an item is reserved */ bool try_reserve( T &v ) override { buffer_operation op_data(res_item); op_data.elem = &v; my_aggregator.execute(&op_data); (void)enqueue_forwarding_task(op_data); return (op_data.status==SUCCEEDED); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT bool try_reserve( output_type& v, message_metainfo& metainfo ) override { buffer_operation op_data(res_item, metainfo); op_data.elem = &v; my_aggregator.execute(&op_data); (void)enqueue_forwarding_task(op_data); return op_data.status==SUCCEEDED; } #endif //! Release a reserved item. /** true = item has been released and so remains in sender */ bool try_release() override { buffer_operation op_data(rel_res); my_aggregator.execute(&op_data); (void)enqueue_forwarding_task(op_data); return true; } //! Consumes a reserved item. /** true = item is removed from sender and reservation removed */ bool try_consume() override { buffer_operation op_data(con_res); my_aggregator.execute(&op_data); (void)enqueue_forwarding_task(op_data); return true; } private: graph_task* try_put_task_impl(const T& t __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) { buffer_operation op_data(t, put_item __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo)); my_aggregator.execute(&op_data); graph_task *ft = grab_forwarding_task(op_data); // sequencer_nodes can return failure (if an item has been previously inserted) // We have to spawn the returned task if our own operation fails. if(ft && op_data.status ==FAILED) { // we haven't succeeded queueing the item, but for some reason the // call returned a task (if another request resulted in a successful // forward this could happen.) Queue the task and reset the pointer. spawn_in_graph_arena(graph_reference(), *ft); ft = nullptr; } else if(!ft && op_data.status ==SUCCEEDED) { ft = SUCCESSFULLY_ENQUEUED; } return ft; } protected: template< typename R, typename B > friend class run_and_put_task; template friend class broadcast_cache; template friend class round_robin_cache; //! receive an item, return a task *if possible graph_task *try_put_task(const T &t) override { return try_put_task_impl(t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{})); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT graph_task* try_put_task(const T& t, const message_metainfo& metainfo) override { return try_put_task_impl(t, metainfo); } #endif graph& graph_reference() const override { return my_graph; } protected: void reset_node( reset_flags f) override { reservable_item_buffer::reset(); // TODO: just clear structures if (f&rf_clear_edges) { my_successors.clear(); } forwarder_busy = false; } }; // buffer_node //! Forwards messages in FIFO order template class queue_node : public buffer_node { protected: typedef buffer_node base_type; typedef typename base_type::size_type size_type; typedef typename base_type::buffer_operation queue_operation; typedef queue_node class_type; private: template friend class buffer_node; bool is_item_valid() { return this->my_item_valid(this->my_head); } void try_put_and_add_task(graph_task*& last_task) { graph_task* new_task = this->my_successors.try_put_task(this->front() __TBB_FLOW_GRAPH_METAINFO_ARG(this->front_metainfo())); if (new_task) { // workaround for icc bug graph& graph_ref = this->graph_reference(); last_task = combine_tasks(graph_ref, last_task, new_task); this->destroy_front(); } } protected: void internal_forward_task(queue_operation *op) override { this->internal_forward_task_impl(op, this); } void internal_pop(queue_operation *op) override { if ( this->my_reserved || !this->my_item_valid(this->my_head)){ op->status.store(FAILED, std::memory_order_release); } else { #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT if (op->metainfo) { this->pop_front(*(op->elem), *(op->metainfo)); } else #endif { this->pop_front(*(op->elem)); } op->status.store(SUCCEEDED, std::memory_order_release); } } void internal_reserve(queue_operation *op) override { if (this->my_reserved || !this->my_item_valid(this->my_head)) { op->status.store(FAILED, std::memory_order_release); } else { #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT if (op->metainfo) { this->reserve_front(*(op->elem), *(op->metainfo)); } else #endif { this->reserve_front(*(op->elem)); } op->status.store(SUCCEEDED, std::memory_order_release); } } void internal_consume(queue_operation *op) override { this->consume_front(); op->status.store(SUCCEEDED, std::memory_order_release); } public: typedef T input_type; typedef T output_type; typedef typename receiver::predecessor_type predecessor_type; typedef typename sender::successor_type successor_type; //! Constructor __TBB_NOINLINE_SYM explicit queue_node( graph &g ) : base_type(g) { fgt_node( CODEPTR(), FLOW_QUEUE_NODE, &(this->my_graph), static_cast *>(this), static_cast *>(this) ); } #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template queue_node( const node_set& nodes) : queue_node(nodes.graph_reference()) { make_edges_in_order(nodes, *this); } #endif //! Copy constructor __TBB_NOINLINE_SYM queue_node( const queue_node& src) : base_type(src) { fgt_node( CODEPTR(), FLOW_QUEUE_NODE, &(this->my_graph), static_cast *>(this), static_cast *>(this) ); } protected: void reset_node( reset_flags f) override { base_type::reset_node(f); } }; // queue_node //! Forwards messages in sequence order template __TBB_requires(std::copyable) class sequencer_node : public queue_node { function_body< T, size_t > *my_sequencer; // my_sequencer should be a benign function and must be callable // from a parallel context. Does this mean it needn't be reset? public: typedef T input_type; typedef T output_type; typedef typename receiver::predecessor_type predecessor_type; typedef typename sender::successor_type successor_type; //! Constructor template< typename Sequencer > __TBB_requires(sequencer) __TBB_NOINLINE_SYM sequencer_node( graph &g, const Sequencer& s ) : queue_node(g), my_sequencer(new function_body_leaf< T, size_t, Sequencer>(s) ) { fgt_node( CODEPTR(), FLOW_SEQUENCER_NODE, &(this->my_graph), static_cast *>(this), static_cast *>(this) ); } #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template __TBB_requires(sequencer) sequencer_node( const node_set& nodes, const Sequencer& s) : sequencer_node(nodes.graph_reference(), s) { make_edges_in_order(nodes, *this); } #endif //! Copy constructor __TBB_NOINLINE_SYM sequencer_node( const sequencer_node& src ) : queue_node(src), my_sequencer( src.my_sequencer->clone() ) { fgt_node( CODEPTR(), FLOW_SEQUENCER_NODE, &(this->my_graph), static_cast *>(this), static_cast *>(this) ); } //! Destructor ~sequencer_node() { delete my_sequencer; } protected: typedef typename buffer_node::size_type size_type; typedef typename buffer_node::buffer_operation sequencer_operation; private: bool internal_push(sequencer_operation *op) override { size_type tag = (*my_sequencer)(*(op->elem)); #if !TBB_DEPRECATED_SEQUENCER_DUPLICATES if (tag < this->my_head) { // have already emitted a message with this tag op->status.store(FAILED, std::memory_order_release); return false; } #endif // cannot modify this->my_tail now; the buffer would be inconsistent. size_t new_tail = (tag+1 > this->my_tail) ? tag+1 : this->my_tail; if (this->size(new_tail) > this->capacity()) { this->grow_my_array(this->size(new_tail)); } this->my_tail = new_tail; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT __TBB_ASSERT(op->metainfo, nullptr); bool place_item_result = this->place_item(tag, *(op->elem), *(op->metainfo)); const op_stat res = place_item_result ? SUCCEEDED : FAILED; #else const op_stat res = this->place_item(tag, *(op->elem)) ? SUCCEEDED : FAILED; #endif op->status.store(res, std::memory_order_release); return res ==SUCCEEDED; } }; // sequencer_node //! Forwards messages in priority order template> class priority_queue_node : public buffer_node { public: typedef T input_type; typedef T output_type; typedef buffer_node base_type; typedef priority_queue_node class_type; typedef typename receiver::predecessor_type predecessor_type; typedef typename sender::successor_type successor_type; //! Constructor __TBB_NOINLINE_SYM explicit priority_queue_node( graph &g, const Compare& comp = Compare() ) : buffer_node(g), compare(comp), mark(0) { fgt_node( CODEPTR(), FLOW_PRIORITY_QUEUE_NODE, &(this->my_graph), static_cast *>(this), static_cast *>(this) ); } #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template priority_queue_node(const node_set& nodes, const Compare& comp = Compare()) : priority_queue_node(nodes.graph_reference(), comp) { make_edges_in_order(nodes, *this); } #endif //! Copy constructor __TBB_NOINLINE_SYM priority_queue_node( const priority_queue_node &src ) : buffer_node(src), mark(0) { fgt_node( CODEPTR(), FLOW_PRIORITY_QUEUE_NODE, &(this->my_graph), static_cast *>(this), static_cast *>(this) ); } protected: void reset_node( reset_flags f) override { mark = 0; base_type::reset_node(f); } typedef typename buffer_node::size_type size_type; typedef typename buffer_node::item_type item_type; typedef typename buffer_node::buffer_operation prio_operation; //! Tries to forward valid items to successors void internal_forward_task(prio_operation *op) override { this->internal_forward_task_impl(op, this); } void handle_operations(prio_operation *op_list) override { this->handle_operations_impl(op_list, this); } bool internal_push(prio_operation *op) override { #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT __TBB_ASSERT(op->metainfo, nullptr); prio_push(*(op->elem), *(op->metainfo)); #else prio_push(*(op->elem)); #endif op->status.store(SUCCEEDED, std::memory_order_release); return true; } void internal_pop(prio_operation *op) override { // if empty or already reserved, don't pop if ( this->my_reserved == true || this->my_tail == 0 ) { op->status.store(FAILED, std::memory_order_release); return; } *(op->elem) = prio(); #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT if (op->metainfo) { *(op->metainfo) = std::move(prio_metainfo()); } #endif op->status.store(SUCCEEDED, std::memory_order_release); prio_pop(); } // pops the highest-priority item, saves copy void internal_reserve(prio_operation *op) override { if (this->my_reserved == true || this->my_tail == 0) { op->status.store(FAILED, std::memory_order_release); return; } this->my_reserved = true; *(op->elem) = prio(); #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT if (op->metainfo) { *(op->metainfo) = std::move(prio_metainfo()); reserved_metainfo = *(op->metainfo); } #endif reserved_item = *(op->elem); op->status.store(SUCCEEDED, std::memory_order_release); prio_pop(); } void internal_consume(prio_operation *op) override { op->status.store(SUCCEEDED, std::memory_order_release); this->my_reserved = false; reserved_item = input_type(); #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT for (auto waiter : reserved_metainfo.waiters()) { waiter->release(1); } reserved_metainfo = message_metainfo{}; #endif } void internal_release(prio_operation *op) override { op->status.store(SUCCEEDED, std::memory_order_release); prio_push(reserved_item __TBB_FLOW_GRAPH_METAINFO_ARG(reserved_metainfo)); this->my_reserved = false; reserved_item = input_type(); #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT for (auto waiter : reserved_metainfo.waiters()) { waiter->release(1); } reserved_metainfo = message_metainfo{}; #endif } private: template friend class buffer_node; void order() { if (mark < this->my_tail) heapify(); __TBB_ASSERT(mark == this->my_tail, "mark unequal after heapify"); } bool is_item_valid() { return this->my_tail > 0; } void try_put_and_add_task(graph_task*& last_task) { graph_task* new_task = this->my_successors.try_put_task(this->prio() __TBB_FLOW_GRAPH_METAINFO_ARG(this->prio_metainfo())); if (new_task) { // workaround for icc bug graph& graph_ref = this->graph_reference(); last_task = combine_tasks(graph_ref, last_task, new_task); prio_pop(); } } private: Compare compare; size_type mark; input_type reserved_item; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT message_metainfo reserved_metainfo; #endif // in case a reheap has not been done after a push, check if the mark item is higher than the 0'th item bool prio_use_tail() { __TBB_ASSERT(mark <= this->my_tail, "mark outside bounds before test"); return mark < this->my_tail && compare(this->get_my_item(0), this->get_my_item(this->my_tail - 1)); } // prio_push: checks that the item will fit, expand array if necessary, put at end void prio_push(const T &src __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) { if ( this->my_tail >= this->my_array_size ) this->grow_my_array( this->my_tail + 1 ); (void) this->place_item(this->my_tail, src __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo)); ++(this->my_tail); __TBB_ASSERT(mark < this->my_tail, "mark outside bounds after push"); } // prio_pop: deletes highest priority item from the array, and if it is item // 0, move last item to 0 and reheap. If end of array, just destroy and decrement tail // and mark. Assumes the array has already been tested for emptiness; no failure. void prio_pop() { if (prio_use_tail()) { // there are newly pushed elements; last one higher than top // copy the data this->destroy_item(this->my_tail-1); --(this->my_tail); __TBB_ASSERT(mark <= this->my_tail, "mark outside bounds after pop"); return; } this->destroy_item(0); if(this->my_tail > 1) { // push the last element down heap __TBB_ASSERT(this->my_item_valid(this->my_tail - 1), nullptr); this->move_item(0,this->my_tail - 1); } --(this->my_tail); if(mark > this->my_tail) --mark; if (this->my_tail > 1) // don't reheap for heap of size 1 reheap(); __TBB_ASSERT(mark <= this->my_tail, "mark outside bounds after pop"); } const T& prio() { return this->get_my_item(prio_use_tail() ? this->my_tail-1 : 0); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT message_metainfo& prio_metainfo() { return this->get_my_metainfo(prio_use_tail() ? this->my_tail-1 : 0); } #endif // turn array into heap void heapify() { if(this->my_tail == 0) { mark = 0; return; } if (!mark) mark = 1; for (; markmy_tail; ++mark) { // for each unheaped element size_type cur_pos = mark; input_type to_place; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT message_metainfo metainfo; #endif this->fetch_item(mark, to_place __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo)); do { // push to_place up the heap size_type parent = (cur_pos-1)>>1; if (!compare(this->get_my_item(parent), to_place)) break; this->move_item(cur_pos, parent); cur_pos = parent; } while( cur_pos ); this->place_item(cur_pos, to_place __TBB_FLOW_GRAPH_METAINFO_ARG(std::move(metainfo))); } } // otherwise heapified array with new root element; rearrange to heap void reheap() { size_type cur_pos=0, child=1; while (child < mark) { size_type target = child; if (child+1get_my_item(child), this->get_my_item(child+1))) ++target; // target now has the higher priority child if (compare(this->get_my_item(target), this->get_my_item(cur_pos))) break; // swap this->swap_items(cur_pos, target); cur_pos = target; child = (cur_pos<<1)+1; } } }; // priority_queue_node //! Forwards messages only if the threshold has not been reached /** This node forwards items until its threshold is reached. It contains no buffering. If the downstream node rejects, the message is dropped. */ template< typename T, typename DecrementType=continue_msg > class limiter_node : public graph_node, public receiver< T >, public sender< T > { public: typedef T input_type; typedef T output_type; typedef typename receiver::predecessor_type predecessor_type; typedef typename sender::successor_type successor_type; //TODO: There is a lack of predefined types for its controlling "decrementer" port. It should be fixed later. private: size_t my_threshold; size_t my_count; // number of successful puts size_t my_tries; // number of active put attempts size_t my_future_decrement; // number of active decrement reservable_predecessor_cache< T, spin_mutex > my_predecessors; spin_mutex my_mutex; broadcast_cache< T > my_successors; //! The internal receiver< DecrementType > that adjusts the count threshold_regulator< limiter_node, DecrementType > decrement; graph_task* decrement_counter( long long delta ) { if ( delta > 0 && size_t(delta) > my_threshold ) { delta = my_threshold; } { spin_mutex::scoped_lock lock(my_mutex); if ( delta > 0 && size_t(delta) > my_count ) { if( my_tries > 0 ) { my_future_decrement += (size_t(delta) - my_count); } my_count = 0; } else if ( delta < 0 && size_t(-delta) > my_threshold - my_count ) { my_count = my_threshold; } else { my_count -= size_t(delta); // absolute value of delta is sufficiently small } __TBB_ASSERT(my_count <= my_threshold, "counter values are truncated to be inside the [0, threshold] interval"); } return forward_task(); } // Let threshold_regulator call decrement_counter() friend class threshold_regulator< limiter_node, DecrementType >; friend class forward_task_bypass< limiter_node >; bool check_conditions() { // always called under lock return ( my_count + my_tries < my_threshold && !my_predecessors.empty() && !my_successors.empty() ); } // only returns a valid task pointer or nullptr, never SUCCESSFULLY_ENQUEUED graph_task* forward_task() { input_type v; graph_task* rval = nullptr; bool reserved = false; { spin_mutex::scoped_lock lock(my_mutex); if ( check_conditions() ) ++my_tries; else return nullptr; } //SUCCESS // if we can reserve and can put, we consume the reservation // we increment the count and decrement the tries #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT message_metainfo metainfo; #endif if ( (my_predecessors.try_reserve(v __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo))) == true ) { reserved = true; if ( (rval = my_successors.try_put_task(v __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo))) != nullptr ) { { spin_mutex::scoped_lock lock(my_mutex); ++my_count; if ( my_future_decrement ) { if ( my_count > my_future_decrement ) { my_count -= my_future_decrement; my_future_decrement = 0; } else { my_future_decrement -= my_count; my_count = 0; } } --my_tries; my_predecessors.try_consume(); if ( check_conditions() ) { if ( is_graph_active(this->my_graph) ) { typedef forward_task_bypass> task_type; d1::small_object_allocator allocator{}; graph_task* rtask = allocator.new_object( my_graph, allocator, *this ); spawn_in_graph_arena(graph_reference(), *rtask); } } } return rval; } } //FAILURE //if we can't reserve, we decrement the tries //if we can reserve but can't put, we decrement the tries and release the reservation { spin_mutex::scoped_lock lock(my_mutex); --my_tries; if (reserved) my_predecessors.try_release(); if ( check_conditions() ) { if ( is_graph_active(this->my_graph) ) { d1::small_object_allocator allocator{}; typedef forward_task_bypass> task_type; graph_task* t = allocator.new_object(my_graph, allocator, *this); __TBB_ASSERT(!rval, "Have two tasks to handle"); return t; } } return rval; } } void initialize() { fgt_node( CODEPTR(), FLOW_LIMITER_NODE, &this->my_graph, static_cast *>(this), static_cast *>(&decrement), static_cast *>(this) ); } public: //! Constructor limiter_node(graph &g, size_t threshold) : graph_node(g), my_threshold(threshold), my_count(0), my_tries(0), my_future_decrement(0), my_predecessors(this), my_successors(this), decrement(this) { initialize(); } #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template limiter_node(const node_set& nodes, size_t threshold) : limiter_node(nodes.graph_reference(), threshold) { make_edges_in_order(nodes, *this); } #endif //! Copy constructor limiter_node( const limiter_node& src ) : limiter_node(src.my_graph, src.my_threshold) {} //! The interface for accessing internal receiver< DecrementType > that adjusts the count receiver& decrementer() { return decrement; } //! Replace the current successor with this new successor bool register_successor( successor_type &r ) override { spin_mutex::scoped_lock lock(my_mutex); bool was_empty = my_successors.empty(); my_successors.register_successor(r); //spawn a forward task if this is the only successor if ( was_empty && !my_predecessors.empty() && my_count + my_tries < my_threshold ) { if ( is_graph_active(this->my_graph) ) { d1::small_object_allocator allocator{}; typedef forward_task_bypass> task_type; graph_task* t = allocator.new_object(my_graph, allocator, *this); spawn_in_graph_arena(graph_reference(), *t); } } return true; } //! Removes a successor from this node /** r.remove_predecessor(*this) is also called. */ bool remove_successor( successor_type &r ) override { // TODO revamp: investigate why qualification is needed for remove_predecessor() call tbb::detail::d2::remove_predecessor(r, *this); my_successors.remove_successor(r); return true; } //! Adds src to the list of cached predecessors. bool register_predecessor( predecessor_type &src ) override { spin_mutex::scoped_lock lock(my_mutex); my_predecessors.add( src ); if ( my_count + my_tries < my_threshold && !my_successors.empty() && is_graph_active(this->my_graph) ) { d1::small_object_allocator allocator{}; typedef forward_task_bypass> task_type; graph_task* t = allocator.new_object(my_graph, allocator, *this); spawn_in_graph_arena(graph_reference(), *t); } return true; } //! Removes src from the list of cached predecessors. bool remove_predecessor( predecessor_type &src ) override { my_predecessors.remove( src ); return true; } protected: template< typename R, typename B > friend class run_and_put_task; template friend class broadcast_cache; template friend class round_robin_cache; private: //! Puts an item to this receiver graph_task* try_put_task_impl( const T &t __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo) ) { { spin_mutex::scoped_lock lock(my_mutex); if ( my_count + my_tries >= my_threshold ) return nullptr; else ++my_tries; } graph_task* rtask = my_successors.try_put_task(t __TBB_FLOW_GRAPH_METAINFO_ARG(metainfo)); if ( !rtask ) { // try_put_task failed. spin_mutex::scoped_lock lock(my_mutex); --my_tries; if (check_conditions() && is_graph_active(this->my_graph)) { d1::small_object_allocator allocator{}; typedef forward_task_bypass> task_type; rtask = allocator.new_object(my_graph, allocator, *this); } } else { spin_mutex::scoped_lock lock(my_mutex); ++my_count; if ( my_future_decrement ) { if ( my_count > my_future_decrement ) { my_count -= my_future_decrement; my_future_decrement = 0; } else { my_future_decrement -= my_count; my_count = 0; } } --my_tries; } return rtask; } protected: graph_task* try_put_task(const T& t) override { return try_put_task_impl(t __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{})); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT graph_task* try_put_task(const T& t, const message_metainfo& metainfo) override { return try_put_task_impl(t, metainfo); } #endif graph& graph_reference() const override { return my_graph; } void reset_node( reset_flags f ) override { my_count = 0; if ( f & rf_clear_edges ) { my_predecessors.clear(); my_successors.clear(); } else { my_predecessors.reset(); } decrement.reset_receiver(f); } }; // limiter_node #include "detail/_flow_graph_join_impl.h" template class join_node; template class join_node: public unfolded_join_node::value, reserving_port, OutputTuple, reserving> { private: static const int N = std::tuple_size::value; typedef unfolded_join_node unfolded_type; public: typedef OutputTuple output_type; typedef typename unfolded_type::input_ports_type input_ports_type; __TBB_NOINLINE_SYM explicit join_node(graph &g) : unfolded_type(g) { fgt_multiinput_node( CODEPTR(), FLOW_JOIN_NODE_RESERVING, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template __TBB_NOINLINE_SYM join_node(const node_set& nodes, reserving = reserving()) : join_node(nodes.graph_reference()) { make_edges_in_order(nodes, *this); } #endif __TBB_NOINLINE_SYM join_node(const join_node &other) : unfolded_type(other) { fgt_multiinput_node( CODEPTR(), FLOW_JOIN_NODE_RESERVING, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } }; template class join_node: public unfolded_join_node::value, queueing_port, OutputTuple, queueing> { private: static const int N = std::tuple_size::value; typedef unfolded_join_node unfolded_type; public: typedef OutputTuple output_type; typedef typename unfolded_type::input_ports_type input_ports_type; __TBB_NOINLINE_SYM explicit join_node(graph &g) : unfolded_type(g) { fgt_multiinput_node( CODEPTR(), FLOW_JOIN_NODE_QUEUEING, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template __TBB_NOINLINE_SYM join_node(const node_set& nodes, queueing = queueing()) : join_node(nodes.graph_reference()) { make_edges_in_order(nodes, *this); } #endif __TBB_NOINLINE_SYM join_node(const join_node &other) : unfolded_type(other) { fgt_multiinput_node( CODEPTR(), FLOW_JOIN_NODE_QUEUEING, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } }; #if __TBB_CPP20_CONCEPTS_PRESENT // Helper function which is well-formed only if all of the elements in OutputTuple // satisfies join_node_function_object template void join_node_function_objects_helper( std::index_sequence ) requires (std::tuple_size_v == sizeof...(Functions)) && (... && join_node_function_object, K>); template concept join_node_functions = requires { join_node_function_objects_helper(std::make_index_sequence{}); }; #endif // template for key_matching join_node // tag_matching join_node is a specialization of key_matching, and is source-compatible. template class join_node > : public unfolded_join_node::value, key_matching_port, OutputTuple, key_matching > { private: static const int N = std::tuple_size::value; typedef unfolded_join_node > unfolded_type; public: typedef OutputTuple output_type; typedef typename unfolded_type::input_ports_type input_ports_type; #if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING join_node(graph &g) : unfolded_type(g) {} #endif /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */ template __TBB_requires(join_node_functions) __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1) : unfolded_type(g, b0, b1) { fgt_multiinput_node( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } template __TBB_requires(join_node_functions) __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2) : unfolded_type(g, b0, b1, b2) { fgt_multiinput_node( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } template __TBB_requires(join_node_functions) __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2, __TBB_B3 b3) : unfolded_type(g, b0, b1, b2, b3) { fgt_multiinput_node( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } template __TBB_requires(join_node_functions) __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2, __TBB_B3 b3, __TBB_B4 b4) : unfolded_type(g, b0, b1, b2, b3, b4) { fgt_multiinput_node( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } #if __TBB_VARIADIC_MAX >= 6 template __TBB_requires(join_node_functions) __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2, __TBB_B3 b3, __TBB_B4 b4, __TBB_B5 b5) : unfolded_type(g, b0, b1, b2, b3, b4, b5) { fgt_multiinput_node( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } #endif #if __TBB_VARIADIC_MAX >= 7 template __TBB_requires(join_node_functions) __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2, __TBB_B3 b3, __TBB_B4 b4, __TBB_B5 b5, __TBB_B6 b6) : unfolded_type(g, b0, b1, b2, b3, b4, b5, b6) { fgt_multiinput_node( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } #endif #if __TBB_VARIADIC_MAX >= 8 template __TBB_requires(join_node_functions) __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2, __TBB_B3 b3, __TBB_B4 b4, __TBB_B5 b5, __TBB_B6 b6, __TBB_B7 b7) : unfolded_type(g, b0, b1, b2, b3, b4, b5, b6, b7) { fgt_multiinput_node( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } #endif #if __TBB_VARIADIC_MAX >= 9 template __TBB_requires(join_node_functions) __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2, __TBB_B3 b3, __TBB_B4 b4, __TBB_B5 b5, __TBB_B6 b6, __TBB_B7 b7, __TBB_B8 b8) : unfolded_type(g, b0, b1, b2, b3, b4, b5, b6, b7, b8) { fgt_multiinput_node( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } #endif #if __TBB_VARIADIC_MAX >= 10 template __TBB_requires(join_node_functions) __TBB_NOINLINE_SYM join_node(graph &g, __TBB_B0 b0, __TBB_B1 b1, __TBB_B2 b2, __TBB_B3 b3, __TBB_B4 b4, __TBB_B5 b5, __TBB_B6 b6, __TBB_B7 b7, __TBB_B8 b8, __TBB_B9 b9) : unfolded_type(g, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9) { fgt_multiinput_node( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } #endif #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template < #if (__clang_major__ == 3 && __clang_minor__ == 4) // clang 3.4 misdeduces 'Args...' for 'node_set' while it can cope with template template parameter. template class node_set, #endif typename... Args, typename... Bodies > __TBB_requires((sizeof...(Bodies) == 0) || join_node_functions) __TBB_NOINLINE_SYM join_node(const node_set& nodes, Bodies... bodies) : join_node(nodes.graph_reference(), bodies...) { make_edges_in_order(nodes, *this); } #endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET __TBB_NOINLINE_SYM join_node(const join_node &other) : unfolded_type(other) { fgt_multiinput_node( CODEPTR(), FLOW_JOIN_NODE_TAG_MATCHING, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } }; // indexer node #include "detail/_flow_graph_indexer_impl.h" // TODO: Implement interface with variadic template or tuple template class indexer_node; //indexer node specializations template class indexer_node : public unfolded_indexer_node > { private: static const int N = 1; public: typedef std::tuple InputTuple; typedef tagged_msg output_type; typedef unfolded_indexer_node unfolded_type; __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) { fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template indexer_node(const node_set& nodes) : indexer_node(nodes.graph_reference()) { make_edges_in_order(nodes, *this); } #endif // Copy constructor __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) { fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } }; template class indexer_node : public unfolded_indexer_node > { private: static const int N = 2; public: typedef std::tuple InputTuple; typedef tagged_msg output_type; typedef unfolded_indexer_node unfolded_type; __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) { fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template indexer_node(const node_set& nodes) : indexer_node(nodes.graph_reference()) { make_edges_in_order(nodes, *this); } #endif // Copy constructor __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) { fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } }; template class indexer_node : public unfolded_indexer_node > { private: static const int N = 3; public: typedef std::tuple InputTuple; typedef tagged_msg output_type; typedef unfolded_indexer_node unfolded_type; __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) { fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template indexer_node(const node_set& nodes) : indexer_node(nodes.graph_reference()) { make_edges_in_order(nodes, *this); } #endif // Copy constructor __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) { fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } }; template class indexer_node : public unfolded_indexer_node > { private: static const int N = 4; public: typedef std::tuple InputTuple; typedef tagged_msg output_type; typedef unfolded_indexer_node unfolded_type; __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) { fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template indexer_node(const node_set& nodes) : indexer_node(nodes.graph_reference()) { make_edges_in_order(nodes, *this); } #endif // Copy constructor __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) { fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } }; template class indexer_node : public unfolded_indexer_node > { private: static const int N = 5; public: typedef std::tuple InputTuple; typedef tagged_msg output_type; typedef unfolded_indexer_node unfolded_type; __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) { fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template indexer_node(const node_set& nodes) : indexer_node(nodes.graph_reference()) { make_edges_in_order(nodes, *this); } #endif // Copy constructor __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) { fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } }; #if __TBB_VARIADIC_MAX >= 6 template class indexer_node : public unfolded_indexer_node > { private: static const int N = 6; public: typedef std::tuple InputTuple; typedef tagged_msg output_type; typedef unfolded_indexer_node unfolded_type; __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) { fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template indexer_node(const node_set& nodes) : indexer_node(nodes.graph_reference()) { make_edges_in_order(nodes, *this); } #endif // Copy constructor __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) { fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } }; #endif //variadic max 6 #if __TBB_VARIADIC_MAX >= 7 template class indexer_node : public unfolded_indexer_node > { private: static const int N = 7; public: typedef std::tuple InputTuple; typedef tagged_msg output_type; typedef unfolded_indexer_node unfolded_type; __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) { fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template indexer_node(const node_set& nodes) : indexer_node(nodes.graph_reference()) { make_edges_in_order(nodes, *this); } #endif // Copy constructor __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) { fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } }; #endif //variadic max 7 #if __TBB_VARIADIC_MAX >= 8 template class indexer_node : public unfolded_indexer_node > { private: static const int N = 8; public: typedef std::tuple InputTuple; typedef tagged_msg output_type; typedef unfolded_indexer_node unfolded_type; indexer_node(graph& g) : unfolded_type(g) { fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template indexer_node(const node_set& nodes) : indexer_node(nodes.graph_reference()) { make_edges_in_order(nodes, *this); } #endif // Copy constructor indexer_node( const indexer_node& other ) : unfolded_type(other) { fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } }; #endif //variadic max 8 #if __TBB_VARIADIC_MAX >= 9 template class indexer_node : public unfolded_indexer_node > { private: static const int N = 9; public: typedef std::tuple InputTuple; typedef tagged_msg output_type; typedef unfolded_indexer_node unfolded_type; __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) { fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template indexer_node(const node_set& nodes) : indexer_node(nodes.graph_reference()) { make_edges_in_order(nodes, *this); } #endif // Copy constructor __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) { fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } }; #endif //variadic max 9 #if __TBB_VARIADIC_MAX >= 10 template class indexer_node/*default*/ : public unfolded_indexer_node > { private: static const int N = 10; public: typedef std::tuple InputTuple; typedef tagged_msg output_type; typedef unfolded_indexer_node unfolded_type; __TBB_NOINLINE_SYM indexer_node(graph& g) : unfolded_type(g) { fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template indexer_node(const node_set& nodes) : indexer_node(nodes.graph_reference()) { make_edges_in_order(nodes, *this); } #endif // Copy constructor __TBB_NOINLINE_SYM indexer_node( const indexer_node& other ) : unfolded_type(other) { fgt_multiinput_node( CODEPTR(), FLOW_INDEXER_NODE, &this->my_graph, this->input_ports(), static_cast< sender< output_type > *>(this) ); } }; #endif //variadic max 10 template< typename T > inline void internal_make_edge( sender &p, receiver &s ) { register_successor(p, s); fgt_make_edge( &p, &s ); } //! Makes an edge between a single predecessor and a single successor template< typename T > inline void make_edge( sender &p, receiver &s ) { internal_make_edge( p, s ); } //Makes an edge from port 0 of a multi-output predecessor to port 0 of a multi-input successor. template< typename T, typename V, typename = typename T::output_ports_type, typename = typename V::input_ports_type > inline void make_edge( T& output, V& input) { make_edge(std::get<0>(output.output_ports()), std::get<0>(input.input_ports())); } //Makes an edge from port 0 of a multi-output predecessor to a receiver. template< typename T, typename R, typename = typename T::output_ports_type > inline void make_edge( T& output, receiver& input) { make_edge(std::get<0>(output.output_ports()), input); } //Makes an edge from a sender to port 0 of a multi-input successor. template< typename S, typename V, typename = typename V::input_ports_type > inline void make_edge( sender& output, V& input) { make_edge(output, std::get<0>(input.input_ports())); } template< typename T > inline void internal_remove_edge( sender &p, receiver &s ) { remove_successor( p, s ); fgt_remove_edge( &p, &s ); } //! Removes an edge between a single predecessor and a single successor template< typename T > inline void remove_edge( sender &p, receiver &s ) { internal_remove_edge( p, s ); } //Removes an edge between port 0 of a multi-output predecessor and port 0 of a multi-input successor. template< typename T, typename V, typename = typename T::output_ports_type, typename = typename V::input_ports_type > inline void remove_edge( T& output, V& input) { remove_edge(std::get<0>(output.output_ports()), std::get<0>(input.input_ports())); } //Removes an edge between port 0 of a multi-output predecessor and a receiver. template< typename T, typename R, typename = typename T::output_ports_type > inline void remove_edge( T& output, receiver& input) { remove_edge(std::get<0>(output.output_ports()), input); } //Removes an edge between a sender and port 0 of a multi-input successor. template< typename S, typename V, typename = typename V::input_ports_type > inline void remove_edge( sender& output, V& input) { remove_edge(output, std::get<0>(input.input_ports())); } //! Returns a copy of the body from a function or continue node template< typename Body, typename Node > Body copy_body( Node &n ) { return n.template copy_function_object(); } //composite_node template< typename InputTuple, typename OutputTuple > class composite_node; template< typename... InputTypes, typename... OutputTypes> class composite_node , std::tuple > : public graph_node { public: typedef std::tuple< receiver&... > input_ports_type; typedef std::tuple< sender&... > output_ports_type; private: std::unique_ptr my_input_ports; std::unique_ptr my_output_ports; static const size_t NUM_INPUTS = sizeof...(InputTypes); static const size_t NUM_OUTPUTS = sizeof...(OutputTypes); protected: void reset_node(reset_flags) override {} public: composite_node( graph &g ) : graph_node(g) { fgt_multiinput_multioutput_node( CODEPTR(), FLOW_COMPOSITE_NODE, this, &this->my_graph ); } template void set_external_ports(T1&& input_ports_tuple, T2&& output_ports_tuple) { static_assert(NUM_INPUTS == std::tuple_size::value, "number of arguments does not match number of input ports"); static_assert(NUM_OUTPUTS == std::tuple_size::value, "number of arguments does not match number of output ports"); fgt_internal_input_alias_helper::alias_port( this, input_ports_tuple); fgt_internal_output_alias_helper::alias_port( this, output_ports_tuple); my_input_ports.reset( new input_ports_type(std::forward(input_ports_tuple)) ); my_output_ports.reset( new output_ports_type(std::forward(output_ports_tuple)) ); } template< typename... NodeTypes > void add_visible_nodes(const NodeTypes&... n) { add_nodes_impl(this, true, n...); } template< typename... NodeTypes > void add_nodes(const NodeTypes&... n) { add_nodes_impl(this, false, n...); } input_ports_type& input_ports() { __TBB_ASSERT(my_input_ports, "input ports not set, call set_external_ports to set input ports"); return *my_input_ports; } output_ports_type& output_ports() { __TBB_ASSERT(my_output_ports, "output ports not set, call set_external_ports to set output ports"); return *my_output_ports; } }; // class composite_node //composite_node with only input ports template< typename... InputTypes> class composite_node , std::tuple<> > : public graph_node { public: typedef std::tuple< receiver&... > input_ports_type; private: std::unique_ptr my_input_ports; static const size_t NUM_INPUTS = sizeof...(InputTypes); protected: void reset_node(reset_flags) override {} public: composite_node( graph &g ) : graph_node(g) { fgt_composite( CODEPTR(), this, &g ); } template void set_external_ports(T&& input_ports_tuple) { static_assert(NUM_INPUTS == std::tuple_size::value, "number of arguments does not match number of input ports"); fgt_internal_input_alias_helper::alias_port( this, input_ports_tuple); my_input_ports.reset( new input_ports_type(std::forward(input_ports_tuple)) ); } template< typename... NodeTypes > void add_visible_nodes(const NodeTypes&... n) { add_nodes_impl(this, true, n...); } template< typename... NodeTypes > void add_nodes( const NodeTypes&... n) { add_nodes_impl(this, false, n...); } input_ports_type& input_ports() { __TBB_ASSERT(my_input_ports, "input ports not set, call set_external_ports to set input ports"); return *my_input_ports; } }; // class composite_node //composite_nodes with only output_ports template class composite_node , std::tuple > : public graph_node { public: typedef std::tuple< sender&... > output_ports_type; private: std::unique_ptr my_output_ports; static const size_t NUM_OUTPUTS = sizeof...(OutputTypes); protected: void reset_node(reset_flags) override {} public: __TBB_NOINLINE_SYM composite_node( graph &g ) : graph_node(g) { fgt_composite( CODEPTR(), this, &g ); } template void set_external_ports(T&& output_ports_tuple) { static_assert(NUM_OUTPUTS == std::tuple_size::value, "number of arguments does not match number of output ports"); fgt_internal_output_alias_helper::alias_port( this, output_ports_tuple); my_output_ports.reset( new output_ports_type(std::forward(output_ports_tuple)) ); } template void add_visible_nodes(const NodeTypes&... n) { add_nodes_impl(this, true, n...); } template void add_nodes(const NodeTypes&... n) { add_nodes_impl(this, false, n...); } output_ports_type& output_ports() { __TBB_ASSERT(my_output_ports, "output ports not set, call set_external_ports to set output ports"); return *my_output_ports; } }; // class composite_node template class async_body_base: no_assign { public: typedef Gateway gateway_type; async_body_base(gateway_type *gateway): my_gateway(gateway) { } void set_gateway(gateway_type *gateway) { my_gateway = gateway; } protected: gateway_type *my_gateway; }; template class async_body: public async_body_base { private: Body my_body; public: typedef async_body_base base_type; typedef Gateway gateway_type; async_body(const Body &body, gateway_type *gateway) : base_type(gateway), my_body(body) { } void operator()( const Input &v, Ports & ) noexcept(noexcept(tbb::detail::invoke(my_body, v, std::declval()))) { tbb::detail::invoke(my_body, v, *this->my_gateway); } Body get_body() { return my_body; } }; //! Implements async node template < typename Input, typename Output, typename Policy = queueing_lightweight > __TBB_requires(std::default_initializable && std::copy_constructible) class async_node : public multifunction_node< Input, std::tuple< Output >, Policy >, public sender< Output > { typedef multifunction_node< Input, std::tuple< Output >, Policy > base_type; typedef multifunction_input< Input, typename base_type::output_ports_type, Policy, cache_aligned_allocator> mfn_input_type; public: typedef Input input_type; typedef Output output_type; typedef receiver receiver_type; typedef receiver successor_type; typedef sender predecessor_type; typedef receiver_gateway gateway_type; typedef async_body_base async_body_base_type; typedef typename base_type::output_ports_type output_ports_type; private: class receiver_gateway_impl: public receiver_gateway { public: receiver_gateway_impl(async_node* node): my_node(node) {} void reserve_wait() override { fgt_async_reserve(static_cast(my_node), &my_node->my_graph); my_node->my_graph.reserve_wait(); } void release_wait() override { async_node* n = my_node; graph* g = &n->my_graph; g->release_wait(); fgt_async_commit(static_cast(n), g); } //! Implements gateway_type::try_put for an external activity to submit a message to FG bool try_put(const Output &i) override { return my_node->try_put_impl(i); } private: async_node* my_node; } my_gateway; //The substitute of 'this' for member construction, to prevent compiler warnings async_node* self() { return this; } //! Implements gateway_type::try_put for an external activity to submit a message to FG bool try_put_impl(const Output &i) { multifunction_output &port_0 = output_port<0>(*this); broadcast_cache& port_successors = port_0.successors(); fgt_async_try_put_begin(this, &port_0); // TODO revamp: change to std::list graph_task_list tasks; bool is_at_least_one_put_successful = port_successors.gather_successful_try_puts(i, tasks); __TBB_ASSERT( is_at_least_one_put_successful || tasks.empty(), "Return status is inconsistent with the method operation." ); while( !tasks.empty() ) { enqueue_in_graph_arena(this->my_graph, tasks.pop_front()); } fgt_async_try_put_end(this, &port_0); return is_at_least_one_put_successful; } public: template __TBB_requires(async_node_body) __TBB_NOINLINE_SYM async_node( graph &g, size_t concurrency, Body body, Policy = Policy(), node_priority_t a_priority = no_priority ) : base_type( g, concurrency, async_body (body, &my_gateway), a_priority ), my_gateway(self()) { fgt_multioutput_node_with_body<1>( CODEPTR(), FLOW_ASYNC_NODE, &this->my_graph, static_cast *>(this), this->output_ports(), this->my_body ); } template __TBB_requires(async_node_body) __TBB_NOINLINE_SYM async_node(graph& g, size_t concurrency, Body body, node_priority_t a_priority) : async_node(g, concurrency, body, Policy(), a_priority) {} #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template __TBB_requires(async_node_body) __TBB_NOINLINE_SYM async_node( const node_set& nodes, size_t concurrency, Body body, Policy = Policy(), node_priority_t a_priority = no_priority ) : async_node(nodes.graph_reference(), concurrency, body, a_priority) { make_edges_in_order(nodes, *this); } template __TBB_requires(async_node_body) __TBB_NOINLINE_SYM async_node(const node_set& nodes, size_t concurrency, Body body, node_priority_t a_priority) : async_node(nodes, concurrency, body, Policy(), a_priority) {} #endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET __TBB_NOINLINE_SYM async_node( const async_node &other ) : base_type(other), sender(), my_gateway(self()) { static_cast(this->my_body->get_body_ptr())->set_gateway(&my_gateway); static_cast(this->my_init_body->get_body_ptr())->set_gateway(&my_gateway); fgt_multioutput_node_with_body<1>( CODEPTR(), FLOW_ASYNC_NODE, &this->my_graph, static_cast *>(this), this->output_ports(), this->my_body ); } gateway_type& gateway() { return my_gateway; } // Define sender< Output > //! Add a new successor to this node bool register_successor(successor_type&) override { __TBB_ASSERT(false, "Successors must be registered only via ports"); return false; } //! Removes a successor from this node bool remove_successor(successor_type&) override { __TBB_ASSERT(false, "Successors must be removed only via ports"); return false; } template Body copy_function_object() { typedef multifunction_body mfn_body_type; typedef async_body async_body_type; mfn_body_type &body_ref = *this->my_body; async_body_type ab = *static_cast(dynamic_cast< multifunction_body_leaf & >(body_ref).get_body_ptr()); return ab.get_body(); } protected: void reset_node( reset_flags f) override { base_type::reset_node(f); } }; #include "detail/_flow_graph_node_set_impl.h" template< typename T > class overwrite_node : public graph_node, public receiver, public sender { public: typedef T input_type; typedef T output_type; typedef typename receiver::predecessor_type predecessor_type; typedef typename sender::successor_type successor_type; __TBB_NOINLINE_SYM explicit overwrite_node(graph &g) : graph_node(g), my_successors(this), my_buffer_is_valid(false) { fgt_node( CODEPTR(), FLOW_OVERWRITE_NODE, &this->my_graph, static_cast *>(this), static_cast *>(this) ); } #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template overwrite_node(const node_set& nodes) : overwrite_node(nodes.graph_reference()) { make_edges_in_order(nodes, *this); } #endif //! Copy constructor; doesn't take anything from src; default won't work __TBB_NOINLINE_SYM overwrite_node( const overwrite_node& src ) : overwrite_node(src.my_graph) {} ~overwrite_node() {} bool register_successor( successor_type &s ) override { spin_mutex::scoped_lock l( my_mutex ); if (my_buffer_is_valid && is_graph_active( my_graph )) { // We have a valid value that must be forwarded immediately. bool ret = s.try_put( my_buffer ); if ( ret ) { // We add the successor that accepted our put my_successors.register_successor( s ); } else { // In case of reservation a race between the moment of reservation and register_successor can appear, // because failed reserve does not mean that register_successor is not ready to put a message immediately. // We have some sort of infinite loop: reserving node tries to set pull state for the edge, // but overwrite_node tries to return push state back. That is why we have to break this loop with task creation. d1::small_object_allocator allocator{}; typedef register_predecessor_task task_type; graph_task* t = allocator.new_object(graph_reference(), allocator, *this, s); spawn_in_graph_arena( my_graph, *t ); } } else { // No valid value yet, just add as successor my_successors.register_successor( s ); } return true; } bool remove_successor( successor_type &s ) override { spin_mutex::scoped_lock l( my_mutex ); my_successors.remove_successor(s); return true; } bool try_get( input_type &v ) override { spin_mutex::scoped_lock l( my_mutex ); if ( my_buffer_is_valid ) { v = my_buffer; return true; } return false; } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT bool try_get( input_type &v, message_metainfo& metainfo ) override { spin_mutex::scoped_lock l( my_mutex ); if (my_buffer_is_valid) { v = my_buffer; metainfo = my_buffered_metainfo; // Since the successor of the node will use move semantics while wrapping the metainfo // that is designed to transfer the ownership of the value from single-push buffer to the task // It is required to reserve one more reference here because the value keeps in the buffer // and the ownership is not transferred for (auto msg_waiter : metainfo.waiters()) { msg_waiter->reserve(1); } return true; } return false; } #endif //! Reserves an item bool try_reserve( T &v ) override { return try_get(v); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT private: bool try_reserve(T& v, message_metainfo& metainfo) override { spin_mutex::scoped_lock l( my_mutex ); if (my_buffer_is_valid) { v = my_buffer; metainfo = my_buffered_metainfo; return true; } return false; } public: #endif //! Releases the reserved item bool try_release() override { return true; } //! Consumes the reserved item bool try_consume() override { return true; } bool is_valid() { spin_mutex::scoped_lock l( my_mutex ); return my_buffer_is_valid; } void clear() { spin_mutex::scoped_lock l( my_mutex ); my_buffer_is_valid = false; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT for (auto msg_waiter : my_buffered_metainfo.waiters()) { msg_waiter->release(1); } my_buffered_metainfo = message_metainfo{}; #endif } protected: template< typename R, typename B > friend class run_and_put_task; template friend class broadcast_cache; template friend class round_robin_cache; graph_task* try_put_task( const input_type &v ) override { spin_mutex::scoped_lock l( my_mutex ); return try_put_task_impl(v __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{})); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT graph_task* try_put_task(const input_type& v, const message_metainfo& metainfo) override { spin_mutex::scoped_lock l( my_mutex ); return try_put_task_impl(v, metainfo); } #endif graph_task * try_put_task_impl(const input_type &v __TBB_FLOW_GRAPH_METAINFO_ARG(const message_metainfo& metainfo)) { my_buffer = v; my_buffer_is_valid = true; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT // Since the new item is pushed to the buffer - reserving the waiters for (auto msg_waiter : metainfo.waiters()) { msg_waiter->reserve(1); } // Since the item is taken out from the buffer - releasing the stored waiters for (auto msg_waiter : my_buffered_metainfo.waiters()) { msg_waiter->release(1); } my_buffered_metainfo = metainfo; #endif graph_task* rtask = my_successors.try_put_task(v __TBB_FLOW_GRAPH_METAINFO_ARG(my_buffered_metainfo) ); if (!rtask) rtask = SUCCESSFULLY_ENQUEUED; return rtask; } graph& graph_reference() const override { return my_graph; } //! Breaks an infinite loop between the node reservation and register_successor call struct register_predecessor_task : public graph_task { register_predecessor_task( graph& g, d1::small_object_allocator& allocator, predecessor_type& owner, successor_type& succ) : graph_task(g, allocator), o(owner), s(succ) {}; d1::task* execute(d1::execution_data& ed) override { // TODO revamp: investigate why qualification is needed for register_successor() call using tbb::detail::d2::register_predecessor; using tbb::detail::d2::register_successor; if ( !register_predecessor(s, o) ) { register_successor(o, s); } finalize(ed); return nullptr; } d1::task* cancel(d1::execution_data& ed) override { finalize(ed); return nullptr; } predecessor_type& o; successor_type& s; }; spin_mutex my_mutex; broadcast_cache< input_type, null_rw_mutex > my_successors; input_type my_buffer; #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT message_metainfo my_buffered_metainfo; #endif bool my_buffer_is_valid; void reset_node( reset_flags f) override { my_buffer_is_valid = false; if (f&rf_clear_edges) { my_successors.clear(); } } }; // overwrite_node template< typename T > class write_once_node : public overwrite_node { public: typedef T input_type; typedef T output_type; typedef overwrite_node base_type; typedef typename receiver::predecessor_type predecessor_type; typedef typename sender::successor_type successor_type; //! Constructor __TBB_NOINLINE_SYM explicit write_once_node(graph& g) : base_type(g) { fgt_node( CODEPTR(), FLOW_WRITE_ONCE_NODE, &(this->my_graph), static_cast *>(this), static_cast *>(this) ); } #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET template write_once_node(const node_set& nodes) : write_once_node(nodes.graph_reference()) { make_edges_in_order(nodes, *this); } #endif //! Copy constructor: call base class copy constructor __TBB_NOINLINE_SYM write_once_node( const write_once_node& src ) : base_type(src) { fgt_node( CODEPTR(), FLOW_WRITE_ONCE_NODE, &(this->my_graph), static_cast *>(this), static_cast *>(this) ); } protected: template< typename R, typename B > friend class run_and_put_task; template friend class broadcast_cache; template friend class round_robin_cache; graph_task *try_put_task( const T &v ) override { spin_mutex::scoped_lock l( this->my_mutex ); return this->my_buffer_is_valid ? nullptr : this->try_put_task_impl(v __TBB_FLOW_GRAPH_METAINFO_ARG(message_metainfo{})); } #if __TBB_PREVIEW_FLOW_GRAPH_TRY_PUT_AND_WAIT graph_task* try_put_task(const T& v, const message_metainfo& metainfo) override { spin_mutex::scoped_lock l( this->my_mutex ); return this->my_buffer_is_valid ? nullptr : this->try_put_task_impl(v, metainfo); } #endif }; // write_once_node inline void set_name(const graph& g, const char *name) { fgt_graph_desc(&g, name); } template inline void set_name(const input_node& node, const char *name) { fgt_node_desc(&node, name); } template inline void set_name(const function_node& node, const char *name) { fgt_node_desc(&node, name); } template inline void set_name(const continue_node& node, const char *name) { fgt_node_desc(&node, name); } template inline void set_name(const broadcast_node& node, const char *name) { fgt_node_desc(&node, name); } template inline void set_name(const buffer_node& node, const char *name) { fgt_node_desc(&node, name); } template inline void set_name(const queue_node& node, const char *name) { fgt_node_desc(&node, name); } template inline void set_name(const sequencer_node& node, const char *name) { fgt_node_desc(&node, name); } template inline void set_name(const priority_queue_node& node, const char *name) { fgt_node_desc(&node, name); } template inline void set_name(const limiter_node& node, const char *name) { fgt_node_desc(&node, name); } template inline void set_name(const join_node& node, const char *name) { fgt_node_desc(&node, name); } template inline void set_name(const indexer_node& node, const char *name) { fgt_node_desc(&node, name); } template inline void set_name(const overwrite_node& node, const char *name) { fgt_node_desc(&node, name); } template inline void set_name(const write_once_node& node, const char *name) { fgt_node_desc(&node, name); } template inline void set_name(const multifunction_node& node, const char *name) { fgt_multioutput_node_desc(&node, name); } template inline void set_name(const split_node& node, const char *name) { fgt_multioutput_node_desc(&node, name); } template< typename InputTuple, typename OutputTuple > inline void set_name(const composite_node& node, const char *name) { fgt_multiinput_multioutput_node_desc(&node, name); } template inline void set_name(const async_node& node, const char *name) { fgt_multioutput_node_desc(&node, name); } } // d2 } // detail } // tbb // Include deduction guides for node classes #include "detail/_flow_graph_nodes_deduction.h" namespace tbb { namespace flow { inline namespace v1 { using detail::d2::receiver; using detail::d2::sender; using detail::d2::serial; using detail::d2::unlimited; using detail::d2::reset_flags; using detail::d2::rf_reset_protocol; using detail::d2::rf_reset_bodies; using detail::d2::rf_clear_edges; using detail::d2::graph; using detail::d2::graph_node; using detail::d2::continue_msg; using detail::d2::input_node; using detail::d2::function_node; using detail::d2::multifunction_node; using detail::d2::split_node; using detail::d2::output_port; using detail::d2::indexer_node; using detail::d2::tagged_msg; using detail::d2::cast_to; using detail::d2::is_a; using detail::d2::continue_node; using detail::d2::overwrite_node; using detail::d2::write_once_node; using detail::d2::broadcast_node; using detail::d2::buffer_node; using detail::d2::queue_node; using detail::d2::sequencer_node; using detail::d2::priority_queue_node; using detail::d2::limiter_node; using namespace detail::d2::graph_policy_namespace; using detail::d2::join_node; using detail::d2::input_port; using detail::d2::copy_body; using detail::d2::make_edge; using detail::d2::remove_edge; using detail::d2::tag_value; using detail::d2::composite_node; using detail::d2::async_node; using detail::d2::node_priority_t; using detail::d2::no_priority; #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET using detail::d2::follows; using detail::d2::precedes; using detail::d2::make_node_set; using detail::d2::make_edges; #endif } // v1 } // flow using detail::d1::flow_control; namespace profiling { using detail::d2::set_name; } // profiling } // tbb #if TBB_USE_PROFILING_TOOLS && ( __unix__ || __APPLE__ ) // We don't do pragma pop here, since it still gives warning on the USER side #undef __TBB_NOINLINE_SYM #endif #endif // __TBB_flow_graph_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/flow_graph_abstractions.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_flow_graph_abstractions_H #define __TBB_flow_graph_abstractions_H namespace tbb { namespace detail { namespace d2 { //! Pure virtual template classes that define interfaces for async communication class graph_proxy { public: //! Inform a graph that messages may come from outside, to prevent premature graph completion virtual void reserve_wait() = 0; //! Inform a graph that a previous call to reserve_wait is no longer in effect virtual void release_wait() = 0; virtual ~graph_proxy() {} }; template class receiver_gateway : public graph_proxy { public: //! Type of inputing data into FG. typedef Input input_type; //! Submit signal from an asynchronous activity to FG. virtual bool try_put(const input_type&) = 0; }; } // d2 } // detail } // tbb #endif ================================================ FILE: third-party/tbb/include/oneapi/tbb/global_control.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_global_control_H #define __TBB_global_control_H #include "detail/_config.h" #include "detail/_assert.h" #include "detail/_attach.h" #include "detail/_exception.h" #include "detail/_namespace_injection.h" #include "detail/_template_helpers.h" #include #include // std::nothrow_t namespace tbb { namespace detail { namespace d1 { class global_control; class task_scheduler_handle; } namespace r1 { TBB_EXPORT void __TBB_EXPORTED_FUNC create(d1::global_control&); TBB_EXPORT void __TBB_EXPORTED_FUNC destroy(d1::global_control&); TBB_EXPORT std::size_t __TBB_EXPORTED_FUNC global_control_active_value(int); struct global_control_impl; struct control_storage_comparator; void release_impl(d1::task_scheduler_handle& handle); bool finalize_impl(d1::task_scheduler_handle& handle); TBB_EXPORT void __TBB_EXPORTED_FUNC get(d1::task_scheduler_handle&); TBB_EXPORT bool __TBB_EXPORTED_FUNC finalize(d1::task_scheduler_handle&, std::intptr_t mode); } namespace d1 { class global_control { public: enum parameter { max_allowed_parallelism, thread_stack_size, terminate_on_exception, scheduler_handle, // not a public parameter parameter_max // insert new parameters above this point }; global_control(parameter p, std::size_t value) : my_value(value), my_reserved(), my_param(p) { suppress_unused_warning(my_reserved); __TBB_ASSERT(my_param < parameter_max, "Invalid parameter"); #if __TBB_WIN8UI_SUPPORT && (_WIN32_WINNT < 0x0A00) // For Windows 8 Store* apps it's impossible to set stack size if (p==thread_stack_size) return; #elif __TBB_x86_64 && (_WIN32 || _WIN64) if (p==thread_stack_size) __TBB_ASSERT_RELEASE((unsigned)value == value, "Stack size is limited to unsigned int range"); #endif if (my_param==max_allowed_parallelism) __TBB_ASSERT_RELEASE(my_value>0, "max_allowed_parallelism cannot be 0."); r1::create(*this); } ~global_control() { __TBB_ASSERT(my_param < parameter_max, "Invalid parameter"); #if __TBB_WIN8UI_SUPPORT && (_WIN32_WINNT < 0x0A00) // For Windows 8 Store* apps it's impossible to set stack size if (my_param==thread_stack_size) return; #endif r1::destroy(*this); } static std::size_t active_value(parameter p) { __TBB_ASSERT(p < parameter_max, "Invalid parameter"); return r1::global_control_active_value((int)p); } private: std::size_t my_value; std::intptr_t my_reserved; // TODO: substitution of global_control* not to break backward compatibility parameter my_param; friend struct r1::global_control_impl; friend struct r1::control_storage_comparator; }; //! Finalization options. //! Outside of the class to avoid extensive friendship. static constexpr std::intptr_t release_nothrowing = 0; static constexpr std::intptr_t finalize_nothrowing = 1; static constexpr std::intptr_t finalize_throwing = 2; //! User side wrapper for a task scheduler lifetime control object class task_scheduler_handle { public: //! Creates an empty task_scheduler_handle task_scheduler_handle() = default; //! Creates an attached instance of task_scheduler_handle task_scheduler_handle(attach) { r1::get(*this); } //! Release a reference if any ~task_scheduler_handle() { release(); } //! No copy task_scheduler_handle(const task_scheduler_handle& other) = delete; task_scheduler_handle& operator=(const task_scheduler_handle& other) = delete; //! Move only task_scheduler_handle(task_scheduler_handle&& other) noexcept { std::swap(m_ctl, other.m_ctl); } task_scheduler_handle& operator=(task_scheduler_handle&& other) noexcept { std::swap(m_ctl, other.m_ctl); return *this; }; //! Checks if the task_scheduler_handle is empty explicit operator bool() const noexcept { return m_ctl != nullptr; } //! Release the reference and deactivate handle void release() { if (m_ctl != nullptr) { r1::finalize(*this, release_nothrowing); m_ctl = nullptr; } } private: friend void r1::release_impl(task_scheduler_handle& handle); friend bool r1::finalize_impl(task_scheduler_handle& handle); friend void __TBB_EXPORTED_FUNC r1::get(task_scheduler_handle&); friend void finalize(task_scheduler_handle&); friend bool finalize(task_scheduler_handle&, const std::nothrow_t&) noexcept; global_control* m_ctl{nullptr}; }; #if TBB_USE_EXCEPTIONS //! Waits for worker threads termination. Throws exception on error. inline void finalize(task_scheduler_handle& handle) { try_call([&] { if (handle.m_ctl != nullptr) { bool finalized = r1::finalize(handle, finalize_throwing); __TBB_ASSERT_EX(finalized, "r1::finalize did not respect finalize_throwing ?"); } }).on_completion([&] { __TBB_ASSERT(!handle, "The handle should be empty after finalize"); }); } #endif //! Waits for worker threads termination. Returns false on error. inline bool finalize(task_scheduler_handle& handle, const std::nothrow_t&) noexcept { bool finalized = true; if (handle.m_ctl != nullptr) { finalized = r1::finalize(handle, finalize_nothrowing); } __TBB_ASSERT(!handle, "The handle should be empty after finalize"); return finalized; } } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::global_control; using detail::d1::attach; using detail::d1::finalize; using detail::d1::task_scheduler_handle; using detail::r1::unsafe_wait; } // namespace v1 } // namespace tbb #endif // __TBB_global_control_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/info.h ================================================ /* Copyright (c) 2019-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_info_H #define __TBB_info_H #include "detail/_config.h" #include "detail/_namespace_injection.h" #if __TBB_ARENA_BINDING #include #include namespace tbb { namespace detail { namespace d1{ using numa_node_id = int; using core_type_id = int; // TODO: consider version approach to resolve backward compatibility potential issues. struct constraints { #if !__TBB_CPP20_PRESENT constraints(numa_node_id id = -1, int maximal_concurrency = -1) : numa_id(id) , max_concurrency(maximal_concurrency) {} #endif /*!__TBB_CPP20_PRESENT*/ constraints& set_numa_id(numa_node_id id) { numa_id = id; return *this; } constraints& set_max_concurrency(int maximal_concurrency) { max_concurrency = maximal_concurrency; return *this; } constraints& set_core_type(core_type_id id) { core_type = id; return *this; } constraints& set_max_threads_per_core(int threads_number) { max_threads_per_core = threads_number; return *this; } numa_node_id numa_id = -1; int max_concurrency = -1; core_type_id core_type = -1; int max_threads_per_core = -1; }; } // namespace d1 namespace r1 { TBB_EXPORT unsigned __TBB_EXPORTED_FUNC numa_node_count(); TBB_EXPORT void __TBB_EXPORTED_FUNC fill_numa_indices(int* index_array); TBB_EXPORT int __TBB_EXPORTED_FUNC numa_default_concurrency(int numa_id); // Reserved fields are required to save binary backward compatibility in case of future changes. // They must be defined to 0 at this moment. TBB_EXPORT unsigned __TBB_EXPORTED_FUNC core_type_count(intptr_t reserved = 0); TBB_EXPORT void __TBB_EXPORTED_FUNC fill_core_type_indices(int* index_array, intptr_t reserved = 0); TBB_EXPORT int __TBB_EXPORTED_FUNC constraints_default_concurrency(const d1::constraints& c, intptr_t reserved = 0); TBB_EXPORT int __TBB_EXPORTED_FUNC constraints_threads_per_core(const d1::constraints& c, intptr_t reserved = 0); } // namespace r1 namespace d1 { inline std::vector numa_nodes() { std::vector node_indices(r1::numa_node_count()); r1::fill_numa_indices(node_indices.data()); return node_indices; } inline int default_concurrency(numa_node_id id = -1) { return r1::numa_default_concurrency(id); } inline std::vector core_types() { std::vector core_type_indexes(r1::core_type_count()); r1::fill_core_type_indices(core_type_indexes.data()); return core_type_indexes; } inline int default_concurrency(constraints c) { if (c.max_concurrency > 0) { return c.max_concurrency; } return r1::constraints_default_concurrency(c); } } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::numa_node_id; using detail::d1::core_type_id; namespace info { using detail::d1::numa_nodes; using detail::d1::core_types; using detail::d1::default_concurrency; } // namespace info } // namespace v1 } // namespace tbb #endif /*__TBB_ARENA_BINDING*/ #endif /*__TBB_info_H*/ ================================================ FILE: third-party/tbb/include/oneapi/tbb/memory_pool.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_memory_pool_H #define __TBB_memory_pool_H #if !TBB_PREVIEW_MEMORY_POOL #error Set TBB_PREVIEW_MEMORY_POOL to include memory_pool.h #endif /** @file */ #include "scalable_allocator.h" #include // std::bad_alloc #include // std::runtime_error, std::invalid_argument #include // std::forward #if __TBB_EXTRA_DEBUG #define __TBBMALLOC_ASSERT ASSERT #else #define __TBBMALLOC_ASSERT(a,b) ((void)0) #endif namespace tbb { namespace detail { namespace d1 { //! Base of thread-safe pool allocator for variable-size requests class pool_base : no_copy { // Pool interface is separate from standard allocator classes because it has // to maintain internal state, no copy or assignment. Move and swap are possible. public: //! Reset pool to reuse its memory (free all objects at once) void recycle() { rml::pool_reset(my_pool); } //! The "malloc" analogue to allocate block of memory of size bytes void *malloc(size_t size) { return rml::pool_malloc(my_pool, size); } //! The "free" analogue to discard a previously allocated piece of memory. void free(void* ptr) { rml::pool_free(my_pool, ptr); } //! The "realloc" analogue complementing pool_malloc. // Enables some low-level optimization possibilities void *realloc(void* ptr, size_t size) { return rml::pool_realloc(my_pool, ptr, size); } protected: //! destroy pool - must be called in a child class void destroy() { rml::pool_destroy(my_pool); } rml::MemoryPool *my_pool; }; #if _MSC_VER && !defined(__INTEL_COMPILER) // Workaround for erroneous "unreferenced parameter" warning in method destroy. #pragma warning (push) #pragma warning (disable: 4100) #endif //! Meets "allocator" requirements of ISO C++ Standard, Section 20.1.5 /** @ingroup memory_allocation */ template class memory_pool_allocator { protected: typedef P pool_type; pool_type *my_pool; template friend class memory_pool_allocator; template friend bool operator==( const memory_pool_allocator& a, const memory_pool_allocator& b); template friend bool operator!=( const memory_pool_allocator& a, const memory_pool_allocator& b); public: typedef T value_type; typedef value_type* pointer; typedef const value_type* const_pointer; typedef value_type& reference; typedef const value_type& const_reference; typedef size_t size_type; typedef ptrdiff_t difference_type; template struct rebind { typedef memory_pool_allocator other; }; explicit memory_pool_allocator(pool_type &pool) noexcept : my_pool(&pool) {} memory_pool_allocator(const memory_pool_allocator& src) noexcept : my_pool(src.my_pool) {} template memory_pool_allocator(const memory_pool_allocator& src) noexcept : my_pool(src.my_pool) {} pointer address(reference x) const { return &x; } const_pointer address(const_reference x) const { return &x; } //! Allocate space for n objects. pointer allocate( size_type n, const void* /*hint*/ = nullptr) { pointer p = static_cast( my_pool->malloc( n*sizeof(value_type) ) ); if (!p) throw_exception(std::bad_alloc()); return p; } //! Free previously allocated block of memory. void deallocate( pointer p, size_type ) { my_pool->free(p); } //! Largest value for which method allocate might succeed. size_type max_size() const noexcept { size_type max = static_cast(-1) / sizeof (value_type); return (max > 0 ? max : 1); } //! Copy-construct value at location pointed to by p. template void construct(U *p, Args&&... args) { ::new((void *)p) U(std::forward(args)...); } //! Destroy value at location pointed to by p. void destroy( pointer p ) { p->~value_type(); } }; #if _MSC_VER && !defined(__INTEL_COMPILER) #pragma warning (pop) #endif // warning 4100 is back //! Analogous to std::allocator, as defined in ISO C++ Standard, Section 20.4.1 /** @ingroup memory_allocation */ template class memory_pool_allocator { public: typedef P pool_type; typedef void* pointer; typedef const void* const_pointer; typedef void value_type; template struct rebind { typedef memory_pool_allocator other; }; explicit memory_pool_allocator( pool_type &pool) noexcept : my_pool(&pool) {} memory_pool_allocator( const memory_pool_allocator& src) noexcept : my_pool(src.my_pool) {} template memory_pool_allocator(const memory_pool_allocator& src) noexcept : my_pool(src.my_pool) {} protected: pool_type *my_pool; template friend class memory_pool_allocator; template friend bool operator==( const memory_pool_allocator& a, const memory_pool_allocator& b); template friend bool operator!=( const memory_pool_allocator& a, const memory_pool_allocator& b); }; template inline bool operator==( const memory_pool_allocator& a, const memory_pool_allocator& b) {return a.my_pool==b.my_pool;} template inline bool operator!=( const memory_pool_allocator& a, const memory_pool_allocator& b) {return a.my_pool!=b.my_pool;} //! Thread-safe growable pool allocator for variable-size requests template class memory_pool : public pool_base { Alloc my_alloc; // TODO: base-class optimization static void *allocate_request(intptr_t pool_id, size_t & bytes); static int deallocate_request(intptr_t pool_id, void*, size_t raw_bytes); public: //! construct pool with underlying allocator explicit memory_pool(const Alloc &src = Alloc()); //! destroy pool ~memory_pool() { destroy(); } // call the callbacks first and destroy my_alloc latter }; class fixed_pool : public pool_base { void *my_buffer; size_t my_size; inline static void *allocate_request(intptr_t pool_id, size_t & bytes); public: //! construct pool with underlying allocator inline fixed_pool(void *buf, size_t size); //! destroy pool ~fixed_pool() { destroy(); } }; //////////////// Implementation /////////////// template memory_pool::memory_pool(const Alloc &src) : my_alloc(src) { rml::MemPoolPolicy args(allocate_request, deallocate_request, sizeof(typename Alloc::value_type)); rml::MemPoolError res = rml::pool_create_v1(intptr_t(this), &args, &my_pool); if (res!=rml::POOL_OK) throw_exception(std::runtime_error("Can't create pool")); } template void *memory_pool::allocate_request(intptr_t pool_id, size_t & bytes) { memory_pool &self = *reinterpret_cast*>(pool_id); const size_t unit_size = sizeof(typename Alloc::value_type); __TBBMALLOC_ASSERT( 0 == bytes%unit_size, nullptr); void *ptr; #if TBB_USE_EXCEPTIONS try { #endif ptr = self.my_alloc.allocate( bytes/unit_size ); #if TBB_USE_EXCEPTIONS } catch(...) { return nullptr; } #endif return ptr; } #if __TBB_MSVC_UNREACHABLE_CODE_IGNORED // Workaround for erroneous "unreachable code" warning in the template below. // Specific for VC++ 17-18 compiler #pragma warning (push) #pragma warning (disable: 4702) #endif template int memory_pool::deallocate_request(intptr_t pool_id, void* raw_ptr, size_t raw_bytes) { memory_pool &self = *reinterpret_cast*>(pool_id); const size_t unit_size = sizeof(typename Alloc::value_type); __TBBMALLOC_ASSERT( 0 == raw_bytes%unit_size, nullptr); self.my_alloc.deallocate( static_cast(raw_ptr), raw_bytes/unit_size ); return 0; } #if __TBB_MSVC_UNREACHABLE_CODE_IGNORED #pragma warning (pop) #endif inline fixed_pool::fixed_pool(void *buf, size_t size) : my_buffer(buf), my_size(size) { if (!buf || !size) // TODO: improve support for mode with exceptions disabled throw_exception(std::invalid_argument("Zero in parameter is invalid")); rml::MemPoolPolicy args(allocate_request, nullptr, size, /*fixedPool=*/true); rml::MemPoolError res = rml::pool_create_v1(intptr_t(this), &args, &my_pool); if (res!=rml::POOL_OK) throw_exception(std::runtime_error("Can't create pool")); } inline void *fixed_pool::allocate_request(intptr_t pool_id, size_t & bytes) { fixed_pool &self = *reinterpret_cast(pool_id); __TBBMALLOC_ASSERT(0 != self.my_size, "The buffer must not be used twice."); bytes = self.my_size; self.my_size = 0; // remember that buffer has been used return self.my_buffer; } } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::memory_pool_allocator; using detail::d1::memory_pool; using detail::d1::fixed_pool; } // inline namepspace v1 } // namespace tbb #undef __TBBMALLOC_ASSERT #endif// __TBB_memory_pool_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/mutex.h ================================================ /* Copyright (c) 2021-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_mutex_H #define __TBB_mutex_H #include "detail/_namespace_injection.h" #include "detail/_utils.h" #include "detail/_scoped_lock.h" #include "detail/_waitable_atomic.h" #include "detail/_mutex_common.h" #include "profiling.h" namespace tbb { namespace detail { namespace d1 { class mutex { public: //! Constructors mutex() { create_itt_sync(this, "tbb::mutex", ""); }; //! Destructor ~mutex() = default; //! No Copy mutex(const mutex&) = delete; mutex& operator=(const mutex&) = delete; using scoped_lock = unique_scoped_lock; //! Mutex traits static constexpr bool is_rw_mutex = false; static constexpr bool is_recursive_mutex = false; static constexpr bool is_fair_mutex = false; //! Acquire lock /** Spin if the lock is taken */ void lock() { call_itt_notify(prepare, this); while (!try_lock()) { my_flag.wait(true, /* context = */ 0, std::memory_order_relaxed); } } //! Try acquiring lock (non-blocking) /** Return true if lock acquired; false otherwise. */ bool try_lock() { bool result = !my_flag.load(std::memory_order_relaxed) && !my_flag.exchange(true); if (result) { call_itt_notify(acquired, this); } return result; } //! Release lock void unlock() { call_itt_notify(releasing, this); // We need Write Read memory barrier before notify that reads the waiter list. // In C++ only full fence covers this type of barrier. my_flag.exchange(false); my_flag.notify_one_relaxed(); } private: waitable_atomic my_flag{0}; }; // class mutex } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::mutex; } // namespace v1 } // namespace tbb #endif // __TBB_mutex_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/null_mutex.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_null_mutex_H #define __TBB_null_mutex_H #include "detail/_config.h" #include "detail/_namespace_injection.h" #include "detail/_mutex_common.h" namespace tbb { namespace detail { namespace d1 { //! A mutex which does nothing /** A null_mutex does no operation and simulates success. @ingroup synchronization */ class null_mutex { public: //! Constructors constexpr null_mutex() noexcept = default; //! Destructor ~null_mutex() = default; //! No Copy null_mutex(const null_mutex&) = delete; null_mutex& operator=(const null_mutex&) = delete; //! Represents acquisition of a mutex. class scoped_lock { public: //! Constructors constexpr scoped_lock() noexcept = default; scoped_lock(null_mutex&) {} //! Destructor ~scoped_lock() = default; //! No Copy scoped_lock(const scoped_lock&) = delete; scoped_lock& operator=(const scoped_lock&) = delete; void acquire(null_mutex&) {} bool try_acquire(null_mutex&) { return true; } void release() {} }; //! Mutex traits static constexpr bool is_rw_mutex = false; static constexpr bool is_recursive_mutex = true; static constexpr bool is_fair_mutex = true; void lock() {} bool try_lock() { return true; } void unlock() {} }; // class null_mutex } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::null_mutex; } // namespace v1 } // namespace tbb #endif /* __TBB_null_mutex_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/null_rw_mutex.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_null_rw_mutex_H #define __TBB_null_rw_mutex_H #include "detail/_config.h" #include "detail/_namespace_injection.h" #include "detail/_mutex_common.h" namespace tbb { namespace detail { namespace d1 { //! A rw mutex which does nothing /** A null_rw_mutex is a rw mutex that does nothing and simulates successful operation. @ingroup synchronization */ class null_rw_mutex { public: //! Constructors constexpr null_rw_mutex() noexcept = default; //! Destructor ~null_rw_mutex() = default; //! No Copy null_rw_mutex(const null_rw_mutex&) = delete; null_rw_mutex& operator=(const null_rw_mutex&) = delete; //! Represents acquisition of a mutex. class scoped_lock { public: //! Constructors constexpr scoped_lock() noexcept = default; scoped_lock(null_rw_mutex&, bool = true) {} //! Destructor ~scoped_lock() = default; //! No Copy scoped_lock(const scoped_lock&) = delete; scoped_lock& operator=(const scoped_lock&) = delete; void acquire(null_rw_mutex&, bool = true) {} bool try_acquire(null_rw_mutex&, bool = true) { return true; } void release() {} bool upgrade_to_writer() { return true; } bool downgrade_to_reader() { return true; } bool is_writer() const { return true; } }; //! Mutex traits static constexpr bool is_rw_mutex = true; static constexpr bool is_recursive_mutex = true; static constexpr bool is_fair_mutex = true; void lock() {} bool try_lock() { return true; } void unlock() {} void lock_shared() {} bool try_lock_shared() { return true; } void unlock_shared() {} }; // class null_rw_mutex } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::null_rw_mutex; } // namespace v1 } // namespace tbb #endif /* __TBB_null_rw_mutex_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/parallel_for.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_parallel_for_H #define __TBB_parallel_for_H #include "detail/_config.h" #include "detail/_namespace_injection.h" #include "detail/_exception.h" #include "detail/_task.h" #include "detail/_small_object_pool.h" #include "profiling.h" #include "partitioner.h" #include "blocked_range.h" #include "task_group.h" #include #include namespace tbb { namespace detail { #if __TBB_CPP20_CONCEPTS_PRESENT inline namespace d0 { template concept parallel_for_body = std::copy_constructible && std::invocable&, Range&>; template concept parallel_for_index = std::constructible_from && std::copyable && requires( const std::remove_reference_t& lhs, const std::remove_reference_t& rhs ) { { lhs < rhs } -> adaptive_same_as; { lhs - rhs } -> std::convertible_to; { lhs + (rhs - lhs) } -> std::convertible_to; }; template concept parallel_for_function = std::invocable&, Index>; } // namespace d0 #endif // __TBB_CPP20_CONCEPTS_PRESENT namespace d1 { //! Task type used in parallel_for /** @ingroup algorithms */ template struct start_for : public task { Range my_range; const Body my_body; node* my_parent; typename Partitioner::task_partition_type my_partition; small_object_allocator my_allocator; task* execute(execution_data&) override; task* cancel(execution_data&) override; void finalize(const execution_data&); //! Constructor for root task. start_for( const Range& range, const Body& body, Partitioner& partitioner, small_object_allocator& alloc ) : my_range(range), my_body(body), my_parent(nullptr), my_partition(partitioner), my_allocator(alloc) {} //! Splitting constructor used to generate children. /** parent_ becomes left child. Newly constructed object is right child. */ start_for( start_for& parent_, typename Partitioner::split_type& split_obj, small_object_allocator& alloc ) : my_range(parent_.my_range, get_range_split_object(split_obj)), my_body(parent_.my_body), my_parent(nullptr), my_partition(parent_.my_partition, split_obj), my_allocator(alloc) {} //! Construct right child from the given range as response to the demand. /** parent_ remains left child. Newly constructed object is right child. */ start_for( start_for& parent_, const Range& r, depth_t d, small_object_allocator& alloc ) : my_range(r), my_body(parent_.my_body), my_parent(nullptr), my_partition(parent_.my_partition, split()), my_allocator(alloc) { my_partition.align_depth( d ); } static void run(const Range& range, const Body& body, Partitioner& partitioner) { task_group_context context(PARALLEL_FOR); run(range, body, partitioner, context); } static void run(const Range& range, const Body& body, Partitioner& partitioner, task_group_context& context) { if ( !range.empty() ) { small_object_allocator alloc{}; start_for& for_task = *alloc.new_object(range, body, partitioner, alloc); // defer creation of the wait node until task allocation succeeds wait_node wn; for_task.my_parent = &wn; execute_and_wait(for_task, context, wn.m_wait, context); } } //! Run body for range, serves as callback for partitioner void run_body( Range &r ) { tbb::detail::invoke(my_body, r); } //! spawn right task, serves as callback for partitioner void offer_work(typename Partitioner::split_type& split_obj, execution_data& ed) { offer_work_impl(ed, *this, split_obj); } //! spawn right task, serves as callback for partitioner void offer_work(const Range& r, depth_t d, execution_data& ed) { offer_work_impl(ed, *this, r, d); } private: template void offer_work_impl(execution_data& ed, Args&&... constructor_args) { // New right child small_object_allocator alloc{}; start_for& right_child = *alloc.new_object(ed, std::forward(constructor_args)..., alloc); // New root node as a continuation and ref count. Left and right child attach to the new parent. right_child.my_parent = my_parent = alloc.new_object(ed, my_parent, 2, alloc); // Spawn the right sibling right_child.spawn_self(ed); } void spawn_self(execution_data& ed) { my_partition.spawn_task(*this, *context(ed)); } }; //! fold the tree and deallocate the task template void start_for::finalize(const execution_data& ed) { // Get the current parent and allocator an object destruction node* parent = my_parent; auto allocator = my_allocator; // Task execution finished - destroy it this->~start_for(); // Unwind the tree decrementing the parent`s reference count fold_tree(parent, ed); allocator.deallocate(this, ed); } //! execute task for parallel_for template task* start_for::execute(execution_data& ed) { if (!is_same_affinity(ed)) { my_partition.note_affinity(execution_slot(ed)); } my_partition.check_being_stolen(*this, ed); my_partition.execute(*this, my_range, ed); finalize(ed); return nullptr; } //! cancel task for parallel_for template task* start_for::cancel(execution_data& ed) { finalize(ed); return nullptr; } //! Calls the function with values from range [begin, end) with a step provided template class parallel_for_body_wrapper : detail::no_assign { const Function &my_func; const Index my_begin; const Index my_step; public: parallel_for_body_wrapper( const Function& _func, Index& _begin, Index& _step ) : my_func(_func), my_begin(_begin), my_step(_step) {} void operator()( const blocked_range& r ) const { // A set of local variables to help the compiler with vectorization of the following loop. Index b = r.begin(); Index e = r.end(); Index ms = my_step; Index k = my_begin + b*ms; #if __INTEL_COMPILER #pragma ivdep #if __TBB_ASSERT_ON_VECTORIZATION_FAILURE #pragma vector always assert #endif #endif for ( Index i = b; i < e; ++i, k += ms ) { tbb::detail::invoke(my_func, k); } } }; // Requirements on Range concept are documented in blocked_range.h /** \page parallel_for_body_req Requirements on parallel_for body Class \c Body implementing the concept of parallel_for body must define: - \code Body::Body( const Body& ); \endcode Copy constructor - \code Body::~Body(); \endcode Destructor - \code void Body::operator()( Range& r ) const; \endcode Function call operator applying the body to range \c r. **/ /** \name parallel_for See also requirements on \ref range_req "Range" and \ref parallel_for_body_req "parallel_for Body". **/ //@{ //! Parallel iteration over range with default partitioner. /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_for_body) void parallel_for( const Range& range, const Body& body ) { start_for::run(range,body,__TBB_DEFAULT_PARTITIONER()); } //! Parallel iteration over range with simple partitioner. /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_for_body) void parallel_for( const Range& range, const Body& body, const simple_partitioner& partitioner ) { start_for::run(range,body,partitioner); } //! Parallel iteration over range with auto_partitioner. /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_for_body) void parallel_for( const Range& range, const Body& body, const auto_partitioner& partitioner ) { start_for::run(range,body,partitioner); } //! Parallel iteration over range with static_partitioner. /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_for_body) void parallel_for( const Range& range, const Body& body, const static_partitioner& partitioner ) { start_for::run(range,body,partitioner); } //! Parallel iteration over range with affinity_partitioner. /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_for_body) void parallel_for( const Range& range, const Body& body, affinity_partitioner& partitioner ) { start_for::run(range,body,partitioner); } //! Parallel iteration over range with default partitioner and user-supplied context. /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_for_body) void parallel_for( const Range& range, const Body& body, task_group_context& context ) { start_for::run(range, body, __TBB_DEFAULT_PARTITIONER(), context); } //! Parallel iteration over range with simple partitioner and user-supplied context. /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_for_body) void parallel_for( const Range& range, const Body& body, const simple_partitioner& partitioner, task_group_context& context ) { start_for::run(range, body, partitioner, context); } //! Parallel iteration over range with auto_partitioner and user-supplied context. /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_for_body) void parallel_for( const Range& range, const Body& body, const auto_partitioner& partitioner, task_group_context& context ) { start_for::run(range, body, partitioner, context); } //! Parallel iteration over range with static_partitioner and user-supplied context. /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_for_body) void parallel_for( const Range& range, const Body& body, const static_partitioner& partitioner, task_group_context& context ) { start_for::run(range, body, partitioner, context); } //! Parallel iteration over range with affinity_partitioner and user-supplied context. /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_for_body) void parallel_for( const Range& range, const Body& body, affinity_partitioner& partitioner, task_group_context& context ) { start_for::run(range,body,partitioner, context); } //! Implementation of parallel iteration over stepped range of integers with explicit step and partitioner template void parallel_for_impl(Index first, Index last, Index step, const Function& f, Partitioner& partitioner) { if (step <= 0 ) throw_exception(exception_id::nonpositive_step); // throws std::invalid_argument else if (first < last) { // Above "else" avoids "potential divide by zero" warning on some platforms Index end = Index(last - first - 1ul) / step + Index(1); blocked_range range(static_cast(0), end); parallel_for_body_wrapper body(f, first, step); parallel_for(range, body, partitioner); } } //! Parallel iteration over a range of integers with a step provided and default partitioner template __TBB_requires(parallel_for_index && parallel_for_function) void parallel_for(Index first, Index last, Index step, const Function& f) { parallel_for_impl(first, last, step, f, __TBB_DEFAULT_PARTITIONER()); } //! Parallel iteration over a range of integers with a step provided and simple partitioner template __TBB_requires(parallel_for_index && parallel_for_function) void parallel_for(Index first, Index last, Index step, const Function& f, const simple_partitioner& partitioner) { parallel_for_impl(first, last, step, f, partitioner); } //! Parallel iteration over a range of integers with a step provided and auto partitioner template __TBB_requires(parallel_for_index && parallel_for_function) void parallel_for(Index first, Index last, Index step, const Function& f, const auto_partitioner& partitioner) { parallel_for_impl(first, last, step, f, partitioner); } //! Parallel iteration over a range of integers with a step provided and static partitioner template __TBB_requires(parallel_for_index && parallel_for_function) void parallel_for(Index first, Index last, Index step, const Function& f, const static_partitioner& partitioner) { parallel_for_impl(first, last, step, f, partitioner); } //! Parallel iteration over a range of integers with a step provided and affinity partitioner template __TBB_requires(parallel_for_index && parallel_for_function) void parallel_for(Index first, Index last, Index step, const Function& f, affinity_partitioner& partitioner) { parallel_for_impl(first, last, step, f, partitioner); } //! Parallel iteration over a range of integers with a default step value and default partitioner template __TBB_requires(parallel_for_index && parallel_for_function) void parallel_for(Index first, Index last, const Function& f) { parallel_for_impl(first, last, static_cast(1), f, __TBB_DEFAULT_PARTITIONER()); } //! Parallel iteration over a range of integers with a default step value and simple partitioner template __TBB_requires(parallel_for_index && parallel_for_function) void parallel_for(Index first, Index last, const Function& f, const simple_partitioner& partitioner) { parallel_for_impl(first, last, static_cast(1), f, partitioner); } //! Parallel iteration over a range of integers with a default step value and auto partitioner template __TBB_requires(parallel_for_index && parallel_for_function) void parallel_for(Index first, Index last, const Function& f, const auto_partitioner& partitioner) { parallel_for_impl(first, last, static_cast(1), f, partitioner); } //! Parallel iteration over a range of integers with a default step value and static partitioner template __TBB_requires(parallel_for_index && parallel_for_function) void parallel_for(Index first, Index last, const Function& f, const static_partitioner& partitioner) { parallel_for_impl(first, last, static_cast(1), f, partitioner); } //! Parallel iteration over a range of integers with a default step value and affinity partitioner template __TBB_requires(parallel_for_index && parallel_for_function) void parallel_for(Index first, Index last, const Function& f, affinity_partitioner& partitioner) { parallel_for_impl(first, last, static_cast(1), f, partitioner); } //! Implementation of parallel iteration over stepped range of integers with explicit step, task group context, and partitioner template void parallel_for_impl(Index first, Index last, Index step, const Function& f, Partitioner& partitioner, task_group_context &context) { if (step <= 0 ) throw_exception(exception_id::nonpositive_step); // throws std::invalid_argument else if (first < last) { // Above "else" avoids "potential divide by zero" warning on some platforms Index end = (last - first - Index(1)) / step + Index(1); blocked_range range(static_cast(0), end); parallel_for_body_wrapper body(f, first, step); parallel_for(range, body, partitioner, context); } } //! Parallel iteration over a range of integers with explicit step, task group context, and default partitioner template __TBB_requires(parallel_for_index && parallel_for_function) void parallel_for(Index first, Index last, Index step, const Function& f, task_group_context &context) { parallel_for_impl(first, last, step, f, __TBB_DEFAULT_PARTITIONER(), context); } //! Parallel iteration over a range of integers with explicit step, task group context, and simple partitioner template __TBB_requires(parallel_for_index && parallel_for_function) void parallel_for(Index first, Index last, Index step, const Function& f, const simple_partitioner& partitioner, task_group_context &context) { parallel_for_impl(first, last, step, f, partitioner, context); } //! Parallel iteration over a range of integers with explicit step, task group context, and auto partitioner template __TBB_requires(parallel_for_index && parallel_for_function) void parallel_for(Index first, Index last, Index step, const Function& f, const auto_partitioner& partitioner, task_group_context &context) { parallel_for_impl(first, last, step, f, partitioner, context); } //! Parallel iteration over a range of integers with explicit step, task group context, and static partitioner template __TBB_requires(parallel_for_index && parallel_for_function) void parallel_for(Index first, Index last, Index step, const Function& f, const static_partitioner& partitioner, task_group_context &context) { parallel_for_impl(first, last, step, f, partitioner, context); } //! Parallel iteration over a range of integers with explicit step, task group context, and affinity partitioner template __TBB_requires(parallel_for_index && parallel_for_function) void parallel_for(Index first, Index last, Index step, const Function& f, affinity_partitioner& partitioner, task_group_context &context) { parallel_for_impl(first, last, step, f, partitioner, context); } //! Parallel iteration over a range of integers with a default step value, explicit task group context, and default partitioner template __TBB_requires(parallel_for_index && parallel_for_function) void parallel_for(Index first, Index last, const Function& f, task_group_context &context) { parallel_for_impl(first, last, static_cast(1), f, __TBB_DEFAULT_PARTITIONER(), context); } //! Parallel iteration over a range of integers with a default step value, explicit task group context, and simple partitioner template __TBB_requires(parallel_for_index && parallel_for_function) void parallel_for(Index first, Index last, const Function& f, const simple_partitioner& partitioner, task_group_context &context) { parallel_for_impl(first, last, static_cast(1), f, partitioner, context); } //! Parallel iteration over a range of integers with a default step value, explicit task group context, and auto partitioner template __TBB_requires(parallel_for_index && parallel_for_function) void parallel_for(Index first, Index last, const Function& f, const auto_partitioner& partitioner, task_group_context &context) { parallel_for_impl(first, last, static_cast(1), f, partitioner, context); } //! Parallel iteration over a range of integers with a default step value, explicit task group context, and static partitioner template __TBB_requires(parallel_for_index && parallel_for_function) void parallel_for(Index first, Index last, const Function& f, const static_partitioner& partitioner, task_group_context &context) { parallel_for_impl(first, last, static_cast(1), f, partitioner, context); } //! Parallel iteration over a range of integers with a default step value, explicit task group context, and affinity_partitioner template __TBB_requires(parallel_for_index && parallel_for_function) void parallel_for(Index first, Index last, const Function& f, affinity_partitioner& partitioner, task_group_context &context) { parallel_for_impl(first, last, static_cast(1), f, partitioner, context); } // @} } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::parallel_for; // Split types using detail::split; using detail::proportional_split; } // namespace v1 } // namespace tbb #endif /* __TBB_parallel_for_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/parallel_for_each.h ================================================ /* Copyright (c) 2005-2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_parallel_for_each_H #define __TBB_parallel_for_each_H #include "detail/_config.h" #include "detail/_namespace_injection.h" #include "detail/_exception.h" #include "detail/_task.h" #include "detail/_aligned_space.h" #include "detail/_small_object_pool.h" #include "detail/_utils.h" #include "parallel_for.h" #include "task_group.h" // task_group_context #include #include namespace tbb { namespace detail { #if __TBB_CPP20_CONCEPTS_PRESENT namespace d1 { template class feeder; } // namespace d1 inline namespace d0 { template concept parallel_for_each_body = std::invocable&, ItemType&&> || std::invocable&, ItemType&&, tbb::detail::d1::feeder&>; } // namespace d0 #endif // __TBB_CPP20_CONCEPTS_PRESENT namespace d2 { template class feeder_impl; } // namespace d2 namespace d1 { //! Class the user supplied algorithm body uses to add new tasks template class feeder { feeder() {} feeder(const feeder&) = delete; void operator=( const feeder&) = delete; virtual ~feeder () {} virtual void internal_add_copy(const Item& item) = 0; virtual void internal_add_move(Item&& item) = 0; template friend class d2::feeder_impl; public: //! Add a work item to a running parallel_for_each. void add(const Item& item) {internal_add_copy(item);} void add(Item&& item) {internal_add_move(std::move(item));} }; } // namespace d1 namespace d2 { using namespace tbb::detail::d1; /** Selects one of the two possible forms of function call member operator. @ingroup algorithms **/ template struct parallel_for_each_operator_selector { public: template static auto call(const Body& body, ItemArg&& item, FeederArg*) -> decltype(tbb::detail::invoke(body, std::forward(item)), void()) { #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) // Suppression of Microsoft non-standard extension warnings #pragma warning (push) #pragma warning (disable: 4239) #endif tbb::detail::invoke(body, std::forward(item)); #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) #pragma warning (pop) #endif } template static auto call(const Body& body, ItemArg&& item, FeederArg* feeder) -> decltype(tbb::detail::invoke(body, std::forward(item), *feeder), void()) { #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) // Suppression of Microsoft non-standard extension warnings #pragma warning (push) #pragma warning (disable: 4239) #endif __TBB_ASSERT(feeder, "Feeder was not created but should be"); tbb::detail::invoke(body, std::forward(item), *feeder); #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) #pragma warning (pop) #endif } }; template struct feeder_item_task: public task { using feeder_type = feeder_impl; template feeder_item_task(ItemType&& input_item, feeder_type& feeder, small_object_allocator& alloc, wait_tree_vertex_interface& wait_vertex) : item(std::forward(input_item)), my_feeder(feeder), my_allocator(alloc), m_wait_tree_vertex(r1::get_thread_reference_vertex(&wait_vertex)) { m_wait_tree_vertex->reserve(); } void finalize(const execution_data& ed) { m_wait_tree_vertex->release(); my_allocator.delete_object(this, ed); } //! Hack for resolve ambiguity between calls to the body with and without moving the stored copy //! Executing body with moving the copy should have higher priority using first_priority = int; using second_priority = double; template static auto call(const BodyType& call_body, ItemType& call_item, FeederType& call_feeder, first_priority) -> decltype(parallel_for_each_operator_selector::call(call_body, std::move(call_item), &call_feeder), void()) { parallel_for_each_operator_selector::call(call_body, std::move(call_item), &call_feeder); } template static void call(const BodyType& call_body, ItemType& call_item, FeederType& call_feeder, second_priority) { parallel_for_each_operator_selector::call(call_body, call_item, &call_feeder); } task* execute(execution_data& ed) override { call(my_feeder.my_body, item, my_feeder, first_priority{}); finalize(ed); return nullptr; } task* cancel(execution_data& ed) override { finalize(ed); return nullptr; } Item item; feeder_type& my_feeder; small_object_allocator my_allocator; wait_tree_vertex_interface* m_wait_tree_vertex; }; // class feeder_item_task /** Implements new task adding procedure. @ingroup algorithms **/ template class feeder_impl : public feeder { // Avoiding use of copy constructor in a virtual method if the type does not support it void internal_add_copy_impl(std::true_type, const Item& item) { using feeder_task = feeder_item_task; small_object_allocator alloc; auto task = alloc.new_object(item, *this, alloc, my_wait_context); spawn(*task, my_execution_context); } void internal_add_copy_impl(std::false_type, const Item&) { __TBB_ASSERT(false, "Overloading for r-value reference doesn't work or it's not movable and not copyable object"); } void internal_add_copy(const Item& item) override { internal_add_copy_impl(typename std::is_copy_constructible::type(), item); } void internal_add_move(Item&& item) override { using feeder_task = feeder_item_task; small_object_allocator alloc{}; auto task = alloc.new_object(std::move(item), *this, alloc, my_wait_context); spawn(*task, my_execution_context); } public: feeder_impl(const Body& body, wait_context_vertex& w_context, task_group_context &context) : my_body(body), my_wait_context(w_context) , my_execution_context(context) {} const Body& my_body; wait_context_vertex& my_wait_context; task_group_context& my_execution_context; }; // class feeder_impl /** Execute computation under one element of the range @ingroup algorithms **/ template struct for_each_iteration_task: public task { using feeder_type = feeder_impl; for_each_iteration_task(Iterator input_item_ptr, const Body& body, feeder_impl* feeder_ptr, wait_context& wait_context) : item_ptr(input_item_ptr), my_body(body), my_feeder_ptr(feeder_ptr), parent_wait_context(wait_context) {} void finalize() { parent_wait_context.release(); } task* execute(execution_data&) override { parallel_for_each_operator_selector::call(my_body, *item_ptr, my_feeder_ptr); finalize(); return nullptr; } task* cancel(execution_data&) override { finalize(); return nullptr; } Iterator item_ptr; const Body& my_body; feeder_impl* my_feeder_ptr; wait_context& parent_wait_context; }; // class for_each_iteration_task // Helper to get the type of the iterator to the internal sequence of copies // If the element can be passed to the body as an rvalue - this iterator should be move_iterator template struct input_iteration_task_iterator_helper { // For input iterators we pass const lvalue reference to the body // It is prohibited to take non-constant lvalue references for input iterators using type = const Item*; }; template struct input_iteration_task_iterator_helper::call(std::declval(), std::declval(), std::declval*>()))>> { using type = std::move_iterator; }; /** Split one block task to several(max_block_size) iteration tasks for input iterators @ingroup algorithms **/ template struct input_block_handling_task : public task { static constexpr size_t max_block_size = 4; using feeder_type = feeder_impl; using iteration_task_iterator_type = typename input_iteration_task_iterator_helper::type; using iteration_task = for_each_iteration_task; input_block_handling_task(wait_context_vertex& root_wait_context, task_group_context& e_context, const Body& body, feeder_impl* feeder_ptr, small_object_allocator& alloc) :my_size(0), my_wait_context(0), my_root_wait_context(root_wait_context), my_execution_context(e_context), my_allocator(alloc) { auto item_it = block_iteration_space.begin(); for (auto* it = task_pool.begin(); it != task_pool.end(); ++it) { new (it) iteration_task(iteration_task_iterator_type(item_it++), body, feeder_ptr, my_wait_context); } } void finalize(const execution_data& ed) { my_root_wait_context.release(); my_allocator.delete_object(this, ed); } task* execute(execution_data& ed) override { __TBB_ASSERT( my_size > 0, "Negative size was passed to task"); for (std::size_t counter = 1; counter < my_size; ++counter) { my_wait_context.reserve(); spawn(*(task_pool.begin() + counter), my_execution_context); } my_wait_context.reserve(); execute_and_wait(*task_pool.begin(), my_execution_context, my_wait_context, my_execution_context); // deallocate current task after children execution finalize(ed); return nullptr; } task* cancel(execution_data& ed) override { finalize(ed); return nullptr; } ~input_block_handling_task() { for(std::size_t counter = 0; counter < max_block_size; ++counter) { (task_pool.begin() + counter)->~iteration_task(); if (counter < my_size) { (block_iteration_space.begin() + counter)->~Item(); } } } aligned_space block_iteration_space; aligned_space task_pool; std::size_t my_size; wait_context my_wait_context; wait_context_vertex& my_root_wait_context; task_group_context& my_execution_context; small_object_allocator my_allocator; }; // class input_block_handling_task /** Split one block task to several(max_block_size) iteration tasks for forward iterators @ingroup algorithms **/ template struct forward_block_handling_task : public task { static constexpr size_t max_block_size = 4; using iteration_task = for_each_iteration_task; forward_block_handling_task(Iterator first, std::size_t size, wait_context_vertex& w_context, task_group_context& e_context, const Body& body, feeder_impl* feeder_ptr, small_object_allocator& alloc) : my_size(size), my_wait_context(0), my_root_wait_context(w_context), my_execution_context(e_context), my_allocator(alloc) { auto* task_it = task_pool.begin(); for (std::size_t i = 0; i < size; i++) { new (task_it++) iteration_task(first, body, feeder_ptr, my_wait_context); ++first; } } void finalize(const execution_data& ed) { my_root_wait_context.release(); my_allocator.delete_object(this, ed); } task* execute(execution_data& ed) override { __TBB_ASSERT( my_size > 0, "Negative size was passed to task"); for(std::size_t counter = 1; counter < my_size; ++counter) { my_wait_context.reserve(); spawn(*(task_pool.begin() + counter), my_execution_context); } my_wait_context.reserve(); execute_and_wait(*task_pool.begin(), my_execution_context, my_wait_context, my_execution_context); // deallocate current task after children execution finalize(ed); return nullptr; } task* cancel(execution_data& ed) override { finalize(ed); return nullptr; } ~forward_block_handling_task() { for(std::size_t counter = 0; counter < my_size; ++counter) { (task_pool.begin() + counter)->~iteration_task(); } } aligned_space task_pool; std::size_t my_size; wait_context my_wait_context; wait_context_vertex& my_root_wait_context; task_group_context& my_execution_context; small_object_allocator my_allocator; }; // class forward_block_handling_task /** Body for parallel_for algorithm. * Allows to redirect operations under random access iterators range to the parallel_for algorithm. @ingroup algorithms **/ template class parallel_for_body_wrapper { Iterator my_first; const Body& my_body; feeder_impl* my_feeder_ptr; public: parallel_for_body_wrapper(Iterator first, const Body& body, feeder_impl* feeder_ptr) : my_first(first), my_body(body), my_feeder_ptr(feeder_ptr) {} void operator()(tbb::blocked_range range) const { #if __INTEL_COMPILER #pragma ivdep #endif for (std::size_t count = range.begin(); count != range.end(); count++) { parallel_for_each_operator_selector::call(my_body, *(my_first + count), my_feeder_ptr); } } }; // class parallel_for_body_wrapper /** Helper for getting iterators tag including inherited custom tags @ingroup algorithms */ template using tag = typename std::iterator_traits::iterator_category; #if __TBB_CPP20_CONCEPTS_PRESENT template struct move_iterator_dispatch_helper { using type = It; }; // Until C++23, std::move_iterator::iterator_concept always defines // to std::input_iterator_tag and hence std::forward_iterator concept // always evaluates to false, so std::move_iterator dispatch should be // made according to the base iterator type. template struct move_iterator_dispatch_helper> { using type = It; }; template using iterator_tag_dispatch_impl = std::conditional_t, std::random_access_iterator_tag, std::conditional_t, std::forward_iterator_tag, std::input_iterator_tag>>; template using iterator_tag_dispatch = iterator_tag_dispatch_impl::type>; #else template using iterator_tag_dispatch = typename std::conditional< std::is_base_of>::value, std::random_access_iterator_tag, typename std::conditional< std::is_base_of>::value, std::forward_iterator_tag, std::input_iterator_tag >::type >::type; #endif // __TBB_CPP20_CONCEPTS_PRESENT template using feeder_is_required = tbb::detail::void_t(), std::declval::reference>(), std::declval&>()))>; // Creates feeder object only if the body can accept it template struct feeder_holder { feeder_holder( wait_context_vertex&, task_group_context&, const Body& ) {} feeder_impl* feeder_ptr() { return nullptr; } }; // class feeder_holder template class feeder_holder> { public: feeder_holder( wait_context_vertex& w_context, task_group_context& context, const Body& body ) : my_feeder(body, w_context, context) {} feeder_impl* feeder_ptr() { return &my_feeder; } private: feeder_impl my_feeder; }; // class feeder_holder template class for_each_root_task_base : public task { public: for_each_root_task_base(Iterator first, Iterator last, const Body& body, wait_context_vertex& w_context, task_group_context& e_context) : my_first(first), my_last(last), my_wait_context(w_context), my_execution_context(e_context), my_body(body), my_feeder_holder(my_wait_context, my_execution_context, my_body) { my_wait_context.reserve(); } private: task* cancel(execution_data&) override { this->my_wait_context.release(); return nullptr; } protected: Iterator my_first; Iterator my_last; wait_context_vertex& my_wait_context; task_group_context& my_execution_context; const Body& my_body; feeder_holder my_feeder_holder; }; // class for_each_root_task_base /** parallel_for_each algorithm root task - most generic version * Splits input range to blocks @ingroup algorithms **/ template > class for_each_root_task : public for_each_root_task_base { using base_type = for_each_root_task_base; public: using base_type::base_type; private: task* execute(execution_data& ed) override { using block_handling_type = input_block_handling_task; if (this->my_first == this->my_last) { this->my_wait_context.release(); return nullptr; } this->my_wait_context.reserve(); small_object_allocator alloc{}; auto block_handling_task = alloc.new_object(ed, this->my_wait_context, this->my_execution_context, this->my_body, this->my_feeder_holder.feeder_ptr(), alloc); auto* block_iterator = block_handling_task->block_iteration_space.begin(); for (; !(this->my_first == this->my_last) && block_handling_task->my_size < block_handling_type::max_block_size; ++this->my_first) { // Move semantics are automatically used when supported by the iterator new (block_iterator++) Item(*this->my_first); ++block_handling_task->my_size; } // Do not access this after spawn to avoid races spawn(*this, this->my_execution_context); return block_handling_task; } }; // class for_each_root_task - most generic implementation /** parallel_for_each algorithm root task - forward iterator based specialization * Splits input range to blocks @ingroup algorithms **/ template class for_each_root_task : public for_each_root_task_base { using base_type = for_each_root_task_base; public: using base_type::base_type; private: task* execute(execution_data& ed) override { using block_handling_type = forward_block_handling_task; if (this->my_first == this->my_last) { this->my_wait_context.release(); return nullptr; } std::size_t block_size{0}; Iterator first_block_element = this->my_first; for (; !(this->my_first == this->my_last) && block_size < block_handling_type::max_block_size; ++this->my_first) { ++block_size; } this->my_wait_context.reserve(); small_object_allocator alloc{}; auto block_handling_task = alloc.new_object(ed, first_block_element, block_size, this->my_wait_context, this->my_execution_context, this->my_body, this->my_feeder_holder.feeder_ptr(), alloc); // Do not access this after spawn to avoid races spawn(*this, this->my_execution_context); return block_handling_task; } }; // class for_each_root_task - forward iterator based specialization /** parallel_for_each algorithm root task - random access iterator based specialization * Splits input range to blocks @ingroup algorithms **/ template class for_each_root_task : public for_each_root_task_base { using base_type = for_each_root_task_base; public: using base_type::base_type; private: task* execute(execution_data&) override { tbb::parallel_for( tbb::blocked_range(0, std::distance(this->my_first, this->my_last)), parallel_for_body_wrapper(this->my_first, this->my_body, this->my_feeder_holder.feeder_ptr()) , this->my_execution_context ); this->my_wait_context.release(); return nullptr; } }; // class for_each_root_task - random access iterator based specialization /** Helper for getting item type. If item type can be deduced from feeder - got it from feeder, if feeder is generic - got item type from range. @ingroup algorithms */ template auto feeder_argument_parser(void (Body::*)(Item, feeder&) const) -> FeederArg; template decltype(feeder_argument_parser(&Body::operator())) get_item_type_impl(int); // for (T, feeder) template Item get_item_type_impl(...); // stub template using get_item_type = decltype(get_item_type_impl(0)); #if __TBB_CPP20_CONCEPTS_PRESENT template using feeder_item_type = std::remove_cvref_t>; template concept parallel_for_each_iterator_body = parallel_for_each_body, feeder_item_type>>; template concept parallel_for_each_range_body = parallel_for_each_body, feeder_item_type>>; #endif /** Implements parallel iteration over a range. @ingroup algorithms */ template void run_parallel_for_each( Iterator first, Iterator last, const Body& body, task_group_context& context) { if (!(first == last)) { using ItemType = get_item_type::value_type>; wait_context_vertex w_context(0); for_each_root_task root_task(first, last, body, w_context, context); execute_and_wait(root_task, context, w_context.get_context(), context); } } /** \page parallel_for_each_body_req Requirements on parallel_for_each body Class \c Body implementing the concept of parallel_for_each body must define: - \code B::operator()( cv_item_type item, feeder& feeder ) const OR B::operator()( cv_item_type& item ) const \endcode Process item. May be invoked concurrently for the same \c this but different \c item. - \code item_type( const item_type& ) \endcode Copy a work item. - \code ~item_type() \endcode Destroy a work item **/ /** \name parallel_for_each See also requirements on \ref parallel_for_each_body_req "parallel_for_each Body". **/ //@{ //! Parallel iteration over a range, with optional addition of more work. /** @ingroup algorithms */ template __TBB_requires(std::input_iterator && parallel_for_each_iterator_body) void parallel_for_each(Iterator first, Iterator last, const Body& body) { task_group_context context(PARALLEL_FOR_EACH); run_parallel_for_each(first, last, body, context); } template __TBB_requires(container_based_sequence && parallel_for_each_range_body) void parallel_for_each(Range& rng, const Body& body) { parallel_for_each(std::begin(rng), std::end(rng), body); } template __TBB_requires(container_based_sequence && parallel_for_each_range_body) void parallel_for_each(const Range& rng, const Body& body) { parallel_for_each(std::begin(rng), std::end(rng), body); } //! Parallel iteration over a range, with optional addition of more work and user-supplied context /** @ingroup algorithms */ template __TBB_requires(std::input_iterator && parallel_for_each_iterator_body) void parallel_for_each(Iterator first, Iterator last, const Body& body, task_group_context& context) { run_parallel_for_each(first, last, body, context); } template __TBB_requires(container_based_sequence && parallel_for_each_range_body) void parallel_for_each(Range& rng, const Body& body, task_group_context& context) { parallel_for_each(std::begin(rng), std::end(rng), body, context); } template __TBB_requires(container_based_sequence && parallel_for_each_range_body) void parallel_for_each(const Range& rng, const Body& body, task_group_context& context) { parallel_for_each(std::begin(rng), std::end(rng), body, context); } } // namespace d2 } // namespace detail //! @endcond //@} inline namespace v1 { using detail::d2::parallel_for_each; using detail::d1::feeder; } // namespace v1 } // namespace tbb #endif /* __TBB_parallel_for_each_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/parallel_invoke.h ================================================ /* Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_parallel_invoke_H #define __TBB_parallel_invoke_H #include "detail/_config.h" #include "detail/_namespace_injection.h" #include "detail/_exception.h" #include "detail/_task.h" #include "detail/_template_helpers.h" #include "detail/_small_object_pool.h" #include "task_group.h" #include #include #include namespace tbb { namespace detail { namespace d1 { //! Simple task object, executing user method template struct function_invoker : public task { function_invoker(const Function& function, WaitObject& wait_ctx) : my_function(function), parent_wait_ctx(wait_ctx) {} task* execute(execution_data& ed) override { my_function(); parent_wait_ctx.release(ed); call_itt_task_notify(destroy, this); return nullptr; } task* cancel(execution_data& ed) override { parent_wait_ctx.release(ed); return nullptr; } const Function& my_function; WaitObject& parent_wait_ctx; }; // struct function_invoker //! Task object for managing subroots in trinary task trees. // Endowed with additional synchronization logic (compatible with wait object interfaces) to support // continuation passing execution. This task spawns 2 function_invoker tasks with first and second functors // and then executes first functor by itself. But only the last executed functor must destruct and deallocate // the subroot task. template struct invoke_subroot_task : public task { wait_context& root_wait_ctx; std::atomic ref_count{0}; bool child_spawned = false; const F1& self_invoked_functor; function_invoker> f2_invoker; function_invoker> f3_invoker; task_group_context& my_execution_context; small_object_allocator my_allocator; invoke_subroot_task(const F1& f1, const F2& f2, const F3& f3, wait_context& wait_ctx, task_group_context& context, small_object_allocator& alloc) : root_wait_ctx(wait_ctx), self_invoked_functor(f1), f2_invoker(f2, *this), f3_invoker(f3, *this), my_execution_context(context), my_allocator(alloc) { root_wait_ctx.reserve(); } void finalize(const execution_data& ed) { root_wait_ctx.release(); my_allocator.delete_object(this, ed); } void release(const execution_data& ed) { __TBB_ASSERT(ref_count > 0, nullptr); call_itt_task_notify(releasing, this); if( --ref_count == 0 ) { call_itt_task_notify(acquired, this); finalize(ed); } } task* execute(execution_data& ed) override { ref_count.fetch_add(3, std::memory_order_relaxed); spawn(f3_invoker, my_execution_context); spawn(f2_invoker, my_execution_context); self_invoked_functor(); release(ed); return nullptr; } task* cancel(execution_data& ed) override { if( ref_count > 0 ) { // detect children spawn release(ed); } else { finalize(ed); } return nullptr; } }; // struct subroot_task class invoke_root_task { public: invoke_root_task(wait_context& wc) : my_wait_context(wc) {} void release(const execution_data&) { my_wait_context.release(); } private: wait_context& my_wait_context; }; template void invoke_recursive_separation(wait_context& root_wait_ctx, task_group_context& context, const F1& f1) { root_wait_ctx.reserve(1); invoke_root_task root(root_wait_ctx); function_invoker invoker1(f1, root); execute_and_wait(invoker1, context, root_wait_ctx, context); } template void invoke_recursive_separation(wait_context& root_wait_ctx, task_group_context& context, const F1& f1, const F2& f2) { root_wait_ctx.reserve(2); invoke_root_task root(root_wait_ctx); function_invoker invoker1(f1, root); function_invoker invoker2(f2, root); spawn(invoker1, context); execute_and_wait(invoker2, context, root_wait_ctx, context); } template void invoke_recursive_separation(wait_context& root_wait_ctx, task_group_context& context, const F1& f1, const F2& f2, const F3& f3) { root_wait_ctx.reserve(3); invoke_root_task root(root_wait_ctx); function_invoker invoker1(f1, root); function_invoker invoker2(f2, root); function_invoker invoker3(f3, root); //TODO: implement sub root for two tasks (measure performance) spawn(invoker1, context); spawn(invoker2, context); execute_and_wait(invoker3, context, root_wait_ctx, context); } template void invoke_recursive_separation(wait_context& root_wait_ctx, task_group_context& context, const F1& f1, const F2& f2, const F3& f3, const Fs&... fs) { small_object_allocator alloc{}; auto sub_root = alloc.new_object>(f1, f2, f3, root_wait_ctx, context, alloc); spawn(*sub_root, context); invoke_recursive_separation(root_wait_ctx, context, fs...); } template void parallel_invoke_impl(task_group_context& context, const Fs&... fs) { static_assert(sizeof...(Fs) >= 2, "Parallel invoke may be called with at least two callable"); wait_context root_wait_ctx{0}; invoke_recursive_separation(root_wait_ctx, context, fs...); } template void parallel_invoke_impl(const F1& f1, const Fs&... fs) { static_assert(sizeof...(Fs) >= 1, "Parallel invoke may be called with at least two callable"); task_group_context context(PARALLEL_INVOKE); wait_context root_wait_ctx{0}; invoke_recursive_separation(root_wait_ctx, context, fs..., f1); } //! Passes last argument of variadic pack as first for handling user provided task_group_context template struct invoke_helper; template struct invoke_helper, T, Fs...> : invoke_helper, Fs...> {}; template struct invoke_helper, T> { void operator()(Fs&&... args, T&& t) { parallel_invoke_impl(std::forward(t), std::forward(args)...); } }; //! Parallel execution of several function objects // We need to pass parameter pack through forwarding reference, // since this pack may contain task_group_context that must be passed via lvalue non-const reference template void parallel_invoke(Fs&&... fs) { invoke_helper, Fs...>()(std::forward(fs)...); } } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::parallel_invoke; } // namespace v1 } // namespace tbb #endif /* __TBB_parallel_invoke_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/parallel_pipeline.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_parallel_pipeline_H #define __TBB_parallel_pipeline_H #include "detail/_pipeline_filters.h" #include "detail/_config.h" #include "detail/_namespace_injection.h" #include "task_group.h" #include #include #include namespace tbb { namespace detail { namespace r1 { TBB_EXPORT void __TBB_EXPORTED_FUNC parallel_pipeline(task_group_context&, std::size_t, const d1::filter_node&); } namespace d1 { enum class filter_mode : unsigned int { //! processes multiple items in parallel and in no particular order parallel = base_filter::filter_is_out_of_order, //! processes items one at a time; all such filters process items in the same order serial_in_order = base_filter::filter_is_serial, //! processes items one at a time and in no particular order serial_out_of_order = base_filter::filter_is_serial | base_filter::filter_is_out_of_order }; //! Class representing a chain of type-safe pipeline filters /** @ingroup algorithms */ template class filter { filter_node_ptr my_root; filter( filter_node_ptr root ) : my_root(root) {} friend void parallel_pipeline( size_t, const filter&, task_group_context& ); template friend filter make_filter( filter_mode, const Body& ); template friend filter operator&( const filter&, const filter& ); public: filter() = default; filter( const filter& rhs ) : my_root(rhs.my_root) {} filter( filter&& rhs ) : my_root(std::move(rhs.my_root)) {} void operator=(const filter& rhs) { my_root = rhs.my_root; } void operator=( filter&& rhs ) { my_root = std::move(rhs.my_root); } template filter( filter_mode mode, const Body& body ) : my_root( new(r1::allocate_memory(sizeof(filter_node_leaf))) filter_node_leaf(static_cast(mode), body) ) { } filter& operator&=( const filter& right ) { *this = *this & right; return *this; } void clear() { // Like operator= with filter() on right side. my_root = nullptr; } }; //! Create a filter to participate in parallel_pipeline /** @ingroup algorithms */ template filter make_filter( filter_mode mode, const Body& body ) { return filter_node_ptr( new(r1::allocate_memory(sizeof(filter_node_leaf))) filter_node_leaf(static_cast(mode), body) ); } //! Create a filter to participate in parallel_pipeline /** @ingroup algorithms */ template filter, filter_output> make_filter( filter_mode mode, const Body& body ) { return make_filter, filter_output>(mode, body); } //! Composition of filters left and right. /** @ingroup algorithms */ template filter operator&( const filter& left, const filter& right ) { __TBB_ASSERT(left.my_root,"cannot use default-constructed filter as left argument of '&'"); __TBB_ASSERT(right.my_root,"cannot use default-constructed filter as right argument of '&'"); return filter_node_ptr( new (r1::allocate_memory(sizeof(filter_node))) filter_node(left.my_root,right.my_root) ); } #if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT template filter(filter_mode, Body) ->filter, filter_output>; #endif // __TBB_CPP17_DEDUCTION_GUIDES_PRESENT //! Parallel pipeline over chain of filters with user-supplied context. /** @ingroup algorithms **/ inline void parallel_pipeline(size_t max_number_of_live_tokens, const filter& filter_chain, task_group_context& context) { r1::parallel_pipeline(context, max_number_of_live_tokens, *filter_chain.my_root); } //! Parallel pipeline over chain of filters. /** @ingroup algorithms **/ inline void parallel_pipeline(size_t max_number_of_live_tokens, const filter& filter_chain) { task_group_context context; parallel_pipeline(max_number_of_live_tokens, filter_chain, context); } //! Parallel pipeline over sequence of filters. /** @ingroup algorithms **/ template void parallel_pipeline(size_t max_number_of_live_tokens, const F1& filter1, const F2& filter2, FiltersContext&&... filters) { parallel_pipeline(max_number_of_live_tokens, filter1 & filter2, std::forward(filters)...); } } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::parallel_pipeline; using detail::d1::filter; using detail::d1::make_filter; using detail::d1::filter_mode; using detail::d1::flow_control; } } // tbb #endif /* __TBB_parallel_pipeline_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/parallel_reduce.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_parallel_reduce_H #define __TBB_parallel_reduce_H #include #include "detail/_namespace_injection.h" #include "detail/_task.h" #include "detail/_aligned_space.h" #include "detail/_small_object_pool.h" #include "detail/_range_common.h" #include "task_group.h" // task_group_context #include "partitioner.h" #include "profiling.h" namespace tbb { namespace detail { #if __TBB_CPP20_CONCEPTS_PRESENT inline namespace d0 { template concept parallel_reduce_body = splittable && requires( Body& body, const Range& range, Body& rhs ) { body(range); body.join(rhs); }; template concept parallel_reduce_function = std::invocable&, const Range&, Value&&> && std::convertible_to&, const Range&, Value&&>, Value>; template concept parallel_reduce_combine = std::invocable&, Value&&, Value&&> && std::convertible_to&, Value&&, Value&&>, Value>; } // namespace d0 #endif // __TBB_CPP20_CONCEPTS_PRESENT namespace d1 { //! Tree node type for parallel_reduce. /** @ingroup algorithms */ //TODO: consider folding tree via bypass execution(instead of manual folding) // for better cancellation and critical tasks handling (performance measurements required). template struct reduction_tree_node : public tree_node { tbb::detail::aligned_space zombie_space; Body& left_body; bool has_right_zombie{false}; reduction_tree_node(node* parent, int ref_count, Body& input_left_body, small_object_allocator& alloc) : tree_node{parent, ref_count, alloc}, left_body(input_left_body) /* gcc4.8 bug - braced-initialization doesn't work for class members of reference type */ {} void join(task_group_context* context) { if (has_right_zombie && !context->is_group_execution_cancelled()) left_body.join(*zombie_space.begin()); } ~reduction_tree_node() { if( has_right_zombie ) zombie_space.begin()->~Body(); } }; //! Task type used to split the work of parallel_reduce. /** @ingroup algorithms */ template struct start_reduce : public task { Range my_range; Body* my_body; node* my_parent; typename Partitioner::task_partition_type my_partition; small_object_allocator my_allocator; bool is_right_child; task* execute(execution_data&) override; task* cancel(execution_data&) override; void finalize(const execution_data&); using tree_node_type = reduction_tree_node; //! Constructor reduce root task. start_reduce( const Range& range, Body& body, Partitioner& partitioner, small_object_allocator& alloc ) : my_range(range), my_body(&body), my_parent(nullptr), my_partition(partitioner), my_allocator(alloc), is_right_child(false) {} //! Splitting constructor used to generate children. /** parent_ becomes left child. Newly constructed object is right child. */ start_reduce( start_reduce& parent_, typename Partitioner::split_type& split_obj, small_object_allocator& alloc ) : my_range(parent_.my_range, get_range_split_object(split_obj)), my_body(parent_.my_body), my_parent(nullptr), my_partition(parent_.my_partition, split_obj), my_allocator(alloc), is_right_child(true) { parent_.is_right_child = false; } //! Construct right child from the given range as response to the demand. /** parent_ remains left child. Newly constructed object is right child. */ start_reduce( start_reduce& parent_, const Range& r, depth_t d, small_object_allocator& alloc ) : my_range(r), my_body(parent_.my_body), my_parent(nullptr), my_partition(parent_.my_partition, split()), my_allocator(alloc), is_right_child(true) { my_partition.align_depth( d ); parent_.is_right_child = false; } static void run(const Range& range, Body& body, Partitioner& partitioner, task_group_context& context) { if ( !range.empty() ) { wait_node wn; small_object_allocator alloc{}; auto reduce_task = alloc.new_object(range, body, partitioner, alloc); reduce_task->my_parent = &wn; execute_and_wait(*reduce_task, context, wn.m_wait, context); } } static void run(const Range& range, Body& body, Partitioner& partitioner) { // Bound context prevents exceptions from body to affect nesting or sibling algorithms, // and allows users to handle exceptions safely by wrapping parallel_reduce in the try-block. task_group_context context(PARALLEL_REDUCE); run(range, body, partitioner, context); } //! Run body for range, serves as callback for partitioner void run_body( Range &r ) { tbb::detail::invoke(*my_body, r); } //! spawn right task, serves as callback for partitioner void offer_work(typename Partitioner::split_type& split_obj, execution_data& ed) { offer_work_impl(ed, *this, split_obj); } //! spawn right task, serves as callback for partitioner void offer_work(const Range& r, depth_t d, execution_data& ed) { offer_work_impl(ed, *this, r, d); } private: template void offer_work_impl(execution_data& ed, Args&&... args) { small_object_allocator alloc{}; // New right child auto right_child = alloc.new_object(ed, std::forward(args)..., alloc); // New root node as a continuation and ref count. Left and right child attach to the new parent. right_child->my_parent = my_parent = alloc.new_object(ed, my_parent, 2, *my_body, alloc); // Spawn the right sibling right_child->spawn_self(ed); } void spawn_self(execution_data& ed) { my_partition.spawn_task(*this, *context(ed)); } }; //! fold the tree and deallocate the task template void start_reduce::finalize(const execution_data& ed) { // Get the current parent and wait object before an object destruction node* parent = my_parent; auto allocator = my_allocator; // Task execution finished - destroy it this->~start_reduce(); // Unwind the tree decrementing the parent`s reference count fold_tree(parent, ed); allocator.deallocate(this, ed); } //! Execute parallel_reduce task template task* start_reduce::execute(execution_data& ed) { if (!is_same_affinity(ed)) { my_partition.note_affinity(execution_slot(ed)); } my_partition.check_being_stolen(*this, ed); // The acquire barrier synchronizes the data pointed with my_body if the left // task has already finished. __TBB_ASSERT(my_parent, nullptr); if( is_right_child && my_parent->m_ref_count.load(std::memory_order_acquire) == 2 ) { tree_node_type* parent_ptr = static_cast(my_parent); my_body = static_cast(new( parent_ptr->zombie_space.begin() ) Body(*my_body, split())); parent_ptr->has_right_zombie = true; } __TBB_ASSERT(my_body != nullptr, "Incorrect body value"); my_partition.execute(*this, my_range, ed); finalize(ed); return nullptr; } //! Cancel parallel_reduce task template task* start_reduce::cancel(execution_data& ed) { finalize(ed); return nullptr; } //! Tree node type for parallel_deterministic_reduce. /** @ingroup algorithms */ template struct deterministic_reduction_tree_node : public tree_node { Body right_body; Body& left_body; deterministic_reduction_tree_node(node* parent, int ref_count, Body& input_left_body, small_object_allocator& alloc) : tree_node{parent, ref_count, alloc}, right_body{input_left_body, detail::split()}, left_body(input_left_body) {} void join(task_group_context* context) { if (!context->is_group_execution_cancelled()) left_body.join(right_body); } }; //! Task type used to split the work of parallel_deterministic_reduce. /** @ingroup algorithms */ template struct start_deterministic_reduce : public task { Range my_range; Body& my_body; node* my_parent; typename Partitioner::task_partition_type my_partition; small_object_allocator my_allocator; task* execute(execution_data&) override; task* cancel(execution_data&) override; void finalize(const execution_data&); using tree_node_type = deterministic_reduction_tree_node; //! Constructor deterministic_reduce root task. start_deterministic_reduce( const Range& range, Partitioner& partitioner, Body& body, small_object_allocator& alloc ) : my_range(range), my_body(body), my_parent(nullptr), my_partition(partitioner), my_allocator(alloc) {} //! Splitting constructor used to generate children. /** parent_ becomes left child. Newly constructed object is right child. */ start_deterministic_reduce( start_deterministic_reduce& parent_, typename Partitioner::split_type& split_obj, Body& body, small_object_allocator& alloc ) : my_range(parent_.my_range, get_range_split_object(split_obj)), my_body(body), my_parent(nullptr), my_partition(parent_.my_partition, split_obj), my_allocator(alloc) {} static void run(const Range& range, Body& body, Partitioner& partitioner, task_group_context& context) { if ( !range.empty() ) { wait_node wn; small_object_allocator alloc{}; auto deterministic_reduce_task = alloc.new_object(range, partitioner, body, alloc); deterministic_reduce_task->my_parent = &wn; execute_and_wait(*deterministic_reduce_task, context, wn.m_wait, context); } } static void run(const Range& range, Body& body, Partitioner& partitioner) { // Bound context prevents exceptions from body to affect nesting or sibling algorithms, // and allows users to handle exceptions safely by wrapping parallel_deterministic_reduce // in the try-block. task_group_context context(PARALLEL_REDUCE); run(range, body, partitioner, context); } //! Run body for range, serves as callback for partitioner void run_body( Range &r ) { tbb::detail::invoke(my_body, r); } //! Spawn right task, serves as callback for partitioner void offer_work(typename Partitioner::split_type& split_obj, execution_data& ed) { offer_work_impl(ed, *this, split_obj); } private: template void offer_work_impl(execution_data& ed, Args&&... args) { small_object_allocator alloc{}; // New root node as a continuation and ref count. Left and right child attach to the new parent. Split the body. auto new_tree_node = alloc.new_object(ed, my_parent, 2, my_body, alloc); // New right child auto right_child = alloc.new_object(ed, std::forward(args)..., new_tree_node->right_body, alloc); right_child->my_parent = my_parent = new_tree_node; // Spawn the right sibling right_child->spawn_self(ed); } void spawn_self(execution_data& ed) { my_partition.spawn_task(*this, *context(ed)); } }; //! Fold the tree and deallocate the task template void start_deterministic_reduce::finalize(const execution_data& ed) { // Get the current parent and wait object before an object destruction node* parent = my_parent; auto allocator = my_allocator; // Task execution finished - destroy it this->~start_deterministic_reduce(); // Unwind the tree decrementing the parent`s reference count fold_tree(parent, ed); allocator.deallocate(this, ed); } //! Execute parallel_deterministic_reduce task template task* start_deterministic_reduce::execute(execution_data& ed) { if (!is_same_affinity(ed)) { my_partition.note_affinity(execution_slot(ed)); } my_partition.check_being_stolen(*this, ed); my_partition.execute(*this, my_range, ed); finalize(ed); return nullptr; } //! Cancel parallel_deterministic_reduce task template task* start_deterministic_reduce::cancel(execution_data& ed) { finalize(ed); return nullptr; } //! Auxiliary class for parallel_reduce; for internal use only. /** The adaptor class that implements \ref parallel_reduce_body_req "parallel_reduce Body" using given \ref parallel_reduce_lambda_req "anonymous function objects". **/ /** @ingroup algorithms */ template class lambda_reduce_body { //TODO: decide if my_real_body, my_reduction, and my_identity_element should be copied or referenced // (might require some performance measurements) const Value& my_identity_element; const RealBody& my_real_body; const Reduction& my_reduction; Value my_value; lambda_reduce_body& operator= ( const lambda_reduce_body& other ); public: lambda_reduce_body( const Value& identity, const RealBody& body, const Reduction& reduction ) : my_identity_element(identity) , my_real_body(body) , my_reduction(reduction) , my_value(identity) { } lambda_reduce_body( const lambda_reduce_body& other ) = default; lambda_reduce_body( lambda_reduce_body& other, tbb::split ) : my_identity_element(other.my_identity_element) , my_real_body(other.my_real_body) , my_reduction(other.my_reduction) , my_value(other.my_identity_element) { } void operator()(Range& range) { my_value = tbb::detail::invoke(my_real_body, range, std::move(my_value)); } void join( lambda_reduce_body& rhs ) { my_value = tbb::detail::invoke(my_reduction, std::move(my_value), std::move(rhs.my_value)); } __TBB_nodiscard Value&& result() && noexcept { return std::move(my_value); } }; // Requirements on Range concept are documented in blocked_range.h /** \page parallel_reduce_body_req Requirements on parallel_reduce body Class \c Body implementing the concept of parallel_reduce body must define: - \code Body::Body( Body&, split ); \endcode Splitting constructor. Must be able to run concurrently with operator() and method \c join - \code Body::~Body(); \endcode Destructor - \code void Body::operator()( Range& r ); \endcode Function call operator applying body to range \c r and accumulating the result - \code void Body::join( Body& b ); \endcode Join results. The result in \c b should be merged into the result of \c this **/ /** \page parallel_reduce_lambda_req Requirements on parallel_reduce anonymous function objects (lambda functions) TO BE DOCUMENTED **/ /** \name parallel_reduce See also requirements on \ref range_req "Range" and \ref parallel_reduce_body_req "parallel_reduce Body". **/ //@{ //! Parallel iteration with reduction and default partitioner. /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_body) void parallel_reduce( const Range& range, Body& body ) { start_reduce::run( range, body, __TBB_DEFAULT_PARTITIONER() ); } //! Parallel iteration with reduction and simple_partitioner /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_body) void parallel_reduce( const Range& range, Body& body, const simple_partitioner& partitioner ) { start_reduce::run( range, body, partitioner ); } //! Parallel iteration with reduction and auto_partitioner /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_body) void parallel_reduce( const Range& range, Body& body, const auto_partitioner& partitioner ) { start_reduce::run( range, body, partitioner ); } //! Parallel iteration with reduction and static_partitioner /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_body) void parallel_reduce( const Range& range, Body& body, const static_partitioner& partitioner ) { start_reduce::run( range, body, partitioner ); } //! Parallel iteration with reduction and affinity_partitioner /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_body) void parallel_reduce( const Range& range, Body& body, affinity_partitioner& partitioner ) { start_reduce::run( range, body, partitioner ); } //! Parallel iteration with reduction, default partitioner and user-supplied context. /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_body) void parallel_reduce( const Range& range, Body& body, task_group_context& context ) { start_reduce::run( range, body, __TBB_DEFAULT_PARTITIONER(), context ); } //! Parallel iteration with reduction, simple partitioner and user-supplied context. /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_body) void parallel_reduce( const Range& range, Body& body, const simple_partitioner& partitioner, task_group_context& context ) { start_reduce::run( range, body, partitioner, context ); } //! Parallel iteration with reduction, auto_partitioner and user-supplied context /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_body) void parallel_reduce( const Range& range, Body& body, const auto_partitioner& partitioner, task_group_context& context ) { start_reduce::run( range, body, partitioner, context ); } //! Parallel iteration with reduction, static_partitioner and user-supplied context /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_body) void parallel_reduce( const Range& range, Body& body, const static_partitioner& partitioner, task_group_context& context ) { start_reduce::run( range, body, partitioner, context ); } //! Parallel iteration with reduction, affinity_partitioner and user-supplied context /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_body) void parallel_reduce( const Range& range, Body& body, affinity_partitioner& partitioner, task_group_context& context ) { start_reduce::run( range, body, partitioner, context ); } /** parallel_reduce overloads that work with anonymous function objects (see also \ref parallel_reduce_lambda_req "requirements on parallel_reduce anonymous function objects"). **/ //! Parallel iteration with reduction and default partitioner. /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_function && parallel_reduce_combine) Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction ) { lambda_reduce_body body(identity, real_body, reduction); start_reduce,const __TBB_DEFAULT_PARTITIONER> ::run(range, body, __TBB_DEFAULT_PARTITIONER() ); return std::move(body).result(); } //! Parallel iteration with reduction and simple_partitioner. /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_function && parallel_reduce_combine) Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, const simple_partitioner& partitioner ) { lambda_reduce_body body(identity, real_body, reduction); start_reduce,const simple_partitioner> ::run(range, body, partitioner ); return std::move(body).result(); } //! Parallel iteration with reduction and auto_partitioner /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_function && parallel_reduce_combine) Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, const auto_partitioner& partitioner ) { lambda_reduce_body body(identity, real_body, reduction); start_reduce,const auto_partitioner> ::run( range, body, partitioner ); return std::move(body).result(); } //! Parallel iteration with reduction and static_partitioner /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_function && parallel_reduce_combine) Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, const static_partitioner& partitioner ) { lambda_reduce_body body(identity, real_body, reduction); start_reduce,const static_partitioner> ::run( range, body, partitioner ); return std::move(body).result(); } //! Parallel iteration with reduction and affinity_partitioner /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_function && parallel_reduce_combine) Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, affinity_partitioner& partitioner ) { lambda_reduce_body body(identity, real_body, reduction); start_reduce,affinity_partitioner> ::run( range, body, partitioner ); return std::move(body).result(); } //! Parallel iteration with reduction, default partitioner and user-supplied context. /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_function && parallel_reduce_combine) Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, task_group_context& context ) { lambda_reduce_body body(identity, real_body, reduction); start_reduce,const __TBB_DEFAULT_PARTITIONER> ::run( range, body, __TBB_DEFAULT_PARTITIONER(), context ); return std::move(body).result(); } //! Parallel iteration with reduction, simple partitioner and user-supplied context. /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_function && parallel_reduce_combine) Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, const simple_partitioner& partitioner, task_group_context& context ) { lambda_reduce_body body(identity, real_body, reduction); start_reduce,const simple_partitioner> ::run( range, body, partitioner, context ); return std::move(body).result(); } //! Parallel iteration with reduction, auto_partitioner and user-supplied context /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_function && parallel_reduce_combine) Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, const auto_partitioner& partitioner, task_group_context& context ) { lambda_reduce_body body(identity, real_body, reduction); start_reduce,const auto_partitioner> ::run( range, body, partitioner, context ); return std::move(body).result(); } //! Parallel iteration with reduction, static_partitioner and user-supplied context /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_function && parallel_reduce_combine) Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, const static_partitioner& partitioner, task_group_context& context ) { lambda_reduce_body body(identity, real_body, reduction); start_reduce,const static_partitioner> ::run( range, body, partitioner, context ); return std::move(body).result(); } //! Parallel iteration with reduction, affinity_partitioner and user-supplied context /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_function && parallel_reduce_combine) Value parallel_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, affinity_partitioner& partitioner, task_group_context& context ) { lambda_reduce_body body(identity, real_body, reduction); start_reduce,affinity_partitioner> ::run( range, body, partitioner, context ); return std::move(body).result(); } //! Parallel iteration with deterministic reduction and default simple partitioner. /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_body) void parallel_deterministic_reduce( const Range& range, Body& body ) { start_deterministic_reduce::run(range, body, simple_partitioner()); } //! Parallel iteration with deterministic reduction and simple partitioner. /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_body) void parallel_deterministic_reduce( const Range& range, Body& body, const simple_partitioner& partitioner ) { start_deterministic_reduce::run(range, body, partitioner); } //! Parallel iteration with deterministic reduction and static partitioner. /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_body) void parallel_deterministic_reduce( const Range& range, Body& body, const static_partitioner& partitioner ) { start_deterministic_reduce::run(range, body, partitioner); } //! Parallel iteration with deterministic reduction, default simple partitioner and user-supplied context. /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_body) void parallel_deterministic_reduce( const Range& range, Body& body, task_group_context& context ) { start_deterministic_reduce::run( range, body, simple_partitioner(), context ); } //! Parallel iteration with deterministic reduction, simple partitioner and user-supplied context. /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_body) void parallel_deterministic_reduce( const Range& range, Body& body, const simple_partitioner& partitioner, task_group_context& context ) { start_deterministic_reduce::run(range, body, partitioner, context); } //! Parallel iteration with deterministic reduction, static partitioner and user-supplied context. /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_body) void parallel_deterministic_reduce( const Range& range, Body& body, const static_partitioner& partitioner, task_group_context& context ) { start_deterministic_reduce::run(range, body, partitioner, context); } /** parallel_reduce overloads that work with anonymous function objects (see also \ref parallel_reduce_lambda_req "requirements on parallel_reduce anonymous function objects"). **/ //! Parallel iteration with deterministic reduction and default simple partitioner. // TODO: consider making static_partitioner the default /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_function && parallel_reduce_combine) Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction ) { return parallel_deterministic_reduce(range, identity, real_body, reduction, simple_partitioner()); } //! Parallel iteration with deterministic reduction and simple partitioner. /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_function && parallel_reduce_combine) Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, const simple_partitioner& partitioner ) { lambda_reduce_body body(identity, real_body, reduction); start_deterministic_reduce, const simple_partitioner> ::run(range, body, partitioner); return std::move(body).result(); } //! Parallel iteration with deterministic reduction and static partitioner. /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_function && parallel_reduce_combine) Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, const static_partitioner& partitioner ) { lambda_reduce_body body(identity, real_body, reduction); start_deterministic_reduce, const static_partitioner> ::run(range, body, partitioner); return std::move(body).result(); } //! Parallel iteration with deterministic reduction, default simple partitioner and user-supplied context. /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_function && parallel_reduce_combine) Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, task_group_context& context ) { return parallel_deterministic_reduce(range, identity, real_body, reduction, simple_partitioner(), context); } //! Parallel iteration with deterministic reduction, simple partitioner and user-supplied context. /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_function && parallel_reduce_combine) Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, const simple_partitioner& partitioner, task_group_context& context ) { lambda_reduce_body body(identity, real_body, reduction); start_deterministic_reduce, const simple_partitioner> ::run(range, body, partitioner, context); return std::move(body).result(); } //! Parallel iteration with deterministic reduction, static partitioner and user-supplied context. /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_reduce_function && parallel_reduce_combine) Value parallel_deterministic_reduce( const Range& range, const Value& identity, const RealBody& real_body, const Reduction& reduction, const static_partitioner& partitioner, task_group_context& context ) { lambda_reduce_body body(identity, real_body, reduction); start_deterministic_reduce, const static_partitioner> ::run(range, body, partitioner, context); return std::move(body).result(); } //@} } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::parallel_reduce; using detail::d1::parallel_deterministic_reduce; // Split types using detail::split; using detail::proportional_split; } // namespace v1 } // namespace tbb #endif /* __TBB_parallel_reduce_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/parallel_scan.h ================================================ /* Copyright (c) 2005-2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_parallel_scan_H #define __TBB_parallel_scan_H #include #include "detail/_config.h" #include "detail/_namespace_injection.h" #include "detail/_exception.h" #include "detail/_task.h" #include "profiling.h" #include "partitioner.h" #include "blocked_range.h" #include "task_group.h" namespace tbb { namespace detail { namespace d1 { //! Used to indicate that the initial scan is being performed. /** @ingroup algorithms */ struct pre_scan_tag { static bool is_final_scan() {return false;} operator bool() {return is_final_scan();} }; //! Used to indicate that the final scan is being performed. /** @ingroup algorithms */ struct final_scan_tag { static bool is_final_scan() {return true;} operator bool() {return is_final_scan();} }; template struct sum_node; #if __TBB_CPP20_CONCEPTS_PRESENT } // namespace d1 inline namespace d0 { template concept parallel_scan_body = splittable && requires( Body& body, const Range& range, Body& other ) { body(range, tbb::detail::d1::pre_scan_tag{}); body(range, tbb::detail::d1::final_scan_tag{}); body.reverse_join(other); body.assign(other); }; template concept parallel_scan_function = std::invocable&, const Range&, const Value&, bool> && std::convertible_to&, const Range&, const Value&, bool>, Value>; template concept parallel_scan_combine = std::invocable&, const Value&, const Value&> && std::convertible_to&, const Value&, const Value&>, Value>; } // namespace d0 namespace d1 { #endif // __TBB_CPP20_CONCEPTS_PRESENT //! Performs final scan for a leaf /** @ingroup algorithms */ template struct final_sum : public task { private: using sum_node_type = sum_node; Body m_body; aligned_space m_range; //! Where to put result of last subrange, or nullptr if not last subrange. Body* m_stuff_last; wait_context& m_wait_context; sum_node_type* m_parent = nullptr; public: small_object_allocator m_allocator; final_sum( Body& body, wait_context& w_o, small_object_allocator& alloc ) : m_body(body, split()), m_wait_context(w_o), m_allocator(alloc) { poison_pointer(m_stuff_last); } final_sum( final_sum& sum, small_object_allocator& alloc ) : m_body(sum.m_body, split()), m_wait_context(sum.m_wait_context), m_allocator(alloc) { poison_pointer(m_stuff_last); } ~final_sum() { m_range.begin()->~Range(); } void finish_construction( sum_node_type* parent, const Range& range, Body* stuff_last ) { __TBB_ASSERT( m_parent == nullptr, nullptr ); m_parent = parent; new( m_range.begin() ) Range(range); m_stuff_last = stuff_last; } private: sum_node_type* release_parent() { call_itt_task_notify(releasing, m_parent); if (m_parent) { auto parent = m_parent; m_parent = nullptr; if (parent->ref_count.fetch_sub(1) == 1) { return parent; } } else m_wait_context.release(); return nullptr; } sum_node_type* finalize(const execution_data& ed){ sum_node_type* next_task = release_parent(); m_allocator.delete_object(this, ed); return next_task; } public: task* execute(execution_data& ed) override { m_body( *m_range.begin(), final_scan_tag() ); if( m_stuff_last ) m_stuff_last->assign(m_body); return finalize(ed); } task* cancel(execution_data& ed) override { return finalize(ed); } template void operator()( const Range& r, Tag tag ) { m_body( r, tag ); } void reverse_join( final_sum& a ) { m_body.reverse_join(a.m_body); } void reverse_join( Body& body ) { m_body.reverse_join(body); } void assign_to( Body& body ) { body.assign(m_body); } void self_destroy(const execution_data& ed) { m_allocator.delete_object(this, ed); } }; //! Split work to be done in the scan. /** @ingroup algorithms */ template struct sum_node : public task { private: using final_sum_type = final_sum; public: final_sum_type *m_incoming; final_sum_type *m_body; Body *m_stuff_last; private: final_sum_type *m_left_sum; sum_node *m_left; sum_node *m_right; bool m_left_is_final; Range m_range; wait_context& m_wait_context; sum_node* m_parent; small_object_allocator m_allocator; public: std::atomic ref_count{0}; sum_node( const Range range, bool left_is_final_, sum_node* parent, wait_context& w_o, small_object_allocator& alloc ) : m_stuff_last(nullptr), m_left_sum(nullptr), m_left(nullptr), m_right(nullptr), m_left_is_final(left_is_final_), m_range(range), m_wait_context(w_o), m_parent(parent), m_allocator(alloc) { if( m_parent ) m_parent->ref_count.fetch_add(1); // Poison fields that will be set by second pass. poison_pointer(m_body); poison_pointer(m_incoming); } ~sum_node() { if (m_parent) m_parent->ref_count.fetch_sub(1); } private: sum_node* release_parent() { call_itt_task_notify(releasing, m_parent); if (m_parent) { auto parent = m_parent; m_parent = nullptr; if (parent->ref_count.fetch_sub(1) == 1) { return parent; } } else m_wait_context.release(); return nullptr; } task* create_child( const Range& range, final_sum_type& body, sum_node* child, final_sum_type* incoming, Body* stuff_last ) { if( child ) { __TBB_ASSERT( is_poisoned(child->m_body) && is_poisoned(child->m_incoming), nullptr ); child->prepare_for_execution(body, incoming, stuff_last); return child; } else { body.finish_construction(this, range, stuff_last); return &body; } } sum_node* finalize(const execution_data& ed) { sum_node* next_task = release_parent(); m_allocator.delete_object(this, ed); return next_task; } public: void prepare_for_execution(final_sum_type& body, final_sum_type* incoming, Body *stuff_last) { this->m_body = &body; this->m_incoming = incoming; this->m_stuff_last = stuff_last; } task* execute(execution_data& ed) override { if( m_body ) { if( m_incoming ) m_left_sum->reverse_join( *m_incoming ); task* right_child = this->create_child(Range(m_range,split()), *m_left_sum, m_right, m_left_sum, m_stuff_last); task* left_child = m_left_is_final ? nullptr : this->create_child(m_range, *m_body, m_left, m_incoming, nullptr); ref_count = (left_child != nullptr) + (right_child != nullptr); m_body = nullptr; if( left_child ) { spawn(*right_child, *ed.context); return left_child; } else { return right_child; } } else { return finalize(ed); } } task* cancel(execution_data& ed) override { return finalize(ed); } void self_destroy(const execution_data& ed) { m_allocator.delete_object(this, ed); } template friend struct start_scan; template friend struct finish_scan; }; //! Combine partial results /** @ingroup algorithms */ template struct finish_scan : public task { private: using sum_node_type = sum_node; using final_sum_type = final_sum; final_sum_type** const m_sum_slot; sum_node_type*& m_return_slot; small_object_allocator m_allocator; public: std::atomic m_right_zombie; sum_node_type& m_result; std::atomic ref_count{2}; finish_scan* m_parent; wait_context& m_wait_context; task* execute(execution_data& ed) override { __TBB_ASSERT( m_result.ref_count.load() == static_cast((m_result.m_left!=nullptr)+(m_result.m_right!=nullptr)), nullptr ); if( m_result.m_left ) m_result.m_left_is_final = false; final_sum_type* right_zombie = m_right_zombie.load(std::memory_order_acquire); if( right_zombie && m_sum_slot ) (*m_sum_slot)->reverse_join(*m_result.m_left_sum); __TBB_ASSERT( !m_return_slot, nullptr ); if( right_zombie || m_result.m_right ) { m_return_slot = &m_result; } else { m_result.self_destroy(ed); } if( right_zombie && !m_sum_slot && !m_result.m_right ) { right_zombie->self_destroy(ed); m_right_zombie.store(nullptr, std::memory_order_relaxed); } return finalize(ed); } task* cancel(execution_data& ed) override { return finalize(ed); } finish_scan(sum_node_type*& return_slot, final_sum_type** sum, sum_node_type& result_, finish_scan* parent, wait_context& w_o, small_object_allocator& alloc) : m_sum_slot(sum), m_return_slot(return_slot), m_allocator(alloc), m_right_zombie(nullptr), m_result(result_), m_parent(parent), m_wait_context(w_o) { __TBB_ASSERT( !m_return_slot, nullptr ); } private: finish_scan* release_parent() { call_itt_task_notify(releasing, m_parent); if (m_parent) { auto parent = m_parent; m_parent = nullptr; if (parent->ref_count.fetch_sub(1) == 1) { return parent; } } else m_wait_context.release(); return nullptr; } finish_scan* finalize(const execution_data& ed) { finish_scan* next_task = release_parent(); m_allocator.delete_object(this, ed); return next_task; } }; //! Initial task to split the work /** @ingroup algorithms */ template struct start_scan : public task { private: using sum_node_type = sum_node; using final_sum_type = final_sum; using finish_pass1_type = finish_scan; std::reference_wrapper m_return_slot; Range m_range; std::reference_wrapper m_body; typename Partitioner::partition_type m_partition; /** Non-null if caller is requesting total. */ final_sum_type** m_sum_slot; bool m_is_final; bool m_is_right_child; finish_pass1_type* m_parent; small_object_allocator m_allocator; wait_context& m_wait_context; finish_pass1_type* release_parent() { call_itt_task_notify(releasing, m_parent); if (m_parent) { auto parent = m_parent; m_parent = nullptr; if (parent->ref_count.fetch_sub(1) == 1) { return parent; } } else m_wait_context.release(); return nullptr; } finish_pass1_type* finalize( const execution_data& ed ) { finish_pass1_type* next_task = release_parent(); m_allocator.delete_object(this, ed); return next_task; } public: task* execute( execution_data& ) override; task* cancel( execution_data& ed ) override { return finalize(ed); } start_scan( sum_node_type*& return_slot, start_scan& parent, small_object_allocator& alloc ) : m_return_slot(return_slot), m_range(parent.m_range,split()), m_body(parent.m_body), m_partition(parent.m_partition,split()), m_sum_slot(parent.m_sum_slot), m_is_final(parent.m_is_final), m_is_right_child(true), m_parent(parent.m_parent), m_allocator(alloc), m_wait_context(parent.m_wait_context) { __TBB_ASSERT( !m_return_slot, nullptr ); parent.m_is_right_child = false; } start_scan( sum_node_type*& return_slot, const Range& range, final_sum_type& body, const Partitioner& partitioner, wait_context& w_o, small_object_allocator& alloc ) : m_return_slot(return_slot), m_range(range), m_body(body), m_partition(partitioner), m_sum_slot(nullptr), m_is_final(true), m_is_right_child(false), m_parent(nullptr), m_allocator(alloc), m_wait_context(w_o) { __TBB_ASSERT( !m_return_slot, nullptr ); } static void run( const Range& range, Body& body, const Partitioner& partitioner ) { if( !range.empty() ) { task_group_context context(PARALLEL_SCAN); using start_pass1_type = start_scan; sum_node_type* root = nullptr; wait_context w_ctx{1}; small_object_allocator alloc{}; auto& temp_body = *alloc.new_object(body, w_ctx, alloc); temp_body.reverse_join(body); auto& pass1 = *alloc.new_object(/*m_return_slot=*/root, range, temp_body, partitioner, w_ctx, alloc); execute_and_wait(pass1, context, w_ctx, context); if( root ) { root->prepare_for_execution(temp_body, nullptr, &body); w_ctx.reserve(); execute_and_wait(*root, context, w_ctx, context); } else { temp_body.assign_to(body); temp_body.finish_construction(nullptr, range, nullptr); alloc.delete_object(&temp_body); } } } }; template task* start_scan::execute( execution_data& ed ) { // Inspecting m_parent->result.left_sum would ordinarily be a race condition. // But we inspect it only if we are not a stolen task, in which case we // know that task assigning to m_parent->result.left_sum has completed. __TBB_ASSERT(!m_is_right_child || m_parent, "right child is never an orphan"); bool treat_as_stolen = m_is_right_child && (is_stolen(ed) || &m_body.get()!=m_parent->m_result.m_left_sum); if( treat_as_stolen ) { // Invocation is for right child that has been really stolen or needs to be virtually stolen small_object_allocator alloc{}; final_sum_type* right_zombie = alloc.new_object(m_body, alloc); m_parent->m_right_zombie.store(right_zombie, std::memory_order_release); m_body = *right_zombie; m_is_final = false; } task* next_task = nullptr; if( (m_is_right_child && !treat_as_stolen) || !m_range.is_divisible() || m_partition.should_execute_range(ed) ) { if( m_is_final ) m_body(m_range, final_scan_tag()); else if( m_sum_slot ) m_body(m_range, pre_scan_tag()); if( m_sum_slot ) *m_sum_slot = &m_body.get(); __TBB_ASSERT( !m_return_slot, nullptr ); next_task = finalize(ed); } else { small_object_allocator alloc{}; auto result = alloc.new_object(m_range,/*m_left_is_final=*/m_is_final, m_parent? &m_parent->m_result: nullptr, m_wait_context, alloc); auto new_parent = alloc.new_object(m_return_slot, m_sum_slot, *result, m_parent, m_wait_context, alloc); m_parent = new_parent; // Split off right child auto& right_child = *alloc.new_object(/*m_return_slot=*/result->m_right, *this, alloc); spawn(right_child, *ed.context); m_sum_slot = &result->m_left_sum; m_return_slot = result->m_left; __TBB_ASSERT( !m_return_slot, nullptr ); next_task = this; } return next_task; } template class lambda_scan_body { Value m_sum_slot; const Value& identity_element; const Scan& m_scan; const ReverseJoin& m_reverse_join; public: void operator=(const lambda_scan_body&) = delete; lambda_scan_body(const lambda_scan_body&) = default; lambda_scan_body( const Value& identity, const Scan& scan, const ReverseJoin& rev_join ) : m_sum_slot(identity) , identity_element(identity) , m_scan(scan) , m_reverse_join(rev_join) {} lambda_scan_body( lambda_scan_body& b, split ) : m_sum_slot(b.identity_element) , identity_element(b.identity_element) , m_scan(b.m_scan) , m_reverse_join(b.m_reverse_join) {} template void operator()( const Range& r, Tag tag ) { m_sum_slot = tbb::detail::invoke(m_scan, r, m_sum_slot, tag); } void reverse_join( lambda_scan_body& a ) { m_sum_slot = tbb::detail::invoke(m_reverse_join, a.m_sum_slot, m_sum_slot); } void assign( lambda_scan_body& b ) { m_sum_slot = b.m_sum_slot; } Value result() const { return m_sum_slot; } }; // Requirements on Range concept are documented in blocked_range.h /** \page parallel_scan_body_req Requirements on parallel_scan body Class \c Body implementing the concept of parallel_scan body must define: - \code Body::Body( Body&, split ); \endcode Splitting constructor. Split \c b so that \c this and \c b can accumulate separately - \code Body::~Body(); \endcode Destructor - \code void Body::operator()( const Range& r, pre_scan_tag ); \endcode Preprocess iterations for range \c r - \code void Body::operator()( const Range& r, final_scan_tag ); \endcode Do final processing for iterations of range \c r - \code void Body::reverse_join( Body& a ); \endcode Merge preprocessing state of \c a into \c this, where \c a was created earlier from \c b by b's splitting constructor **/ /** \name parallel_scan See also requirements on \ref range_req "Range" and \ref parallel_scan_body_req "parallel_scan Body". **/ //@{ //! Parallel prefix with default partitioner /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_scan_body) void parallel_scan( const Range& range, Body& body ) { start_scan::run(range,body,__TBB_DEFAULT_PARTITIONER()); } //! Parallel prefix with simple_partitioner /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_scan_body) void parallel_scan( const Range& range, Body& body, const simple_partitioner& partitioner ) { start_scan::run(range, body, partitioner); } //! Parallel prefix with auto_partitioner /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_scan_body) void parallel_scan( const Range& range, Body& body, const auto_partitioner& partitioner ) { start_scan::run(range, body, partitioner); } //! Parallel prefix with default partitioner /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_scan_function && parallel_scan_combine) Value parallel_scan( const Range& range, const Value& identity, const Scan& scan, const ReverseJoin& reverse_join ) { lambda_scan_body body(identity, scan, reverse_join); parallel_scan(range, body, __TBB_DEFAULT_PARTITIONER()); return body.result(); } //! Parallel prefix with simple_partitioner /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_scan_function && parallel_scan_combine) Value parallel_scan( const Range& range, const Value& identity, const Scan& scan, const ReverseJoin& reverse_join, const simple_partitioner& partitioner ) { lambda_scan_body body(identity, scan, reverse_join); parallel_scan(range, body, partitioner); return body.result(); } //! Parallel prefix with auto_partitioner /** @ingroup algorithms **/ template __TBB_requires(tbb_range && parallel_scan_function && parallel_scan_combine) Value parallel_scan( const Range& range, const Value& identity, const Scan& scan, const ReverseJoin& reverse_join, const auto_partitioner& partitioner ) { lambda_scan_body body(identity, scan, reverse_join); parallel_scan(range, body, partitioner); return body.result(); } } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::parallel_scan; using detail::d1::pre_scan_tag; using detail::d1::final_scan_tag; } // namespace v1 } // namespace tbb #endif /* __TBB_parallel_scan_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/parallel_sort.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_parallel_sort_H #define __TBB_parallel_sort_H #include "detail/_namespace_injection.h" #include "parallel_for.h" #include "blocked_range.h" #include "profiling.h" #include #include #include #include namespace tbb { namespace detail { #if __TBB_CPP20_CONCEPTS_PRESENT inline namespace d0 { // TODO: consider using std::strict_weak_order concept template concept compare = requires( const std::remove_reference_t& comp, typename std::iterator_traits::reference value ) { // Forward via iterator_traits::reference { comp(typename std::iterator_traits::reference(value), typename std::iterator_traits::reference(value)) } -> std::convertible_to; }; // Inspired by std::__PartiallyOrderedWith exposition only concept template concept less_than_comparable = requires( const std::remove_reference_t& lhs, const std::remove_reference_t& rhs ) { { lhs < rhs } -> boolean_testable; }; } // namespace d0 #endif // __TBB_CPP20_CONCEPTS_PRESENT namespace d1 { //! Range used in quicksort to split elements into subranges based on a value. /** The split operation selects a splitter and places all elements less than or equal to the value in the first range and the remaining elements in the second range. @ingroup algorithms */ template class quick_sort_range { std::size_t median_of_three( const RandomAccessIterator& array, std::size_t l, std::size_t m, std::size_t r ) const { return comp(array[l], array[m]) ? ( comp(array[m], array[r]) ? m : ( comp(array[l], array[r]) ? r : l ) ) : ( comp(array[r], array[m]) ? m : ( comp(array[r], array[l]) ? r : l ) ); } std::size_t pseudo_median_of_nine( const RandomAccessIterator& array, const quick_sort_range& range ) const { std::size_t offset = range.size / 8u; return median_of_three(array, median_of_three(array, 0 , offset, offset * 2), median_of_three(array, offset * 3, offset * 4, offset * 5), median_of_three(array, offset * 6, offset * 7, range.size - 1)); } std::size_t split_range( quick_sort_range& range ) { RandomAccessIterator array = range.begin; RandomAccessIterator first_element = range.begin; std::size_t m = pseudo_median_of_nine(array, range); if( m != 0 ) std::iter_swap(array, array + m); std::size_t i = 0; std::size_t j = range.size; // Partition interval [i + 1,j - 1] with key *first_element. for(;;) { __TBB_ASSERT( i < j, nullptr ); // Loop must terminate since array[l] == *first_element. do { --j; __TBB_ASSERT( i <= j, "bad ordering relation?" ); } while( comp(*first_element, array[j]) ); do { __TBB_ASSERT( i <= j, nullptr ); if( i == j ) goto partition; ++i; } while( comp(array[i], *first_element) ); if( i == j ) goto partition; std::iter_swap(array + i, array + j); } partition: // Put the partition key were it belongs std::iter_swap(array + j, first_element); // array[l..j) is less or equal to key. // array(j..r) is greater or equal to key. // array[j] is equal to key i = j + 1; std::size_t new_range_size = range.size - i; range.size = j; return new_range_size; } public: quick_sort_range() = default; quick_sort_range( const quick_sort_range& ) = default; void operator=( const quick_sort_range& ) = delete; static constexpr std::size_t grainsize = 500; const Compare& comp; std::size_t size; RandomAccessIterator begin; quick_sort_range( RandomAccessIterator begin_, std::size_t size_, const Compare& comp_ ) : comp(comp_), size(size_), begin(begin_) {} bool empty() const { return size == 0; } bool is_divisible() const { return size >= grainsize; } quick_sort_range( quick_sort_range& range, split ) : comp(range.comp) , size(split_range(range)) // +1 accounts for the pivot element, which is at its correct place // already and, therefore, is not included into subranges. , begin(range.begin + range.size + 1) {} }; //! Body class used to test if elements in a range are presorted /** @ingroup algorithms */ template class quick_sort_pretest_body { const Compare& comp; task_group_context& context; public: quick_sort_pretest_body() = default; quick_sort_pretest_body( const quick_sort_pretest_body& ) = default; void operator=( const quick_sort_pretest_body& ) = delete; quick_sort_pretest_body( const Compare& _comp, task_group_context& _context ) : comp(_comp), context(_context) {} void operator()( const blocked_range& range ) const { RandomAccessIterator my_end = range.end(); int i = 0; //TODO: consider using std::is_sorted() for each 64 iterations (requires performance measurements) for( RandomAccessIterator k = range.begin(); k != my_end; ++k, ++i ) { if( i % 64 == 0 && context.is_group_execution_cancelled() ) break; // The k - 1 is never out-of-range because the first chunk starts at begin+serial_cutoff+1 if( comp(*(k), *(k - 1)) ) { context.cancel_group_execution(); break; } } } }; //! Body class used to sort elements in a range that is smaller than the grainsize. /** @ingroup algorithms */ template struct quick_sort_body { void operator()( const quick_sort_range& range ) const { std::sort(range.begin, range.begin + range.size, range.comp); } }; //! Method to perform parallel_for based quick sort. /** @ingroup algorithms */ template void do_parallel_quick_sort( RandomAccessIterator begin, RandomAccessIterator end, const Compare& comp ) { parallel_for(quick_sort_range(begin, end - begin, comp), quick_sort_body(), auto_partitioner()); } //! Wrapper method to initiate the sort by calling parallel_for. /** @ingroup algorithms */ template void parallel_quick_sort( RandomAccessIterator begin, RandomAccessIterator end, const Compare& comp ) { task_group_context my_context(PARALLEL_SORT); constexpr int serial_cutoff = 9; __TBB_ASSERT( begin + serial_cutoff < end, "min_parallel_size is smaller than serial cutoff?" ); RandomAccessIterator k = begin; for( ; k != begin + serial_cutoff; ++k ) { if( comp(*(k + 1), *k) ) { do_parallel_quick_sort(begin, end, comp); return; } } // Check is input range already sorted parallel_for(blocked_range(k + 1, end), quick_sort_pretest_body(comp, my_context), auto_partitioner(), my_context); if( my_context.is_group_execution_cancelled() ) do_parallel_quick_sort(begin, end, comp); } /** \page parallel_sort_iter_req Requirements on iterators for parallel_sort Requirements on the iterator type \c It and its value type \c T for \c parallel_sort: - \code void iter_swap( It a, It b ) \endcode Swaps the values of the elements the given iterators \c a and \c b are pointing to. \c It should be a random access iterator. - \code bool Compare::operator()( const T& x, const T& y ) \endcode True if x comes before y; **/ /** \name parallel_sort See also requirements on \ref parallel_sort_iter_req "iterators for parallel_sort". **/ //@{ #if __TBB_CPP20_CONCEPTS_PRESENT template using iter_value_type = typename std::iterator_traits::value_type; template using range_value_type = typename std::iterator_traits>::value_type; #endif //! Sorts the data in [begin,end) using the given comparator /** The compare function object is used for all comparisons between elements during sorting. The compare object must define a bool operator() function. @ingroup algorithms **/ template __TBB_requires(std::random_access_iterator && compare && std::movable>) void parallel_sort( RandomAccessIterator begin, RandomAccessIterator end, const Compare& comp ) { constexpr int min_parallel_size = 500; if( end > begin ) { if( end - begin < min_parallel_size ) { std::sort(begin, end, comp); } else { parallel_quick_sort(begin, end, comp); } } } //! Sorts the data in [begin,end) with a default comparator \c std::less /** @ingroup algorithms **/ template __TBB_requires(std::random_access_iterator && less_than_comparable> && std::movable>) void parallel_sort( RandomAccessIterator begin, RandomAccessIterator end ) { parallel_sort(begin, end, std::less::value_type>()); } //! Sorts the data in rng using the given comparator /** @ingroup algorithms **/ template __TBB_requires(container_based_sequence && compare> && std::movable>) void parallel_sort( Range&& rng, const Compare& comp ) { parallel_sort(std::begin(rng), std::end(rng), comp); } //! Sorts the data in rng with a default comparator \c std::less /** @ingroup algorithms **/ template __TBB_requires(container_based_sequence && less_than_comparable> && std::movable>) void parallel_sort( Range&& rng ) { parallel_sort(std::begin(rng), std::end(rng)); } //@} } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::parallel_sort; } // namespace v1 } // namespace tbb #endif /*__TBB_parallel_sort_H*/ ================================================ FILE: third-party/tbb/include/oneapi/tbb/partitioner.h ================================================ /* Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_partitioner_H #define __TBB_partitioner_H #ifndef __TBB_INITIAL_CHUNKS // initial task divisions per thread #define __TBB_INITIAL_CHUNKS 2 #endif #ifndef __TBB_RANGE_POOL_CAPACITY // maximum number of elements in range pool #define __TBB_RANGE_POOL_CAPACITY 8 #endif #ifndef __TBB_INIT_DEPTH // initial value for depth of range pool #define __TBB_INIT_DEPTH 5 #endif #ifndef __TBB_DEMAND_DEPTH_ADD // when imbalance is found range splits this value times more #define __TBB_DEMAND_DEPTH_ADD 1 #endif #include "detail/_config.h" #include "detail/_namespace_injection.h" #include "detail/_aligned_space.h" #include "detail/_utils.h" #include "detail/_template_helpers.h" #include "detail/_range_common.h" #include "detail/_task.h" #include "detail/_small_object_pool.h" #include "cache_aligned_allocator.h" #include "task_group.h" // task_group_context #include "task_arena.h" #include #include #include #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) // Workaround for overzealous compiler warnings #pragma warning (push) #pragma warning (disable: 4244) #endif namespace tbb { namespace detail { namespace d1 { class auto_partitioner; class simple_partitioner; class static_partitioner; class affinity_partitioner; class affinity_partition_type; class affinity_partitioner_base; inline std::size_t get_initial_auto_partitioner_divisor() { const std::size_t factor = 4; return factor * static_cast(max_concurrency()); } //! Defines entry point for affinity partitioner into oneTBB run-time library. class affinity_partitioner_base: no_copy { friend class affinity_partitioner; friend class affinity_partition_type; //! Array that remembers affinities of tree positions to affinity_id. /** nullptr if my_size==0. */ slot_id* my_array; //! Number of elements in my_array. std::size_t my_size; //! Zeros the fields. affinity_partitioner_base() : my_array(nullptr), my_size(0) {} //! Deallocates my_array. ~affinity_partitioner_base() { resize(0); } //! Resize my_array. /** Retains values if resulting size is the same. */ void resize(unsigned factor) { // Check factor to avoid asking for number of workers while there might be no arena. unsigned max_threads_in_arena = static_cast(max_concurrency()); std::size_t new_size = factor ? factor * max_threads_in_arena : 0; if (new_size != my_size) { if (my_array) { r1::cache_aligned_deallocate(my_array); // Following two assignments must be done here for sake of exception safety. my_array = nullptr; my_size = 0; } if (new_size) { my_array = static_cast(r1::cache_aligned_allocate(new_size * sizeof(slot_id))); std::fill_n(my_array, new_size, no_slot); my_size = new_size; } } } }; template struct start_for; template struct start_scan; template struct start_reduce; template struct start_deterministic_reduce; struct node { node* my_parent{}; std::atomic m_ref_count{}; node() = default; node(node* parent, int ref_count) : my_parent{parent}, m_ref_count{ref_count} { __TBB_ASSERT(ref_count > 0, "The ref count must be positive"); } }; struct wait_node : node { wait_node() : node{ nullptr, 1 } {} wait_context m_wait{1}; }; //! Join task node that contains shared flag for stealing feedback struct tree_node : public node { small_object_allocator m_allocator; std::atomic m_child_stolen{false}; tree_node(node* parent, int ref_count, small_object_allocator& alloc) : node{parent, ref_count} , m_allocator{alloc} {} void join(task_group_context*) {/*dummy, required only for reduction algorithms*/}; template static void mark_task_stolen(Task &t) { std::atomic &flag = static_cast(t.my_parent)->m_child_stolen; #if TBB_USE_PROFILING_TOOLS // Threading tools respect lock prefix but report false-positive data-race via plain store flag.exchange(true); #else flag.store(true, std::memory_order_relaxed); #endif // TBB_USE_PROFILING_TOOLS } template static bool is_peer_stolen(Task &t) { return static_cast(t.my_parent)->m_child_stolen.load(std::memory_order_relaxed); } }; // Context used to check cancellation state during reduction join process template void fold_tree(node* n, const execution_data& ed) { for (;;) { __TBB_ASSERT(n, nullptr); __TBB_ASSERT(n->m_ref_count.load(std::memory_order_relaxed) > 0, "The refcount must be positive."); call_itt_task_notify(releasing, n); if (--n->m_ref_count > 0) { return; } node* parent = n->my_parent; if (!parent) { break; }; call_itt_task_notify(acquired, n); TreeNodeType* self = static_cast(n); self->join(ed.context); self->m_allocator.delete_object(self, ed); n = parent; } // Finish parallel for execution when the root (last node) is reached static_cast(n)->m_wait.release(); } //! Depth is a relative depth of recursive division inside a range pool. Relative depth allows //! infinite absolute depth of the recursion for heavily unbalanced workloads with range represented //! by a number that cannot fit into machine word. typedef unsigned char depth_t; //! Range pool stores ranges of type T in a circular buffer with MaxCapacity template class range_vector { depth_t my_head; depth_t my_tail; depth_t my_size; depth_t my_depth[MaxCapacity]; // relative depths of stored ranges tbb::detail::aligned_space my_pool; public: //! initialize via first range in pool range_vector(const T& elem) : my_head(0), my_tail(0), my_size(1) { my_depth[0] = 0; new( static_cast(my_pool.begin()) ) T(elem);//TODO: std::move? } ~range_vector() { while( !empty() ) pop_back(); } bool empty() const { return my_size == 0; } depth_t size() const { return my_size; } //! Populates range pool via ranges up to max depth or while divisible //! max_depth starts from 0, e.g. value 2 makes 3 ranges in the pool up to two 1/4 pieces void split_to_fill(depth_t max_depth) { while( my_size < MaxCapacity && is_divisible(max_depth) ) { depth_t prev = my_head; my_head = (my_head + 1) % MaxCapacity; new(my_pool.begin()+my_head) T(my_pool.begin()[prev]); // copy TODO: std::move? my_pool.begin()[prev].~T(); // instead of assignment new(my_pool.begin()+prev) T(my_pool.begin()[my_head], detail::split()); // do 'inverse' split my_depth[my_head] = ++my_depth[prev]; my_size++; } } void pop_back() { __TBB_ASSERT(my_size > 0, "range_vector::pop_back() with empty size"); my_pool.begin()[my_head].~T(); my_size--; my_head = (my_head + MaxCapacity - 1) % MaxCapacity; } void pop_front() { __TBB_ASSERT(my_size > 0, "range_vector::pop_front() with empty size"); my_pool.begin()[my_tail].~T(); my_size--; my_tail = (my_tail + 1) % MaxCapacity; } T& back() { __TBB_ASSERT(my_size > 0, "range_vector::back() with empty size"); return my_pool.begin()[my_head]; } T& front() { __TBB_ASSERT(my_size > 0, "range_vector::front() with empty size"); return my_pool.begin()[my_tail]; } //! similarly to front(), returns depth of the first range in the pool depth_t front_depth() { __TBB_ASSERT(my_size > 0, "range_vector::front_depth() with empty size"); return my_depth[my_tail]; } depth_t back_depth() { __TBB_ASSERT(my_size > 0, "range_vector::back_depth() with empty size"); return my_depth[my_head]; } bool is_divisible(depth_t max_depth) { return back_depth() < max_depth && back().is_divisible(); } }; //! Provides default methods for partition objects and common algorithm blocks. template struct partition_type_base { typedef detail::split split_type; // decision makers void note_affinity( slot_id ) {} template bool check_being_stolen(Task&, const execution_data&) { return false; } // part of old should_execute_range() template split_type get_split() { return split(); } Partition& self() { return *static_cast(this); } // CRTP helper template void work_balance(StartType &start, Range &range, const execution_data&) { start.run_body( range ); // static partitioner goes here } template void execute(StartType &start, Range &range, execution_data& ed) { // The algorithm in a few words ([]-denotes calls to decision methods of partitioner): // [If this task is stolen, adjust depth and divisions if necessary, set flag]. // If range is divisible { // Spread the work while [initial divisions left]; // Create trap task [if necessary]; // } // If not divisible or [max depth is reached], execute, else do the range pool part if ( range.is_divisible() ) { if ( self().is_divisible() ) { do { // split until is divisible typename Partition::split_type split_obj = self().template get_split(); start.offer_work( split_obj, ed ); } while ( range.is_divisible() && self().is_divisible() ); } } self().work_balance(start, range, ed); } }; //! Provides default splitting strategy for partition objects. template struct adaptive_mode : partition_type_base { typedef Partition my_partition; std::size_t my_divisor; // For affinity_partitioner, my_divisor indicates the number of affinity array indices the task reserves. // A task which has only one index must produce the right split without reserved index in order to avoid // it to be overwritten in note_affinity() of the created (right) task. // I.e. a task created deeper than the affinity array can remember must not save its affinity (LIFO order) static const unsigned factor = 1; adaptive_mode() : my_divisor(get_initial_auto_partitioner_divisor() / 4 * my_partition::factor) {} adaptive_mode(adaptive_mode &src, split) : my_divisor(do_split(src, split())) {} adaptive_mode(adaptive_mode&, const proportional_split&) : my_divisor(0) { // left blank as my_divisor gets overridden in the successors' constructors } /*! Override do_split methods in order to specify splitting strategy */ std::size_t do_split(adaptive_mode &src, split) { return src.my_divisor /= 2u; } }; //! Provides proportional splitting strategy for partition objects template struct proportional_mode : adaptive_mode { typedef Partition my_partition; using partition_type_base::self; // CRTP helper to get access to derived classes proportional_mode() : adaptive_mode() {} proportional_mode(proportional_mode &src, split) : adaptive_mode(src, split()) {} proportional_mode(proportional_mode &src, const proportional_split& split_obj) : adaptive_mode(src, split_obj) { self().my_divisor = do_split(src, split_obj); } std::size_t do_split(proportional_mode &src, const proportional_split& split_obj) { std::size_t portion = split_obj.right() * my_partition::factor; portion = (portion + my_partition::factor/2) & (0ul - my_partition::factor); src.my_divisor -= portion; return portion; } bool is_divisible() { // part of old should_execute_range() return self().my_divisor > my_partition::factor; } template proportional_split get_split() { // Create the proportion from partitioner internal resources (threads) that would be used: // - into proportional_mode constructor to split the partitioner // - if Range supports the proportional_split constructor it would use proposed proportion, // otherwise, the tbb::proportional_split object will be implicitly (for Range implementer) // casted to tbb::split std::size_t n = self().my_divisor / my_partition::factor; std::size_t right = n / 2; std::size_t left = n - right; return proportional_split(left, right); } }; static std::size_t get_initial_partition_head() { int current_index = tbb::this_task_arena::current_thread_index(); if (current_index == tbb::task_arena::not_initialized) current_index = 0; return size_t(current_index); } //! Provides default linear indexing of partitioner's sequence template struct linear_affinity_mode : proportional_mode { std::size_t my_head; std::size_t my_max_affinity; using proportional_mode::self; linear_affinity_mode() : proportional_mode(), my_head(get_initial_partition_head()), my_max_affinity(self().my_divisor) {} linear_affinity_mode(linear_affinity_mode &src, split) : proportional_mode(src, split()) , my_head((src.my_head + src.my_divisor) % src.my_max_affinity), my_max_affinity(src.my_max_affinity) {} linear_affinity_mode(linear_affinity_mode &src, const proportional_split& split_obj) : proportional_mode(src, split_obj) , my_head((src.my_head + src.my_divisor) % src.my_max_affinity), my_max_affinity(src.my_max_affinity) {} void spawn_task(task& t, task_group_context& ctx) { if (self().my_divisor) { spawn(t, ctx, slot_id(my_head)); } else { spawn(t, ctx); } } }; static bool is_stolen_task(const execution_data& ed) { return execution_slot(ed) != original_slot(ed); } /*! Determine work-balance phase implementing splitting & stealing actions */ template struct dynamic_grainsize_mode : Mode { using Mode::self; enum { begin = 0, run, pass } my_delay; depth_t my_max_depth; static const unsigned range_pool_size = __TBB_RANGE_POOL_CAPACITY; dynamic_grainsize_mode(): Mode() , my_delay(begin) , my_max_depth(__TBB_INIT_DEPTH) {} dynamic_grainsize_mode(dynamic_grainsize_mode& p, split) : Mode(p, split()) , my_delay(pass) , my_max_depth(p.my_max_depth) {} dynamic_grainsize_mode(dynamic_grainsize_mode& p, const proportional_split& split_obj) : Mode(p, split_obj) , my_delay(begin) , my_max_depth(p.my_max_depth) {} template bool check_being_stolen(Task &t, const execution_data& ed) { // part of old should_execute_range() if( !(self().my_divisor / Mode::my_partition::factor) ) { // if not from the top P tasks of binary tree self().my_divisor = 1; // TODO: replace by on-stack flag (partition_state's member)? if( is_stolen_task(ed) && t.my_parent->m_ref_count >= 2 ) { // runs concurrently with the left task #if __TBB_USE_OPTIONAL_RTTI // RTTI is available, check whether the cast is valid // TODO: TBB_REVAMP_TODO __TBB_ASSERT(dynamic_cast(t.m_parent), 0); // correctness of the cast relies on avoiding the root task for which: // - initial value of my_divisor != 0 (protected by separate assertion) // - is_stolen_task() always returns false for the root task. #endif tree_node::mark_task_stolen(t); if( !my_max_depth ) my_max_depth++; my_max_depth += __TBB_DEMAND_DEPTH_ADD; return true; } } return false; } depth_t max_depth() { return my_max_depth; } void align_depth(depth_t base) { __TBB_ASSERT(base <= my_max_depth, nullptr); my_max_depth -= base; } template void work_balance(StartType &start, Range &range, execution_data& ed) { if( !range.is_divisible() || !self().max_depth() ) { start.run_body( range ); } else { // do range pool range_vector range_pool(range); do { range_pool.split_to_fill(self().max_depth()); // fill range pool if( self().check_for_demand( start ) ) { if( range_pool.size() > 1 ) { start.offer_work( range_pool.front(), range_pool.front_depth(), ed ); range_pool.pop_front(); continue; } if( range_pool.is_divisible(self().max_depth()) ) // was not enough depth to fork a task continue; // note: next split_to_fill() should split range at least once } start.run_body( range_pool.back() ); range_pool.pop_back(); } while( !range_pool.empty() && !ed.context->is_group_execution_cancelled() ); } } template bool check_for_demand(Task& t) { if ( pass == my_delay ) { if ( self().my_divisor > 1 ) // produce affinitized tasks while they have slot in array return true; // do not do my_max_depth++ here, but be sure range_pool is splittable once more else if ( self().my_divisor && my_max_depth ) { // make balancing task self().my_divisor = 0; // once for each task; depth will be decreased in align_depth() return true; } else if ( tree_node::is_peer_stolen(t) ) { my_max_depth += __TBB_DEMAND_DEPTH_ADD; return true; } } else if( begin == my_delay ) { my_delay = pass; } return false; } }; class auto_partition_type: public dynamic_grainsize_mode > { public: auto_partition_type( const auto_partitioner& ) { my_divisor *= __TBB_INITIAL_CHUNKS; } auto_partition_type( auto_partition_type& src, split) : dynamic_grainsize_mode >(src, split()) {} bool is_divisible() { // part of old should_execute_range() if( my_divisor > 1 ) return true; if( my_divisor && my_max_depth ) { // can split the task. TODO: on-stack flag instead // keep same fragmentation while splitting for the local task pool my_max_depth--; my_divisor = 0; // decrease max_depth once per task return true; } else return false; } template bool check_for_demand(Task& t) { if (tree_node::is_peer_stolen(t)) { my_max_depth += __TBB_DEMAND_DEPTH_ADD; return true; } else return false; } void spawn_task(task& t, task_group_context& ctx) { spawn(t, ctx); } }; class simple_partition_type: public partition_type_base { public: simple_partition_type( const simple_partitioner& ) {} simple_partition_type( const simple_partition_type&, split ) {} //! simplified algorithm template void execute(StartType &start, Range &range, execution_data& ed) { split_type split_obj = split(); // start.offer_work accepts split_type as reference while( range.is_divisible() ) start.offer_work( split_obj, ed ); start.run_body( range ); } void spawn_task(task& t, task_group_context& ctx) { spawn(t, ctx); } }; class static_partition_type : public linear_affinity_mode { public: typedef detail::proportional_split split_type; static_partition_type( const static_partitioner& ) {} static_partition_type( static_partition_type& p, const proportional_split& split_obj ) : linear_affinity_mode(p, split_obj) {} }; class affinity_partition_type : public dynamic_grainsize_mode > { static const unsigned factor_power = 4; // TODO: get a unified formula based on number of computing units slot_id* my_array; public: static const unsigned factor = 1 << factor_power; // number of slots in affinity array per task typedef detail::proportional_split split_type; affinity_partition_type( affinity_partitioner_base& ap ) { __TBB_ASSERT( (factor&(factor-1))==0, "factor must be power of two" ); ap.resize(factor); my_array = ap.my_array; my_max_depth = factor_power + 1; __TBB_ASSERT( my_max_depth < __TBB_RANGE_POOL_CAPACITY, nullptr ); } affinity_partition_type(affinity_partition_type& p, split) : dynamic_grainsize_mode >(p, split()) , my_array(p.my_array) {} affinity_partition_type(affinity_partition_type& p, const proportional_split& split_obj) : dynamic_grainsize_mode >(p, split_obj) , my_array(p.my_array) {} void note_affinity(slot_id id) { if( my_divisor ) my_array[my_head] = id; } void spawn_task(task& t, task_group_context& ctx) { if (my_divisor) { if (!my_array[my_head]) { // TODO: consider new ideas with my_array for both affinity and static partitioner's, then code reuse spawn(t, ctx, slot_id(my_head / factor)); } else { spawn(t, ctx, my_array[my_head]); } } else { spawn(t, ctx); } } }; //! A simple partitioner /** Divides the range until the range is not divisible. @ingroup algorithms */ class simple_partitioner { public: simple_partitioner() {} private: template friend struct start_for; template friend struct start_reduce; template friend struct start_deterministic_reduce; template friend struct start_scan; // new implementation just extends existing interface typedef simple_partition_type task_partition_type; // TODO: consider to make split_type public typedef simple_partition_type::split_type split_type; // for parallel_scan only class partition_type { public: bool should_execute_range(const execution_data& ) {return false;} partition_type( const simple_partitioner& ) {} partition_type( const partition_type&, split ) {} }; }; //! An auto partitioner /** The range is initial divided into several large chunks. Chunks are further subdivided into smaller pieces if demand detected and they are divisible. @ingroup algorithms */ class auto_partitioner { public: auto_partitioner() {} private: template friend struct start_for; template friend struct start_reduce; template friend struct start_deterministic_reduce; template friend struct start_scan; // new implementation just extends existing interface typedef auto_partition_type task_partition_type; // TODO: consider to make split_type public typedef auto_partition_type::split_type split_type; //! Backward-compatible partition for auto and affinity partition objects. class partition_type { size_t num_chunks; static const size_t VICTIM_CHUNKS = 4; public: bool should_execute_range(const execution_data& ed) { if( num_chunks friend struct start_for; template friend struct start_reduce; template friend struct start_deterministic_reduce; template friend struct start_scan; // new implementation just extends existing interface typedef static_partition_type task_partition_type; // TODO: consider to make split_type public typedef static_partition_type::split_type split_type; }; //! An affinity partitioner class affinity_partitioner : affinity_partitioner_base { public: affinity_partitioner() {} private: template friend struct start_for; template friend struct start_reduce; template friend struct start_deterministic_reduce; template friend struct start_scan; // new implementation just extends existing interface typedef affinity_partition_type task_partition_type; // TODO: consider to make split_type public typedef affinity_partition_type::split_type split_type; }; } // namespace d1 } // namespace detail inline namespace v1 { // Partitioners using detail::d1::auto_partitioner; using detail::d1::simple_partitioner; using detail::d1::static_partitioner; using detail::d1::affinity_partitioner; // Split types using detail::split; using detail::proportional_split; } // namespace v1 } // namespace tbb #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) #pragma warning (pop) #endif // warning 4244 is back #undef __TBB_INITIAL_CHUNKS #undef __TBB_RANGE_POOL_CAPACITY #undef __TBB_INIT_DEPTH #endif /* __TBB_partitioner_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/profiling.h ================================================ /* Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_profiling_H #define __TBB_profiling_H #include "detail/_config.h" #include #include namespace tbb { namespace detail { inline namespace d0 { // include list of index names #define TBB_STRING_RESOURCE(index_name,str) index_name, enum string_resource_index : std::uintptr_t { #include "detail/_string_resource.h" NUM_STRINGS }; #undef TBB_STRING_RESOURCE enum itt_relation { __itt_relation_is_unknown = 0, __itt_relation_is_dependent_on, /**< "A is dependent on B" means that A cannot start until B completes */ __itt_relation_is_sibling_of, /**< "A is sibling of B" means that A and B were created as a group */ __itt_relation_is_parent_of, /**< "A is parent of B" means that A created B */ __itt_relation_is_continuation_of, /**< "A is continuation of B" means that A assumes the dependencies of B */ __itt_relation_is_child_of, /**< "A is child of B" means that A was created by B (inverse of is_parent_of) */ __itt_relation_is_continued_by, /**< "A is continued by B" means that B assumes the dependencies of A (inverse of is_continuation_of) */ __itt_relation_is_predecessor_to /**< "A is predecessor to B" means that B cannot start until A completes (inverse of is_dependent_on) */ }; //! Unicode support #if (_WIN32||_WIN64) //! Unicode character type. Always wchar_t on Windows. using tchar = wchar_t; #else /* !WIN */ using tchar = char; #endif /* !WIN */ } // namespace d0 } // namespace detail } // namespace tbb #include #if _WIN32||_WIN64 #include /* mbstowcs_s */ #endif // Need these to work regardless of tools support namespace tbb { namespace detail { namespace d1 { enum notify_type {prepare=0, cancel, acquired, releasing, destroy}; enum itt_domain_enum { ITT_DOMAIN_FLOW=0, ITT_DOMAIN_MAIN=1, ITT_DOMAIN_ALGO=2, ITT_NUM_DOMAINS }; } // namespace d1 namespace r1 { TBB_EXPORT void __TBB_EXPORTED_FUNC call_itt_notify(int t, void* ptr); TBB_EXPORT void __TBB_EXPORTED_FUNC create_itt_sync(void* ptr, const tchar* objtype, const tchar* objname); TBB_EXPORT void __TBB_EXPORTED_FUNC itt_make_task_group(d1::itt_domain_enum domain, void* group, unsigned long long group_extra, void* parent, unsigned long long parent_extra, string_resource_index name_index); TBB_EXPORT void __TBB_EXPORTED_FUNC itt_task_begin(d1::itt_domain_enum domain, void* task, unsigned long long task_extra, void* parent, unsigned long long parent_extra, string_resource_index name_index); TBB_EXPORT void __TBB_EXPORTED_FUNC itt_task_end(d1::itt_domain_enum domain); TBB_EXPORT void __TBB_EXPORTED_FUNC itt_set_sync_name(void* obj, const tchar* name); TBB_EXPORT void __TBB_EXPORTED_FUNC itt_metadata_str_add(d1::itt_domain_enum domain, void* addr, unsigned long long addr_extra, string_resource_index key, const char* value); TBB_EXPORT void __TBB_EXPORTED_FUNC itt_metadata_ptr_add(d1::itt_domain_enum domain, void* addr, unsigned long long addr_extra, string_resource_index key, void* value); TBB_EXPORT void __TBB_EXPORTED_FUNC itt_relation_add(d1::itt_domain_enum domain, void* addr0, unsigned long long addr0_extra, itt_relation relation, void* addr1, unsigned long long addr1_extra); TBB_EXPORT void __TBB_EXPORTED_FUNC itt_region_begin(d1::itt_domain_enum domain, void* region, unsigned long long region_extra, void* parent, unsigned long long parent_extra, string_resource_index /* name_index */); TBB_EXPORT void __TBB_EXPORTED_FUNC itt_region_end(d1::itt_domain_enum domain, void* region, unsigned long long region_extra); } // namespace r1 namespace d1 { #if TBB_USE_PROFILING_TOOLS && (_WIN32||_WIN64) inline std::size_t multibyte_to_widechar(wchar_t* wcs, const char* mbs, std::size_t bufsize) { std::size_t len; mbstowcs_s(&len, wcs, bufsize, mbs, _TRUNCATE); return len; // mbstowcs_s counts null terminator } #endif #if TBB_USE_PROFILING_TOOLS inline void create_itt_sync(void *ptr, const char *objtype, const char *objname) { #if (_WIN32||_WIN64) std::size_t len_type = multibyte_to_widechar(nullptr, objtype, 0); wchar_t *type = new wchar_t[len_type]; multibyte_to_widechar(type, objtype, len_type); std::size_t len_name = multibyte_to_widechar(nullptr, objname, 0); wchar_t *name = new wchar_t[len_name]; multibyte_to_widechar(name, objname, len_name); #else // WIN const char *type = objtype; const char *name = objname; #endif r1::create_itt_sync(ptr, type, name); #if (_WIN32||_WIN64) delete[] type; delete[] name; #endif // WIN } // Distinguish notifications on task for reducing overheads #if TBB_USE_PROFILING_TOOLS == 2 inline void call_itt_task_notify(d1::notify_type t, void *ptr) { r1::call_itt_notify(static_cast(t), ptr); } #else inline void call_itt_task_notify(d1::notify_type, void *) {} #endif // TBB_USE_PROFILING_TOOLS inline void call_itt_notify(d1::notify_type t, void *ptr) { r1::call_itt_notify(static_cast(t), ptr); } #if (_WIN32||_WIN64) && !__MINGW32__ inline void itt_set_sync_name(void* obj, const wchar_t* name) { r1::itt_set_sync_name(obj, name); } inline void itt_set_sync_name(void* obj, const char* name) { std::size_t len_name = multibyte_to_widechar(nullptr, name, 0); wchar_t *obj_name = new wchar_t[len_name]; multibyte_to_widechar(obj_name, name, len_name); r1::itt_set_sync_name(obj, obj_name); delete[] obj_name; } #else inline void itt_set_sync_name( void* obj, const char* name) { r1::itt_set_sync_name(obj, name); } #endif //WIN inline void itt_make_task_group(itt_domain_enum domain, void* group, unsigned long long group_extra, void* parent, unsigned long long parent_extra, string_resource_index name_index) { r1::itt_make_task_group(domain, group, group_extra, parent, parent_extra, name_index); } inline void itt_metadata_str_add( itt_domain_enum domain, void *addr, unsigned long long addr_extra, string_resource_index key, const char *value ) { r1::itt_metadata_str_add( domain, addr, addr_extra, key, value ); } inline void register_node_addr(itt_domain_enum domain, void *addr, unsigned long long addr_extra, string_resource_index key, void *value) { r1::itt_metadata_ptr_add(domain, addr, addr_extra, key, value); } inline void itt_relation_add( itt_domain_enum domain, void *addr0, unsigned long long addr0_extra, itt_relation relation, void *addr1, unsigned long long addr1_extra ) { r1::itt_relation_add( domain, addr0, addr0_extra, relation, addr1, addr1_extra ); } inline void itt_task_begin( itt_domain_enum domain, void *task, unsigned long long task_extra, void *parent, unsigned long long parent_extra, string_resource_index name_index ) { r1::itt_task_begin( domain, task, task_extra, parent, parent_extra, name_index ); } inline void itt_task_end( itt_domain_enum domain ) { r1::itt_task_end( domain ); } inline void itt_region_begin( itt_domain_enum domain, void *region, unsigned long long region_extra, void *parent, unsigned long long parent_extra, string_resource_index name_index ) { r1::itt_region_begin( domain, region, region_extra, parent, parent_extra, name_index ); } inline void itt_region_end( itt_domain_enum domain, void *region, unsigned long long region_extra ) { r1::itt_region_end( domain, region, region_extra ); } #else inline void create_itt_sync(void* /*ptr*/, const char* /*objtype*/, const char* /*objname*/) {} inline void call_itt_notify(notify_type /*t*/, void* /*ptr*/) {} inline void call_itt_task_notify(notify_type /*t*/, void* /*ptr*/) {} #endif // TBB_USE_PROFILING_TOOLS #if TBB_USE_PROFILING_TOOLS && !(TBB_USE_PROFILING_TOOLS == 2) class event { /** This class supports user event traces through itt. Common use-case is tagging data flow graph tasks (data-id) and visualization by Intel Advisor Flow Graph Analyzer (FGA) **/ // TODO: Replace implementation by itt user event api. const std::string my_name; static void emit_trace(const std::string &input) { itt_metadata_str_add( ITT_DOMAIN_FLOW, nullptr, FLOW_NULL, USER_EVENT, ( "FGA::DATAID::" + input ).c_str() ); } public: event(const std::string &input) : my_name( input ) { } void emit() { emit_trace(my_name); } static void emit(const std::string &description) { emit_trace(description); } }; #else // TBB_USE_PROFILING_TOOLS && !(TBB_USE_PROFILING_TOOLS == 2) // Using empty struct if user event tracing is disabled: struct event { event(const std::string &) { } void emit() { } static void emit(const std::string &) { } }; #endif // TBB_USE_PROFILING_TOOLS && !(TBB_USE_PROFILING_TOOLS == 2) } // namespace d1 } // namespace detail namespace profiling { using detail::d1::event; } } // namespace tbb #endif /* __TBB_profiling_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/queuing_mutex.h ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_queuing_mutex_H #define __TBB_queuing_mutex_H #include "detail/_namespace_injection.h" #include "detail/_assert.h" #include "detail/_utils.h" #include "detail/_mutex_common.h" #include "profiling.h" #include namespace tbb { namespace detail { namespace d1 { //! Queuing mutex with local-only spinning. /** @ingroup synchronization */ class queuing_mutex { public: //! Construct unacquired mutex. queuing_mutex() noexcept { create_itt_sync(this, "tbb::queuing_mutex", ""); }; queuing_mutex(const queuing_mutex&) = delete; queuing_mutex& operator=(const queuing_mutex&) = delete; //! The scoped locking pattern /** It helps to avoid the common problem of forgetting to release lock. It also nicely provides the "node" for queuing locks. */ class scoped_lock { //! Reset fields to mean "no lock held". void reset() { m_mutex = nullptr; } public: //! Construct lock that has not acquired a mutex. /** Equivalent to zero-initialization of *this. */ scoped_lock() = default; //! Acquire lock on given mutex. scoped_lock(queuing_mutex& m) { acquire(m); } //! Release lock (if lock is held). ~scoped_lock() { if (m_mutex) release(); } //! No Copy scoped_lock( const scoped_lock& ) = delete; scoped_lock& operator=( const scoped_lock& ) = delete; //! Acquire lock on given mutex. void acquire( queuing_mutex& m ) { __TBB_ASSERT(!m_mutex, "scoped_lock is already holding a mutex"); // Must set all fields before the exchange, because once the // exchange executes, *this becomes accessible to other threads. m_mutex = &m; m_next.store(nullptr, std::memory_order_relaxed); m_going.store(0U, std::memory_order_relaxed); // x86 compare exchange operation always has a strong fence // "sending" the fields initialized above to other processors. scoped_lock* pred = m.q_tail.exchange(this); if (pred) { call_itt_notify(prepare, &m); __TBB_ASSERT(pred->m_next.load(std::memory_order_relaxed) == nullptr, "the predecessor has another successor!"); pred->m_next.store(this, std::memory_order_release); spin_wait_while_eq(m_going, 0U); } call_itt_notify(acquired, &m); } //! Acquire lock on given mutex if free (i.e. non-blocking) bool try_acquire( queuing_mutex& m ) { __TBB_ASSERT(!m_mutex, "scoped_lock is already holding a mutex"); // Must set all fields before the compare_exchange_strong, because once the // compare_exchange_strong executes, *this becomes accessible to other threads. m_next.store(nullptr, std::memory_order_relaxed); m_going.store(0U, std::memory_order_relaxed); scoped_lock* expected = nullptr; // The compare_exchange_strong must have release semantics, because we are // "sending" the fields initialized above to other processors. // x86 compare exchange operation always has a strong fence if (!m.q_tail.compare_exchange_strong(expected, this, std::memory_order_acq_rel)) return false; m_mutex = &m; call_itt_notify(acquired, &m); return true; } //! Release lock. void release() { __TBB_ASSERT(this->m_mutex, "no lock acquired"); call_itt_notify(releasing, this->m_mutex); if (m_next.load(std::memory_order_relaxed) == nullptr) { scoped_lock* expected = this; if (m_mutex->q_tail.compare_exchange_strong(expected, nullptr)) { // this was the only item in the queue, and the queue is now empty. reset(); return; } // Someone in the queue spin_wait_while_eq(m_next, nullptr); } m_next.load(std::memory_order_acquire)->m_going.store(1U, std::memory_order_release); reset(); } private: //! The pointer to the mutex owned, or nullptr if not holding a mutex. queuing_mutex* m_mutex{nullptr}; //! The pointer to the next competitor for a mutex std::atomic m_next{nullptr}; //! The local spin-wait variable /** Inverted (0 - blocked, 1 - acquired the mutex) for the sake of zero-initialization. Defining it as an entire word instead of a byte seems to help performance slightly. */ std::atomic m_going{0U}; }; // Mutex traits static constexpr bool is_rw_mutex = false; static constexpr bool is_recursive_mutex = false; static constexpr bool is_fair_mutex = true; private: //! The last competitor requesting the lock std::atomic q_tail{nullptr}; }; #if TBB_USE_PROFILING_TOOLS inline void set_name(queuing_mutex& obj, const char* name) { itt_set_sync_name(&obj, name); } #if (_WIN32||_WIN64) inline void set_name(queuing_mutex& obj, const wchar_t* name) { itt_set_sync_name(&obj, name); } #endif //WIN #else inline void set_name(queuing_mutex&, const char*) {} #if (_WIN32||_WIN64) inline void set_name(queuing_mutex&, const wchar_t*) {} #endif //WIN #endif } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::queuing_mutex; } // namespace v1 namespace profiling { using detail::d1::set_name; } } // namespace tbb #endif /* __TBB_queuing_mutex_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/queuing_rw_mutex.h ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_queuing_rw_mutex_H #define __TBB_queuing_rw_mutex_H #include "detail/_config.h" #include "detail/_namespace_injection.h" #include "detail/_assert.h" #include "detail/_mutex_common.h" #include "profiling.h" #include #include namespace tbb { namespace detail { namespace r1 { struct queuing_rw_mutex_impl; } namespace d1 { //! Queuing reader-writer mutex with local-only spinning. /** Adapted from Krieger, Stumm, et al. pseudocode at https://www.researchgate.net/publication/221083709_A_Fair_Fast_Scalable_Reader-Writer_Lock @ingroup synchronization */ class queuing_rw_mutex { friend r1::queuing_rw_mutex_impl; public: //! Construct unacquired mutex. queuing_rw_mutex() noexcept { create_itt_sync(this, "tbb::queuing_rw_mutex", ""); } //! Destructor asserts if the mutex is acquired, i.e. q_tail is non-null ~queuing_rw_mutex() { __TBB_ASSERT(q_tail.load(std::memory_order_relaxed) == nullptr, "destruction of an acquired mutex"); } //! No Copy queuing_rw_mutex(const queuing_rw_mutex&) = delete; queuing_rw_mutex& operator=(const queuing_rw_mutex&) = delete; //! The scoped locking pattern /** It helps to avoid the common problem of forgetting to release lock. It also nicely provides the "node" for queuing locks. */ class scoped_lock { friend r1::queuing_rw_mutex_impl; //! Initialize fields to mean "no lock held". void initialize() { my_mutex = nullptr; my_internal_lock.store(0, std::memory_order_relaxed); my_going.store(0, std::memory_order_relaxed); #if TBB_USE_ASSERT my_state = 0xFF; // Set to invalid state my_next.store(reinterpret_cast(reinterpret_cast(-1)), std::memory_order_relaxed); my_prev.store(reinterpret_cast(reinterpret_cast(-1)), std::memory_order_relaxed); #endif /* TBB_USE_ASSERT */ } public: //! Construct lock that has not acquired a mutex. /** Equivalent to zero-initialization of *this. */ scoped_lock() {initialize();} //! Acquire lock on given mutex. scoped_lock( queuing_rw_mutex& m, bool write=true ) { initialize(); acquire(m,write); } //! Release lock (if lock is held). ~scoped_lock() { if( my_mutex ) release(); } //! No Copy scoped_lock(const scoped_lock&) = delete; scoped_lock& operator=(const scoped_lock&) = delete; //! Acquire lock on given mutex. void acquire( queuing_rw_mutex& m, bool write=true ); //! Acquire lock on given mutex if free (i.e. non-blocking) bool try_acquire( queuing_rw_mutex& m, bool write=true ); //! Release lock. void release(); //! Upgrade reader to become a writer. /** Returns whether the upgrade happened without releasing and re-acquiring the lock */ bool upgrade_to_writer(); //! Downgrade writer to become a reader. bool downgrade_to_reader(); bool is_writer() const; private: //! The pointer to the mutex owned, or nullptr if not holding a mutex. queuing_rw_mutex* my_mutex; //! The 'pointer' to the previous and next competitors for a mutex std::atomic my_prev; std::atomic my_next; using state_t = unsigned char ; //! State of the request: reader, writer, active reader, other service states std::atomic my_state; //! The local spin-wait variable /** Corresponds to "spin" in the pseudocode but inverted for the sake of zero-initialization */ std::atomic my_going; //! A tiny internal lock std::atomic my_internal_lock; }; // Mutex traits static constexpr bool is_rw_mutex = true; static constexpr bool is_recursive_mutex = false; static constexpr bool is_fair_mutex = true; private: //! The last competitor requesting the lock std::atomic q_tail{nullptr}; }; #if TBB_USE_PROFILING_TOOLS inline void set_name(queuing_rw_mutex& obj, const char* name) { itt_set_sync_name(&obj, name); } #if (_WIN32||_WIN64) inline void set_name(queuing_rw_mutex& obj, const wchar_t* name) { itt_set_sync_name(&obj, name); } #endif //WIN #else inline void set_name(queuing_rw_mutex&, const char*) {} #if (_WIN32||_WIN64) inline void set_name(queuing_rw_mutex&, const wchar_t*) {} #endif //WIN #endif } // namespace d1 namespace r1 { TBB_EXPORT void acquire(d1::queuing_rw_mutex&, d1::queuing_rw_mutex::scoped_lock&, bool); TBB_EXPORT bool try_acquire(d1::queuing_rw_mutex&, d1::queuing_rw_mutex::scoped_lock&, bool); TBB_EXPORT void release(d1::queuing_rw_mutex::scoped_lock&); TBB_EXPORT bool upgrade_to_writer(d1::queuing_rw_mutex::scoped_lock&); TBB_EXPORT bool downgrade_to_reader(d1::queuing_rw_mutex::scoped_lock&); TBB_EXPORT bool is_writer(const d1::queuing_rw_mutex::scoped_lock&); } // namespace r1 namespace d1 { inline void queuing_rw_mutex::scoped_lock::acquire(queuing_rw_mutex& m,bool write) { r1::acquire(m, *this, write); } inline bool queuing_rw_mutex::scoped_lock::try_acquire(queuing_rw_mutex& m, bool write) { return r1::try_acquire(m, *this, write); } inline void queuing_rw_mutex::scoped_lock::release() { r1::release(*this); } inline bool queuing_rw_mutex::scoped_lock::upgrade_to_writer() { return r1::upgrade_to_writer(*this); } inline bool queuing_rw_mutex::scoped_lock::downgrade_to_reader() { return r1::downgrade_to_reader(*this); } inline bool queuing_rw_mutex::scoped_lock::is_writer() const { return r1::is_writer(*this); } } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::queuing_rw_mutex; } // namespace v1 namespace profiling { using detail::d1::set_name; } } // namespace tbb #endif /* __TBB_queuing_rw_mutex_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/rw_mutex.h ================================================ /* Copyright (c) 2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_rw_mutex_H #define __TBB_rw_mutex_H #include "detail/_namespace_injection.h" #include "detail/_utils.h" #include "detail/_waitable_atomic.h" #include "detail/_scoped_lock.h" #include "detail/_mutex_common.h" #include "profiling.h" namespace tbb { namespace detail { namespace d1 { class rw_mutex { public: //! Constructors rw_mutex() noexcept : m_state(0) { create_itt_sync(this, "tbb::rw_mutex", ""); } //! Destructor ~rw_mutex() { __TBB_ASSERT(!m_state.load(std::memory_order_relaxed), "destruction of an acquired mutex"); } //! No Copy rw_mutex(const rw_mutex&) = delete; rw_mutex& operator=(const rw_mutex&) = delete; using scoped_lock = rw_scoped_lock; //! Mutex traits static constexpr bool is_rw_mutex = true; static constexpr bool is_recursive_mutex = false; static constexpr bool is_fair_mutex = false; //! Acquire lock void lock() { call_itt_notify(prepare, this); while (!try_lock()) { if (!(m_state.load(std::memory_order_relaxed) & WRITER_PENDING)) { // no pending writers m_state |= WRITER_PENDING; } auto wakeup_condition = [&] { return !(m_state.load(std::memory_order_relaxed) & BUSY); }; adaptive_wait_on_address(this, wakeup_condition, WRITER_CONTEXT); } call_itt_notify(acquired, this); } //! Try acquiring lock (non-blocking) /** Return true if lock acquired; false otherwise. */ bool try_lock() { // for a writer: only possible to acquire if no active readers or writers // Use relaxed memory fence is OK here because // Acquire memory fence guaranteed by compare_exchange_strong() state_type s = m_state.load(std::memory_order_relaxed); if (!(s & BUSY)) { // no readers, no writers; mask is 1..1101 if (m_state.compare_exchange_strong(s, WRITER)) { call_itt_notify(acquired, this); return true; // successfully stored writer flag } } return false; } //! Release lock void unlock() { call_itt_notify(releasing, this); state_type curr_state = (m_state &= READERS | WRITER_PENDING); // Returns current state if (curr_state & WRITER_PENDING) { r1::notify_by_address(this, WRITER_CONTEXT); } else { // It's possible that WRITER sleeps without WRITER_PENDING, // because other thread might clear this bit at upgrade() r1::notify_by_address_all(this); } } //! Lock shared ownership mutex void lock_shared() { call_itt_notify(prepare, this); while (!try_lock_shared()) { state_type has_writer = WRITER | WRITER_PENDING; auto wakeup_condition = [&] { return !(m_state.load(std::memory_order_relaxed) & has_writer); }; adaptive_wait_on_address(this, wakeup_condition, READER_CONTEXT); } __TBB_ASSERT(m_state.load(std::memory_order_relaxed) & READERS, "invalid state of a read lock: no readers"); } //! Try lock shared ownership mutex bool try_lock_shared() { // for a reader: acquire if no active or waiting writers // Use relaxed memory fence is OK here because // Acquire memory fence guaranteed by fetch_add() state_type has_writer = WRITER | WRITER_PENDING; if (!(m_state.load(std::memory_order_relaxed) & has_writer)) { if (m_state.fetch_add(ONE_READER) & has_writer) { m_state -= ONE_READER; r1::notify_by_address(this, WRITER_CONTEXT); } else { call_itt_notify(acquired, this); return true; // successfully stored increased number of readers } } return false; } //! Unlock shared ownership mutex void unlock_shared() { __TBB_ASSERT(m_state.load(std::memory_order_relaxed) & READERS, "invalid state of a read lock: no readers"); call_itt_notify(releasing, this); state_type curr_state = (m_state -= ONE_READER); // Returns current state if (curr_state & (WRITER_PENDING)) { r1::notify_by_address(this, WRITER_CONTEXT); } else { // It's possible that WRITER sleeps without WRITER_PENDING, // because other thread might clear this bit at upgrade() r1::notify_by_address_all(this); } } private: /** Internal non ISO C++ standard API **/ //! This API is used through the scoped_lock class //! Upgrade reader to become a writer. /** Returns whether the upgrade happened without releasing and re-acquiring the lock */ bool upgrade() { state_type s = m_state.load(std::memory_order_relaxed); __TBB_ASSERT(s & READERS, "invalid state before upgrade: no readers "); // Check and set writer-pending flag. // Required conditions: either no pending writers, or we are the only reader // (with multiple readers and pending writer, another upgrade could have been requested) while ((s & READERS) == ONE_READER || !(s & WRITER_PENDING)) { if (m_state.compare_exchange_strong(s, s | WRITER | WRITER_PENDING)) { auto wakeup_condition = [&] { return (m_state.load(std::memory_order_relaxed) & READERS) == ONE_READER; }; while ((m_state.load(std::memory_order_relaxed) & READERS) != ONE_READER) { adaptive_wait_on_address(this, wakeup_condition, WRITER_CONTEXT); } __TBB_ASSERT((m_state.load(std::memory_order_relaxed) & (WRITER_PENDING|WRITER)) == (WRITER_PENDING | WRITER), "invalid state when upgrading to writer"); // Both new readers and writers are blocked at this time m_state -= (ONE_READER + WRITER_PENDING); return true; // successfully upgraded } } // Slow reacquire unlock_shared(); lock(); return false; } //! Downgrade writer to a reader void downgrade() { __TBB_ASSERT(m_state.load(std::memory_order_relaxed) & WRITER, nullptr), call_itt_notify(releasing, this); m_state += (ONE_READER - WRITER); if (!(m_state & WRITER_PENDING)) { r1::notify_by_address(this, READER_CONTEXT); } __TBB_ASSERT(m_state.load(std::memory_order_relaxed) & READERS, "invalid state after downgrade: no readers"); } using state_type = std::intptr_t; static constexpr state_type WRITER = 1; static constexpr state_type WRITER_PENDING = 2; static constexpr state_type READERS = ~(WRITER | WRITER_PENDING); static constexpr state_type ONE_READER = 4; static constexpr state_type BUSY = WRITER | READERS; using context_type = std::uintptr_t; static constexpr context_type WRITER_CONTEXT = 0; static constexpr context_type READER_CONTEXT = 1; friend scoped_lock; //! State of lock /** Bit 0 = writer is holding lock Bit 1 = request by a writer to acquire lock (hint to readers to wait) Bit 2..N = number of readers holding lock */ std::atomic m_state; }; // class rw_mutex } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::rw_mutex; } // namespace v1 } // namespace tbb #endif // __TBB_rw_mutex_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/scalable_allocator.h ================================================ /* Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_scalable_allocator_H #define __TBB_scalable_allocator_H #ifdef __cplusplus #include "oneapi/tbb/detail/_config.h" #include "oneapi/tbb/detail/_utils.h" #include "oneapi/tbb/detail/_namespace_injection.h" #include #include #include /* std::bad_alloc() */ #else #include "oneapi/tbb/detail/_export.h" #include /* Need ptrdiff_t and size_t from here. */ #if !defined(_MSC_VER) || defined(__clang__) #include /* Need intptr_t from here. */ #endif #endif #if __TBB_CPP17_MEMORY_RESOURCE_PRESENT #include #endif #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ #if _MSC_VER #define __TBB_EXPORTED_FUNC __cdecl #else #define __TBB_EXPORTED_FUNC #endif /** The "malloc" analogue to allocate block of memory of size bytes. * @ingroup memory_allocation */ TBBMALLOC_EXPORT void* __TBB_EXPORTED_FUNC scalable_malloc(size_t size); /** The "free" analogue to discard a previously allocated piece of memory. @ingroup memory_allocation */ TBBMALLOC_EXPORT void __TBB_EXPORTED_FUNC scalable_free(void* ptr); /** The "realloc" analogue complementing scalable_malloc. @ingroup memory_allocation */ TBBMALLOC_EXPORT void* __TBB_EXPORTED_FUNC scalable_realloc(void* ptr, size_t size); /** The "calloc" analogue complementing scalable_malloc. @ingroup memory_allocation */ TBBMALLOC_EXPORT void* __TBB_EXPORTED_FUNC scalable_calloc(size_t nobj, size_t size); /** The "posix_memalign" analogue. @ingroup memory_allocation */ TBBMALLOC_EXPORT int __TBB_EXPORTED_FUNC scalable_posix_memalign(void** memptr, size_t alignment, size_t size); /** The "_aligned_malloc" analogue. @ingroup memory_allocation */ TBBMALLOC_EXPORT void* __TBB_EXPORTED_FUNC scalable_aligned_malloc(size_t size, size_t alignment); /** The "_aligned_realloc" analogue. @ingroup memory_allocation */ TBBMALLOC_EXPORT void* __TBB_EXPORTED_FUNC scalable_aligned_realloc(void* ptr, size_t size, size_t alignment); /** The "_aligned_free" analogue. @ingroup memory_allocation */ TBBMALLOC_EXPORT void __TBB_EXPORTED_FUNC scalable_aligned_free(void* ptr); /** The analogue of _msize/malloc_size/malloc_usable_size. Returns the usable size of a memory block previously allocated by scalable_*, or 0 (zero) if ptr does not point to such a block. @ingroup memory_allocation */ TBBMALLOC_EXPORT size_t __TBB_EXPORTED_FUNC scalable_msize(void* ptr); /* Results for scalable_allocation_* functions */ typedef enum { TBBMALLOC_OK, TBBMALLOC_INVALID_PARAM, TBBMALLOC_UNSUPPORTED, TBBMALLOC_NO_MEMORY, TBBMALLOC_NO_EFFECT } ScalableAllocationResult; /* Setting TBB_MALLOC_USE_HUGE_PAGES environment variable to 1 enables huge pages. scalable_allocation_mode call has priority over environment variable. */ typedef enum { TBBMALLOC_USE_HUGE_PAGES, /* value turns using huge pages on and off */ /* deprecated, kept for backward compatibility only */ USE_HUGE_PAGES = TBBMALLOC_USE_HUGE_PAGES, /* try to limit memory consumption value (Bytes), clean internal buffers if limit is exceeded, but not prevents from requesting memory from OS */ TBBMALLOC_SET_SOFT_HEAP_LIMIT, /* Lower bound for the size (Bytes), that is interpreted as huge * and not released during regular cleanup operations. */ TBBMALLOC_SET_HUGE_SIZE_THRESHOLD } AllocationModeParam; /** Set TBB allocator-specific allocation modes. @ingroup memory_allocation */ TBBMALLOC_EXPORT int __TBB_EXPORTED_FUNC scalable_allocation_mode(int param, intptr_t value); typedef enum { /* Clean internal allocator buffers for all threads. Returns TBBMALLOC_NO_EFFECT if no buffers cleaned, TBBMALLOC_OK if some memory released from buffers. */ TBBMALLOC_CLEAN_ALL_BUFFERS, /* Clean internal allocator buffer for current thread only. Return values same as for TBBMALLOC_CLEAN_ALL_BUFFERS. */ TBBMALLOC_CLEAN_THREAD_BUFFERS } ScalableAllocationCmd; /** Call TBB allocator-specific commands. @ingroup memory_allocation */ TBBMALLOC_EXPORT int __TBB_EXPORTED_FUNC scalable_allocation_command(int cmd, void *param); #ifdef __cplusplus } /* extern "C" */ #endif /* __cplusplus */ #ifdef __cplusplus //! The namespace rml contains components of low-level memory pool interface. namespace rml { class MemoryPool; typedef void *(*rawAllocType)(std::intptr_t pool_id, std::size_t &bytes); // returns non-zero in case of error typedef int (*rawFreeType)(std::intptr_t pool_id, void* raw_ptr, std::size_t raw_bytes); struct MemPoolPolicy { enum { TBBMALLOC_POOL_VERSION = 1 }; rawAllocType pAlloc; rawFreeType pFree; // granularity of pAlloc allocations. 0 means default used. std::size_t granularity; int version; // all memory consumed at 1st pAlloc call and never returned, // no more pAlloc calls after 1st unsigned fixedPool : 1, // memory consumed but returned only at pool termination keepAllMemory : 1, reserved : 30; MemPoolPolicy(rawAllocType pAlloc_, rawFreeType pFree_, std::size_t granularity_ = 0, bool fixedPool_ = false, bool keepAllMemory_ = false) : pAlloc(pAlloc_), pFree(pFree_), granularity(granularity_), version(TBBMALLOC_POOL_VERSION), fixedPool(fixedPool_), keepAllMemory(keepAllMemory_), reserved(0) {} }; // enums have same values as appropriate enums from ScalableAllocationResult // TODO: use ScalableAllocationResult in pool_create directly enum MemPoolError { // pool created successfully POOL_OK = TBBMALLOC_OK, // invalid policy parameters found INVALID_POLICY = TBBMALLOC_INVALID_PARAM, // requested pool policy is not supported by allocator library UNSUPPORTED_POLICY = TBBMALLOC_UNSUPPORTED, // lack of memory during pool creation NO_MEMORY = TBBMALLOC_NO_MEMORY, // action takes no effect NO_EFFECT = TBBMALLOC_NO_EFFECT }; TBBMALLOC_EXPORT MemPoolError pool_create_v1(std::intptr_t pool_id, const MemPoolPolicy *policy, rml::MemoryPool **pool); TBBMALLOC_EXPORT bool pool_destroy(MemoryPool* memPool); TBBMALLOC_EXPORT void *pool_malloc(MemoryPool* memPool, std::size_t size); TBBMALLOC_EXPORT void *pool_realloc(MemoryPool* memPool, void *object, std::size_t size); TBBMALLOC_EXPORT void *pool_aligned_malloc(MemoryPool* mPool, std::size_t size, std::size_t alignment); TBBMALLOC_EXPORT void *pool_aligned_realloc(MemoryPool* mPool, void *ptr, std::size_t size, std::size_t alignment); TBBMALLOC_EXPORT bool pool_reset(MemoryPool* memPool); TBBMALLOC_EXPORT bool pool_free(MemoryPool *memPool, void *object); TBBMALLOC_EXPORT MemoryPool *pool_identify(void *object); TBBMALLOC_EXPORT std::size_t pool_msize(MemoryPool *memPool, void *object); } // namespace rml namespace tbb { namespace detail { namespace d1 { // keep throw in a separate function to prevent code bloat template void throw_exception(const E &e) { #if TBB_USE_EXCEPTIONS throw e; #else suppress_unused_warning(e); #endif } template class scalable_allocator { public: using value_type = T; using propagate_on_container_move_assignment = std::true_type; //! Always defined for TBB containers using is_always_equal = std::true_type; scalable_allocator() = default; template scalable_allocator(const scalable_allocator&) noexcept {} //! Allocate space for n objects. __TBB_nodiscard T* allocate(std::size_t n) { T* p = static_cast(scalable_malloc(n * sizeof(value_type))); if (!p) { throw_exception(std::bad_alloc()); } return p; } //! Free previously allocated block of memory void deallocate(T* p, std::size_t) { scalable_free(p); } #if TBB_ALLOCATOR_TRAITS_BROKEN using pointer = value_type*; using const_pointer = const value_type*; using reference = value_type&; using const_reference = const value_type&; using difference_type = std::ptrdiff_t; using size_type = std::size_t; template struct rebind { using other = scalable_allocator; }; //! Largest value for which method allocate might succeed. size_type max_size() const noexcept { size_type absolutemax = static_cast(-1) / sizeof (value_type); return (absolutemax > 0 ? absolutemax : 1); } template void construct(U *p, Args&&... args) { ::new((void *)p) U(std::forward(args)...); } void destroy(pointer p) { p->~value_type(); } pointer address(reference x) const { return &x; } const_pointer address(const_reference x) const { return &x; } #endif // TBB_ALLOCATOR_TRAITS_BROKEN }; #if TBB_ALLOCATOR_TRAITS_BROKEN template<> class scalable_allocator { public: using pointer = void*; using const_pointer = const void*; using value_type = void; template struct rebind { using other = scalable_allocator; }; }; #endif template inline bool operator==(const scalable_allocator&, const scalable_allocator&) noexcept { return true; } #if !__TBB_CPP20_COMPARISONS_PRESENT template inline bool operator!=(const scalable_allocator&, const scalable_allocator&) noexcept { return false; } #endif #if __TBB_CPP17_MEMORY_RESOURCE_PRESENT //! C++17 memory resource implementation for scalable allocator //! ISO C++ Section 23.12.2 class scalable_resource_impl : public std::pmr::memory_resource { private: void* do_allocate(std::size_t bytes, std::size_t alignment) override { void* p = scalable_aligned_malloc(bytes, alignment); if (!p) { throw_exception(std::bad_alloc()); } return p; } void do_deallocate(void* ptr, std::size_t /*bytes*/, std::size_t /*alignment*/) override { scalable_free(ptr); } //! Memory allocated by one instance of scalable_resource_impl could be deallocated by any //! other instance of this class bool do_is_equal(const std::pmr::memory_resource& other) const noexcept override { return this == &other || #if __TBB_USE_OPTIONAL_RTTI dynamic_cast(&other) != nullptr; #else false; #endif } }; //! Global scalable allocator memory resource provider inline std::pmr::memory_resource* scalable_memory_resource() noexcept { static tbb::detail::d1::scalable_resource_impl scalable_res; return &scalable_res; } #endif // __TBB_CPP17_MEMORY_RESOURCE_PRESENT } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::scalable_allocator; #if __TBB_CPP17_MEMORY_RESOURCE_PRESENT using detail::d1::scalable_memory_resource; #endif } // namespace v1 } // namespace tbb #endif /* __cplusplus */ #endif /* __TBB_scalable_allocator_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/spin_mutex.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_spin_mutex_H #define __TBB_spin_mutex_H #include "detail/_namespace_injection.h" #include "detail/_mutex_common.h" #include "profiling.h" #include "detail/_assert.h" #include "detail/_utils.h" #include "detail/_scoped_lock.h" #include namespace tbb { namespace detail { namespace d1 { #if __TBB_TSX_INTRINSICS_PRESENT class rtm_mutex; #endif /** A spin_mutex is a low-level synchronization primitive. While locked, it causes the waiting threads to spin in a loop until the lock is released. It should be used only for locking short critical sections (typically less than 20 instructions) when fairness is not an issue. If zero-initialized, the mutex is considered unheld. @ingroup synchronization */ class spin_mutex { public: //! Constructors spin_mutex() noexcept : m_flag(false) { create_itt_sync(this, "tbb::spin_mutex", ""); }; //! Destructor ~spin_mutex() = default; //! No Copy spin_mutex(const spin_mutex&) = delete; spin_mutex& operator=(const spin_mutex&) = delete; using scoped_lock = unique_scoped_lock; //! Mutex traits static constexpr bool is_rw_mutex = false; static constexpr bool is_recursive_mutex = false; static constexpr bool is_fair_mutex = false; //! Acquire lock /** Spin if the lock is taken */ void lock() { atomic_backoff backoff; call_itt_notify(prepare, this); while (m_flag.exchange(true)) backoff.pause(); call_itt_notify(acquired, this); } //! Try acquiring lock (non-blocking) /** Return true if lock acquired; false otherwise. */ bool try_lock() { bool result = !m_flag.exchange(true); if (result) { call_itt_notify(acquired, this); } return result; } //! Release lock void unlock() { call_itt_notify(releasing, this); m_flag.store(false, std::memory_order_release); } protected: std::atomic m_flag; }; // class spin_mutex #if TBB_USE_PROFILING_TOOLS inline void set_name(spin_mutex& obj, const char* name) { itt_set_sync_name(&obj, name); } #if (_WIN32||_WIN64) inline void set_name(spin_mutex& obj, const wchar_t* name) { itt_set_sync_name(&obj, name); } #endif //WIN #else inline void set_name(spin_mutex&, const char*) {} #if (_WIN32||_WIN64) inline void set_name(spin_mutex&, const wchar_t*) {} #endif // WIN #endif } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::spin_mutex; } // namespace v1 namespace profiling { using detail::d1::set_name; } } // namespace tbb #include "detail/_rtm_mutex.h" namespace tbb { inline namespace v1 { #if __TBB_TSX_INTRINSICS_PRESENT using speculative_spin_mutex = detail::d1::rtm_mutex; #else using speculative_spin_mutex = detail::d1::spin_mutex; #endif } } #endif /* __TBB_spin_mutex_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/spin_rw_mutex.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_spin_rw_mutex_H #define __TBB_spin_rw_mutex_H #include "detail/_namespace_injection.h" #include "detail/_mutex_common.h" #include "profiling.h" #include "detail/_assert.h" #include "detail/_utils.h" #include "detail/_scoped_lock.h" #include namespace tbb { namespace detail { namespace d1 { #if __TBB_TSX_INTRINSICS_PRESENT class rtm_rw_mutex; #endif //! Fast, unfair, spinning reader-writer lock with backoff and writer-preference /** @ingroup synchronization */ class spin_rw_mutex { public: //! Constructors spin_rw_mutex() noexcept : m_state(0) { create_itt_sync(this, "tbb::spin_rw_mutex", ""); } //! Destructor ~spin_rw_mutex() { __TBB_ASSERT(!m_state, "destruction of an acquired mutex"); } //! No Copy spin_rw_mutex(const spin_rw_mutex&) = delete; spin_rw_mutex& operator=(const spin_rw_mutex&) = delete; using scoped_lock = rw_scoped_lock; //! Mutex traits static constexpr bool is_rw_mutex = true; static constexpr bool is_recursive_mutex = false; static constexpr bool is_fair_mutex = false; //! Acquire lock void lock() { call_itt_notify(prepare, this); for (atomic_backoff backoff; ; backoff.pause()) { state_type s = m_state.load(std::memory_order_relaxed); if (!(s & BUSY)) { // no readers, no writers if (m_state.compare_exchange_strong(s, WRITER)) break; // successfully stored writer flag backoff.reset(); // we could be very close to complete op. } else if (!(s & WRITER_PENDING)) { // no pending writers m_state |= WRITER_PENDING; } } call_itt_notify(acquired, this); } //! Try acquiring lock (non-blocking) /** Return true if lock acquired; false otherwise. */ bool try_lock() { // for a writer: only possible to acquire if no active readers or writers state_type s = m_state.load(std::memory_order_relaxed); if (!(s & BUSY)) { // no readers, no writers; mask is 1..1101 if (m_state.compare_exchange_strong(s, WRITER)) { call_itt_notify(acquired, this); return true; // successfully stored writer flag } } return false; } //! Release lock void unlock() { call_itt_notify(releasing, this); m_state &= READERS; } //! Lock shared ownership mutex void lock_shared() { call_itt_notify(prepare, this); for (atomic_backoff b; ; b.pause()) { state_type s = m_state.load(std::memory_order_relaxed); if (!(s & (WRITER | WRITER_PENDING))) { // no writer or write requests state_type prev_state = m_state.fetch_add(ONE_READER); if (!(prev_state & WRITER)) { break; // successfully stored increased number of readers } // writer got there first, undo the increment m_state -= ONE_READER; } } call_itt_notify(acquired, this); __TBB_ASSERT(m_state & READERS, "invalid state of a read lock: no readers"); } //! Try lock shared ownership mutex bool try_lock_shared() { // for a reader: acquire if no active or waiting writers state_type s = m_state.load(std::memory_order_relaxed); if (!(s & (WRITER | WRITER_PENDING))) { // no writers state_type prev_state = m_state.fetch_add(ONE_READER); if (!(prev_state & WRITER)) { // got the lock call_itt_notify(acquired, this); return true; // successfully stored increased number of readers } // writer got there first, undo the increment m_state -= ONE_READER; } return false; } //! Unlock shared ownership mutex void unlock_shared() { __TBB_ASSERT(m_state & READERS, "invalid state of a read lock: no readers"); call_itt_notify(releasing, this); m_state -= ONE_READER; } protected: /** Internal non ISO C++ standard API **/ //! This API is used through the scoped_lock class //! Upgrade reader to become a writer. /** Returns whether the upgrade happened without releasing and re-acquiring the lock */ bool upgrade() { state_type s = m_state.load(std::memory_order_relaxed); __TBB_ASSERT(s & READERS, "invalid state before upgrade: no readers "); // Check and set writer-pending flag. // Required conditions: either no pending writers, or we are the only reader // (with multiple readers and pending writer, another upgrade could have been requested) while ((s & READERS) == ONE_READER || !(s & WRITER_PENDING)) { if (m_state.compare_exchange_strong(s, s | WRITER | WRITER_PENDING)) { atomic_backoff backoff; while ((m_state.load(std::memory_order_relaxed) & READERS) != ONE_READER) backoff.pause(); __TBB_ASSERT((m_state & (WRITER_PENDING|WRITER)) == (WRITER_PENDING | WRITER), "invalid state when upgrading to writer"); // Both new readers and writers are blocked at this time m_state -= (ONE_READER + WRITER_PENDING); return true; // successfully upgraded } } // Slow reacquire unlock_shared(); lock(); return false; } //! Downgrade writer to a reader void downgrade() { call_itt_notify(releasing, this); m_state += (ONE_READER - WRITER); __TBB_ASSERT(m_state & READERS, "invalid state after downgrade: no readers"); } using state_type = std::intptr_t; static constexpr state_type WRITER = 1; static constexpr state_type WRITER_PENDING = 2; static constexpr state_type READERS = ~(WRITER | WRITER_PENDING); static constexpr state_type ONE_READER = 4; static constexpr state_type BUSY = WRITER | READERS; friend scoped_lock; //! State of lock /** Bit 0 = writer is holding lock Bit 1 = request by a writer to acquire lock (hint to readers to wait) Bit 2..N = number of readers holding lock */ std::atomic m_state; }; // class spin_rw_mutex #if TBB_USE_PROFILING_TOOLS inline void set_name(spin_rw_mutex& obj, const char* name) { itt_set_sync_name(&obj, name); } #if (_WIN32||_WIN64) inline void set_name(spin_rw_mutex& obj, const wchar_t* name) { itt_set_sync_name(&obj, name); } #endif // WIN #else inline void set_name(spin_rw_mutex&, const char*) {} #if (_WIN32||_WIN64) inline void set_name(spin_rw_mutex&, const wchar_t*) {} #endif // WIN #endif } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::spin_rw_mutex; } // namespace v1 namespace profiling { using detail::d1::set_name; } } // namespace tbb #include "detail/_rtm_rw_mutex.h" namespace tbb { inline namespace v1 { #if __TBB_TSX_INTRINSICS_PRESENT using speculative_spin_rw_mutex = detail::d1::rtm_rw_mutex; #else using speculative_spin_rw_mutex = detail::d1::spin_rw_mutex; #endif } } #endif /* __TBB_spin_rw_mutex_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/task.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_task_H #define __TBB_task_H #include "detail/_config.h" #include "detail/_namespace_injection.h" #include "detail/_task.h" namespace tbb { inline namespace v1 { namespace task { #if __TBB_RESUMABLE_TASKS using detail::d1::suspend_point; using detail::d1::resume; using detail::d1::suspend; #endif /* __TBB_RESUMABLE_TASKS */ using detail::d1::current_context; } // namespace task } // namespace v1 } // namespace tbb #endif /* __TBB_task_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/task_arena.h ================================================ /* Copyright (c) 2005-2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_task_arena_H #define __TBB_task_arena_H #include "detail/_config.h" #include "detail/_aligned_space.h" #include "detail/_attach.h" #include "detail/_exception.h" #include "detail/_namespace_injection.h" #include "detail/_small_object_pool.h" #include "detail/_task.h" #include "detail/_task_handle.h" #if __TBB_ARENA_BINDING #include "info.h" #endif /*__TBB_ARENA_BINDING*/ namespace tbb { namespace detail { namespace d1 { template class task_arena_function : public delegate_base { F &my_func; aligned_space my_return_storage; bool my_constructed{false}; // The function should be called only once. bool operator()() const override { new (my_return_storage.begin()) R(my_func()); return true; } public: task_arena_function(F& f) : my_func(f) {} // The function can be called only after operator() and only once. R consume_result() { my_constructed = true; return std::move(*(my_return_storage.begin())); } ~task_arena_function() override { if (my_constructed) { my_return_storage.begin()->~R(); } } }; template class task_arena_function : public delegate_base { F &my_func; bool operator()() const override { my_func(); return true; } public: task_arena_function(F& f) : my_func(f) {} void consume_result() const {} friend class task_arena_base; }; class task_arena_base; class task_scheduler_observer; } // namespace d1 namespace r1 { class arena; struct task_arena_impl; TBB_EXPORT void __TBB_EXPORTED_FUNC observe(d1::task_scheduler_observer&, bool); TBB_EXPORT void __TBB_EXPORTED_FUNC initialize(d1::task_arena_base&); TBB_EXPORT void __TBB_EXPORTED_FUNC terminate(d1::task_arena_base&); TBB_EXPORT bool __TBB_EXPORTED_FUNC attach(d1::task_arena_base&); TBB_EXPORT void __TBB_EXPORTED_FUNC execute(d1::task_arena_base&, d1::delegate_base&); TBB_EXPORT void __TBB_EXPORTED_FUNC wait(d1::task_arena_base&); TBB_EXPORT int __TBB_EXPORTED_FUNC max_concurrency(const d1::task_arena_base*); TBB_EXPORT void __TBB_EXPORTED_FUNC isolate_within_arena(d1::delegate_base& d, std::intptr_t); TBB_EXPORT void __TBB_EXPORTED_FUNC enqueue(d1::task&, d1::task_arena_base*); TBB_EXPORT void __TBB_EXPORTED_FUNC enqueue(d1::task&, d1::task_group_context&, d1::task_arena_base*); TBB_EXPORT void __TBB_EXPORTED_FUNC submit(d1::task&, d1::task_group_context&, arena*, std::uintptr_t); #if __TBB_PREVIEW_PARALLEL_PHASE TBB_EXPORT void __TBB_EXPORTED_FUNC enter_parallel_phase(d1::task_arena_base*, std::uintptr_t); TBB_EXPORT void __TBB_EXPORTED_FUNC exit_parallel_phase(d1::task_arena_base*, std::uintptr_t); #endif } // namespace r1 namespace d2 { inline void enqueue_impl(task_handle&& th, d1::task_arena_base* ta) { __TBB_ASSERT(th != nullptr, "Attempt to schedule empty task_handle"); auto& ctx = task_handle_accessor::ctx_of(th); // Do not access th after release r1::enqueue(*task_handle_accessor::release(th), ctx, ta); } } //namespace d2 namespace d1 { static constexpr unsigned num_priority_levels = 3; static constexpr int priority_stride = INT_MAX / (num_priority_levels + 1); class task_arena_base { friend struct r1::task_arena_impl; friend void r1::observe(d1::task_scheduler_observer&, bool); public: enum class priority : int { low = 1 * priority_stride, normal = 2 * priority_stride, high = 3 * priority_stride }; #if __TBB_PREVIEW_PARALLEL_PHASE enum class leave_policy : int { automatic = 0, fast = 1 }; #endif #if __TBB_ARENA_BINDING using constraints = tbb::detail::d1::constraints; #endif /*__TBB_ARENA_BINDING*/ protected: //! Special settings intptr_t my_version_and_traits; std::atomic my_initialization_state; //! nullptr if not currently initialized. std::atomic my_arena; static_assert(sizeof(std::atomic) == sizeof(r1::arena*), "To preserve backward compatibility we need the equal size of an atomic pointer and a pointer"); //! Concurrency level for deferred initialization int my_max_concurrency; //! Reserved slots for external threads unsigned my_num_reserved_slots; //! Arena priority priority my_priority; //! The NUMA node index to which the arena will be attached numa_node_id my_numa_id; //! The core type index to which arena will be attached core_type_id my_core_type; //! Number of threads per core int my_max_threads_per_core; // Backward compatibility checks. core_type_id core_type() const { return (my_version_and_traits & core_type_support_flag) == core_type_support_flag ? my_core_type : automatic; } int max_threads_per_core() const { return (my_version_and_traits & core_type_support_flag) == core_type_support_flag ? my_max_threads_per_core : automatic; } #if __TBB_PREVIEW_PARALLEL_PHASE leave_policy get_leave_policy() const { return (my_version_and_traits & fast_leave_policy_flag) ? leave_policy::fast : leave_policy::automatic; } int leave_policy_trait(leave_policy lp) const { return lp == leave_policy::fast ? fast_leave_policy_flag : 0; } void set_leave_policy(leave_policy lp) { my_version_and_traits |= leave_policy_trait(lp); } #endif enum { default_flags = 0, core_type_support_flag = 1, fast_leave_policy_flag = 1 << 1 }; task_arena_base(int max_concurrency, unsigned reserved_for_masters, priority a_priority #if __TBB_PREVIEW_PARALLEL_PHASE , leave_policy lp #endif ) : my_version_and_traits(default_flags | core_type_support_flag #if __TBB_PREVIEW_PARALLEL_PHASE | leave_policy_trait(lp) #endif ) , my_initialization_state(do_once_state::uninitialized) , my_arena(nullptr) , my_max_concurrency(max_concurrency) , my_num_reserved_slots(reserved_for_masters) , my_priority(a_priority) , my_numa_id(automatic) , my_core_type(automatic) , my_max_threads_per_core(automatic) {} #if __TBB_ARENA_BINDING task_arena_base(const constraints& constraints_, unsigned reserved_for_masters, priority a_priority #if __TBB_PREVIEW_PARALLEL_PHASE , leave_policy lp #endif ) : my_version_and_traits(default_flags | core_type_support_flag #if __TBB_PREVIEW_PARALLEL_PHASE | leave_policy_trait(lp) #endif ) , my_initialization_state(do_once_state::uninitialized) , my_arena(nullptr) , my_max_concurrency(constraints_.max_concurrency) , my_num_reserved_slots(reserved_for_masters) , my_priority(a_priority) , my_numa_id(constraints_.numa_id) , my_core_type(constraints_.core_type) , my_max_threads_per_core(constraints_.max_threads_per_core) {} #endif /*__TBB_ARENA_BINDING*/ public: //! Typedef for number of threads that is automatic. static const int automatic = -1; static const int not_initialized = -2; }; template R isolate_impl(F& f) { task_arena_function func(f); r1::isolate_within_arena(func, /*isolation*/ 0); return func.consume_result(); } template class enqueue_task : public task { small_object_allocator m_allocator; const F m_func; void finalize(const execution_data& ed) { m_allocator.delete_object(this, ed); } task* execute(execution_data& ed) override { m_func(); finalize(ed); return nullptr; } task* cancel(execution_data&) override { __TBB_ASSERT_RELEASE(false, "Unhandled exception from enqueue task is caught"); return nullptr; } public: enqueue_task(const F& f, small_object_allocator& alloc) : m_allocator(alloc), m_func(f) {} enqueue_task(F&& f, small_object_allocator& alloc) : m_allocator(alloc), m_func(std::move(f)) {} }; template void enqueue_impl(F&& f, task_arena_base* ta) { small_object_allocator alloc{}; r1::enqueue(*alloc.new_object::type>>(std::forward(f), alloc), ta); } /** 1-to-1 proxy representation class of scheduler's arena * Constructors set up settings only, real construction is deferred till the first method invocation * Destructor only removes one of the references to the inner arena representation. * Final destruction happens when all the references (and the work) are gone. */ class task_arena : public task_arena_base { void mark_initialized() { __TBB_ASSERT( my_arena.load(std::memory_order_relaxed), "task_arena initialization is incomplete" ); my_initialization_state.store(do_once_state::initialized, std::memory_order_release); } template R execute_impl(F& f) { initialize(); task_arena_function func(f); r1::execute(*this, func); return func.consume_result(); } public: //! Creates task_arena with certain concurrency limits /** Sets up settings only, real construction is deferred till the first method invocation * @arg max_concurrency specifies total number of slots in arena where threads work * @arg reserved_for_masters specifies number of slots to be used by external threads only. * Value of 1 is default and reflects behavior of implicit arenas. **/ task_arena(int max_concurrency_ = automatic, unsigned reserved_for_masters = 1, priority a_priority = priority::normal #if __TBB_PREVIEW_PARALLEL_PHASE , leave_policy lp = leave_policy::automatic #endif ) : task_arena_base(max_concurrency_, reserved_for_masters, a_priority #if __TBB_PREVIEW_PARALLEL_PHASE , lp #endif ) {} #if __TBB_ARENA_BINDING //! Creates task arena pinned to certain NUMA node task_arena(const constraints& constraints_, unsigned reserved_for_masters = 1, priority a_priority = priority::normal #if __TBB_PREVIEW_PARALLEL_PHASE , leave_policy lp = leave_policy::automatic #endif ) : task_arena_base(constraints_, reserved_for_masters, a_priority #if __TBB_PREVIEW_PARALLEL_PHASE , lp #endif ) {} //! Copies settings from another task_arena task_arena(const task_arena &a) // copy settings but not the reference or instance : task_arena_base( constraints{} .set_numa_id(a.my_numa_id) .set_max_concurrency(a.my_max_concurrency) .set_core_type(a.my_core_type) .set_max_threads_per_core(a.my_max_threads_per_core) , a.my_num_reserved_slots, a.my_priority #if __TBB_PREVIEW_PARALLEL_PHASE , a.get_leave_policy() #endif ) {} #else //! Copies settings from another task_arena task_arena(const task_arena& a) // copy settings but not the reference or instance : task_arena_base(a.my_max_concurrency, a.my_num_reserved_slots, a.my_priority, #if __TBB_PREVIEW_PARALLEL_PHASE a.get_leave_policy() #endif ) {} #endif /*__TBB_ARENA_BINDING*/ //! Tag class used to indicate the "attaching" constructor struct attach {}; //! Creates an instance of task_arena attached to the current arena of the thread explicit task_arena( attach ) : task_arena_base(automatic, 1, priority::normal #if __TBB_PREVIEW_PARALLEL_PHASE , leave_policy::automatic #endif ) // use default settings if attach fails { if (r1::attach(*this)) { mark_initialized(); } } //! Creates an instance of task_arena attached to the current arena of the thread explicit task_arena(d1::attach) : task_arena(attach{}) {} //! Forces allocation of the resources for the task_arena as specified in constructor arguments void initialize() { atomic_do_once([this]{ r1::initialize(*this); }, my_initialization_state); } //! Overrides concurrency level and forces initialization of internal representation void initialize(int max_concurrency_, unsigned reserved_for_masters = 1, priority a_priority = priority::normal #if __TBB_PREVIEW_PARALLEL_PHASE , leave_policy lp = leave_policy::automatic #endif ) { __TBB_ASSERT(!my_arena.load(std::memory_order_relaxed), "Impossible to modify settings of an already initialized task_arena"); if( !is_active() ) { my_max_concurrency = max_concurrency_; my_num_reserved_slots = reserved_for_masters; my_priority = a_priority; #if __TBB_PREVIEW_PARALLEL_PHASE set_leave_policy(lp); #endif r1::initialize(*this); mark_initialized(); } } #if __TBB_ARENA_BINDING void initialize(constraints constraints_, unsigned reserved_for_masters = 1, priority a_priority = priority::normal #if __TBB_PREVIEW_PARALLEL_PHASE , leave_policy lp = leave_policy::automatic #endif ) { __TBB_ASSERT(!my_arena.load(std::memory_order_relaxed), "Impossible to modify settings of an already initialized task_arena"); if( !is_active() ) { my_numa_id = constraints_.numa_id; my_max_concurrency = constraints_.max_concurrency; my_core_type = constraints_.core_type; my_max_threads_per_core = constraints_.max_threads_per_core; my_num_reserved_slots = reserved_for_masters; my_priority = a_priority; #if __TBB_PREVIEW_PARALLEL_PHASE set_leave_policy(lp); #endif r1::initialize(*this); mark_initialized(); } } #endif /*__TBB_ARENA_BINDING*/ //! Attaches this instance to the current arena of the thread void initialize(attach) { // TODO: decide if this call must be thread-safe __TBB_ASSERT(!my_arena.load(std::memory_order_relaxed), "Impossible to modify settings of an already initialized task_arena"); if( !is_active() ) { if ( !r1::attach(*this) ) { r1::initialize(*this); } mark_initialized(); } } //! Attaches this instance to the current arena of the thread void initialize(d1::attach) { initialize(attach{}); } //! Removes the reference to the internal arena representation. //! Not thread safe wrt concurrent invocations of other methods. void terminate() { if( is_active() ) { r1::terminate(*this); my_initialization_state.store(do_once_state::uninitialized, std::memory_order_relaxed); } } //! Removes the reference to the internal arena representation, and destroys the external object. //! Not thread safe wrt concurrent invocations of other methods. ~task_arena() { terminate(); } //! Returns true if the arena is active (initialized); false otherwise. //! The name was chosen to match a task_scheduler_init method with the same semantics. bool is_active() const { return my_initialization_state.load(std::memory_order_acquire) == do_once_state::initialized; } //! Enqueues a task into the arena to process a functor, and immediately returns. //! Does not require the calling thread to join the arena template void enqueue(F&& f) { initialize(); enqueue_impl(std::forward(f), this); } //! Enqueues a task into the arena to process a functor wrapped in task_handle, and immediately returns. //! Does not require the calling thread to join the arena void enqueue(d2::task_handle&& th) { initialize(); d2::enqueue_impl(std::move(th), this); } //! Joins the arena and executes a mutable functor, then returns //! If not possible to join, wraps the functor into a task, enqueues it and waits for task completion //! Can decrement the arena demand for workers, causing a worker to leave and free a slot to the calling thread //! Since C++11, the method returns the value returned by functor (prior to C++11 it returns void). template auto execute(F&& f) -> decltype(f()) { return execute_impl(f); } #if __TBB_PREVIEW_PARALLEL_PHASE void start_parallel_phase() { initialize(); r1::enter_parallel_phase(this, /*reserved*/0); } void end_parallel_phase(bool with_fast_leave = false) { __TBB_ASSERT(my_initialization_state.load(std::memory_order_relaxed) == do_once_state::initialized, nullptr); // It is guaranteed by the standard that conversion of boolean to integral type will result in either 0 or 1 r1::exit_parallel_phase(this, static_cast(with_fast_leave)); } class scoped_parallel_phase { task_arena& arena; bool one_time_fast_leave; public: scoped_parallel_phase(task_arena& ta, bool with_fast_leave = false) : arena(ta), one_time_fast_leave(with_fast_leave) { arena.start_parallel_phase(); } ~scoped_parallel_phase() { arena.end_parallel_phase(one_time_fast_leave); } }; #endif #if __TBB_EXTRA_DEBUG //! Returns my_num_reserved_slots int debug_reserved_slots() const { // Handle special cases inside the library return my_num_reserved_slots; } //! Returns my_max_concurrency int debug_max_concurrency() const { // Handle special cases inside the library return my_max_concurrency; } //! Wait for all work in the arena to be completed //! Even submitted by other application threads //! Joins arena if/when possible (in the same way as execute()) void debug_wait_until_empty() { initialize(); r1::wait(*this); } #endif //__TBB_EXTRA_DEBUG //! Returns the maximal number of threads that can work inside the arena int max_concurrency() const { // Handle special cases inside the library return (my_max_concurrency > 1) ? my_max_concurrency : r1::max_concurrency(this); } friend void submit(task& t, task_arena& ta, task_group_context& ctx, bool as_critical) { __TBB_ASSERT(ta.is_active(), nullptr); call_itt_task_notify(releasing, &t); r1::submit(t, ctx, ta.my_arena.load(std::memory_order_relaxed), as_critical ? 1 : 0); } }; //! Executes a mutable functor in isolation within the current task arena. //! Since C++11, the method returns the value returned by functor (prior to C++11 it returns void). template inline auto isolate(F&& f) -> decltype(f()) { return isolate_impl(f); } //! Returns the index, aka slot number, of the calling thread in its current arena inline int current_thread_index() { slot_id idx = r1::execution_slot(nullptr); return idx == slot_id(-1) ? task_arena_base::not_initialized : int(idx); } #if __TBB_PREVIEW_TASK_GROUP_EXTENSIONS inline bool is_inside_task() { return nullptr != current_context(); } #endif //__TBB_PREVIEW_TASK_GROUP_EXTENSIONS //! Returns the maximal number of threads that can work inside the arena inline int max_concurrency() { return r1::max_concurrency(nullptr); } inline void enqueue(d2::task_handle&& th) { d2::enqueue_impl(std::move(th), nullptr); } template inline void enqueue(F&& f) { enqueue_impl(std::forward(f), nullptr); } #if __TBB_PREVIEW_PARALLEL_PHASE inline void start_parallel_phase() { r1::enter_parallel_phase(nullptr, /*reserved*/0); } inline void end_parallel_phase(bool with_fast_leave) { // It is guaranteed by the standard that conversion of boolean to integral type will result in either 0 or 1 r1::exit_parallel_phase(nullptr, static_cast(with_fast_leave)); } #endif using r1::submit; } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::task_arena; using detail::d1::attach; #if __TBB_PREVIEW_TASK_GROUP_EXTENSIONS using detail::d1::is_inside_task; #endif namespace this_task_arena { using detail::d1::current_thread_index; using detail::d1::max_concurrency; using detail::d1::isolate; using detail::d1::enqueue; #if __TBB_PREVIEW_PARALLEL_PHASE using detail::d1::start_parallel_phase; using detail::d1::end_parallel_phase; #endif } // namespace this_task_arena } // inline namespace v1 } // namespace tbb #endif /* __TBB_task_arena_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/task_group.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_task_group_H #define __TBB_task_group_H #include "detail/_config.h" #include "detail/_namespace_injection.h" #include "detail/_assert.h" #include "detail/_utils.h" #include "detail/_template_helpers.h" #include "detail/_exception.h" #include "detail/_task.h" #include "detail/_small_object_pool.h" #include "detail/_intrusive_list_node.h" #include "detail/_task_handle.h" #include "profiling.h" #include #if _MSC_VER && !defined(__INTEL_COMPILER) // Suppress warning: structure was padded due to alignment specifier #pragma warning(push) #pragma warning(disable:4324) #endif namespace tbb { namespace detail { namespace d1 { class delegate_base; class task_arena_base; class task_group_context; } namespace r1 { // Forward declarations class tbb_exception_ptr; class cancellation_disseminator; class thread_data; class task_dispatcher; template class context_guard_helper; struct task_arena_impl; class context_list; TBB_EXPORT void __TBB_EXPORTED_FUNC execute(d1::task_arena_base&, d1::delegate_base&); TBB_EXPORT void __TBB_EXPORTED_FUNC isolate_within_arena(d1::delegate_base&, std::intptr_t); TBB_EXPORT void __TBB_EXPORTED_FUNC initialize(d1::task_group_context&); TBB_EXPORT void __TBB_EXPORTED_FUNC destroy(d1::task_group_context&); TBB_EXPORT void __TBB_EXPORTED_FUNC reset(d1::task_group_context&); TBB_EXPORT bool __TBB_EXPORTED_FUNC cancel_group_execution(d1::task_group_context&); TBB_EXPORT bool __TBB_EXPORTED_FUNC is_group_execution_cancelled(d1::task_group_context&); TBB_EXPORT void __TBB_EXPORTED_FUNC capture_fp_settings(d1::task_group_context&); struct task_group_context_impl; } namespace d2 { namespace { template d1::task* task_ptr_or_nullptr(F&& f); } template class function_task : public task_handle_task { //TODO: apply empty base optimization here const F m_func; private: d1::task* execute(d1::execution_data& ed) override { __TBB_ASSERT(ed.context == &this->ctx(), "The task group context should be used for all tasks"); task* res = task_ptr_or_nullptr(m_func); finalize(&ed); return res; } d1::task* cancel(d1::execution_data& ed) override { finalize(&ed); return nullptr; } public: template function_task(FF&& f, d1::wait_tree_vertex_interface* vertex, d1::task_group_context& ctx, d1::small_object_allocator& alloc) : task_handle_task{vertex, ctx, alloc}, m_func(std::forward(f)) {} }; #if __TBB_PREVIEW_TASK_GROUP_EXTENSIONS namespace { template d1::task* task_ptr_or_nullptr_impl(std::false_type, F&& f){ task_handle th = std::forward(f)(); return task_handle_accessor::release(th); } template d1::task* task_ptr_or_nullptr_impl(std::true_type, F&& f){ std::forward(f)(); return nullptr; } template d1::task* task_ptr_or_nullptr(F&& f){ using is_void_t = std::is_void< decltype(std::forward(f)()) >; return task_ptr_or_nullptr_impl(is_void_t{}, std::forward(f)); } } #else namespace { template d1::task* task_ptr_or_nullptr(F&& f){ std::forward(f)(); return nullptr; } } // namespace #endif // __TBB_PREVIEW_TASK_GROUP_EXTENSIONS } // namespace d2 namespace d1 { // This structure is left here for backward compatibility check struct context_list_node { std::atomic prev{}; std::atomic next{}; }; //! Used to form groups of tasks /** @ingroup task_scheduling The context services explicit cancellation requests from user code, and unhandled exceptions intercepted during tasks execution. Intercepting an exception results in generating internal cancellation requests (which is processed in exactly the same way as external ones). The context is associated with one or more root tasks and defines the cancellation group that includes all the descendants of the corresponding root task(s). Association is established when a context object is passed as an argument to the task::allocate_root() method. See task_group_context::task_group_context for more details. The context can be bound to another one, and other contexts can be bound to it, forming a tree-like structure: parent -> this -> children. Arrows here designate cancellation propagation direction. If a task in a cancellation group is cancelled all the other tasks in this group and groups bound to it (as children) get cancelled too. **/ class task_group_context : no_copy { public: enum traits_type { fp_settings = 1 << 1, concurrent_wait = 1 << 2, default_traits = 0 }; enum kind_type { isolated, bound }; private: //! Space for platform-specific FPU settings. /** Must only be accessed inside TBB binaries, and never directly in user code or inline methods. */ std::uint64_t my_cpu_ctl_env; //! Specifies whether cancellation was requested for this task group. std::atomic my_cancellation_requested; //! Versioning for run-time checks and behavioral traits of the context. enum class task_group_context_version : std::uint8_t { unused = 1 // ensure that new versions, if any, will not clash with previously used ones }; task_group_context_version my_version; //! The context traits. struct context_traits { bool fp_settings : 1; bool concurrent_wait : 1; bool bound : 1; bool reserved1 : 1; bool reserved2 : 1; bool reserved3 : 1; bool reserved4 : 1; bool reserved5 : 1; } my_traits; static_assert(sizeof(context_traits) == 1, "Traits shall fit into one byte."); static constexpr std::uint8_t may_have_children = 1; //! The context internal state (currently only may_have_children). std::atomic my_may_have_children; enum class state : std::uint8_t { created, locked, isolated, bound, dead, proxy = std::uint8_t(-1) //the context is not the real one, but proxy to other one }; //! The synchronization machine state to manage lifetime. std::atomic my_state; union { //! Pointer to the context of the parent cancellation group. nullptr for isolated contexts. task_group_context* my_parent; //! Pointer to the actual context 'this' context represents a proxy of. task_group_context* my_actual_context; }; //! Thread data instance that registered this context in its list. r1::context_list* my_context_list; static_assert(sizeof(std::atomic) == sizeof(r1::context_list*), "To preserve backward compatibility these types should have the same size"); //! Used to form the thread specific list of contexts without additional memory allocation. /** A context is included into the list of the current thread when its binding to its parent happens. Any context can be present in the list of one thread only. **/ intrusive_list_node my_node; static_assert(sizeof(intrusive_list_node) == sizeof(context_list_node), "To preserve backward compatibility these types should have the same size"); //! Pointer to the container storing exception being propagated across this task group. std::atomic my_exception; static_assert(sizeof(std::atomic) == sizeof(r1::tbb_exception_ptr*), "backward compatibility check"); //! Used to set and maintain stack stitching point for Intel Performance Tools. void* my_itt_caller; //! Description of algorithm for scheduler based instrumentation. string_resource_index my_name; char padding[max_nfs_size - sizeof(std::uint64_t) // my_cpu_ctl_env - sizeof(std::atomic) // my_cancellation_requested - sizeof(std::uint8_t) // my_version - sizeof(context_traits) // my_traits - sizeof(std::atomic) // my_state - sizeof(std::atomic) // my_state - sizeof(task_group_context*) // my_parent - sizeof(r1::context_list*) // my_context_list - sizeof(intrusive_list_node) // my_node - sizeof(std::atomic) // my_exception - sizeof(void*) // my_itt_caller - sizeof(string_resource_index) // my_name ]; task_group_context(context_traits t, string_resource_index name) : my_version{task_group_context_version::unused}, my_name{name} { my_traits = t; // GCC4.8 issues warning list initialization for bitset (missing-field-initializers) r1::initialize(*this); } task_group_context(task_group_context* actual_context) : my_version{task_group_context_version::unused} , my_state{state::proxy} , my_actual_context{actual_context} { __TBB_ASSERT(my_actual_context, "Passed pointer value points to nothing."); my_name = actual_context->my_name; // no need to initialize 'this' context as it acts as a proxy for my_actual_context, which // initialization is a user-side responsibility. } static context_traits make_traits(kind_type relation_with_parent, std::uintptr_t user_traits) { context_traits ct; ct.fp_settings = (user_traits & fp_settings) == fp_settings; ct.concurrent_wait = (user_traits & concurrent_wait) == concurrent_wait; ct.bound = relation_with_parent == bound; ct.reserved1 = ct.reserved2 = ct.reserved3 = ct.reserved4 = ct.reserved5 = false; return ct; } bool is_proxy() const { return my_state.load(std::memory_order_relaxed) == state::proxy; } task_group_context& actual_context() noexcept { if (is_proxy()) { __TBB_ASSERT(my_actual_context, "Actual task_group_context is not set."); return *my_actual_context; } return *this; } const task_group_context& actual_context() const noexcept { if (is_proxy()) { __TBB_ASSERT(my_actual_context, "Actual task_group_context is not set."); return *my_actual_context; } return *this; } public: //! Default & binding constructor. /** By default a bound context is created. That is this context will be bound (as child) to the context of the currently executing task . Cancellation requests passed to the parent context are propagated to all the contexts bound to it. Similarly priority change is propagated from the parent context to its children. If task_group_context::isolated is used as the argument, then the tasks associated with this context will never be affected by events in any other context. Creating isolated contexts involve much less overhead, but they have limited utility. Normally when an exception occurs in an algorithm that has nested ones running, it is desirably to have all the nested algorithms cancelled as well. Such a behavior requires nested algorithms to use bound contexts. There is one good place where using isolated algorithms is beneficial. It is an external thread. That is if a particular algorithm is invoked directly from the external thread (not from a TBB task), supplying it with explicitly created isolated context will result in a faster algorithm startup. VERSIONING NOTE: Implementation(s) of task_group_context constructor(s) cannot be made entirely out-of-line because the run-time version must be set by the user code. This will become critically important for binary compatibility, if we ever have to change the size of the context object. **/ task_group_context(kind_type relation_with_parent = bound, std::uintptr_t t = default_traits) : task_group_context(make_traits(relation_with_parent, t), CUSTOM_CTX) {} // Custom constructor for instrumentation of oneTBB algorithm task_group_context(string_resource_index name ) : task_group_context(make_traits(bound, default_traits), name) {} // Do not introduce any logic on user side since it might break state propagation assumptions ~task_group_context() { // When 'this' serves as a proxy, the initialization does not happen - nor should the // destruction. if (!is_proxy()) { r1::destroy(*this); } } //! Forcefully reinitializes the context after the task tree it was associated with is completed. /** Because the method assumes that all the tasks that used to be associated with this context have already finished, calling it while the context is still in use somewhere in the task hierarchy leads to undefined behavior. IMPORTANT: This method is not thread safe! The method does not change the context's parent if it is set. **/ void reset() { r1::reset(actual_context()); } //! Initiates cancellation of all tasks in this cancellation group and its subordinate groups. /** \return false if cancellation has already been requested, true otherwise. Note that canceling never fails. When false is returned, it just means that another thread (or this one) has already sent cancellation request to this context or to one of its ancestors (if this context is bound). It is guaranteed that when this method is concurrently called on the same not yet cancelled context, true will be returned by one and only one invocation. **/ bool cancel_group_execution() { return r1::cancel_group_execution(actual_context()); } //! Returns true if the context received cancellation request. bool is_group_execution_cancelled() { return r1::is_group_execution_cancelled(actual_context()); } #if __TBB_FP_CONTEXT //! Captures the current FPU control settings to the context. /** Because the method assumes that all the tasks that used to be associated with this context have already finished, calling it while the context is still in use somewhere in the task hierarchy leads to undefined behavior. IMPORTANT: This method is not thread safe! The method does not change the FPU control settings of the context's parent. **/ void capture_fp_settings() { r1::capture_fp_settings(actual_context()); } #endif //! Returns the user visible context trait std::uintptr_t traits() const { std::uintptr_t t{}; const task_group_context& ctx = actual_context(); t |= ctx.my_traits.fp_settings ? fp_settings : 0; t |= ctx.my_traits.concurrent_wait ? concurrent_wait : 0; return t; } private: //// TODO: cleanup friends friend class r1::cancellation_disseminator; friend class r1::thread_data; friend class r1::task_dispatcher; template friend class r1::context_guard_helper; friend struct r1::task_arena_impl; friend struct r1::task_group_context_impl; friend class d2::task_group_base; }; // class task_group_context static_assert(sizeof(task_group_context) == 128, "Wrong size of task_group_context"); inline bool is_current_task_group_canceling() { task_group_context* ctx = current_context(); return ctx ? ctx->is_group_execution_cancelled() : false; } } // namespace d1 namespace d2 { enum task_group_status { not_complete, complete, canceled }; class task_group; class structured_task_group; #if TBB_PREVIEW_ISOLATED_TASK_GROUP class isolated_task_group; #endif template class function_stack_task : public d1::task { const F& m_func; d1::wait_tree_vertex_interface* m_wait_tree_vertex; void finalize() { m_wait_tree_vertex->release(); } task* execute(d1::execution_data&) override { task* res = d2::task_ptr_or_nullptr(m_func); finalize(); return res; } task* cancel(d1::execution_data&) override { finalize(); return nullptr; } public: function_stack_task(const F& f, d1::wait_tree_vertex_interface* vertex) : m_func(f), m_wait_tree_vertex(vertex) { m_wait_tree_vertex->reserve(); } }; class task_group_base : no_copy { protected: d1::wait_context_vertex m_wait_vertex; d1::task_group_context m_context; template task_group_status internal_run_and_wait(const F& f) { function_stack_task t{ f, r1::get_thread_reference_vertex(&m_wait_vertex) }; bool cancellation_status = false; try_call([&] { execute_and_wait(t, context(), m_wait_vertex.get_context(), context()); }).on_completion([&] { // TODO: the reset method is not thread-safe. Ensure the correct behavior. cancellation_status = context().is_group_execution_cancelled(); context().reset(); }); return cancellation_status ? canceled : complete; } task_group_status internal_run_and_wait(d2::task_handle&& h) { __TBB_ASSERT(h != nullptr, "Attempt to schedule empty task_handle"); using acs = d2::task_handle_accessor; __TBB_ASSERT(&acs::ctx_of(h) == &context(), "Attempt to schedule task_handle into different task_group"); bool cancellation_status = false; try_call([&] { execute_and_wait(*acs::release(h), context(), m_wait_vertex.get_context(), context()); }).on_completion([&] { // TODO: the reset method is not thread-safe. Ensure the correct behavior. cancellation_status = context().is_group_execution_cancelled(); context().reset(); }); return cancellation_status ? canceled : complete; } template d1::task* prepare_task(F&& f) { d1::small_object_allocator alloc{}; return alloc.new_object::type>>(std::forward(f), r1::get_thread_reference_vertex(&m_wait_vertex), context(), alloc); } d1::task_group_context& context() noexcept { return m_context.actual_context(); } template d2::task_handle prepare_task_handle(F&& f) { d1::small_object_allocator alloc{}; using function_task_t = d2::function_task::type>; d2::task_handle_task* function_task_p = alloc.new_object(std::forward(f), r1::get_thread_reference_vertex(&m_wait_vertex), context(), alloc); return d2::task_handle_accessor::construct(function_task_p); } public: task_group_base(uintptr_t traits = 0) : m_wait_vertex(0) , m_context(d1::task_group_context::bound, d1::task_group_context::default_traits | traits) {} task_group_base(d1::task_group_context& ctx) : m_wait_vertex(0) , m_context(&ctx) {} ~task_group_base() noexcept(false) { if (m_wait_vertex.continue_execution()) { #if __TBB_CPP17_UNCAUGHT_EXCEPTIONS_PRESENT bool stack_unwinding_in_progress = std::uncaught_exceptions() > 0; #else bool stack_unwinding_in_progress = std::uncaught_exception(); #endif // Always attempt to do proper cleanup to avoid inevitable memory corruption // in case of missing wait (for the sake of better testability & debuggability) if (!context().is_group_execution_cancelled()) cancel(); d1::wait(m_wait_vertex.get_context(), context()); if (!stack_unwinding_in_progress) throw_exception(exception_id::missing_wait); } } task_group_status wait() { bool cancellation_status = false; try_call([&] { d1::wait(m_wait_vertex.get_context(), context()); }).on_completion([&] { // TODO: the reset method is not thread-safe. Ensure the correct behavior. cancellation_status = m_context.is_group_execution_cancelled(); context().reset(); }); return cancellation_status ? canceled : complete; } void cancel() { context().cancel_group_execution(); } }; // class task_group_base class task_group : public task_group_base { public: task_group() : task_group_base(d1::task_group_context::concurrent_wait) {} task_group(d1::task_group_context& ctx) : task_group_base(ctx) {} template void run(F&& f) { d1::spawn(*prepare_task(std::forward(f)), context()); } void run(d2::task_handle&& h) { __TBB_ASSERT(h != nullptr, "Attempt to schedule empty task_handle"); using acs = d2::task_handle_accessor; __TBB_ASSERT(&acs::ctx_of(h) == &context(), "Attempt to schedule task_handle into different task_group"); d1::spawn(*acs::release(h), context()); } template d2::task_handle defer(F&& f) { return prepare_task_handle(std::forward(f)); } template task_group_status run_and_wait(const F& f) { return internal_run_and_wait(f); } task_group_status run_and_wait(d2::task_handle&& h) { return internal_run_and_wait(std::move(h)); } }; // class task_group #if TBB_PREVIEW_ISOLATED_TASK_GROUP class spawn_delegate : public d1::delegate_base { d1::task* task_to_spawn; d1::task_group_context& context; bool operator()() const override { spawn(*task_to_spawn, context); return true; } public: spawn_delegate(d1::task* a_task, d1::task_group_context& ctx) : task_to_spawn(a_task), context(ctx) {} }; class wait_delegate : public d1::delegate_base { bool operator()() const override { status = tg.wait(); return true; } protected: task_group& tg; task_group_status& status; public: wait_delegate(task_group& a_group, task_group_status& tgs) : tg(a_group), status(tgs) {} }; template class run_wait_delegate : public wait_delegate { F& func; bool operator()() const override { status = tg.run_and_wait(func); return true; } public: run_wait_delegate(task_group& a_group, F& a_func, task_group_status& tgs) : wait_delegate(a_group, tgs), func(a_func) {} }; class isolated_task_group : public task_group { intptr_t this_isolation() { return reinterpret_cast(this); } public: isolated_task_group() : task_group() {} isolated_task_group(d1::task_group_context& ctx) : task_group(ctx) {} template void run(F&& f) { spawn_delegate sd(prepare_task(std::forward(f)), context()); r1::isolate_within_arena(sd, this_isolation()); } void run(d2::task_handle&& h) { __TBB_ASSERT(h != nullptr, "Attempt to schedule empty task_handle"); using acs = d2::task_handle_accessor; __TBB_ASSERT(&acs::ctx_of(h) == &context(), "Attempt to schedule task_handle into different task_group"); spawn_delegate sd(acs::release(h), context()); r1::isolate_within_arena(sd, this_isolation()); } template task_group_status run_and_wait( const F& f ) { task_group_status result = not_complete; run_wait_delegate rwd(*this, f, result); r1::isolate_within_arena(rwd, this_isolation()); __TBB_ASSERT(result != not_complete, "premature exit from wait?"); return result; } task_group_status wait() { task_group_status result = not_complete; wait_delegate wd(*this, result); r1::isolate_within_arena(wd, this_isolation()); __TBB_ASSERT(result != not_complete, "premature exit from wait?"); return result; } }; // class isolated_task_group #endif // TBB_PREVIEW_ISOLATED_TASK_GROUP } // namespace d2 } // namespace detail inline namespace v1 { using detail::d1::task_group_context; using detail::d2::task_group; #if TBB_PREVIEW_ISOLATED_TASK_GROUP using detail::d2::isolated_task_group; #endif using detail::d2::task_group_status; using detail::d2::not_complete; using detail::d2::complete; using detail::d2::canceled; using detail::d1::is_current_task_group_canceling; using detail::r1::missing_wait; using detail::d2::task_handle; } } // namespace tbb #if _MSC_VER && !defined(__INTEL_COMPILER) #pragma warning(pop) // 4324 warning #endif #endif // __TBB_task_group_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/task_scheduler_observer.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_task_scheduler_observer_H #define __TBB_task_scheduler_observer_H #include "detail/_namespace_injection.h" #include "task_arena.h" #include namespace tbb { namespace detail { namespace d1 { class task_scheduler_observer; } namespace r1 { class observer_proxy; class observer_list; //! Enable or disable observation /** For local observers the method can be used only when the current thread has the task scheduler initialized or is attached to an arena. Repeated calls with the same state are no-ops. **/ TBB_EXPORT void __TBB_EXPORTED_FUNC observe(d1::task_scheduler_observer&, bool state = true); } namespace d1 { class task_scheduler_observer { friend class r1::observer_proxy; friend class r1::observer_list; friend void r1::observe(d1::task_scheduler_observer&, bool); //! Pointer to the proxy holding this observer. /** Observers are proxied by the scheduler to maintain persistent lists of them. **/ std::atomic my_proxy{ nullptr }; //! Counter preventing the observer from being destroyed while in use by the scheduler. /** Valid only when observation is on. **/ std::atomic my_busy_count{ 0 }; //! Contains task_arena pointer task_arena* my_task_arena{ nullptr }; public: //! Returns true if observation is enabled, false otherwise. bool is_observing() const { return my_proxy.load(std::memory_order_relaxed) != nullptr; } //! Entry notification /** Invoked from inside observe(true) call and whenever a worker enters the arena this observer is associated with. If a thread is already in the arena when the observer is activated, the entry notification is called before it executes the first stolen task. **/ virtual void on_scheduler_entry( bool /*is_worker*/ ) {} //! Exit notification /** Invoked from inside observe(false) call and whenever a worker leaves the arena this observer is associated with. **/ virtual void on_scheduler_exit( bool /*is_worker*/ ) {} //! Construct local or global observer in inactive state (observation disabled). /** For a local observer entry/exit notifications are invoked whenever a worker thread joins/leaves the arena of the observer's owner thread. If a thread is already in the arena when the observer is activated, the entry notification is called before it executes the first stolen task. **/ explicit task_scheduler_observer() = default; //! Construct local observer for a given arena in inactive state (observation disabled). /** entry/exit notifications are invoked whenever a thread joins/leaves arena. If a thread is already in the arena when the observer is activated, the entry notification is called before it executes the first stolen task. **/ explicit task_scheduler_observer(task_arena& a) : my_task_arena(&a) {} /** Destructor protects instance of the observer from concurrent notification. It is recommended to disable observation before destructor of a derived class starts, otherwise it can lead to concurrent notification callback on partly destroyed object **/ virtual ~task_scheduler_observer() { if (my_proxy.load(std::memory_order_acquire)) { observe(false); } } //! Enable or disable observation /** Warning: concurrent invocations of this method are not safe. Repeated calls with the same state are no-ops. **/ void observe(bool state = true) { if( state && !my_proxy.load(std::memory_order_relaxed) ) { __TBB_ASSERT( my_busy_count.load(std::memory_order_relaxed) == 0, "Inconsistent state of task_scheduler_observer instance"); } r1::observe(*this, state); } }; } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::task_scheduler_observer; } } // namespace tbb #endif /* __TBB_task_scheduler_observer_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/tbb_allocator.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_tbb_allocator_H #define __TBB_tbb_allocator_H #include "oneapi/tbb/detail/_utils.h" #include "detail/_namespace_injection.h" #include #include #if __TBB_CPP17_MEMORY_RESOURCE_PRESENT #include #endif namespace tbb { namespace detail { namespace r1 { TBB_EXPORT void* __TBB_EXPORTED_FUNC allocate_memory(std::size_t size); TBB_EXPORT void __TBB_EXPORTED_FUNC deallocate_memory(void* p); TBB_EXPORT bool __TBB_EXPORTED_FUNC is_tbbmalloc_used(); } namespace d1 { template class tbb_allocator { public: using value_type = T; using propagate_on_container_move_assignment = std::true_type; //! Always defined for TBB containers (supported since C++17 for std containers) using is_always_equal = std::true_type; //! Specifies current allocator enum malloc_type { scalable, standard }; tbb_allocator() = default; template tbb_allocator(const tbb_allocator&) noexcept {} //! Allocate space for n objects. __TBB_nodiscard T* allocate(std::size_t n) { return static_cast(r1::allocate_memory(n * sizeof(value_type))); } //! Free previously allocated block of memory. void deallocate(T* p, std::size_t) { r1::deallocate_memory(p); } //! Returns current allocator static malloc_type allocator_type() { return r1::is_tbbmalloc_used() ? standard : scalable; } #if TBB_ALLOCATOR_TRAITS_BROKEN using pointer = value_type*; using const_pointer = const value_type*; using reference = value_type&; using const_reference = const value_type&; using difference_type = std::ptrdiff_t; using size_type = std::size_t; template struct rebind { using other = tbb_allocator; }; //! Largest value for which method allocate might succeed. size_type max_size() const noexcept { size_type max = ~(std::size_t(0)) / sizeof(value_type); return (max > 0 ? max : 1); } template void construct(U *p, Args&&... args) { ::new (p) U(std::forward(args)...); } void destroy( pointer p ) { p->~value_type(); } pointer address(reference x) const { return &x; } const_pointer address(const_reference x) const { return &x; } #endif // TBB_ALLOCATOR_TRAITS_BROKEN }; #if TBB_ALLOCATOR_TRAITS_BROKEN template<> class tbb_allocator { public: using pointer = void*; using const_pointer = const void*; using value_type = void; template struct rebind { using other = tbb_allocator; }; }; #endif template inline bool operator==(const tbb_allocator&, const tbb_allocator&) noexcept { return true; } #if !__TBB_CPP20_COMPARISONS_PRESENT template inline bool operator!=(const tbb_allocator&, const tbb_allocator&) noexcept { return false; } #endif } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::tbb_allocator; } // namespace v1 } // namespace tbb #endif /* __TBB_tbb_allocator_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/tbbmalloc_proxy.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* Replacing the standard memory allocation routines in Microsoft* C/C++ RTL (malloc/free, global new/delete, etc.) with the TBB memory allocator. Include the following header to a source of any binary which is loaded during application startup #include "oneapi/tbb/tbbmalloc_proxy.h" or add following parameters to the linker options for the binary which is loaded during application startup. It can be either exe-file or dll. For win32 tbbmalloc_proxy.lib /INCLUDE:"___TBB_malloc_proxy" win64 tbbmalloc_proxy.lib /INCLUDE:"__TBB_malloc_proxy" */ #ifndef __TBB_tbbmalloc_proxy_H #define __TBB_tbbmalloc_proxy_H #if _MSC_VER #ifdef _DEBUG #pragma comment(lib, "tbbmalloc_proxy_debug.lib") #else #pragma comment(lib, "tbbmalloc_proxy.lib") #endif #if defined(_WIN64) #pragma comment(linker, "/include:__TBB_malloc_proxy") #else #pragma comment(linker, "/include:___TBB_malloc_proxy") #endif #else /* Primarily to support MinGW */ extern "C" void __TBB_malloc_proxy(); struct __TBB_malloc_proxy_caller { __TBB_malloc_proxy_caller() { __TBB_malloc_proxy(); } } volatile __TBB_malloc_proxy_helper_object; #endif // _MSC_VER /* Public Windows API */ extern "C" int TBB_malloc_replacement_log(char *** function_replacement_log_ptr); #endif //__TBB_tbbmalloc_proxy_H ================================================ FILE: third-party/tbb/include/oneapi/tbb/tick_count.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_tick_count_H #define __TBB_tick_count_H #include #include "detail/_namespace_injection.h" namespace tbb { namespace detail { namespace d1 { //! Absolute timestamp /** @ingroup timing */ class tick_count { public: using clock_type = typename std::conditional::type; //! Relative time interval. class interval_t : public clock_type::duration { public: //! Construct a time interval representing zero time duration interval_t() : clock_type::duration(clock_type::duration::zero()) {} //! Construct a time interval representing sec seconds time duration explicit interval_t( double sec ) : clock_type::duration(std::chrono::duration_cast(std::chrono::duration(sec))) {} //! Return the length of a time interval in seconds double seconds() const { return std::chrono::duration_cast>(*this).count(); } //! Extract the intervals from the tick_counts and subtract them. friend interval_t operator-( const tick_count& t1, const tick_count& t0 ); //! Add two intervals. friend interval_t operator+( const interval_t& i, const interval_t& j ) { return interval_t(std::chrono::operator+(i, j)); } //! Subtract two intervals. friend interval_t operator-( const interval_t& i, const interval_t& j ) { return interval_t(std::chrono::operator-(i, j)); } private: explicit interval_t( clock_type::duration value_ ) : clock_type::duration(value_) {} }; tick_count() = default; //! Return current time. static tick_count now() { return clock_type::now(); } //! Subtract two timestamps to get the time interval between friend interval_t operator-( const tick_count& t1, const tick_count& t0 ) { return tick_count::interval_t(t1.my_time_point - t0.my_time_point); } //! Return the resolution of the clock in seconds per tick. static double resolution() { return static_cast(interval_t::period::num) / interval_t::period::den; } private: clock_type::time_point my_time_point; tick_count( clock_type::time_point tp ) : my_time_point(tp) {} }; } // namespace d1 } // namespace detail inline namespace v1 { using detail::d1::tick_count; } // namespace v1 } // namespace tbb #endif /* __TBB_tick_count_H */ ================================================ FILE: third-party/tbb/include/oneapi/tbb/version.h ================================================ /* Copyright (c) 2005-2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_version_H #define __TBB_version_H // Exclude all includes during .rc files compilation #ifndef RC_INVOKED #include "detail/_config.h" #include "detail/_namespace_injection.h" #else #define __TBB_STRING_AUX(x) #x #define __TBB_STRING(x) __TBB_STRING_AUX(x) #endif // Product version #define TBB_VERSION_MAJOR 2022 // Update version #define TBB_VERSION_MINOR 1 // "Patch" version for custom releases #define TBB_VERSION_PATCH 0 // Suffix string #define __TBB_VERSION_SUFFIX "" // Full official version string #define TBB_VERSION_STRING \ __TBB_STRING(TBB_VERSION_MAJOR) "." \ __TBB_STRING(TBB_VERSION_MINOR) "." \ __TBB_STRING(TBB_VERSION_PATCH) \ __TBB_VERSION_SUFFIX // OneAPI oneTBB specification version #define ONETBB_SPEC_VERSION 104 // Full interface version #define TBB_INTERFACE_VERSION 12150 // Major interface version #define TBB_INTERFACE_VERSION_MAJOR (TBB_INTERFACE_VERSION/1000) // Minor interface version #define TBB_INTERFACE_VERSION_MINOR (TBB_INTERFACE_VERSION%1000/10) // The binary compatibility version // To be used in SONAME, manifests, etc. #define __TBB_BINARY_VERSION 12 //! TBB_VERSION support #ifndef TBB_ENDL #define TBB_ENDL "\n" #endif //TBB_REVAMP_TODO: consider enabling version_string.ver generation //TBB_REVAMP_TODO: #include "version_string.ver" #define __TBB_ONETBB_SPEC_VERSION(N) #N ": SPECIFICATION VERSION\t" __TBB_STRING(ONETBB_SPEC_VERSION) TBB_ENDL #define __TBB_VERSION_NUMBER(N) #N ": VERSION\t\t" TBB_VERSION_STRING TBB_ENDL #define __TBB_INTERFACE_VERSION_NUMBER(N) #N ": INTERFACE VERSION\t" __TBB_STRING(TBB_INTERFACE_VERSION) TBB_ENDL #ifndef TBB_USE_DEBUG #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\tundefined" TBB_ENDL #elif TBB_USE_DEBUG==0 #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\t0" TBB_ENDL #elif TBB_USE_DEBUG==1 #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\t1" TBB_ENDL #elif TBB_USE_DEBUG==2 #define __TBB_VERSION_USE_DEBUG(N) #N ": TBB_USE_DEBUG\t2" TBB_ENDL #else #error Unexpected value for TBB_USE_DEBUG #endif #ifndef TBB_USE_ASSERT #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\tundefined" TBB_ENDL #elif TBB_USE_ASSERT==0 #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\t0" TBB_ENDL #elif TBB_USE_ASSERT==1 #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\t1" TBB_ENDL #elif TBB_USE_ASSERT==2 #define __TBB_VERSION_USE_ASSERT(N) #N ": TBB_USE_ASSERT\t2" TBB_ENDL #else #error Unexpected value for TBB_USE_ASSERT #endif #define TBB_VERSION_STRINGS_P(N) \ __TBB_ONETBB_SPEC_VERSION(N) \ __TBB_VERSION_NUMBER(N) \ __TBB_INTERFACE_VERSION_NUMBER(N) \ __TBB_VERSION_USE_DEBUG(N) \ __TBB_VERSION_USE_ASSERT(N) #define TBB_VERSION_STRINGS TBB_VERSION_STRINGS_P(oneTBB) #define TBBMALLOC_VERSION_STRINGS TBB_VERSION_STRINGS_P(TBBmalloc) //! The function returns the version string for the Intel(R) oneAPI Threading Building Blocks (oneTBB) //! shared library being used. /** * The returned pointer is an address of a string in the shared library. * It can be different than the TBB_VERSION_STRING obtained at compile time. */ extern "C" TBB_EXPORT const char* __TBB_EXPORTED_FUNC TBB_runtime_version(); //! The function returns the interface version of the oneTBB shared library being used. /** * The returned version is determined at runtime, not at compile/link time. * It can be different than the value of TBB_INTERFACE_VERSION obtained at compile time. */ extern "C" TBB_EXPORT int __TBB_EXPORTED_FUNC TBB_runtime_interface_version(); #endif // __TBB_version_H ================================================ FILE: third-party/tbb/include/oneapi/tbb.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_tbb_H #define __TBB_tbb_H /** This header bulk-includes declarations or definitions of all the functionality provided by TBB (save for tbbmalloc and 3rd party dependent headers). If you use only a few TBB constructs, consider including specific headers only. Any header listed below can be included independently of others. **/ #include "oneapi/tbb/blocked_range.h" #include "oneapi/tbb/blocked_range2d.h" #include "oneapi/tbb/blocked_range3d.h" #include "oneapi/tbb/blocked_nd_range.h" #include "oneapi/tbb/cache_aligned_allocator.h" #include "oneapi/tbb/combinable.h" #include "oneapi/tbb/concurrent_hash_map.h" #if TBB_PREVIEW_CONCURRENT_LRU_CACHE #include "tbb/concurrent_lru_cache.h" #endif #include "oneapi/tbb/collaborative_call_once.h" #include "oneapi/tbb/concurrent_priority_queue.h" #include "oneapi/tbb/concurrent_queue.h" #include "oneapi/tbb/concurrent_unordered_map.h" #include "oneapi/tbb/concurrent_unordered_set.h" #include "oneapi/tbb/concurrent_map.h" #include "oneapi/tbb/concurrent_set.h" #include "oneapi/tbb/concurrent_vector.h" #include "oneapi/tbb/enumerable_thread_specific.h" #include "oneapi/tbb/flow_graph.h" #include "oneapi/tbb/global_control.h" #include "oneapi/tbb/info.h" #include "oneapi/tbb/null_mutex.h" #include "oneapi/tbb/null_rw_mutex.h" #include "oneapi/tbb/parallel_for.h" #include "oneapi/tbb/parallel_for_each.h" #include "oneapi/tbb/parallel_invoke.h" #include "oneapi/tbb/parallel_pipeline.h" #include "oneapi/tbb/parallel_reduce.h" #include "oneapi/tbb/parallel_scan.h" #include "oneapi/tbb/parallel_sort.h" #include "oneapi/tbb/partitioner.h" #include "oneapi/tbb/queuing_mutex.h" #include "oneapi/tbb/queuing_rw_mutex.h" #include "oneapi/tbb/spin_mutex.h" #include "oneapi/tbb/spin_rw_mutex.h" #include "oneapi/tbb/mutex.h" #include "oneapi/tbb/rw_mutex.h" #include "oneapi/tbb/task.h" #include "oneapi/tbb/task_arena.h" #include "oneapi/tbb/task_group.h" #include "oneapi/tbb/task_scheduler_observer.h" #include "oneapi/tbb/tbb_allocator.h" #include "oneapi/tbb/tick_count.h" #include "oneapi/tbb/version.h" #endif /* __TBB_tbb_H */ ================================================ FILE: third-party/tbb/include/tbb/blocked_nd_range.h ================================================ /* Copyright (c) 2017-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/blocked_nd_range.h" ================================================ FILE: third-party/tbb/include/tbb/blocked_range.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/blocked_range.h" ================================================ FILE: third-party/tbb/include/tbb/blocked_range2d.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/blocked_range2d.h" ================================================ FILE: third-party/tbb/include/tbb/blocked_range3d.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/blocked_range3d.h" ================================================ FILE: third-party/tbb/include/tbb/blocked_rangeNd.h ================================================ /* Copyright (c) 2017-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/blocked_rangeNd.h" ================================================ FILE: third-party/tbb/include/tbb/cache_aligned_allocator.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/cache_aligned_allocator.h" ================================================ FILE: third-party/tbb/include/tbb/collaborative_call_once.h ================================================ /* Copyright (c) 2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/collaborative_call_once.h" ================================================ FILE: third-party/tbb/include/tbb/combinable.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/combinable.h" ================================================ FILE: third-party/tbb/include/tbb/concurrent_hash_map.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/concurrent_hash_map.h" ================================================ FILE: third-party/tbb/include/tbb/concurrent_lru_cache.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/concurrent_lru_cache.h" ================================================ FILE: third-party/tbb/include/tbb/concurrent_map.h ================================================ /* Copyright (c) 2019-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/concurrent_map.h" ================================================ FILE: third-party/tbb/include/tbb/concurrent_priority_queue.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/concurrent_priority_queue.h" ================================================ FILE: third-party/tbb/include/tbb/concurrent_queue.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/concurrent_queue.h" ================================================ FILE: third-party/tbb/include/tbb/concurrent_set.h ================================================ /* Copyright (c) 2019-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/concurrent_set.h" ================================================ FILE: third-party/tbb/include/tbb/concurrent_unordered_map.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/concurrent_unordered_map.h" ================================================ FILE: third-party/tbb/include/tbb/concurrent_unordered_set.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/concurrent_unordered_set.h" ================================================ FILE: third-party/tbb/include/tbb/concurrent_vector.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/concurrent_vector.h" ================================================ FILE: third-party/tbb/include/tbb/enumerable_thread_specific.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/enumerable_thread_specific.h" ================================================ FILE: third-party/tbb/include/tbb/flow_graph.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/flow_graph.h" ================================================ FILE: third-party/tbb/include/tbb/flow_graph_abstractions.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/flow_graph_abstractions.h" ================================================ FILE: third-party/tbb/include/tbb/global_control.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/global_control.h" ================================================ FILE: third-party/tbb/include/tbb/info.h ================================================ /* Copyright (c) 2019-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/info.h" ================================================ FILE: third-party/tbb/include/tbb/memory_pool.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/memory_pool.h" ================================================ FILE: third-party/tbb/include/tbb/mutex.h ================================================ /* Copyright (c) 2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/mutex.h" ================================================ FILE: third-party/tbb/include/tbb/null_mutex.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/null_mutex.h" ================================================ FILE: third-party/tbb/include/tbb/null_rw_mutex.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/null_rw_mutex.h" ================================================ FILE: third-party/tbb/include/tbb/parallel_for.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/parallel_for.h" ================================================ FILE: third-party/tbb/include/tbb/parallel_for_each.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/parallel_for_each.h" ================================================ FILE: third-party/tbb/include/tbb/parallel_invoke.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/parallel_invoke.h" ================================================ FILE: third-party/tbb/include/tbb/parallel_pipeline.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/parallel_pipeline.h" ================================================ FILE: third-party/tbb/include/tbb/parallel_reduce.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/parallel_reduce.h" ================================================ FILE: third-party/tbb/include/tbb/parallel_scan.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/parallel_scan.h" ================================================ FILE: third-party/tbb/include/tbb/parallel_sort.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/parallel_sort.h" ================================================ FILE: third-party/tbb/include/tbb/partitioner.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/partitioner.h" ================================================ FILE: third-party/tbb/include/tbb/profiling.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/profiling.h" ================================================ FILE: third-party/tbb/include/tbb/queuing_mutex.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/queuing_mutex.h" ================================================ FILE: third-party/tbb/include/tbb/queuing_rw_mutex.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/queuing_rw_mutex.h" ================================================ FILE: third-party/tbb/include/tbb/rw_mutex.h ================================================ /* Copyright (c) 2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/rw_mutex.h" ================================================ FILE: third-party/tbb/include/tbb/scalable_allocator.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/scalable_allocator.h" ================================================ FILE: third-party/tbb/include/tbb/spin_mutex.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/spin_mutex.h" ================================================ FILE: third-party/tbb/include/tbb/spin_rw_mutex.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/spin_rw_mutex.h" ================================================ FILE: third-party/tbb/include/tbb/task.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/task.h" ================================================ FILE: third-party/tbb/include/tbb/task_arena.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/task_arena.h" ================================================ FILE: third-party/tbb/include/tbb/task_group.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/task_group.h" ================================================ FILE: third-party/tbb/include/tbb/task_scheduler_observer.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/task_scheduler_observer.h" ================================================ FILE: third-party/tbb/include/tbb/tbb.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb.h" ================================================ FILE: third-party/tbb/include/tbb/tbb_allocator.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/tbb_allocator.h" ================================================ FILE: third-party/tbb/include/tbb/tbbmalloc_proxy.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/tbbmalloc_proxy.h" ================================================ FILE: third-party/tbb/include/tbb/tick_count.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/tick_count.h" ================================================ FILE: third-party/tbb/include/tbb/version.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "../oneapi/tbb/version.h" ================================================ FILE: third-party/tbb/integration/cmake/generate_vars.cmake ================================================ # Copyright (c) 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Reuired parameters: # SOURCE_DIR - incoming path to oneTBB source directory. # BINARY_DIR - incoming path to oneTBB build directory. # BIN_PATH - incoming path to oneTBB binaries directory. # TBB_INSTALL_VARS - install vars generation trigger # TBB_CMAKE_INSTALL_LIBDIR - subdir for shared object files installation path (used only in TBB_INSTALL_VARS mode) # VARS_TEMPLATE - path to the vars template file # VARS_NAME - name of the output vars script set(INPUT_FILE "${SOURCE_DIR}/integration/${VARS_TEMPLATE}") set(OUTPUT_FILE "${BIN_PATH}/${VARS_NAME}") file(TO_NATIVE_PATH "${SOURCE_DIR}" TBBROOT_REPLACEMENT) file(TO_NATIVE_PATH "${BIN_PATH}" LIBRARY_PATH_REPLACEMENT) if (WIN32) file(TO_NATIVE_PATH "${BIN_PATH}" BINARY_PATH_REPLACEMENT) endif() if (NOT EXISTS ${OUTPUT_FILE}) configure_file(${INPUT_FILE} ${OUTPUT_FILE} @ONLY) endif() if (TBB_INSTALL_VARS) set(OUTPUT_FILE "${BINARY_DIR}/internal_install_vars") if (UNIX) set(TBBROOT_REPLACEMENT "$(cd $(dirname \${BASH_SOURCE}) && pwd -P)/..") set(LIBRARY_PATH_REPLACEMENT "$TBBROOT/${TBB_CMAKE_INSTALL_LIBDIR}/") set(CMAKE_ENVIRONMENT_SOURCING_STRING "CMAKE_PREFIX_PATH=\"\${TBBROOT}/${TBB_CMAKE_INSTALL_LIBDIR}/cmake/TBB:${CMAKE_PREFIX_PATH}\"; export CMAKE_PREFIX_PATH") else() set(TBBROOT_REPLACEMENT "%~d0%~p0..") set(LIBRARY_PATH_REPLACEMENT "%TBBROOT%\\${TBB_CMAKE_INSTALL_LIBDIR}") set(BINARY_PATH_REPLACEMENT "%TBBROOT%\\bin") set(CMAKE_ENVIRONMENT_SOURCING_STRING "set \"CMAKE_PREFIX_PATH=%TBBROOT%\\${TBB_CMAKE_INSTALL_LIBDIR}\\cmake\\TBB;%CMAKE_PREFIX_PATH%\"") endif() configure_file( ${INPUT_FILE} ${OUTPUT_FILE} @ONLY ) endif() ================================================ FILE: third-party/tbb/integration/linux/env/vars.sh ================================================ #!/bin/sh # shellcheck shell=sh # # Copyright (c) 2005-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # The script is setting up environment for oneTBB. # Supported arguments: # intel64|ia32 - architecture, intel64 is default. # Get absolute path to script. Gets a relative path as argument and outputs an absolute path. get_script_path() ( script_path="$1" while [ -L "$script_path" ] ; do script_dir=$(command dirname -- "$script_path") script_dir=$(cd "$script_dir" && command pwd -P) script_path="$(readlink "$script_path")" case $script_path in (/*) ;; (*) script_path="$script_dir/$script_path" ;; esac done script_dir=$(command dirname -- "$script_path") script_dir=$(cd "$script_dir" && command pwd -P) printf "%s" "$script_dir" ) _vars_get_proc_name() { if [ -n "${ZSH_VERSION:-}" ] ; then script="$(ps -p "$$" -o comm=)" else script="$1" while [ -L "$script" ] ; do script="$(readlink "$script")" done fi basename -- "$script" } _vars_this_script_name="vars.sh" if [ "$_vars_this_script_name" = "$(_vars_get_proc_name "$0")" ] ; then echo ":: ERROR: Incorrect usage: this script must be sourced." echo " Usage: . path/to/${_vars_this_script_name}" return 255 2>/dev/null || exit 255 fi # Prepend path segment(s) to path-like env vars (PATH, CPATH, etc.). # prepend_path() avoids dangling ":" that affects some env vars (PATH and CPATH) # PATH > https://www.gnu.org/software/libc/manual/html_node/Standard-Environment.html # Usage: # env_var=$(prepend_path "$prepend_to_var" "$existing_env_var") # export env_var # # Inputs: # $1 == path segment to be prepended to $2 # $2 == value of existing path-like environment variable prepend_path() ( path_to_add="$1" path_is_now="$2" if [ "" = "${path_is_now}" ] ; then # avoid dangling ":" printf "%s" "${path_to_add}" else printf "%s" "${path_to_add}:${path_is_now}" fi ) # Extract the name and location of this sourced script. # Generally, "ps -o comm=" is limited to a 15 character result, but it works # fine for this usage, because we are primarily interested in finding the name # of the execution shell, not the name of any calling script. vars_script_name="" vars_script_shell="$(ps -p "$$" -o comm=)" # ${var:-} needed to pass "set -eu" checks if [ -n "${ZSH_VERSION:-}" ] && [ -n "${ZSH_EVAL_CONTEXT:-}" ] ; then # zsh 5.x and later # shellcheck disable=2249 case $ZSH_EVAL_CONTEXT in (*:file*) vars_script_name="${(%):-%x}" ;; esac ; elif [ -n "${KSH_VERSION:-}" ] ; then # ksh, mksh or lksh if [ "$(set | grep -Fq "KSH_VERSION=.sh.version" ; echo $?)" -eq 0 ] ; then # ksh vars_script_name="${.sh.file}" ; else # mksh or lksh or [lm]ksh masquerading as ksh or sh # force [lm]ksh to issue error msg; which contains this script's path/filename, e.g.: # mksh: /home/ubuntu/intel/oneapi/vars.sh[137]: ${.sh.file}: bad substitution vars_script_name="$( (echo "${.sh.file}") 2>&1 )" || : ; vars_script_name="$(expr "${vars_script_name:-}" : '^.*sh: \(.*\)\[[0-9]*\]:')" ; fi elif [ -n "${BASH_VERSION:-}" ] ; then # bash # shellcheck disable=2128 (return 0 2>/dev/null) && vars_script_name="${BASH_SOURCE}" ; elif [ "dash" = "$vars_script_shell" ] ; then # dash # force dash to issue error msg; which contains this script's rel/path/filename, e.g.: # dash: 146: /home/ubuntu/intel/oneapi/vars.sh: Bad substitution vars_script_name="$( (echo "${.sh.file}") 2>&1 )" || : ; vars_script_name="$(expr "${vars_script_name:-}" : '^.*dash: [0-9]*: \(.*\):')" ; elif [ "sh" = "$vars_script_shell" ] ; then # could be dash masquerading as /bin/sh # force a shell error msg; which should contain this script's path/filename # sample error msg shown; assume this file is named "vars.sh"; as required by setvars.sh vars_script_name="$( (echo "${.sh.file}") 2>&1 )" || : ; if [ "$(printf "%s" "$vars_script_name" | grep -Eq "sh: [0-9]+: .*vars\.sh: " ; echo $?)" -eq 0 ] ; then # dash as sh # sh: 155: /home/ubuntu/intel/oneapi/vars.sh: Bad substitution vars_script_name="$(expr "${vars_script_name:-}" : '^.*sh: [0-9]*: \(.*\):')" ; fi else # unrecognized shell or dash being sourced from within a user's script # force a shell error msg; which should contain this script's path/filename # sample error msg shown; assume this file is named "vars.sh"; as required by setvars.sh vars_script_name="$( (echo "${.sh.file}") 2>&1 )" || : ; if [ "$(printf "%s" "$vars_script_name" | grep -Eq "^.+: [0-9]+: .*vars\.sh: " ; echo $?)" -eq 0 ] ; then # dash # .*: 164: intel/oneapi/vars.sh: Bad substitution vars_script_name="$(expr "${vars_script_name:-}" : '^.*: [0-9]*: \(.*\):')" ; else vars_script_name="" ; fi fi if [ "" = "$vars_script_name" ] ; then >&2 echo ":: ERROR: Unable to proceed: possible causes listed below." >&2 echo " This script must be sourced. Did you execute or source this script?" ; >&2 echo " Unrecognized/unsupported shell (supported: bash, zsh, ksh, m/lksh, dash)." ; >&2 echo " Can be caused by sourcing from ZSH version 4.x or older." ; return 255 2>/dev/null || exit 255 fi TBBROOT=$(get_script_path "${vars_script_name:-}")/.. TBB_TARGET_ARCH="intel64" TBB_ARCH_SUFFIX="" if [ -n "${SETVARS_ARGS:-}" ]; then tbb_arg_ia32="$(expr "${SETVARS_ARGS:-}" : '^.*\(ia32\)')" || true if [ -n "${tbb_arg_ia32:-}" ]; then TBB_TARGET_ARCH="ia32" fi else for arg do case "$arg" in (intel64|ia32) TBB_TARGET_ARCH="${arg}" ;; (*) ;; esac done fi TBB_LIB_NAME="libtbb.so.12" # Parse layout if [ -e "$TBBROOT/lib/$TBB_TARGET_ARCH" ]; then TBB_LIB_DIR="$TBB_TARGET_ARCH/gcc4.8" else if [ "$TBB_TARGET_ARCH" = "ia32" ] ; then TBB_ARCH_SUFFIX="32" fi TBB_LIB_DIR="" fi if [ -e "$TBBROOT/lib$TBB_ARCH_SUFFIX/$TBB_LIB_DIR/$TBB_LIB_NAME" ]; then export TBBROOT LIBRARY_PATH=$(prepend_path "${TBBROOT}/lib$TBB_ARCH_SUFFIX/$TBB_LIB_DIR" "${LIBRARY_PATH:-}") ; export LIBRARY_PATH LD_LIBRARY_PATH=$(prepend_path "${TBBROOT}/lib$TBB_ARCH_SUFFIX/$TBB_LIB_DIR" "${LD_LIBRARY_PATH:-}") ; export LD_LIBRARY_PATH CPATH=$(prepend_path "${TBBROOT}/include" "${CPATH:-}") ; export CPATH CMAKE_PREFIX_PATH=$(prepend_path "${TBBROOT}" "${CMAKE_PREFIX_PATH:-}") ; export CMAKE_PREFIX_PATH PKG_CONFIG_PATH=$(prepend_path "${TBBROOT}/lib$TBB_ARCH_SUFFIX/pkgconfig" "${PKG_CONFIG_PATH:-}") ; export PKG_CONFIG_PATH else >&2 echo "ERROR: $TBB_LIB_NAME library does not exist in $TBBROOT/lib$TBB_ARCH_SUFFIX/$TBB_LIB_DIR." return 255 2>/dev/null || exit 255 fi ================================================ FILE: third-party/tbb/integration/linux/env/vars.sh.in ================================================ #!/bin/sh # # Copyright (c) 2005-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. export TBBROOT=@TBBROOT_REPLACEMENT@ LD_LIBRARY_PATH="@LIBRARY_PATH_REPLACEMENT@:${LD_LIBRARY_PATH}"; export LD_LIBRARY_PATH LIBRARY_PATH="@LIBRARY_PATH_REPLACEMENT@:${LIBRARY_PATH}"; export LIBRARY_PATH CPATH="${TBBROOT}/include:${CPATH}"; export CPATH PKG_CONFIG_PATH="@LIBRARY_PATH_REPLACEMENT@/pkgconfig:${PKG_CONFIG_PATH}"; export PKG_CONFIG_PATH @CMAKE_ENVIRONMENT_SOURCING_STRING@ ================================================ FILE: third-party/tbb/integration/linux/modulefiles/tbb ================================================ #%Module1.0################################################################### # # Copyright (c) 2020-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # This modulefile requires Environment Modules 4.1 or later. # Type `module --version` to determine the current installed version. ############################################################################## set min_tcl_ver 8.4 if { $tcl_version < $min_tcl_ver } { puts stderr " " puts stderr "ERROR: This modulefile requires tcl $min_tcl_ver or greater." puts stderr "Your system reports that tclsh version $tcl_version is installed." exit 1 } # if modulefile script name is a symlink, resolve it to get the fully # qualified pathname that points to the actual modulefile script # see: https://wiki.tcl-lang.org/page/file+normalize set scriptpath "${ModulesCurrentModulefile}" set scriptpath "[file dirname [file normalize "$scriptpath/___"]]" # define componentroot, modulefilepath, modulefilename and modulefilever set modulefilename "[file tail [file dirname "${scriptpath}"]]" set modulefilever "[file tail "${scriptpath}"]" set modulefilepath "${scriptpath}" set componentroot "[file dirname [file dirname [file dirname [file dirname "${scriptpath}"]]]]" ############################################################################## module-whatis "Name: Intel(R) oneAPI Threading Building Blocks" module-whatis "Version: $modulefilename/$modulefilever" module-whatis "Description: Flexible threading library for adding parallelism to complex applications across accelerated architectures." module-whatis "URL: https://www.intel.com/content/www/us/en/developer/tools/oneapi/onetbb.html" module-whatis "Dependencies: none" proc ModulesHelp { } { global modulefilename global modulefilever puts "module whatis ${modulefilename}/${modulefilever}" } ############################################################################## # Define environment variables needed for an isolated component install. set tbbroot "$componentroot" set tbb_target_arch "intel64" setenv TBBROOT "$tbbroot" prepend-path CPATH "$tbbroot/include" prepend-path LIBRARY_PATH "$tbbroot/lib" prepend-path LD_LIBRARY_PATH "$tbbroot/lib" prepend-path CMAKE_PREFIX_PATH "$tbbroot" prepend-path PKG_CONFIG_PATH "$tbbroot/lib/pkgconfig" ================================================ FILE: third-party/tbb/integration/linux/modulefiles/tbb32 ================================================ #%Module1.0################################################################### # # Copyright (c) 2020-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # This modulefile requires Environment Modules 4.1 or later. # Type `module --version` to determine the current installed version. ############################################################################## set min_tcl_ver 8.4 if { $tcl_version < $min_tcl_ver } { puts stderr " " puts stderr "ERROR: This modulefile requires tcl $min_tcl_ver or greater." puts stderr "Your system reports that tclsh version $tcl_version is installed." exit 1 } # if modulefile script name is a symlink, resolve it to get the fully # qualified pathname that points to the actual modulefile script # see: https://wiki.tcl-lang.org/page/file+normalize set scriptpath "${ModulesCurrentModulefile}" set scriptpath "[file dirname [file normalize "$scriptpath/___"]]" # define componentroot, modulefilepath, modulefilename and modulefilever set modulefilename "[file tail [file dirname "${scriptpath}"]]" set modulefilever "[file tail "${scriptpath}"]" set modulefilepath "${scriptpath}" set componentroot "[file dirname [file dirname [file dirname [file dirname "${scriptpath}"]]]]" ############################################################################## module-whatis "Name: Intel(R) oneAPI Threading Building Blocks" module-whatis "Version: $modulefilename/$modulefilever" module-whatis "Description: Flexible threading library for adding parallelism to complex applications across accelerated architectures." module-whatis "URL: https://www.intel.com/content/www/us/en/developer/tools/oneapi/onetbb.html" module-whatis "Dependencies: none" proc ModulesHelp { } { global modulefilename global modulefilever puts "module whatis ${modulefilename}/${modulefilever}" } ############################################################################## # Define environment variables needed for an isolated component install. set tbbroot "$componentroot" set tbb_target_arch "ia32" setenv TBBROOT "$tbbroot" prepend-path CPATH "$tbbroot/include32:$tbbroot/include" prepend-path LIBRARY_PATH "$tbbroot/lib32" prepend-path LD_LIBRARY_PATH "$tbbroot/lib32" prepend-path CMAKE_PREFIX_PATH "$tbbroot" prepend-path PKG_CONFIG_PATH "$tbbroot/lib32/pkgconfig" ================================================ FILE: third-party/tbb/integration/linux/oneapi/vars.sh ================================================ #!/bin/sh # shellcheck shell=sh # # Copyright (c) 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. if [ -z "${SETVARS_CALL:-}" ] ; then >&2 echo " " >&2 echo ":: ERROR: This script must be sourced by setvars.sh." >&2 echo " Try 'source /setvars.sh --help' for help." >&2 echo " " return 255 fi if [ -z "${ONEAPI_ROOT:-}" ] ; then >&2 echo " " >&2 echo ":: ERROR: This script requires that the ONEAPI_ROOT env variable is set." >&2 echo " Try 'source \setvars.sh --help' for help." >&2 echo " " return 254 fi TBBROOT="${ONEAPI_ROOT}"; export TBBROOT ================================================ FILE: third-party/tbb/integration/linux/sys_check/sys_check.sh ================================================ #!/bin/sh # # Copyright (c) 2019-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. LOC=$(realpath $(dirname "${BASH_SOURCE[0]}")) source $LOC/../../../common.sh $@ ERRORSTATE=0 return $ERRORSTATE ================================================ FILE: third-party/tbb/integration/mac/env/vars.sh ================================================ #!/bin/sh # shellcheck shell=sh # # Copyright (c) 2005-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Get absolute path to script. Gets a relative path as argument and outputs an absolute path. get_script_path() ( script_path="$1" while [ -L "$script_path" ] ; do script_dir=$(command dirname -- "$script_path") script_dir=$(cd "$script_dir" && command pwd -P) script_path="$(readlink "$script_path")" case $script_path in (/*) ;; (*) script_path="$script_dir/$script_path" ;; esac done script_dir=$(command dirname -- "$script_path") script_dir=$(cd "$script_dir" && command pwd -P) printf "%s" "$script_dir" ) _vars_get_proc_name() { if [ -n "${ZSH_VERSION:-}" ] ; then script="$(ps -p "$$" -o comm=)" else script="$1" while [ -L "$script" ] ; do script="$(readlink "$script")" done fi basename -- "$script" } _vars_this_script_name="vars.sh" if [ "$_vars_this_script_name" = "$(_vars_get_proc_name "$0")" ] ; then echo ":: ERROR: Incorrect usage: this script must be sourced." echo " Usage: . path/to/${_vars_this_script_name}" return 255 2>/dev/null || exit 255 fi # Prepend path segment(s) to path-like env vars (PATH, CPATH, etc.). # prepend_path() avoids dangling ":" that affects some env vars (PATH and CPATH) # PATH > https://www.gnu.org/software/libc/manual/html_node/Standard-Environment.html # Usage: # env_var=$(prepend_path "$prepend_to_var" "$existing_env_var") # export env_var # # Inputs: # $1 == path segment to be prepended to $2 # $2 == value of existing path-like environment variable prepend_path() ( path_to_add="$1" path_is_now="$2" if [ "" = "${path_is_now}" ] ; then # avoid dangling ":" printf "%s" "${path_to_add}" else printf "%s" "${path_to_add}:${path_is_now}" fi ) # Extract the name and location of this sourced script. # Generally, "ps -o comm=" is limited to a 15 character result, but it works # fine for this usage, because we are primarily interested in finding the name # of the execution shell, not the name of any calling script. vars_script_name="" vars_script_shell="$(ps -p "$$" -o comm=)" # ${var:-} needed to pass "set -eu" checks if [ -n "${ZSH_VERSION:-}" ] && [ -n "${ZSH_EVAL_CONTEXT:-}" ] ; then # zsh 5.x and later # shellcheck disable=2249 case $ZSH_EVAL_CONTEXT in (*:file*) vars_script_name="${(%):-%x}" ;; esac ; elif [ -n "${KSH_VERSION:-}" ] ; then # ksh, mksh or lksh if [ "$(set | grep -Fq "KSH_VERSION=.sh.version" ; echo $?)" -eq 0 ] ; then # ksh vars_script_name="${.sh.file}" ; else # mksh or lksh or [lm]ksh masquerading as ksh or sh # force [lm]ksh to issue error msg; which contains this script's path/filename, e.g.: # mksh: /home/ubuntu/intel/oneapi/vars.sh[137]: ${.sh.file}: bad substitution vars_script_name="$( (echo "${.sh.file}") 2>&1 )" || : ; vars_script_name="$(expr "${vars_script_name:-}" : '^.*sh: \(.*\)\[[0-9]*\]:')" ; fi elif [ -n "${BASH_VERSION:-}" ] ; then # bash # shellcheck disable=2128 (return 0 2>/dev/null) && vars_script_name="${BASH_SOURCE}" ; elif [ "dash" = "$vars_script_shell" ] ; then # dash # force dash to issue error msg; which contains this script's rel/path/filename, e.g.: # dash: 146: /home/ubuntu/intel/oneapi/vars.sh: Bad substitution vars_script_name="$( (echo "${.sh.file}") 2>&1 )" || : ; vars_script_name="$(expr "${vars_script_name:-}" : '^.*dash: [0-9]*: \(.*\):')" ; elif [ "sh" = "$vars_script_shell" ] ; then # could be dash masquerading as /bin/sh # force a shell error msg; which should contain this script's path/filename # sample error msg shown; assume this file is named "vars.sh"; as required by setvars.sh vars_script_name="$( (echo "${.sh.file}") 2>&1 )" || : ; if [ "$(printf "%s" "$vars_script_name" | grep -Eq "sh: [0-9]+: .*vars\.sh: " ; echo $?)" -eq 0 ] ; then # dash as sh # sh: 155: /home/ubuntu/intel/oneapi/vars.sh: Bad substitution vars_script_name="$(expr "${vars_script_name:-}" : '^.*sh: [0-9]*: \(.*\):')" ; fi else # unrecognized shell or dash being sourced from within a user's script # force a shell error msg; which should contain this script's path/filename # sample error msg shown; assume this file is named "vars.sh"; as required by setvars.sh vars_script_name="$( (echo "${.sh.file}") 2>&1 )" || : ; if [ "$(printf "%s" "$vars_script_name" | grep -Eq "^.+: [0-9]+: .*vars\.sh: " ; echo $?)" -eq 0 ] ; then # dash # .*: 164: intel/oneapi/vars.sh: Bad substitution vars_script_name="$(expr "${vars_script_name:-}" : '^.*: [0-9]*: \(.*\):')" ; else vars_script_name="" ; fi fi if [ "" = "$vars_script_name" ] ; then >&2 echo ":: ERROR: Unable to proceed: possible causes listed below." >&2 echo " This script must be sourced. Did you execute or source this script?" ; >&2 echo " Unrecognized/unsupported shell (supported: bash, zsh, ksh, m/lksh, dash)." ; >&2 echo " Can be caused by sourcing from ZSH version 4.x or older." ; return 255 2>/dev/null || exit 255 fi TBBROOT=$(get_script_path "${vars_script_name:-}")/.. LIBTBB_NAME="libtbb.dylib" if [ -e "$TBBROOT/lib/$LIBTBB_NAME" ]; then export TBBROOT LIBRARY_PATH=$(prepend_path "${TBBROOT}/lib" "${LIBRARY_PATH:-}") ; export LIBRARY_PATH DYLD_LIBRARY_PATH=$(prepend_path "${TBBROOT}/lib" "${DYLD_LIBRARY_PATH:-}") ; export DYLD_LIBRARY_PATH CPATH=$(prepend_path "${TBBROOT}/include" "${CPATH:-}") ; export CPATH CMAKE_PREFIX_PATH=$(prepend_path "${TBBROOT}" "${CMAKE_PREFIX_PATH:-}") ; export CMAKE_PREFIX_PATH PKG_CONFIG_PATH=$(prepend_path "${TBBROOT}/lib/pkgconfig" "${PKG_CONFIG_PATH:-}") ; export PKG_CONFIG_PATH else >&2 echo "ERROR: $LIBTBB_NAME library does not exist in $TBBROOT/lib." return 255 2>/dev/null || exit 255 fi ================================================ FILE: third-party/tbb/integration/mac/env/vars.sh.in ================================================ #!/bin/sh # # Copyright (c) 2005-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. export TBBROOT=@TBBROOT_REPLACEMENT@ DYLD_LIBRARY_PATH="@LIBRARY_PATH_REPLACEMENT@:${DYLD_LIBRARY_PATH}"; export DYLD_LIBRARY_PATH LIBRARY_PATH="@LIBRARY_PATH_REPLACEMENT@:${LIBRARY_PATH}"; export LIBRARY_PATH CPATH="${TBBROOT}/include:${CPATH}"; export CPATH PKG_CONFIG_PATH="@LIBRARY_PATH_REPLACEMENT@/pkgconfig:${PKG_CONFIG_PATH}"; export PKG_CONFIG_PATH @CMAKE_ENVIRONMENT_SOURCING_STRING@ ================================================ FILE: third-party/tbb/integration/pkg-config/tbb.pc.in ================================================ # Copyright (c) 2021-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. prefix=@_prefix_for_pc_file@ libdir=@_libdir_for_pc_file@ includedir=@_includedir_for_pc_file@ Name: oneAPI Threading Building Blocks (oneTBB) Description: C++ library for parallel programming on multi-core processors. URL: https://github.com/uxlfoundation/oneTBB Version: @TBB_VERSION@ Libs: -L${libdir} @_tbb_pc_extra_libdir@ -l@_tbb_pc_lib_name@ Cflags: -I${includedir} ================================================ FILE: third-party/tbb/integration/windows/env/vars.bat ================================================ @echo off REM REM Copyright (c) 2005-2023 Intel Corporation REM REM Licensed under the Apache License, Version 2.0 (the "License"); REM you may not use this file except in compliance with the License. REM You may obtain a copy of the License at REM REM http://www.apache.org/licenses/LICENSE-2.0 REM REM Unless required by applicable law or agreed to in writing, software REM distributed under the License is distributed on an "AS IS" BASIS, REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. REM See the License for the specific language governing permissions and REM limitations under the License. REM REM Syntax: REM %SCRIPT_NAME% [^] [^] REM ^ should be one of the following REM ia32 : Set up for IA-32 architecture REM intel64 : Set up for Intel(R) 64 architecture REM if ^ is not set Intel(R) 64 architecture will be used REM ^ should be one of the following REM vs2019 : Set to use with Microsoft Visual Studio 2019 runtime DLLs REM vs2022 : Set to use with Microsoft Visual Studio 2022 runtime DLLs REM all : Set to use oneTBB statically linked with Microsoft Visual C++ runtime REM if ^ is not set oneTBB dynamically linked with Microsoft Visual C++ runtime will be used. set "SCRIPT_NAME=%~nx0" set "TBB_SCRIPT_DIR=%~d0%~p0" set "TBBROOT=%TBB_SCRIPT_DIR%.." :: Set the default arguments set TBB_TARGET_ARCH=intel64 set TBB_ARCH_SUFFIX= set TBB_TARGET_VS=vc14 :ParseArgs :: Parse the incoming arguments if /i "%1"=="" goto ParseLayout if /i "%1"=="ia32" (set TBB_TARGET_ARCH=ia32) & shift & goto ParseArgs if /i "%1"=="intel64" (set TBB_TARGET_ARCH=intel64) & shift & goto ParseArgs if /i "%1"=="vs2019" (set TBB_TARGET_VS=vc14) & shift & goto ParseArgs if /i "%1"=="vs2022" (set TBB_TARGET_VS=vc14) & shift & goto ParseArgs if /i "%1"=="all" (set TBB_TARGET_VS=vc_mt) & shift & goto ParseArgs :ParseLayout if exist "%TBBROOT%\redist\" ( set "TBB_BIN_DIR=%TBBROOT%\redist" set "TBB_SUBDIR=%TBB_TARGET_ARCH%" goto SetEnv ) if "%TBB_TARGET_ARCH%" == "ia32" ( set TBB_ARCH_SUFFIX=32 ) if exist "%TBBROOT%\bin%TBB_ARCH_SUFFIX%" ( set "TBB_BIN_DIR=%TBBROOT%\bin%TBB_ARCH_SUFFIX%" if "%TBB_TARGET_VS%" == "vc14" ( set TBB_TARGET_VS= ) goto SetEnv ) :: Couldn't parse TBBROOT/bin, unset variable set TBB_ARCH_SUFFIX= if exist "%TBBROOT%\..\redist\" ( set "TBB_BIN_DIR=%TBBROOT%\..\redist" set "TBB_SUBDIR=%TBB_TARGET_ARCH%\tbb" goto SetEnv ) :SetEnv if exist "%TBB_BIN_DIR%\%TBB_SUBDIR%\%TBB_TARGET_VS%\tbb12.dll" ( set "TBB_DLL_PATH=%TBB_BIN_DIR%\%TBB_SUBDIR%\%TBB_TARGET_VS%" ) else ( echo: echo :: ERROR: tbb12.dll library does not exist in "%TBB_BIN_DIR%\%TBB_SUBDIR%\%TBB_TARGET_VS%\" echo: exit /b 255 ) set "PATH=%TBB_DLL_PATH%;%PATH%" set "LIB=%TBBROOT%\lib%TBB_ARCH_SUFFIX%\%TBB_SUBDIR%\%TBB_TARGET_VS%;%LIB%" set "INCLUDE=%TBBROOT%\include;%INCLUDE%" set "CPATH=%TBBROOT%\include;%CPATH%" set "CMAKE_PREFIX_PATH=%TBBROOT%;%CMAKE_PREFIX_PATH%" set "PKG_CONFIG_PATH=%TBBROOT%\lib%TBB_ARCH_SUFFIX%\pkgconfig;%PKG_CONFIG_PATH%" :End exit /B 0 ================================================ FILE: third-party/tbb/integration/windows/env/vars.bat.in ================================================ @echo off REM REM Copyright (c) 2005-2021 Intel Corporation REM REM Licensed under the Apache License, Version 2.0 (the "License"); REM you may not use this file except in compliance with the License. REM You may obtain a copy of the License at REM REM http://www.apache.org/licenses/LICENSE-2.0 REM REM Unless required by applicable law or agreed to in writing, software REM distributed under the License is distributed on an "AS IS" BASIS, REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. REM See the License for the specific language governing permissions and REM limitations under the License. REM @echo off set "TBBROOT=@TBBROOT_REPLACEMENT@" set "TBB_DLL_PATH=@BINARY_PATH_REPLACEMENT@" set "INCLUDE=%TBBROOT%\include;%INCLUDE%" set "CPATH=%TBBROOT%\include;%CPATH%" set "LIB=@LIBRARY_PATH_REPLACEMENT@;%LIB%" set "PATH=@BINARY_PATH_REPLACEMENT@;%PATH%" set "PKG_CONFIG_PATH=@LIBRARY_PATH_REPLACEMENT@\pkgconfig;%PKG_CONFIG_PATH%" @CMAKE_ENVIRONMENT_SOURCING_STRING@ ================================================ FILE: third-party/tbb/integration/windows/nuget/inteltbb.devel.win.targets ================================================ $(MSBuildThisFileDirectory)..\..\build\native\include;%(AdditionalIncludeDirectories) TBB_USE_DEBUG;%(PreprocessorDefinitions) $(MSBuildThisFileDirectory)..\..\build\native\win-x86;%(AdditionalLibraryDirectories) tbb12.lib;tbbmalloc.lib;tbbmalloc_proxy.lib;%(AdditionalDependencies) $(MSBuildThisFileDirectory)..\..\build\native\win-x64;%(AdditionalLibraryDirectories) tbb12.lib;tbbmalloc.lib;tbbmalloc_proxy.lib;%(AdditionalDependencies) $(MSBuildThisFileDirectory)..\..\build\native\win-x86;%(AdditionalLibraryDirectories) tbb12_debug.lib;tbbmalloc_debug.lib;tbbmalloc_proxy_debug.lib;%(AdditionalDependencies) $(MSBuildThisFileDirectory)..\..\build\native\win-x64;%(AdditionalLibraryDirectories) tbb12_debug.lib;tbbmalloc_debug.lib;tbbmalloc_proxy_debug.lib;%(AdditionalDependencies) ================================================ FILE: third-party/tbb/integration/windows/nuget/inteltbb.redist.win.targets ================================================ ================================================ FILE: third-party/tbb/integration/windows/oneapi/vars.bat ================================================ @echo off REM REM Copyright (c) 2023 Intel Corporation REM REM Licensed under the Apache License, Version 2.0 (the "License"); REM you may not use this file except in compliance with the License. REM You may obtain a copy of the License at REM REM http://www.apache.org/licenses/LICENSE-2.0 REM REM Unless required by applicable law or agreed to in writing, software REM distributed under the License is distributed on an "AS IS" BASIS, REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. REM See the License for the specific language governing permissions and REM limitations under the License. REM if not defined SETVARS_CALL ( echo: echo :: ERROR: This script must be executed by setvars.bat. echo: Try '[install-dir]\setvars.bat --help' for help. echo: exit /b 255 ) if not defined ONEAPI_ROOT ( echo: echo :: ERROR: This script requires that the ONEAPI_ROOT env variable is set." echo: Try '[install-dir]\setvars.bat --help' for help. echo: exit /b 254 ) set "TBBROOT=%ONEAPI_ROOT%" :: Set the default arguments set "TBB_TARGET_ARCH=%INTEL_TARGET_ARCH%" set TBB_TARGET_VS= set ARCH_SUFFIX= :ParseArgs :: Parse the incoming arguments if /i "%1"=="" goto SetEnv if /i "%1"=="vs2019" (set TBB_TARGET_VS= ) & shift & goto ParseArgs if /i "%1"=="vs2022" (set TBB_TARGET_VS= ) & shift & goto ParseArgs if /i "%1"=="all" (set TBB_TARGET_VS=vc_mt) & shift & goto ParseArgs if "%TBB_TARGET_ARCH%"=="ia32" set ARCH_SUFFIX=32 :SetEnv if exist "%TBBROOT%\bin%ARCH_SUFFIX%\%TBB_TARGET_VS%\tbb12.dll" ( set "TBB_DLL_PATH=%TBBROOT%\bin%ARCH_SUFFIX%\%TBB_TARGET_VS%" ) :End exit /B 0 ================================================ FILE: third-party/tbb/integration/windows/sys_check/sys_check.bat ================================================ @echo off REM REM Copyright (c) 2019-2021 Intel Corporation REM REM Licensed under the Apache License, Version 2.0 (the "License"); REM you may not use this file except in compliance with the License. REM You may obtain a copy of the License at REM REM http://www.apache.org/licenses/LICENSE-2.0 REM REM Unless required by applicable law or agreed to in writing, software REM distributed under the License is distributed on an "AS IS" BASIS, REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. REM See the License for the specific language governing permissions and REM limitations under the License. REM exit /B 0 ================================================ FILE: third-party/tbb/python/CMakeLists.txt ================================================ # Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. find_package(PythonInterp 3.5 REQUIRED) set(PYTHON_BUILD_WORK_DIR python_build) add_custom_target( python_copy COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/tbb ${PYTHON_BUILD_WORK_DIR}/tbb COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/TBB.py ${PYTHON_BUILD_WORK_DIR} COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/setup.py ${PYTHON_BUILD_WORK_DIR} ) # Python build requires path to TBB headers get_target_property(TBB_INCLUDES tbb INTERFACE_INCLUDE_DIRECTORIES) foreach(dir ${TBB_INCLUDES}) if (${dir} MATCHES " install --prefix build -f COMMENT "Build and install to work directory the oneTBB Python module" ) add_test(NAME python_test COMMAND ${CMAKE_COMMAND} -DTBB_BINARIES_PATH=$ -DPYTHON_MODULE_BUILD_PATH=${PYTHON_BUILD_WORK_DIR}/build -P ${PROJECT_SOURCE_DIR}/cmake/python/test_launcher.cmake) install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/${PYTHON_BUILD_WORK_DIR}/build/ DESTINATION . COMPONENT tbb4py) if (UNIX AND NOT APPLE) add_subdirectory(rml) add_dependencies(python_build irml) endif() ================================================ FILE: third-party/tbb/python/README.md ================================================ # Python* API for Intel® oneAPI Threading Building Blocks (oneTBB) . ## Overview It is a preview Python* module which unlocks opportunities for additional performance in multi-threaded and multiprocess Python programs by enabling threading composability between two or more thread-enabled libraries like Numpy, Scipy, Sklearn, Dask, Joblib, and etc. The biggest improvement can be achieved when a task pool like the ThreadPool or Pool from the Python standard library or libraries like Dask or Joblib (used either in multi-threading or multi-processing mode) execute tasks calling compute-intensive functions of Numpy/Scipy/Sklearn/PyDAAL which in turn are parallelized using Intel® oneAPI Math Kernel Library or/and oneTBB. The module implements Pool class with the standard interface using oneTBB which can be used to replace Python's ThreadPool. Thanks to the monkey-patching technique implemented in class Monkey, no source code change is needed in order to enable threading composability in Python programs. For more information and examples, please refer to [forum discussion](https://community.intel.com/t5/Intel-Distribution-for-Python/TBB-module-Unleash-parallel-performance-of-Python-programs/m-p/1074459). ## Directories - **rml** - The folder contains sources for building the plugin with cross-process dynamic thread scheduler implementation. - **tbb** - The folder contains Python module sources. ## Files - **setup.py** - Standard Python setup script. - **TBB.py** - Alternative entry point for Python module. ## CMake predefined targets - `irml` - compilation of plugin with cross-process dynamic thread scheduler implementation. - `python_build` - building of oneTBB module for Python. ## Command-line interface - `python3 -m tbb -h` - Print documentation on command-line interface. - `pydoc tbb` - Read built-in documentation for Python interfaces. - `python3 -m tbb your_script.py` - Run your_script.py in context of `with tbb.Monkey():` when oneTBB is enabled. By default only multi-threading will be covered. - `python3 -m tbb --ipc your_script.py` - Run your_script.py in context of `with tbb.Monkey():` when oneTBB enabled in both multi-threading and multi-processing modes. - `python3 setup.py build -b -f check` - Build oneTBB module for Python. (Prerequisites: built and sourced oneTBB and IRML libraries) - `python3 setup.py build -b build_ext -I -L install -f ` - Build and install oneTBB module for Python. (Prerequisites: built oneTBB and IRML libraries) - `python3 -m TBB test` - run test for oneTBB module for Python. - `python3 -m tbb test` - run test for oneTBB module for Python. ## System Requirements - The Python module was not tested on older versions of Python thus we require at least Python and 3.5 or higher. - SWIG must be of version 3.0.6 or higher. ================================================ FILE: third-party/tbb/python/TBB.py ================================================ # Copyright (c) 2016-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from tbb import * from tbb import __all__, __doc__ if __name__ == "__main__": from tbb import _main import sys sys.exit(_main()) ================================================ FILE: third-party/tbb/python/rml/CMakeLists.txt ================================================ # Copyright (c) 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. add_library(irml ../../src/tbb/allocator.cpp ../../src/tbb/dynamic_link.cpp ../../src/tbb/misc.cpp ../../src/tbb/misc_ex.cpp ../../src/tbb/exception.cpp ipc_server.cpp ipc_utils.cpp ) add_library(TBB::irml ALIAS irml) set_target_properties(irml PROPERTIES VERSION 1) target_include_directories(irml PUBLIC $ $) target_compile_options(irml PRIVATE ${TBB_CXX_STD_FLAG} # TODO: consider making it PUBLIC. ${TBB_MMD_FLAG} ${TBB_DSE_FLAG} ${TBB_WARNING_LEVEL} ${TBB_LIB_COMPILE_FLAGS} ${TBB_COMMON_COMPILE_FLAGS} ) target_compile_definitions(irml PUBLIC $<$:TBB_USE_DEBUG>) # Prefer using target_link_options instead of target_link_libraries to specify link options because # target_link_libraries may incorrectly handle some options (on Windows, for example). if (COMMAND target_link_options) target_link_options(irml PRIVATE ${TBB_LIB_LINK_FLAGS} ${TBB_COMMON_LINK_FLAGS} ) else() target_link_libraries(irml PRIVATE ${TBB_LIB_LINK_FLAGS} ${TBB_COMMON_LINK_FLAGS} ) endif() target_link_libraries(irml PRIVATE Threads::Threads ${TBB_LIB_LINK_LIBS} ${TBB_COMMON_LINK_LIBS} ) if (DEFINED TBB_SIGNTOOL) string(REPLACE " " ";" TBB_SIGNTOOL_ARGS "${TBB_SIGNTOOL_ARGS}") add_custom_command(TARGET irml POST_BUILD COMMAND ${TBB_SIGNTOOL} $ ${TBB_SIGNTOOL_ARGS}) endif() install(TARGETS irml EXPORT TBBTargets LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT tbb4py RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT tbb4py ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT tbb4py) ================================================ FILE: third-party/tbb/python/rml/ipc_server.cpp ================================================ /* Copyright (c) 2017-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include "../../src/tbb/rml_tbb.h" #include "../../src/tbb/rml_thread_monitor.h" #include "../../src/tbb/scheduler_common.h" #include "../../src/tbb/governor.h" #include "../../src/tbb/misc.h" #include "tbb/cache_aligned_allocator.h" #include "ipc_utils.h" #include #include namespace rml { namespace internal { static const char* IPC_ENABLE_VAR_NAME = "IPC_ENABLE"; typedef versioned_object::version_type version_type; extern "C" factory::status_type __RML_open_factory(factory& f, version_type& /*server_version*/, version_type /*client_version*/) { if( !tbb::internal::rml::get_enable_flag( IPC_ENABLE_VAR_NAME ) ) { return factory::st_incompatible; } // Hack to keep this library from being closed static std::atomic one_time_flag{false}; bool expected = false; if( one_time_flag.compare_exchange_strong(expected, true) ) { __TBB_ASSERT( (size_t)f.library_handle!=factory::c_dont_unload, nullptr ); #if _WIN32||_WIN64 f.library_handle = reinterpret_cast(factory::c_dont_unload); #else f.library_handle = reinterpret_cast(factory::c_dont_unload); #endif } // End of hack return factory::st_success; } extern "C" void __RML_close_factory(factory& /*f*/) { } class ipc_thread_monitor : public tbb::detail::r1::rml::internal::thread_monitor { public: ipc_thread_monitor() : thread_monitor() {} #if USE_WINTHREAD #elif USE_PTHREAD static handle_type launch(thread_routine_type thread_routine, void* arg, size_t stack_size); #endif }; #if USE_WINTHREAD #elif USE_PTHREAD inline ipc_thread_monitor::handle_type ipc_thread_monitor::launch(void* (*thread_routine)(void*), void* arg, size_t stack_size) { pthread_attr_t s; if( pthread_attr_init( &s ) ) return 0; if( stack_size>0 ) { if( pthread_attr_setstacksize( &s, stack_size ) ) return 0; } pthread_t handle; if( pthread_create( &handle, &s, thread_routine, arg ) ) return 0; if( pthread_attr_destroy( &s ) ) return 0; return handle; } #endif }} // rml::internal using rml::internal::ipc_thread_monitor; using tbb::internal::rml::get_shared_name; namespace tbb { namespace detail { namespace r1 { bool terminate_on_exception() { return false; } } namespace rml { typedef ipc_thread_monitor::handle_type thread_handle; class ipc_server; static const char* IPC_MAX_THREADS_VAR_NAME = "MAX_THREADS"; static const char* IPC_ACTIVE_SEM_PREFIX = "/__IPC_active"; static const char* IPC_STOP_SEM_PREFIX = "/__IPC_stop"; static const char* IPC_ACTIVE_SEM_VAR_NAME = "IPC_ACTIVE_SEMAPHORE"; static const char* IPC_STOP_SEM_VAR_NAME = "IPC_STOP_SEMAPHORE"; static const mode_t IPC_SEM_MODE = 0660; static std::atomic my_global_thread_count; using tbb_client = tbb::detail::r1::rml::tbb_client; using tbb_server = tbb::detail::r1::rml::tbb_server; using tbb_factory = tbb::detail::r1::rml::tbb_factory; using tbb::detail::r1::runtime_warning; char* get_sem_name(const char* name, const char* prefix) { __TBB_ASSERT(name != nullptr, nullptr); __TBB_ASSERT(prefix != nullptr, nullptr); char* value = std::getenv(name); std::size_t len = value == nullptr ? 0 : std::strlen(value); if (len > 0) { // TODO: consider returning the original string instead of the copied string. char* sem_name = new char[len + 1]; __TBB_ASSERT(sem_name != nullptr, nullptr); std::strncpy(sem_name, value, len+1); __TBB_ASSERT(sem_name[len] == 0, nullptr); return sem_name; } else { return get_shared_name(prefix); } } char* get_active_sem_name() { return get_sem_name(IPC_ACTIVE_SEM_VAR_NAME, IPC_ACTIVE_SEM_PREFIX); } char* get_stop_sem_name() { return get_sem_name(IPC_STOP_SEM_VAR_NAME, IPC_STOP_SEM_PREFIX); } static void release_thread_sem(sem_t* my_sem) { int old = my_global_thread_count.load(std::memory_order_relaxed); do { if( old<=0 ) return; } while( !my_global_thread_count.compare_exchange_strong(old, old-1) ); if( old>0 ) { sem_post( my_sem ); } } void set_sem_name(const char* name, const char* prefix) { __TBB_ASSERT(name != nullptr, nullptr); __TBB_ASSERT(prefix != nullptr, nullptr); const char* postfix = "_XXXXXX"; std::size_t plen = std::strlen(prefix); std::size_t xlen = std::strlen(postfix); char* templ = new char[plen + xlen + 1]; __TBB_ASSERT(templ != nullptr, nullptr); strncpy(templ, prefix, plen+1); __TBB_ASSERT(templ[plen] == 0, nullptr); strncat(templ, postfix, xlen + 1); __TBB_ASSERT(std::strlen(templ) == plen + xlen + 1, nullptr); // TODO: consider using mkstemp instead of mktemp. char* sem_name = mktemp(templ); if (sem_name != nullptr) { int status = setenv(name, sem_name, /*overwrite*/ 1); __TBB_ASSERT_EX(status == 0, nullptr); } delete[] templ; } extern "C" void set_active_sem_name() { set_sem_name(IPC_ACTIVE_SEM_VAR_NAME, IPC_ACTIVE_SEM_PREFIX); } extern "C" void set_stop_sem_name() { set_sem_name(IPC_STOP_SEM_VAR_NAME, IPC_STOP_SEM_PREFIX); } extern "C" void release_resources() { if( my_global_thread_count.load(std::memory_order_acquire)!=0 ) { char* active_sem_name = get_active_sem_name(); sem_t* my_active_sem = sem_open( active_sem_name, O_CREAT ); __TBB_ASSERT( my_active_sem, "Unable to open active threads semaphore" ); delete[] active_sem_name; do { release_thread_sem( my_active_sem ); } while( my_global_thread_count.load(std::memory_order_acquire)!=0 ); } } extern "C" void release_semaphores() { int status = 0; char* sem_name = nullptr; sem_name = get_active_sem_name(); if( sem_name==nullptr ) { runtime_warning("Can not get RML semaphore name"); return; } status = sem_unlink( sem_name ); if( status!=0 ) { if( errno==ENOENT ) { /* There is no semaphore with the given name, nothing to do */ } else { runtime_warning("Can not release RML semaphore"); return; } } delete[] sem_name; sem_name = get_stop_sem_name(); if( sem_name==nullptr ) { runtime_warning( "Can not get RML semaphore name" ); return; } status = sem_unlink( sem_name ); if( status!=0 ) { if( errno==ENOENT ) { /* There is no semaphore with the given name, nothing to do */ } else { runtime_warning("Can not release RML semaphore"); return; } } delete[] sem_name; } class ipc_worker: no_copy { protected: //! State in finite-state machine that controls the worker. /** State diagram: /----------stop---\ | ^ | V | | init --> starting --> normal | | | | | | V | | \------> quit <-------/<----/ */ enum state_t { //! *this is initialized st_init, //! *this has associated thread that is starting up. st_starting, //! Associated thread is doing normal life sequence. st_normal, //! Associated thread is stopped but can be started again. st_stop, //! Associated thread has ended normal life sequence and promises to never touch *this again. st_quit }; std::atomic my_state; //! Associated server ipc_server& my_server; //! Associated client tbb_client& my_client; //! index used for avoiding the 64K aliasing problem const size_t my_index; //! Monitor for sleeping when there is no work to do. /** The invariant that holds for sleeping workers is: "my_slack<=0 && my_state==st_normal && I am on server's list of asleep threads" */ ipc_thread_monitor my_thread_monitor; //! Handle of the OS thread associated with this worker thread_handle my_handle; //! Link for list of workers that are sleeping or have no associated thread. ipc_worker* my_next; friend class ipc_server; //! Actions executed by the associated thread void run(); //! Wake up associated thread (or launch a thread if there is none) bool wake_or_launch(); //! Called by a thread (usually not the associated thread) to commence termination. void start_shutdown(bool join); //! Called by a thread (usually not the associated thread) to commence stopping. void start_stopping(bool join); static __RML_DECL_THREAD_ROUTINE thread_routine(void* arg); static void release_handle(thread_handle my_handle, bool join); protected: ipc_worker(ipc_server& server, tbb_client& client, const size_t i) : my_server(server), my_client(client), my_index(i) { my_state = st_init; } }; //TODO: cannot bind to nfs_size from allocator.cpp since nfs_size is constexpr defined in another translation unit constexpr static size_t cache_line_sz = 128; #if _MSC_VER && !defined(__INTEL_COMPILER) // Suppress overzealous compiler warnings about uninstantiable class #pragma warning(push) #pragma warning(disable:4510 4610) #endif class padded_ipc_worker: public ipc_worker { char pad[cache_line_sz - sizeof(ipc_worker)%cache_line_sz]; public: padded_ipc_worker(ipc_server& server, tbb_client& client, const size_t i) : ipc_worker( server,client,i ) { suppress_unused_warning(pad); } }; #if _MSC_VER && !defined(__INTEL_COMPILER) #pragma warning(pop) #endif class ipc_waker : public padded_ipc_worker { private: static __RML_DECL_THREAD_ROUTINE thread_routine(void* arg); void run(); bool wake_or_launch(); friend class ipc_server; public: ipc_waker(ipc_server& server, tbb_client& client, const size_t i) : padded_ipc_worker( server, client, i ) {} }; class ipc_stopper : public padded_ipc_worker { private: static __RML_DECL_THREAD_ROUTINE thread_routine(void* arg); void run(); bool wake_or_launch(); friend class ipc_server; public: ipc_stopper(ipc_server& server, tbb_client& client, const size_t i) : padded_ipc_worker( server, client, i ) {} }; class ipc_server: public tbb_server, no_copy { private: tbb_client& my_client; //! Maximum number of threads to be created. /** Threads are created lazily, so maximum might not actually be reached. */ tbb_client::size_type my_n_thread; //! Stack size for each thread. */ const size_t my_stack_size; //! Number of jobs that could use their associated thread minus number of active threads. /** If negative, indicates oversubscription. If positive, indicates that more threads should run. Can be lowered asynchronously, but must be raised only while holding my_asleep_list_mutex, because raising it impacts the invariant for sleeping threads. */ std::atomic my_slack; //! Counter used to determine when to delete this. std::atomic my_ref_count; padded_ipc_worker* my_thread_array; //! List of workers that are asleep or committed to sleeping until notified by another thread. std::atomic my_asleep_list_root; //! Protects my_asleep_list_root typedef scheduler_mutex_type asleep_list_mutex_type; asleep_list_mutex_type my_asleep_list_mutex; //! Should server wait workers while terminate const bool my_join_workers; //! Service thread for waking of workers ipc_waker* my_waker; //! Service thread to stop threads ipc_stopper* my_stopper; //! Semaphore to account active threads sem_t* my_active_sem; //! Semaphore to account stop threads sem_t* my_stop_sem; #if TBB_USE_ASSERT std::atomic my_net_slack_requests; #endif /* TBB_USE_ASSERT */ //! Wake up to two sleeping workers, if there are any sleeping. /** The call is used to propagate a chain reaction where each thread wakes up two threads, which in turn each wake up two threads, etc. */ void propagate_chain_reaction() { // First test of a double-check idiom. Second test is inside wake_some(0). if( my_slack.load(std::memory_order_acquire)>0 ) { int active_threads = 0; if( try_get_active_thread() ) { ++active_threads; if( try_get_active_thread() ) { ++active_threads; } wake_some( 0, active_threads ); } } } //! Try to add t to list of sleeping workers bool try_insert_in_asleep_list(ipc_worker& t); //! Try to add t to list of sleeping workers even if there is some work to do bool try_insert_in_asleep_list_forced(ipc_worker& t); //! Equivalent of adding additional_slack to my_slack and waking up to 2 threads if my_slack permits. void wake_some(int additional_slack, int active_threads); //! Equivalent of adding additional_slack to my_slack and waking up to 1 thread if my_slack permits. void wake_one_forced(int additional_slack); //! Stop one thread from asleep list bool stop_one(); //! Wait for active thread bool wait_active_thread(); //! Try to get active thread bool try_get_active_thread(); //! Release active thread void release_active_thread(); //! Wait for thread to stop bool wait_stop_thread(); //! Add thread to stop list void add_stop_thread(); void remove_server_ref() { if( --my_ref_count==0 ) { my_client.acknowledge_close_connection(); this->~ipc_server(); tbb::cache_aligned_allocator().deallocate( this, 1 ); } } friend class ipc_worker; friend class ipc_waker; friend class ipc_stopper; public: ipc_server(tbb_client& client); virtual ~ipc_server(); version_type version() const override { return 0; } void request_close_connection(bool /*exiting*/) override { my_waker->start_shutdown(false); my_stopper->start_shutdown(false); for( size_t i=0; i=2 && !__MINGW64__ // ensure that stack is properly aligned __attribute__((force_align_arg_pointer)) #endif __RML_DECL_THREAD_ROUTINE ipc_worker::thread_routine(void* arg) { ipc_worker* self = static_cast(arg); AVOID_64K_ALIASING( self->my_index ); self->run(); return 0; } #if _MSC_VER && !defined(__INTEL_COMPILER) #pragma warning(pop) #endif void ipc_worker::release_handle(thread_handle handle, bool join) { if( join ) ipc_thread_monitor::join( handle ); else ipc_thread_monitor::detach_thread( handle ); } void ipc_worker::start_shutdown(bool join) { state_t s = my_state.load(std::memory_order_relaxed); do { __TBB_ASSERT( s!=st_quit, nullptr ); } while( !my_state.compare_exchange_strong( s, st_quit ) ); if( s==st_normal || s==st_starting ) { // May have invalidated invariant for sleeping, so wake up the thread. // Note that the notify() here occurs without maintaining invariants for my_slack. // It does not matter, because my_state==st_quit overrides checking of my_slack. my_thread_monitor.notify(); // Do not need release handle in st_init state, // because in this case the thread wasn't started yet. // For st_starting release is done at launch site. if( s==st_normal ) release_handle( my_handle, join ); } } void ipc_worker::start_stopping(bool join) { state_t s = my_state.load(std::memory_order_relaxed); while( !my_state.compare_exchange_strong( s, st_quit ) ) {}; if( s==st_normal || s==st_starting ) { // May have invalidated invariant for sleeping, so wake up the thread. // Note that the notify() here occurs without maintaining invariants for my_slack. // It does not matter, because my_state==st_quit overrides checking of my_slack. my_thread_monitor.notify(); // Do not need release handle in st_init state, // because in this case the thread wasn't started yet. // For st_starting release is done at launch site. if( s==st_normal ) release_handle( my_handle, join ); } } void ipc_worker::run() { my_server.propagate_chain_reaction(); // Transiting to st_normal here would require setting my_handle, // which would create race with the launching thread and // complications in handle management on Windows. ::rml::job& j = *my_client.create_one_job(); state_t state = my_state.load(std::memory_order_acquire); while( state!=st_quit && state!=st_stop ) { if( my_server.my_slack>=0 ) { my_client.process(j); } else { // Check/set the invariant for sleeping state = my_state.load(std::memory_order_seq_cst); if( state!=st_quit && state!=st_stop && my_server.try_insert_in_asleep_list(*this) ) { if( my_server.my_n_thread > 1 ) my_server.release_active_thread(); my_thread_monitor.wait(); my_server.propagate_chain_reaction(); } } // memory_order_seq_cst to be strictly ordered after thread_monitor::wait state = my_state.load(std::memory_order_seq_cst); } my_client.cleanup(j); my_server.remove_server_ref(); } inline bool ipc_worker::wake_or_launch() { state_t excepted_stop = st_stop, expected_init = st_init; if( ( my_state.load(std::memory_order_acquire)==st_init && my_state.compare_exchange_strong( expected_init, st_starting ) ) || ( my_state.load(std::memory_order_acquire)==st_stop && my_state.compare_exchange_strong( excepted_stop, st_starting ) ) ) { // after this point, remove_server_ref() must be done by created thread #if USE_WINTHREAD my_handle = ipc_thread_monitor::launch( thread_routine, this, my_server.my_stack_size, &this->my_index ); #elif USE_PTHREAD { affinity_helper fpa; fpa.protect_affinity_mask( /*restore_process_mask=*/true ); my_handle = ipc_thread_monitor::launch( thread_routine, this, my_server.my_stack_size ); if( my_handle == 0 ) { // Unable to create new thread for process // However, this is expected situation for the use cases of this coordination server state_t s = st_starting; my_state.compare_exchange_strong( s, st_init ); if (st_starting != s) { // Do shutdown during startup. my_handle can't be released // by start_shutdown, because my_handle value might be not set yet // at time of transition from st_starting to st_quit. __TBB_ASSERT( s==st_quit, nullptr ); release_handle( my_handle, my_server.my_join_workers ); } return false; } else { my_server.my_ref_count++; } // Implicit destruction of fpa resets original affinity mask. } #endif /* USE_PTHREAD */ state_t s = st_starting; my_state.compare_exchange_strong( s, st_normal ); if( st_starting!=s ) { // Do shutdown during startup. my_handle can't be released // by start_shutdown, because my_handle value might be not set yet // at time of transition from st_starting to st_quit. __TBB_ASSERT( s==st_quit, nullptr ); release_handle( my_handle, my_server.my_join_workers ); } } else { my_thread_monitor.notify(); } return true; } //------------------------------------------------------------------------ // Methods of ipc_waker //------------------------------------------------------------------------ #if _MSC_VER && !defined(__INTEL_COMPILER) // Suppress overzealous compiler warnings about an initialized variable 'sink_for_alloca' not referenced #pragma warning(push) #pragma warning(disable:4189) #endif #if __MINGW32__ && __GNUC__==4 &&__GNUC_MINOR__>=2 && !__MINGW64__ // ensure that stack is properly aligned __attribute__((force_align_arg_pointer)) #endif __RML_DECL_THREAD_ROUTINE ipc_waker::thread_routine(void* arg) { ipc_waker* self = static_cast(arg); AVOID_64K_ALIASING( self->my_index ); self->run(); return 0; } #if _MSC_VER && !defined(__INTEL_COMPILER) #pragma warning(pop) #endif void ipc_waker::run() { // Transiting to st_normal here would require setting my_handle, // which would create race with the launching thread and // complications in handle management on Windows. // memory_order_seq_cst to be strictly ordered after thread_monitor::wait on the next iteration while( my_state.load(std::memory_order_seq_cst)!=st_quit ) { bool have_to_sleep = false; if( my_server.my_slack.load(std::memory_order_acquire)>0 ) { if( my_server.wait_active_thread() ) { if( my_server.my_slack.load(std::memory_order_acquire)>0 ) { my_server.wake_some( 0, 1 ); } else { my_server.release_active_thread(); have_to_sleep = true; } } } else { have_to_sleep = true; } if( have_to_sleep ) { // Check/set the invariant for sleeping if( my_server.my_slack.load(std::memory_order_acquire)<0 ) { my_thread_monitor.wait(); } } } my_server.remove_server_ref(); } inline bool ipc_waker::wake_or_launch() { state_t excepted = st_init; if( ( my_state.load(std::memory_order_acquire)==st_init && my_state.compare_exchange_strong( excepted, st_starting ) ) ) { // after this point, remove_server_ref() must be done by created thread #if USE_WINTHREAD my_handle = ipc_thread_monitor::launch( thread_routine, this, my_server.my_stack_size, &this->my_index ); #elif USE_PTHREAD { affinity_helper fpa; fpa.protect_affinity_mask( /*restore_process_mask=*/true ); my_handle = ipc_thread_monitor::launch( thread_routine, this, my_server.my_stack_size ); if( my_handle == 0 ) { runtime_warning( "Unable to create new thread for process %d", getpid() ); state_t s = st_starting; my_state.compare_exchange_strong(s, st_init); if (st_starting != s) { // Do shutdown during startup. my_handle can't be released // by start_shutdown, because my_handle value might be not set yet // at time of transition from st_starting to st_quit. __TBB_ASSERT( s==st_quit, nullptr ); release_handle( my_handle, my_server.my_join_workers ); } return false; } else { my_server.my_ref_count++; } // Implicit destruction of fpa resets original affinity mask. } #endif /* USE_PTHREAD */ state_t s = st_starting; my_state.compare_exchange_strong(s, st_normal); if( st_starting!=s ) { // Do shutdown during startup. my_handle can't be released // by start_shutdown, because my_handle value might be not set yet // at time of transition from st_starting to st_quit. __TBB_ASSERT( s==st_quit, nullptr ); release_handle( my_handle, my_server.my_join_workers ); } } else { my_thread_monitor.notify(); } return true; } //------------------------------------------------------------------------ // Methods of ipc_stopper //------------------------------------------------------------------------ #if _MSC_VER && !defined(__INTEL_COMPILER) // Suppress overzealous compiler warnings about an initialized variable 'sink_for_alloca' not referenced #pragma warning(push) #pragma warning(disable:4189) #endif #if __MINGW32__ && __GNUC__==4 &&__GNUC_MINOR__>=2 && !__MINGW64__ // ensure that stack is properly aligned __attribute__((force_align_arg_pointer)) #endif __RML_DECL_THREAD_ROUTINE ipc_stopper::thread_routine(void* arg) { ipc_stopper* self = static_cast(arg); AVOID_64K_ALIASING( self->my_index ); self->run(); return 0; } #if _MSC_VER && !defined(__INTEL_COMPILER) #pragma warning(pop) #endif void ipc_stopper::run() { // Transiting to st_normal here would require setting my_handle, // which would create race with the launching thread and // complications in handle management on Windows. while( my_state.load(std::memory_order_acquire)!=st_quit ) { if( my_server.wait_stop_thread() ) { if( my_state.load(std::memory_order_acquire)!=st_quit ) { if( !my_server.stop_one() ) { my_server.add_stop_thread(); tbb::detail::r1::prolonged_pause(); } } } } my_server.remove_server_ref(); } inline bool ipc_stopper::wake_or_launch() { state_t excepted = st_init; if( ( my_state.load(std::memory_order_acquire)==st_init && my_state.compare_exchange_strong( excepted, st_starting ) ) ) { // after this point, remove_server_ref() must be done by created thread #if USE_WINTHREAD my_handle = ipc_thread_monitor::launch( thread_routine, this, my_server.my_stack_size, &this->my_index ); #elif USE_PTHREAD { affinity_helper fpa; fpa.protect_affinity_mask( /*restore_process_mask=*/true ); my_handle = ipc_thread_monitor::launch( thread_routine, this, my_server.my_stack_size ); if( my_handle == 0 ) { runtime_warning( "Unable to create new thread for process %d", getpid() ); state_t s = st_starting; my_state.compare_exchange_strong(s, st_init); if (st_starting != s) { // Do shutdown during startup. my_handle can't be released // by start_shutdown, because my_handle value might be not set yet // at time of transition from st_starting to st_quit. __TBB_ASSERT( s==st_quit, nullptr ); release_handle( my_handle, my_server.my_join_workers ); } return false; } else { my_server.my_ref_count++; } // Implicit destruction of fpa resets original affinity mask. } #endif /* USE_PTHREAD */ state_t s = st_starting; my_state.compare_exchange_strong(s, st_normal); if( st_starting!=s ) { // Do shutdown during startup. my_handle can't be released // by start_shutdown, because my_handle value might be not set yet // at time of transition from st_starting to st_quit. __TBB_ASSERT( s==st_quit, nullptr ); release_handle( my_handle, my_server.my_join_workers ); } } else { my_thread_monitor.notify(); } return true; } //------------------------------------------------------------------------ // Methods of ipc_server //------------------------------------------------------------------------ ipc_server::ipc_server(tbb_client& client) : my_client( client ), my_stack_size( client.min_stack_size() ), my_thread_array(nullptr), my_join_workers(false), my_waker(nullptr), my_stopper(nullptr) { my_ref_count = 1; my_slack = 0; #if TBB_USE_ASSERT my_net_slack_requests = 0; #endif /* TBB_USE_ASSERT */ my_n_thread = tbb::internal::rml::get_num_threads(IPC_MAX_THREADS_VAR_NAME); if( my_n_thread==0 ) { my_n_thread = tbb::detail::r1::AvailableHwConcurrency(); __TBB_ASSERT( my_n_thread>0, nullptr ); } my_asleep_list_root = nullptr; my_thread_array = tbb::cache_aligned_allocator().allocate( my_n_thread ); for( size_t i=0; imy_next = my_asleep_list_root; my_asleep_list_root = t; } my_waker = tbb::cache_aligned_allocator().allocate(1); new( my_waker ) ipc_waker( *this, client, my_n_thread ); my_stopper = tbb::cache_aligned_allocator().allocate(1); new( my_stopper ) ipc_stopper( *this, client, my_n_thread + 1 ); char* active_sem_name = get_active_sem_name(); my_active_sem = sem_open( active_sem_name, O_CREAT, IPC_SEM_MODE, my_n_thread - 1 ); __TBB_ASSERT( my_active_sem, "Unable to open active threads semaphore" ); delete[] active_sem_name; char* stop_sem_name = get_stop_sem_name(); my_stop_sem = sem_open( stop_sem_name, O_CREAT, IPC_SEM_MODE, 0 ); __TBB_ASSERT( my_stop_sem, "Unable to open stop threads semaphore" ); delete[] stop_sem_name; } ipc_server::~ipc_server() { __TBB_ASSERT( my_net_slack_requests.load(std::memory_order_relaxed)==0, nullptr ); for( size_t i=my_n_thread; i--; ) my_thread_array[i].~padded_ipc_worker(); tbb::cache_aligned_allocator().deallocate( my_thread_array, my_n_thread ); tbb::detail::d0::poison_pointer( my_thread_array ); my_waker->~ipc_waker(); tbb::cache_aligned_allocator().deallocate( my_waker, 1 ); tbb::detail::d0::poison_pointer( my_waker ); my_stopper->~ipc_stopper(); tbb::cache_aligned_allocator().deallocate( my_stopper, 1 ); tbb::detail::d0::poison_pointer( my_stopper ); sem_close( my_active_sem ); sem_close( my_stop_sem ); } inline bool ipc_server::try_insert_in_asleep_list(ipc_worker& t) { asleep_list_mutex_type::scoped_lock lock; if( !lock.try_acquire( my_asleep_list_mutex ) ) return false; // Contribute to slack under lock so that if another takes that unit of slack, // it sees us sleeping on the list and wakes us up. int k = ++my_slack; if( k<=0 ) { t.my_next = my_asleep_list_root.load(std::memory_order_relaxed); my_asleep_list_root.store(&t, std::memory_order_relaxed); return true; } else { --my_slack; return false; } } inline bool ipc_server::try_insert_in_asleep_list_forced(ipc_worker& t) { asleep_list_mutex_type::scoped_lock lock; if( !lock.try_acquire( my_asleep_list_mutex ) ) return false; // Contribute to slack under lock so that if another takes that unit of slack, // it sees us sleeping on the list and wakes us up. ++my_slack; t.my_next = my_asleep_list_root.load(std::memory_order_relaxed); my_asleep_list_root.store(&t, std::memory_order_relaxed); return true; } inline bool ipc_server::wait_active_thread() { if( sem_wait( my_active_sem ) == 0 ) { ++my_global_thread_count; return true; } return false; } inline bool ipc_server::try_get_active_thread() { if( sem_trywait( my_active_sem ) == 0 ) { ++my_global_thread_count; return true; } return false; } inline void ipc_server::release_active_thread() { release_thread_sem( my_active_sem ); } inline bool ipc_server::wait_stop_thread() { struct timespec ts; if( clock_gettime( CLOCK_REALTIME, &ts )==0 ) { ts.tv_sec++; if( sem_timedwait( my_stop_sem, &ts )==0 ) { return true; } } return false; } inline void ipc_server::add_stop_thread() { sem_post( my_stop_sem ); } void ipc_server::wake_some( int additional_slack, int active_threads ) { __TBB_ASSERT( additional_slack>=0, nullptr ); ipc_worker* wakee[2]; ipc_worker **w = wakee; { asleep_list_mutex_type::scoped_lock lock(my_asleep_list_mutex); while( active_threads>0 && my_asleep_list_root.load(std::memory_order_relaxed) && w0 ) { if( additional_slack+my_slack.load(std::memory_order_acquire)<=0 ) // additional demand does not exceed surplus supply break; --additional_slack; } else { // Chain reaction; Try to claim unit of slack int old; do { old = my_slack.load(std::memory_order_relaxed); if( old<=0 ) goto done; } while( !my_slack.compare_exchange_strong( old, old-1 ) ); } // Pop sleeping worker to combine with claimed unit of slack my_asleep_list_root.store( (*w++ = my_asleep_list_root.load(std::memory_order_relaxed))->my_next, std::memory_order_relaxed ); --active_threads; } if( additional_slack ) { // Contribute our unused slack to my_slack. my_slack += additional_slack; } } done: while( w>wakee ) { if( !(*--w)->wake_or_launch() ) { add_stop_thread(); do { } while( !try_insert_in_asleep_list_forced(**w) ); release_active_thread(); } } while( active_threads ) { release_active_thread(); --active_threads; } } void ipc_server::wake_one_forced( int additional_slack ) { __TBB_ASSERT( additional_slack>=0, nullptr ); ipc_worker* wakee[1]; ipc_worker **w = wakee; { asleep_list_mutex_type::scoped_lock lock(my_asleep_list_mutex); while( my_asleep_list_root.load(std::memory_order_relaxed) && w0 ) { if( additional_slack+my_slack.load(std::memory_order_acquire)<=0 ) // additional demand does not exceed surplus supply break; --additional_slack; } else { // Chain reaction; Try to claim unit of slack int old; do { old = my_slack.load(std::memory_order_relaxed); if( old<=0 ) goto done; } while( !my_slack.compare_exchange_strong( old, old-1 ) ); } // Pop sleeping worker to combine with claimed unit of slack my_asleep_list_root.store( (*w++ = my_asleep_list_root.load(std::memory_order_relaxed))->my_next, std::memory_order_relaxed); } if( additional_slack ) { // Contribute our unused slack to my_slack. my_slack += additional_slack; } } done: while( w>wakee ) { if( !(*--w)->wake_or_launch() ) { add_stop_thread(); do { } while( !try_insert_in_asleep_list_forced(**w) ); } } } bool ipc_server::stop_one() { ipc_worker* current = nullptr; ipc_worker* next = nullptr; { asleep_list_mutex_type::scoped_lock lock(my_asleep_list_mutex); if( my_asleep_list_root.load(std::memory_order_relaxed) ) { current = my_asleep_list_root.load(std::memory_order_relaxed); if( current->my_state.load(std::memory_order_relaxed)==ipc_worker::st_normal ) { next = current->my_next; while( next!= nullptr && next->my_state.load(std::memory_order_relaxed)==ipc_worker::st_normal ) { current = next; next = current->my_next; } current->start_stopping( my_join_workers ); return true; } } } return false; } void ipc_server::adjust_job_count_estimate( int delta ) { #if TBB_USE_ASSERT my_net_slack_requests+=delta; #endif /* TBB_USE_ASSERT */ if( my_n_thread > 1 ) { if( delta<0 ) { my_slack+=delta; } else if( delta>0 ) { int active_threads = 0; if( try_get_active_thread() ) { ++active_threads; if( try_get_active_thread() ) { ++active_threads; } } wake_some( delta, active_threads ); if( !my_waker->wake_or_launch() ) { add_stop_thread(); } if( !my_stopper->wake_or_launch() ) { add_stop_thread(); } } } else { // Corner case when RML shouldn't provide any worker thread but client has to have at least one if( delta<0 ) { my_slack += delta; } else { wake_one_forced( delta ); } } } //------------------------------------------------------------------------ // RML factory methods //------------------------------------------------------------------------ #if USE_PTHREAD static tbb_client* my_global_client = nullptr; static tbb_server* my_global_server = nullptr; void rml_atexit() { release_resources(); } void rml_atfork_child() { if( my_global_server!=nullptr && my_global_client!=nullptr ) { ipc_server* server = static_cast( my_global_server ); server->~ipc_server(); // memset( server, 0, sizeof(ipc_server) ); new( server ) ipc_server( *my_global_client ); pthread_atfork( nullptr, nullptr, rml_atfork_child ); atexit( rml_atexit ); } } #endif /* USE_PTHREAD */ extern "C" tbb_factory::status_type __TBB_make_rml_server(tbb_factory& /*f*/, tbb_server*& server, tbb_client& client) { server = new( tbb::cache_aligned_allocator().allocate(1) ) ipc_server(client); #if USE_PTHREAD my_global_client = &client; my_global_server = server; pthread_atfork( nullptr, nullptr, rml_atfork_child ); atexit( rml_atexit ); #endif /* USE_PTHREAD */ if( getenv( "RML_DEBUG" ) ) { runtime_warning("IPC server is started"); } return tbb_factory::st_success; } extern "C" void __TBB_call_with_my_server_info(::rml::server_info_callback_t /*cb*/, void* /*arg*/) { } } // namespace rml } // namespace detail } // namespace tbb ================================================ FILE: third-party/tbb/python/rml/ipc_utils.cpp ================================================ /* Copyright (c) 2017-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "ipc_utils.h" #include #include #include #include #include namespace tbb { namespace internal { namespace rml { #define MAX_STR_LEN 255 #define STARTTIME_ITEM_ID 21 static char* get_stat_item(char* line, int item_id) { int id = 0, i = 0; while( id!=item_id ) { while( line[i]!='(' && line[i]!=' ' && line[i]!='\0' ) { ++i; } if( line[i]==' ' ) { ++id; ++i; } else if( line[i]=='(' ) { while( line[i]!=')' && line[i]!='\0' ) { ++i; } if( line[i]==')' ) { ++i; } else { return nullptr; } } else { return nullptr; } } return line + i; } unsigned long long get_start_time(int pid) { const char* stat_file_path_template = "/proc/%d/stat"; char stat_file_path[MAX_STR_LEN + 1]; sprintf( stat_file_path, stat_file_path_template, pid ); FILE* stat_file = fopen( stat_file_path, "rt" ); if( stat_file==nullptr ) { return 0; } char stat_line[MAX_STR_LEN + 1]; char* line = fgets( stat_line, MAX_STR_LEN, stat_file ); if( line==nullptr ) { return 0; } char* starttime_str = get_stat_item( stat_line, STARTTIME_ITEM_ID ); if( starttime_str==nullptr ) { return 0; } unsigned long long starttime = strtoull( starttime_str, nullptr, 10 ); if( starttime==ULLONG_MAX ) { return 0; } return starttime; } char* get_shared_name(const char* prefix, int pid, unsigned long long time) { const char* name_template = "%s_%d_%llu"; const int digits_in_int = 10; const int digits_in_long = 20; int len = strlen( name_template ) + strlen( prefix ) + digits_in_int + digits_in_long + 1; char* name = new char[len]; sprintf( name, name_template, prefix, pid, time ); return name; } char* get_shared_name(const char* prefix) { int pid = getpgrp(); unsigned long long time = get_start_time( pid ); return get_shared_name( prefix, pid, time ); } int get_num_threads(const char* env_var) { if( env_var==nullptr ) { return 0; } char* value = getenv( env_var ); if( value==nullptr ) { return 0; } int num_threads = (int)strtol( value, nullptr, 10 ); return num_threads; } bool get_enable_flag(const char* env_var) { if( env_var==nullptr ) { return false; } char* value = getenv( env_var ); if( value==nullptr ) { return false; } if( strcmp( value, "0" ) == 0 || strcmp( value, "false" ) == 0 || strcmp( value, "False" ) == 0 || strcmp( value, "FALSE" ) == 0 ) { return false; } return true; } }}} // namespace tbb::internal::rml ================================================ FILE: third-party/tbb/python/rml/ipc_utils.h ================================================ /* Copyright (c) 2017-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __IPC_UTILS_H #define __IPC_UTILS_H namespace tbb { namespace internal { namespace rml { char* get_shared_name(const char* prefix); int get_num_threads(const char* env_var); bool get_enable_flag(const char* env_var); }}} // namespace tbb::internal::rml #endif ================================================ FILE: third-party/tbb/python/setup.py ================================================ # Copyright (c) 2016-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # System imports import platform import os from distutils.core import * from distutils.command.build import build rundir = os.getcwd() os.chdir(os.path.abspath(os.path.dirname(__file__))) if any(i in os.environ for i in ["CC", "CXX"]): if "CC" not in os.environ: os.environ['CC'] = os.environ['CXX'] if "CXX" not in os.environ: os.environ['CXX'] = os.environ['CC'] if platform.system() == 'Linux': os.environ['LDSHARED'] = os.environ['CXX'] + " -shared" print("Environment specifies CC=%s CXX=%s"%(os.environ['CC'], os.environ['CXX'])) intel_compiler = os.getenv('CC', '') in ['icl', 'icpc', 'icc'] try: tbb_root = os.environ['TBBROOT'] print("Using TBBROOT=", tbb_root) except: tbb_root = '..' if not intel_compiler: print("Warning: TBBROOT env var is not set and Intel's compiler is not used. It might lead\n" " !!!: to compile/link problems. Source tbbvars.sh/.csh file to set environment") use_compiler_tbb = intel_compiler and tbb_root == '..' if use_compiler_tbb: print("Using oneTBB from Intel(R) C++ Compiler") if platform.system() == 'Windows': if intel_compiler: os.environ['DISTUTILS_USE_SDK'] = '1' # Enable environment settings in distutils os.environ['MSSdk'] = '1' print("Using compiler settings from environment") tbb_flag = ['/Qtbb'] if use_compiler_tbb else [] compile_flags = ['/Qstd=c++11'] if intel_compiler else [] tbb_lib_name = 'tbb12' else: tbb_flag = ['-tbb'] if use_compiler_tbb else [] compile_flags = ['-std=c++11', '-Wno-unused-variable'] tbb_lib_name = 'tbb' _tbb = Extension("tbb._api", ["tbb/api.i"], include_dirs=[os.path.join(tbb_root, 'include')] if not use_compiler_tbb else [], swig_opts =['-c++', '-O', '-threads'] + ( # add '-builtin' later ['-I' + os.path.join(tbb_root, 'include')] if not use_compiler_tbb else []), extra_compile_args=compile_flags + tbb_flag, extra_link_args=tbb_flag, libraries =([tbb_lib_name] if not use_compiler_tbb else []) + (['irml'] if platform.system() == "Linux" else []), library_dirs=[ rundir, # for custom-builds os.path.join(tbb_root, 'lib', 'intel64', 'gcc4.8'), # for Linux os.path.join(tbb_root, 'lib'), # for MacOS os.path.join(tbb_root, 'lib', 'intel64', 'vc_mt'), # for Windows ] if not use_compiler_tbb else [], language ='c++', ) class TBBBuild(build): sub_commands = [ # define build order ('build_ext', build.has_ext_modules), ('build_py', build.has_pure_modules), ] setup( name ="TBB", description ="Python API for oneTBB", long_description="Python API to Intel(R) oneAPI Threading Building Blocks library (oneTBB) " "extended with standard Pool implementation and monkey-patching", url ="https://www.intel.com/content/www/us/en/developer/tools/oneapi/onetbb.html", author ="Intel Corporation", author_email="inteltbbdevelopers@intel.com", license ="Dual license: Apache or Proprietary", version ="0.2", classifiers =[ 'Development Status :: 4 - Beta', 'Environment :: Console', 'Environment :: Plugins', 'Intended Audience :: Developers', 'Intended Audience :: System Administrators', 'Intended Audience :: Other Audience', 'Intended Audience :: Science/Research', 'License :: OSI Approved :: Apache Software License', 'Operating System :: MacOS :: MacOS X', 'Operating System :: Microsoft :: Windows', 'Operating System :: POSIX :: Linux', 'Programming Language :: Python', 'Programming Language :: Python :: 3', 'Programming Language :: C++', 'Topic :: System :: Hardware :: Symmetric Multi-processing', 'Topic :: Software Development :: Libraries', ], keywords='TBB multiprocessing multithreading composable parallelism', ext_modules=[_tbb], packages=['tbb'], py_modules=['TBB'], cmdclass={'build': TBBBuild} ) ================================================ FILE: third-party/tbb/python/tbb/__init__.py ================================================ # Copyright (c) 2016-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import multiprocessing.pool import ctypes import atexit import sys import os import platform if "Windows" in platform.system(): import site path_to_env = site.getsitepackages()[0] path_to_libs = os.path.join(path_to_env, "Library", "bin") if sys.version_info.minor >= 8: os.add_dll_directory(path_to_libs) os.environ['PATH'] += os.pathsep + path_to_libs from .api import * from .api import __all__ as api__all from .pool import * from .pool import __all__ as pool__all __all__ = ["Monkey", "is_active"] + api__all + pool__all __doc__ = """ Python API for Intel(R) oneAPI Threading Building Blocks (oneTBB) extended with standard Python's pools implementation and monkey-patching. Command-line interface example: $ python3 -m tbb $your_script.py Runs your_script.py in context of tbb.Monkey """ is_active = False """ Indicates whether oneTBB context is activated """ ipc_enabled = False """ Indicates whether IPC mode is enabled """ libirml = "libirml.so.1" def _test(arg=None): """Some tests""" import platform if platform.system() == "Linux": ctypes.CDLL(libirml) assert 256 == os.system("ldd "+_api.__file__+"| grep -E 'libimf|libsvml|libintlc'") # nosec from .test import test test(arg) print("done") def tbb_process_pool_worker27(inqueue, outqueue, initializer=None, initargs=(), maxtasks=None): from multiprocessing.pool import worker worker(inqueue, outqueue, initializer, initargs, maxtasks) if ipc_enabled: try: librml = ctypes.CDLL(libirml) librml.release_resources() except: print("Warning: Can not load ", libirml, file=sys.stderr) class TBBProcessPool27(multiprocessing.pool.Pool): def _repopulate_pool(self): """Bring the number of pool processes up to the specified number, for use after reaping workers which have exited. """ from multiprocessing.util import debug for i in range(self._processes - len(self._pool)): w = self.Process(target=tbb_process_pool_worker27, args=(self._inqueue, self._outqueue, self._initializer, self._initargs, self._maxtasksperchild) ) self._pool.append(w) w.name = w.name.replace('Process', 'PoolWorker') w.daemon = True w.start() debug('added worker') def __del__(self): self.close() for p in self._pool: p.join() def __exit__(self, *args): self.close() for p in self._pool: p.join() def tbb_process_pool_worker3(inqueue, outqueue, initializer=None, initargs=(), maxtasks=None, wrap_exception=False): from multiprocessing.pool import worker worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception) if ipc_enabled: try: librml = ctypes.CDLL(libirml) librml.release_resources() except: print("Warning: Can not load ", libirml, file=sys.stderr) class TBBProcessPool3(multiprocessing.pool.Pool): def _repopulate_pool(self): """Bring the number of pool processes up to the specified number, for use after reaping workers which have exited. """ from multiprocessing.util import debug for i in range(self._processes - len(self._pool)): w = self.Process(target=tbb_process_pool_worker3, args=(self._inqueue, self._outqueue, self._initializer, self._initargs, self._maxtasksperchild, self._wrap_exception) ) self._pool.append(w) w.name = w.name.replace('Process', 'PoolWorker') w.daemon = True w.start() debug('added worker') def __del__(self): self.close() for p in self._pool: p.join() def __exit__(self, *args): self.close() for p in self._pool: p.join() class Monkey: """ Context manager which replaces standard multiprocessing.pool implementations with tbb.pool using monkey-patching. It also enables oneTBB threading for Intel(R) oneAPI Math Kernel Library (oneMKL). For example: with tbb.Monkey(): run_my_numpy_code() It allows multiple parallel tasks to be executed on the same thread pool and coordinate number of threads across multiple processes thus avoiding overheads from oversubscription. """ _items = {} _modules = {} def __init__(self, max_num_threads=None, benchmark=False): """ Create context manager for running under TBB scheduler. :param max_num_threads: if specified, limits maximal number of threads :param benchmark: if specified, blocks in initialization until requested number of threads are ready """ if max_num_threads: self.ctl = global_control(global_control.max_allowed_parallelism, int(max_num_threads)) if benchmark: if not max_num_threads: max_num_threads = default_num_threads() from .api import _concurrency_barrier _concurrency_barrier(int(max_num_threads)) def _patch(self, class_name, module_name, obj): m = self._modules[class_name] = __import__(module_name, globals(), locals(), [class_name]) if m == None: return oldattr = getattr(m, class_name, None) if oldattr == None: self._modules[class_name] = None return self._items[class_name] = oldattr setattr(m, class_name, obj) def __enter__(self): global is_active assert is_active == False, "tbb.Monkey does not support nesting yet" is_active = True self.env_mkl = os.getenv('MKL_THREADING_LAYER') os.environ['MKL_THREADING_LAYER'] = 'TBB' self.env_numba = os.getenv('NUMBA_THREADING_LAYER') os.environ['NUMBA_THREADING_LAYER'] = 'TBB' if ipc_enabled: if sys.version_info.major == 2 and sys.version_info.minor >= 7: self._patch("Pool", "multiprocessing.pool", TBBProcessPool27) elif sys.version_info.major == 3 and sys.version_info.minor >= 5: self._patch("Pool", "multiprocessing.pool", TBBProcessPool3) self._patch("ThreadPool", "multiprocessing.pool", Pool) return self def __exit__(self, exc_type, exc_value, traceback): global is_active assert is_active == True, "modified?" is_active = False if self.env_mkl is None: del os.environ['MKL_THREADING_LAYER'] else: os.environ['MKL_THREADING_LAYER'] = self.env_mkl if self.env_numba is None: del os.environ['NUMBA_THREADING_LAYER'] else: os.environ['NUMBA_THREADING_LAYER'] = self.env_numba for name in self._items.keys(): setattr(self._modules[name], name, self._items[name]) def init_sem_name(): try: librml = ctypes.CDLL(libirml) librml.set_active_sem_name() librml.set_stop_sem_name() except Exception as e: print("Warning: Can not initialize name of shared semaphores:", e, file=sys.stderr) def tbb_atexit(): if ipc_enabled: try: librml = ctypes.CDLL(libirml) librml.release_semaphores() except: print("Warning: Can not release shared semaphores", file=sys.stderr) def _main(): # Run the module specified as the next command line argument # python3 -m TBB user_app.py global ipc_enabled import platform import argparse parser = argparse.ArgumentParser(prog="python3 -m tbb", description=""" Run your Python script in context of tbb.Monkey, which replaces standard Python pools and threading layer of Intel(R) oneAPI Math Kernel Library (oneMKL) by implementation based on Intel(R) oneAPI Threading Building Blocks (oneTBB). It enables multiple parallel tasks to be executed on the same thread pool and coordinate number of threads across multiple processes thus avoiding overheads from oversubscription. """, formatter_class=argparse.ArgumentDefaultsHelpFormatter) if platform.system() == "Linux": parser.add_argument('--ipc', action='store_true', help="Enable inter-process (IPC) coordination between oneTBB schedulers") parser.add_argument('-a', '--allocator', action='store_true', help="Enable oneTBB scalable allocator as a replacement for standard memory allocator") parser.add_argument('--allocator-huge-pages', action='store_true', help="Enable huge pages for oneTBB allocator (implies: -a)") parser.add_argument('-p', '--max-num-threads', default=default_num_threads(), type=int, help="Initialize oneTBB with P max number of threads per process", metavar='P') parser.add_argument('-b', '--benchmark', action='store_true', help="Block oneTBB initialization until all the threads are created before continue the script. " "This is necessary for performance benchmarks that want to exclude lazy scheduler initialization effects from the measurements") parser.add_argument('-v', '--verbose', action='store_true', help="Request verbose and version information") parser.add_argument('-m', action='store_true', dest='module', help="Executes following as a module") parser.add_argument('name', help="Script or module name") parser.add_argument('args', nargs=argparse.REMAINDER, help="Command line arguments") args = parser.parse_args() if args.verbose: os.environ["TBB_VERSION"] = "1" if platform.system() == "Linux": if args.allocator_huge_pages: args.allocator = True if args.allocator and not os.environ.get("_TBB_MALLOC_PRELOAD"): libtbbmalloc_lib = 'libtbbmalloc_proxy.so.2' ld_preload = 'LD_PRELOAD' os.environ["_TBB_MALLOC_PRELOAD"] = "1" preload_list = filter(None, os.environ.get(ld_preload, "").split(':')) if libtbbmalloc_lib in preload_list: print('Info:', ld_preload, "contains", libtbbmalloc_lib, "already\n") else: os.environ[ld_preload] = ':'.join([libtbbmalloc_lib] + list(preload_list)) if args.allocator_huge_pages: assert platform.system() == "Linux" try: with open('/proc/sys/vm/nr_hugepages', 'r') as f: pages = int(f.read()) if pages == 0: print("oneTBB: Pre-allocated huge pages are not currently reserved in the system. To reserve, run e.g.:\n" "\tsudo sh -c 'echo 2000 > /proc/sys/vm/nr_hugepages'") os.environ["TBB_MALLOC_USE_HUGE_PAGES"] = "1" except: print("oneTBB: Failed to read number of pages from /proc/sys/vm/nr_hugepages\n" "\tIs the Linux kernel configured with the huge pages feature?") sys.exit(1) os.execl(sys.executable, sys.executable, '-m', 'tbb', *sys.argv[1:]) assert False, "Re-execution failed" sys.argv = [args.name] + args.args ipc_enabled = platform.system() == "Linux" and args.ipc os.environ["IPC_ENABLE"] = "1" if ipc_enabled else "0" if ipc_enabled: atexit.register(tbb_atexit) init_sem_name() if not os.environ.get("KMP_BLOCKTIME"): # TODO move os.environ["KMP_BLOCKTIME"] = "0" if '_' + args.name in globals(): return globals()['_' + args.name](*args.args) else: import runpy runf = runpy.run_module if args.module else runpy.run_path with Monkey(max_num_threads=args.max_num_threads, benchmark=args.benchmark): runf(args.name, run_name='__main__') ================================================ FILE: third-party/tbb/python/tbb/__main__.py ================================================ # Copyright (c) 2016-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from . import _main from sys import exit exit(_main()) ================================================ FILE: third-party/tbb/python/tbb/api.i ================================================ %pythonbegin %{ # # Copyright (c) 2016-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. __all__ = ["task_arena", "task_group", "global_control", "default_num_threads", "this_task_arena_max_concurrency", "this_task_arena_current_thread_index", "runtime_version", "runtime_interface_version"] %} %begin %{ /* Defines Python wrappers for Intel(R) oneAPI Threading Building Blocks (oneTBB) */ %} %module api #if SWIG_VERSION < 0x030001 #error SWIG version 3.0.6 or newer is required for correct functioning #endif %{ #define TBB_PREVIEW_WAITING_FOR_WORKERS 1 #include "tbb/task_arena.h" #include "tbb/task_group.h" #include "tbb/global_control.h" #include "tbb/version.h" #include #include #include using namespace tbb; class PyCaller : public swig::SwigPtr_PyObject { public: // icpc 2013 does not support simple using SwigPtr_PyObject::SwigPtr_PyObject; PyCaller(const PyCaller& s) : SwigPtr_PyObject(s) {} PyCaller(PyObject *p, bool initial = true) : SwigPtr_PyObject(p, initial) {} void operator()() const { SWIG_PYTHON_THREAD_BEGIN_BLOCK; PyObject* r = PyObject_CallFunctionObjArgs((PyObject*)*this, nullptr); if(r) Py_DECREF(r); SWIG_PYTHON_THREAD_END_BLOCK; } }; struct ArenaPyCaller { task_arena *my_arena; PyObject *my_callable; ArenaPyCaller(task_arena *a, PyObject *c) : my_arena(a), my_callable(c) { SWIG_PYTHON_THREAD_BEGIN_BLOCK; Py_XINCREF(c); SWIG_PYTHON_THREAD_END_BLOCK; } void operator()() const { my_arena->execute(PyCaller(my_callable, false)); } }; struct barrier_data { std::condition_variable event; std::mutex m; int worker_threads, full_threads; }; void _concurrency_barrier(int threads = tbb::task_arena::automatic) { if(threads == tbb::task_arena::automatic) threads = tbb::this_task_arena::max_concurrency(); if(threads < 2) return; std::unique_ptr g( (global_control::active_value(global_control::max_allowed_parallelism) < unsigned(threads))? new global_control(global_control::max_allowed_parallelism, threads) : nullptr); tbb::task_group tg; barrier_data b; b.worker_threads = 0; b.full_threads = threads-1; for(int i = 0; i < b.full_threads; i++) tg.run([&b]{ std::unique_lock lock(b.m); if(++b.worker_threads >= b.full_threads) b.event.notify_all(); else while(b.worker_threads < b.full_threads) b.event.wait(lock); }); std::unique_lock lock(b.m); b.event.wait(lock); tg.wait(); }; %} void _concurrency_barrier(int threads = tbb::task_arena::automatic); namespace tbb { class task_arena { public: static const int automatic = -1; task_arena(int max_concurrency = automatic, unsigned reserved_for_masters = 1); task_arena(const task_arena &s); ~task_arena(); void initialize(); void initialize(int max_concurrency, unsigned reserved_for_masters = 1); void terminate(); bool is_active(); %extend { void enqueue( PyObject *c ) { $self->enqueue(PyCaller(c)); } void execute( PyObject *c ) { $self->execute(PyCaller(c)); } }; }; class task_group { public: task_group(); ~task_group(); void wait(); void cancel(); %extend { void run( PyObject *c ) { $self->run(PyCaller(c)); } void run( PyObject *c, task_arena *a ) { $self->run(ArenaPyCaller(a, c)); } }; }; class global_control { public: enum parameter { max_allowed_parallelism, thread_stack_size, parameter_max // insert new parameters above this point }; global_control(parameter param, size_t value); ~global_control(); static size_t active_value(parameter param); }; } // tbb %inline { inline const char* runtime_version() { return TBB_runtime_version();} inline int runtime_interface_version() { return TBB_runtime_interface_version();} inline int this_task_arena_max_concurrency() { return this_task_arena::max_concurrency();} inline int this_task_arena_current_thread_index() { return this_task_arena::current_thread_index();} }; // Additional definitions for Python part of the module %pythoncode %{ default_num_threads = this_task_arena_max_concurrency %} ================================================ FILE: third-party/tbb/python/tbb/pool.py ================================================ # Copyright (c) 2016-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Based on the software developed by: # Copyright (c) 2008,2016 david decotigny (Pool of threads) # Copyright (c) 2006-2008, R Oudkerk (multiprocessing.Pool) # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. Neither the name of author nor the names of any contributors may be # used to endorse or promote products derived from this software # without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # # @brief Python Pool implementation based on TBB with monkey-patching # # See http://docs.python.org/dev/library/multiprocessing.html # Differences: added imap_async and imap_unordered_async, and terminate() # has to be called explicitly (it's not registered by atexit). # # The general idea is that we submit works to a workqueue, either as # single Jobs (one function to call), or JobSequences (batch of # Jobs). Each Job is associated with an ApplyResult object which has 2 # states: waiting for the Job to complete, or Ready. Instead of # waiting for the jobs to finish, we wait for their ApplyResult object # to become ready: an event mechanism is used for that. # When we apply a function to several arguments in "parallel", we need # a way to wait for all/part of the Jobs to be processed: that's what # "collectors" are for; they group and wait for a set of ApplyResult # objects. Once a collector is ready to be used, we can use a # CollectorIterator to iterate over the result values it's collecting. # # The methods of a Pool object use all these concepts and expose # them to their caller in a very simple way. import sys import threading import traceback from .api import * __all__ = ["Pool", "TimeoutError"] __doc__ = """ Standard Python Pool implementation based on Python API for Intel(R) oneAPI Threading Building Blocks (oneTBB) """ class TimeoutError(Exception): """Raised when a result is not available within the given timeout""" pass class Pool(object): """ The Pool class provides standard multiprocessing.Pool interface which is mapped onto oneTBB tasks executing in its thread pool """ def __init__(self, nworkers=0, name="Pool"): """ :param nworkers (integer) number of worker threads to start :param name (string) prefix for the worker threads' name """ self._closed = False self._tasks = task_group() self._pool = [None,]*default_num_threads() # Dask asks for len(_pool) def apply(self, func, args=(), kwds=dict()): """Equivalent of the apply() builtin function. It blocks till the result is ready.""" return self.apply_async(func, args, kwds).get() def map(self, func, iterable, chunksize=None): """A parallel equivalent of the map() builtin function. It blocks till the result is ready. This method chops the iterable into a number of chunks which it submits to the process pool as separate tasks. The (approximate) size of these chunks can be specified by setting chunksize to a positive integer.""" return self.map_async(func, iterable, chunksize).get() def imap(self, func, iterable, chunksize=1): """ An equivalent of itertools.imap(). The chunksize argument is the same as the one used by the map() method. For very long iterables using a large value for chunksize can make the job complete much faster than using the default value of 1. Also if chunksize is 1 then the next() method of the iterator returned by the imap() method has an optional timeout parameter: next(timeout) will raise processing.TimeoutError if the result cannot be returned within timeout seconds. """ collector = OrderedResultCollector(as_iterator=True) self._create_sequences(func, iterable, chunksize, collector) return iter(collector) def imap_unordered(self, func, iterable, chunksize=1): """The same as imap() except that the ordering of the results from the returned iterator should be considered arbitrary. (Only when there is only one worker process is the order guaranteed to be "correct".)""" collector = UnorderedResultCollector() self._create_sequences(func, iterable, chunksize, collector) return iter(collector) def apply_async(self, func, args=(), kwds=dict(), callback=None): """A variant of the apply() method which returns an ApplyResult object. If callback is specified then it should be a callable which accepts a single argument. When the result becomes ready, callback is applied to it (unless the call failed). callback should complete immediately since otherwise the thread which handles the results will get blocked.""" assert not self._closed # No lock here. We assume it's atomic... apply_result = ApplyResult(callback=callback) job = Job(func, args, kwds, apply_result) self._tasks.run(job) return apply_result def map_async(self, func, iterable, chunksize=None, callback=None): """A variant of the map() method which returns a ApplyResult object. If callback is specified then it should be a callable which accepts a single argument. When the result becomes ready callback is applied to it (unless the call failed). callback should complete immediately since otherwise the thread which handles the results will get blocked.""" apply_result = ApplyResult(callback=callback) collector = OrderedResultCollector(apply_result, as_iterator=False) if not self._create_sequences(func, iterable, chunksize, collector): apply_result._set_value([]) return apply_result def imap_async(self, func, iterable, chunksize=None, callback=None): """A variant of the imap() method which returns an ApplyResult object that provides an iterator (next method(timeout) available). If callback is specified then it should be a callable which accepts a single argument. When the resulting iterator becomes ready, callback is applied to it (unless the call failed). callback should complete immediately since otherwise the thread which handles the results will get blocked.""" apply_result = ApplyResult(callback=callback) collector = OrderedResultCollector(apply_result, as_iterator=True) if not self._create_sequences(func, iterable, chunksize, collector): apply_result._set_value(iter([])) return apply_result def imap_unordered_async(self, func, iterable, chunksize=None, callback=None): """A variant of the imap_unordered() method which returns an ApplyResult object that provides an iterator (next method(timeout) available). If callback is specified then it should be a callable which accepts a single argument. When the resulting iterator becomes ready, callback is applied to it (unless the call failed). callback should complete immediately since otherwise the thread which handles the results will get blocked.""" apply_result = ApplyResult(callback=callback) collector = UnorderedResultCollector(apply_result) if not self._create_sequences(func, iterable, chunksize, collector): apply_result._set_value(iter([])) return apply_result def close(self): """Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit.""" # No lock here. We assume it's sufficiently atomic... self._closed = True def terminate(self): """Stops the worker processes immediately without completing outstanding work. When the pool object is garbage collected terminate() will be called immediately.""" self.close() self._tasks.cancel() def join(self): """Wait for the worker processes to exit. One must call close() or terminate() before using join().""" self._tasks.wait() def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.join() def __del__(self): self.terminate() self.join() def _create_sequences(self, func, iterable, chunksize, collector): """ Create callable objects to process and pushes them on the work queue. Each work unit is meant to process a slice of iterable of size chunksize. If collector is specified, then the ApplyResult objects associated with the jobs will notify collector when their result becomes ready. \return the list callable objects (basically: JobSequences) pushed onto the work queue """ assert not self._closed # No lock here. We assume it's atomic... it_ = iter(iterable) exit_loop = False sequences = [] while not exit_loop: seq = [] for _ in range(chunksize or 1): try: arg = next(it_) except StopIteration: exit_loop = True break apply_result = ApplyResult(collector) job = Job(func, (arg,), {}, apply_result) seq.append(job) if seq: sequences.append(JobSequence(seq)) for t in sequences: self._tasks.run(t) return sequences class Job: """A work unit that corresponds to the execution of a single function""" def __init__(self, func, args, kwds, apply_result): """ :param func/args/kwds used to call the function :param apply_result ApplyResult object that holds the result of the function call """ self._func = func self._args = args self._kwds = kwds self._result = apply_result def __call__(self): """ Call the function with the args/kwds and tell the ApplyResult that its result is ready. Correctly handles the exceptions happening during the execution of the function """ try: result = self._func(*self._args, **self._kwds) except: self._result._set_exception() else: self._result._set_value(result) class JobSequence: """A work unit that corresponds to the processing of a continuous sequence of Job objects""" def __init__(self, jobs): self._jobs = jobs def __call__(self): """ Call all the Job objects that have been specified """ for job in self._jobs: job() class ApplyResult(object): """An object associated with a Job object that holds its result: it's available during the whole life the Job and after, even when the Job didn't process yet. It's possible to use this object to wait for the result/exception of the job to be available. The result objects returns by the Pool::*_async() methods are of this type""" def __init__(self, collector=None, callback=None): """ :param collector when not None, the notify_ready() method of the collector will be called when the result from the Job is ready :param callback when not None, function to call when the result becomes available (this is the parameter passed to the Pool::*_async() methods. """ self._success = False self._event = threading.Event() self._data = None self._collector = None self._callback = callback if collector is not None: collector.register_result(self) self._collector = collector def get(self, timeout=None): """ Returns the result when it arrives. If timeout is not None and the result does not arrive within timeout seconds then TimeoutError is raised. If the remote call raised an exception then that exception will be reraised by get(). """ if not self.wait(timeout): raise TimeoutError("Result not available within %fs" % timeout) if self._success: return self._data raise self._data[0](self._data[1]).with_traceback(self._data[2]) def wait(self, timeout=None): """Waits until the result is available or until timeout seconds pass.""" self._event.wait(timeout) return self._event.isSet() def ready(self): """Returns whether the call has completed.""" return self._event.isSet() def successful(self): """Returns whether the call completed without raising an exception. Will raise AssertionError if the result is not ready.""" assert self.ready() return self._success def _set_value(self, value): """Called by a Job object to tell the result is ready, and provides the value of this result. The object will become ready and successful. The collector's notify_ready() method will be called, and the callback method too""" assert not self.ready() self._data = value self._success = True self._event.set() if self._collector is not None: self._collector.notify_ready(self) if self._callback is not None: try: self._callback(value) except: traceback.print_exc() def _set_exception(self): """Called by a Job object to tell that an exception occurred during the processing of the function. The object will become ready but not successful. The collector's notify_ready() method will be called, but NOT the callback method""" # traceback.print_exc() assert not self.ready() self._data = sys.exc_info() self._success = False self._event.set() if self._collector is not None: self._collector.notify_ready(self) class AbstractResultCollector(object): """ABC to define the interface of a ResultCollector object. It is basically an object which knows whuich results it's waiting for, and which is able to get notify when they get available. It is also able to provide an iterator over the results when they are available""" def __init__(self, to_notify): """ :param to_notify ApplyResult object to notify when all the results we're waiting for become available. Can be None. """ self._to_notify = to_notify def register_result(self, apply_result): """Used to identify which results we're waiting for. Will always be called BEFORE the Jobs get submitted to the work queue, and BEFORE the __iter__ and _get_result() methods can be called :param apply_result ApplyResult object to add in our collection """ raise NotImplementedError("Children classes must implement it") def notify_ready(self, apply_result): """Called by the ApplyResult object (already registered via register_result()) that it is now ready (ie. the Job's result is available or an exception has been raised). :param apply_result ApplyResult object telling us that the job has been processed """ raise NotImplementedError("Children classes must implement it") def _get_result(self, idx, timeout=None): """Called by the CollectorIterator object to retrieve the result's values one after another (order defined by the implementation) :param idx The index of the result we want, wrt collector's order :param timeout integer telling how long to wait (in seconds) for the result at index idx to be available, or None (wait forever) """ raise NotImplementedError("Children classes must implement it") def __iter__(self): """Return a new CollectorIterator object for this collector""" return CollectorIterator(self) class CollectorIterator(object): """An iterator that allows to iterate over the result values available in the given collector object. Equipped with an extended next() method accepting a timeout argument. Created by the AbstractResultCollector::__iter__() method""" def __init__(self, collector): """:param AbstractResultCollector instance""" self._collector = collector self._idx = 0 def __iter__(self): return self def next(self, timeout=None): """Return the next result value in the sequence. Raise StopIteration at the end. Can raise the exception raised by the Job""" try: apply_result = self._collector._get_result(self._idx, timeout) except IndexError: # Reset for next time self._idx = 0 raise StopIteration except: self._idx = 0 raise self._idx += 1 assert apply_result.ready() return apply_result.get(0) def __next__(self): return self.next() class UnorderedResultCollector(AbstractResultCollector): """An AbstractResultCollector implementation that collects the values of the ApplyResult objects in the order they become ready. The CollectorIterator object returned by __iter__() will iterate over them in the order they become ready""" def __init__(self, to_notify=None): """ :param to_notify ApplyResult object to notify when all the results we're waiting for become available. Can be None. """ AbstractResultCollector.__init__(self, to_notify) self._cond = threading.Condition() self._collection = [] self._expected = 0 def register_result(self, apply_result): """Used to identify which results we're waiting for. Will always be called BEFORE the Jobs get submitted to the work queue, and BEFORE the __iter__ and _get_result() methods can be called :param apply_result ApplyResult object to add in our collection """ self._expected += 1 def _get_result(self, idx, timeout=None): """Called by the CollectorIterator object to retrieve the result's values one after another, in the order the results have become available. :param idx The index of the result we want, wrt collector's order :param timeout integer telling how long to wait (in seconds) for the result at index idx to be available, or None (wait forever) """ self._cond.acquire() try: if idx >= self._expected: raise IndexError elif idx < len(self._collection): return self._collection[idx] elif idx != len(self._collection): # Violation of the sequence protocol raise IndexError() else: self._cond.wait(timeout=timeout) try: return self._collection[idx] except IndexError: # Still not added ! raise TimeoutError("Timeout while waiting for results") finally: self._cond.release() def notify_ready(self, apply_result=None): """Called by the ApplyResult object (already registered via register_result()) that it is now ready (ie. the Job's result is available or an exception has been raised). :param apply_result ApplyResult object telling us that the job has been processed """ first_item = False self._cond.acquire() try: self._collection.append(apply_result) first_item = (len(self._collection) == 1) self._cond.notifyAll() finally: self._cond.release() if first_item and self._to_notify is not None: self._to_notify._set_value(iter(self)) class OrderedResultCollector(AbstractResultCollector): """An AbstractResultCollector implementation that collects the values of the ApplyResult objects in the order they have been submitted. The CollectorIterator object returned by __iter__() will iterate over them in the order they have been submitted""" def __init__(self, to_notify=None, as_iterator=True): """ :param to_notify ApplyResult object to notify when all the results we're waiting for become available. Can be None. :param as_iterator boolean telling whether the result value set on to_notify should be an iterator (available as soon as 1 result arrived) or a list (available only after the last result arrived) """ AbstractResultCollector.__init__(self, to_notify) self._results = [] self._lock = threading.Lock() self._remaining = 0 self._as_iterator = as_iterator def register_result(self, apply_result): """Used to identify which results we're waiting for. Will always be called BEFORE the Jobs get submitted to the work queue, and BEFORE the __iter__ and _get_result() methods can be called :param apply_result ApplyResult object to add in our collection """ self._results.append(apply_result) self._remaining += 1 def _get_result(self, idx, timeout=None): """Called by the CollectorIterator object to retrieve the result's values one after another (order defined by the implementation) :param idx The index of the result we want, wrt collector's order :param timeout integer telling how long to wait (in seconds) for the result at index idx to be available, or None (wait forever) """ res = self._results[idx] res.wait(timeout) return res def notify_ready(self, apply_result): """Called by the ApplyResult object (already registered via register_result()) that it is now ready (ie. the Job's result is available or an exception has been raised). :param apply_result ApplyResult object telling us that the job has been processed """ got_first = False got_last = False self._lock.acquire() try: assert self._remaining > 0 got_first = (len(self._results) == self._remaining) self._remaining -= 1 got_last = (self._remaining == 0) finally: self._lock.release() if self._to_notify is not None: if self._as_iterator and got_first: self._to_notify._set_value(iter(self)) elif not self._as_iterator and got_last: try: lst = [r.get(0) for r in self._results] except: self._to_notify._set_exception() else: self._to_notify._set_value(lst) ================================================ FILE: third-party/tbb/python/tbb/test.py ================================================ # Copyright (c) 2016-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Based on the software developed by: # Copyright (c) 2008,2016 david decotigny (Pool of threads) # Copyright (c) 2006-2008, R Oudkerk (multiprocessing.Pool) # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. Neither the name of author nor the names of any contributors may be # used to endorse or promote products derived from this software # without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # import time import threading from .api import * from .pool import * def test(arg=None): if arg == "-v": def say(*x): print(*x) else: def say(*x): pass say("Start Pool testing") print("oneTBB version is %s" % runtime_version()) print("oneTBB interface version is %s" % runtime_interface_version()) get_tid = lambda: threading.current_thread().ident assert default_num_threads() == this_task_arena_max_concurrency() def return42(): return 42 def f(x): return x * x def work(mseconds): res = str(mseconds) if mseconds < 0: mseconds = -mseconds say("[%d] Start to work for %fms..." % (get_tid(), mseconds*10)) time.sleep(mseconds/100.) say("[%d] Work done (%fms)." % (get_tid(), mseconds*10)) return res # special flag to to be set by thread calling async work spin_flag = None def timeout_work(param): say("[%d] Spin wait work start..." % get_tid()) while spin_flag: time.sleep(0.0001) # yield equivalent say("[%d] Work done." % get_tid()) return str(param) if param != None else None def prepare_timeout_exception(): nonlocal spin_flag spin_flag = True # lock threads in timeout_work def check_timeout_exception(pool_object, func): nonlocal spin_flag try: func(pool_object) except TimeoutError: say("Good. Got expected timeout exception.") else: assert False, "Expected exception !" spin_flag = False # unlock threads in timeout_work ### Test copy/pasted from multiprocessing pool = Pool(4) # start worker threads # edge cases assert pool.map(return42, []) == [] assert pool.apply_async(return42, []).get() == 42 assert pool.apply(return42, []) == 42 assert list(pool.imap(return42, iter([]))) == [] assert list(pool.imap_unordered(return42, iter([]))) == [] assert pool.map_async(return42, []).get() == [] assert list(pool.imap_async(return42, iter([])).get()) == [] assert list(pool.imap_unordered_async(return42, iter([])).get()) == [] # basic tests result = pool.apply_async(f, (10,)) # evaluate "f(10)" asynchronously assert result.get(timeout=1) == 100 # ... unless slow computer assert list(pool.map(f, range(10))) == list(map(f, range(10))) it = pool.imap(f, range(10)) assert next(it) == 0 assert next(it) == 1 assert next(it) == 4 # Test apply_sync exceptions prepare_timeout_exception() result = pool.apply_async(timeout_work, (None,)) check_timeout_exception(result, lambda result : say(result.get(timeout=1))) assert result.get() is None # sleep() returns None def cb(s): say("Result ready: %s" % s) # Test imap() assert list(pool.imap(work, range(10, 3, -1), chunksize=4)) == list(map( str, range(10, 3, -1))) # Test imap_unordered() assert sorted(pool.imap_unordered(work, range(10, 3, -1))) == sorted(map( str, range(10, 3, -1))) # Test map_async() prepare_timeout_exception() result = pool.map_async(timeout_work, range(10), callback=cb) check_timeout_exception(result, lambda result : result.get(timeout=0.01)) say(result.get()) # Test imap_async() prepare_timeout_exception() result = pool.imap_async(timeout_work, range(3, 10), callback=cb) check_timeout_exception(result, lambda result : result.get(timeout=0.01)) for i in result.get(): say("Item:", i) say("### Loop again:") for i in result.get(): say("Item2:", i) # Test imap_unordered_async() prepare_timeout_exception() result = pool.imap_unordered_async(timeout_work, range(10, 3, -1), callback=cb) check_timeout_exception(result, lambda result : result.get(timeout=0.01)) for i in result.get(): say("Item1:", i) for i in result.get(): say("Item2:", i) r = result.get() for i in r: say("Item3:", i) for i in r: say("Item4:", i) for i in r: say("Item5:", i) # # The case for the exceptions # # Exceptions in imap_unordered_async() result = pool.imap_unordered_async(work, range(2, -10, -1), callback=cb) time.sleep(3) try: for i in result.get(): say("Got item:", i) except (IOError, ValueError): say("Good. Got expected exception") # Exceptions in imap_async() result = pool.imap_async(work, range(2, -10, -1), callback=cb) time.sleep(3) try: for i in result.get(): say("Got item:", i) except (IOError, ValueError): say("Good. Got expected exception") # Stop the test: need to stop the pool !!! pool.terminate() pool.join() if __name__ == "__main__": test() ================================================ FILE: third-party/tbb/rfcs/README.md ================================================ # oneTBB Design Documents/RFCs The RFC process intends to: - Communicate library-wide changes - Collect feedback before implementation - Increase transparency in decision-making - Align different teams involved in oneTBB development This directory contains design documents (RFCs) approved or rejected for implementation in oneTBB. The possible RFC states are: 1. Initial 2. Proposed 3. Experimental 4. Supported 5. Archived Most modifications or new features will naturally start as a part of a GitHub issue or discussion. Small changes do not require a formal RFC. However, if the issue or discussion results in an idea for a significant change or new feature that affects the library's public API or architecture, we recommend opening a PR to add a new RFC to the `rfcs/proposed` directory. The RFC should provide a detailed description and design of the proposed feature. or new feature that significantly impacts the library's public API or architecture, it will be suggested that a PR be opened to add a new rfc to the `rfcs/proposed` directory. The RFC contains a more detailed description and design for the feature. ## General Process A template for RFCs is available as [template.md](template.md). Place the modified template in the subdirectory of the `rfcs/proposed` with a name of the form `_`. For example, a proposal for a new ``my_op`` flow graph node should be put into the `rfcs/proposed/flow_graph_my_op_node` directory. Use [template.md](template.md) to create the `README.md` file in that directory. The folder can contain other files referenced by the `README.md` file, such as figures. Once two maintainers approve the PR, it is merged into the `rfcs/proposed` directory. Update the RFC document with additional information as the RFC moves to different states. A proposal that is subsequently implemented and released in oneTBB as a preview feature is moved into the `rfcs/experimental` folder. The RFC for a preview feature in `rfcs/experimental` should include a description of what is required to move from experimental to fully supported -- for example, feedback from users, demonstrated performance improvements, etc. A proposal that is implemented, added to the oneTBB specification, and supported as a full feature appears in the `rfcs/supported` directory. An RFC for a fully supported feature in the `rfcs/supported` directory should have a link to the section in the oneTBB specification with its formal wording. A feature that is removed or a proposal that is abandoned or rejected will be moved to the `rfcs/archived` folder. ## Document Style The design documents are stored in the `rfcs` directory, and each RFC is placed in its subdirectory under `rfcs/proposed/_`. - There must be a `README.md` file that contains the main RFC itself (or links to a file that contains it in the same directory). - The RFC should follow the [template.md](template.md) structure. - The directory can contain other supporting files, such as images, tex formulas, and sub-proposals / sub-RFCs. - We highly recommend using a text-based file format like markdown for easy collaboration on GitHub, but other formats like PDFs may also be acceptable. - For the markdown-written RFC, keep the text width within 100 characters, unless there is a reason to violate this rule, e.g., long links or wide tables. - It is also recommended to read through existing RFCs to better understand the general writing style and required elements. ================================================ FILE: third-party/tbb/rfcs/archived/README.md ================================================ # Archived Design Documents Documents may appear in the `rfcs/archived` directory for one of two reasons: 1. The document describes a feature or extension that has been deprecated and then removed. 2. The document describes a proposed feature or extension that have not (ultimately) become a fully supported feature. Design documents that appear in the `rfcs/archived` folder should describe a reason for archiving. Documents may remain in this folder indefinitely to serve as a source of information about previous proposals and features. ================================================ FILE: third-party/tbb/rfcs/experimental/README.md ================================================ # Design Documents for Experimental Features Experimental proposals describe extensions that are implemented and released as preview features in the oneTBB library. A preview feature is expected to have an implementation that is of comparable quality to a fully supported feature. Sufficient tests are required. An experimental feature does not yet appear as part of the oneTBB specification. Therefore, the interface and design can change. There is no commitment to backward compatibility for a preview feature. The documents in this directory should include a list of the exit conditions that need to be met to move from preview to fully supported. These conditions might include demonstrated performance improvements, demonstrated interest from the community, acceptance of the required oneTBB specification changes, etc. For features that require oneTBB specification changes, the document might include wording for those changes or a link to any PRs that opened against the specification. Proposals should not remain in the experimental directory forever. It should move either to the supported folder when they become fully supported or the archived folder if they are not fully accepted. It should be highly unusual for a proposal to stay in the experimental folder for longer than a year or two. ================================================ FILE: third-party/tbb/rfcs/experimental/blocked_nd_range_ctad/README.md ================================================ # Enabling CTAD for blocked_nd_range ## Introduction `oneapi::tbb::blocked_nd_range` class was introduced as a representation for recursively divisible N-dimensional range for oneTBB parallel algorithms. This document proposes extending its API with the deduction guides since C++17 to allow dropping the explicit template arguments specification while creating the object if they can be determined using the arguments provided: ```cpp oneapi::tbb::blocked_range range1(0, 100); oneapi::tbb::blocked_range range2(0, 200); oneapi::tbb::blocked_range range3(0, 300); // Since 3 ranges of type int are provided, the type of nd_range // can be deduced as oneapi::tbb::blocked_nd_range oneapi::tbb::blocked_nd_range nd_range(range1, range2, range3); ``` ## Proposal ### Supported constructors The `oneapi::tbb::blocked_nd_range` supports the following set of constructors: ```cpp template \ class blocked_nd_range { public: using value_type; using dim_range_type = blocked_range; using size_type = typename dim_range_type::size_type; blocked_nd_range(const dim_range_type& dim0, /*exactly N arguments of type const dim_range_type&*/); // [1] blocked_nd_range(const value_type (&dim_size)[N], size_type grainsize = 1); // [2] blocked_nd_range(blocked_nd_range& r, split); // [3] blocked_nd_range(blocked_nd_range& r, proportional_split proportion); // [4] }; ``` The constructor `[1]` is intended to create an n-dimensional interval by providing N one-dimensional ranges. Each element represents the dimension of the N-dimensional interval being constructed. It also allows to construct these one-dimensional intervals in-place from braced-inclosed initializer lists: ```cpp // Passing blocked_range objects itself tbb::blocked_range dim_range(0, 100); tbb::blocked_nd_range nd_range_1(dim_range, tbb::blocked_range(0, 200)); // Constructing in-place from braced-init-lists tbb::blocked_nd_range nd_range_2({0, 100}, {0, 200, 5}); // Combined approach tbb::blocked_nd_range nd_range_3(dim_range, {0, 200, 5}); ``` The constructor `[2]` is intended to create an interval by providing a C-array each element of which represents a size of the corresponding dimension of the interval being created. This constructor also allows to pass braced-init-list instead of the array from stack: ```cpp // Passing array object itself int sizes[3] = {100, 200, 300}; // Constructing the 3-dim range [0, 100), [0, 200), [0, 300) tbb::blocked_nd_range nd_range_1(sizes); // Using the grainsize - each dim range will have grainsize 5 tbb::blocked_nd_range nd_range_2(sizes, 5); // Passing the braced-init-list tbb::blocked_nd_range nd_range_3({100, 200, 300}); ``` In case of passing the template arguments explicitly, using a braced-init-list in both constructors `[1]` and `[2]` does not introduce any ambiguity since if the number of braced-init lists provided is always equal to the number of dimensions of the range for constructor `[1]` and the number of elements in the braced-init list equal to the number of dimensions for constructor `[2]`. Constructors `[3]` and `[4]` are intended to split the range into two parts. They are part of _Range_ Named Requirements and used internally in the implementation of oneTBB parallel algorithms. ### Proposed deduction guides This paper proposes to add explicit deduction guides for `blocked_nd_range` class: ```cpp // [g1] template blocked_nd_range(blocked_range, blocked_range...) -> blocked_nd_range; ``` This deduction guide corresponds to the constructor `[1]` for the case of passing _N_ `blocked_range` objects itself. It only participates in overload resolution if all of the types in `Values` are same as `Value`. To cover the case while blocked_ranges are passed as braced-init-lists, it is proposed to add a deduction guide taking a set of C-array objects. There are currently two options how to define the deduction guide (or a function) taking the braced-init-list of any type- C-array and `std::initializer_list`. The issue with `std::initializer_list` is that it does not allow tracking the size in compile-time. ```cpp // [g2] template blocked_nd_range(const Value (&...)[Ns]) -> blocked_nd_range; ``` This deduction guide only participates in overload resolution if 1. the number of C-arrays provided is more than 1 (`sizeof...(Ns) > 1`), 2. Each C-array has the size 2 or 3. The first constraint is intended to disambiguate between `[1]` and `[2]`. See [separate section](#ambiguity-while-passing-the-single-braced-init-list-of-size-2-or-3) for more details. The second constraint is intended to accept only the braced-init-lists that can be used to initialize the `blocked_range` object. Currently it supports the constructor with 2 parameters, taking _begin_ and _end_ of the interval, and the constructor with 3 parameters, taking additional _grainsize_ parameter. The important limitation of the deduction guide `[g2]` is that all of the elements in the braced-init-list should be of the same type. It would be impossible to use this constructor for initializing the `blocked_range` objects of types that are not convertible to `size_type` together with the grainsize: ```cpp std::vector vector; // OK, deduced as blocked_nd_range blocked_nd_range range1({vector.begin(), vector.end()}); // FAIL, all items in braced-init-lists should be objects of the same type // It is impossible to provide grainsize as iterator since iterator is not convertible to size_type blocked_nd_range range({vector.begin(), vector.end(), /*grainsize = */5}); ``` For the constructor `[2]`, the following deduction guide is proposed: ```cpp // [g3] template blocked_nd_range(const Value (&)[N]) ``` For service constructors `[3]` and `[4]`, the following guides are proposed: ```cpp // [g4] template blocked_nd_range(blocked_nd_range, split) -> blocked_nd_range; // [g5] template blocked_nd_range(blocked_nd_range, proportional_split) -> blocked_nd_range; ``` From the specification perspective, such a deduction guides can be generated as implicit deduction guides, in the same manner as copy and move constructors. But the current oneTBB implementation, these deduction guides are not generated implicitly, so the explicit guides are required. Guides `[g4]` and `[g5]` are not proposed to be a part of the spec, only a part of oneTBB implementation. ## Open Questions ### Ambiguity while passing the single braced-init-list of size 2 or 3 While using the CTAD with `blocked_nd_range`, there is an ambiguity between two approaches while using a single braced-init-list of size 2 or 3: ```cpp blocked_nd_range range1({10, 20}); blocked_nd_range range2({10, 20, 5}); ``` Since the template arguments for `blocked_nd_range` are not specified, there can be two possible resolutions: 1. Be interpreted as one-dimensional range _[10, 20)_ (with grainsize 1 or 5). In this case it should be deduced as `blocked_nd_range` and constructed using the constructor `[1]`. 2. Be interpreted as two (or three) dimensional range _[0, 10)_, _[0, 20)_ (and _[0, 5)_). In this case it should be deduced as `blocked_nd_range` and constructed using the constructor `[2]`. Since it is unclear which resolution should be chosen, current proposal is not to support such use-case in CTAD and require the user to either explicitly specialize the template arguments, or to use array or `blocked_range` type itself to initialize the object. ### Passing single C-array object of size 2 or 3 Another interesting issue that should be resolved, is passing the single C-array object of size 2 or 3 to the constructor: ```cpp int array[2] = {100, 200}; tbb::blocked_nd_range range(array); ``` Since the `blocked_range` is not constructible from C-array and the braced-init-list is not used, the user expects the range to be deduced as `blocked_nd_range` and the constructor `[2]` to be used. If we add one more explicit deduction guide to support the code above, the single braced-init-list of size 2 or 3 would also match on this guide. There are the following options how this issue can be resolved: * Add a new deduction guide to support the code above. The downside of this approach is that it makes the ambiguity, discussed in the [previous section](#ambiguity-while-passing-the-single-braced-init-list-of-size-2-or-3) to be resolved also and always result in 2 or 3-dimensional range. If the user provided the single braced-init-list to have one-dimensional range, he would face the unexpected behavior without any diagnostics. * Document the code above as limitation and do not support it. The downside is that the code above is considered valid, but cannot be supported because of the implementation of CTAD and current set of constructors. * Support the use-case above but do not support CTAD for braced-init-lists at all. The major downside is that the user would need to always specify the exact type `tbb::blocked_range` while using the braced-init-list construction. ### Using the constructor `[1]` with "mixed" arguments There is a limitation of the deduction guides proposed if the constructor `[1]` is used with both arguments of exact `tbb::blocked_range` type and the braced-init-lists: ```cpp tbb::blocked_range dim_range(0, 100); tbb::blocked_nd_range nd_range(dim_range, {0, 200}, {0, 300}, dim_range); ``` These arguments would not match nether on the `[g1]` not `[g2]` and it is unclear how to define the deduction guide that covers this case. Current proposal is to keep this scenario a limitation for using the CTAD and always require using the consistent set of parameters - or the set of braced-init-lists or the set of `tbb::blocked_range` objects. ## Exit criteria The following conditions need to be met to move the feature from experimental to fully supported: * Collecting feedback on user experience confirming the choices made on the open questions and limitations: * Preference of multi-dimensional range while deducing from the C-array or braced-init-list of size 2 or 3. See [separate section](#passing-single-c-array-object-of-size-2-or-3) for more details. * Limitation for the deduction from the braced-init-list to accept only the lists of items of the same type. * Limitation for the deduction guide `1` in case of mixing `blocked_range` objects and braced-init-lists. See [separate section](#using-the-constructor-1-with-mixed-arguments) for more details. * The corresponding oneTBB specification update should be done backed by the user feedback provided. ================================================ FILE: third-party/tbb/rfcs/experimental/parallel_phase_for_task_arena/README.md ================================================ # Adding API for parallel phase to task_arena to warm-up/retain/release worker threads ## Introduction In oneTBB, there has never been an API that allows users to block worker threads within the arena. This design choice was made to preserve the composability of the application. Before PR#1352, workers moved to the thread pool to sleep once there were no arenas with active demand. However, PR#1352 introduced a delayed leave behavior to the library that results in blocking threads for an _implementation-defined_ duration inside an arena if there is no active demand arcoss all arenas. This change significantly improved performance for various applications on high thread count systems. The main idea is that usually, after one parallel computation ends, another will start after some time. The delayed leave behavior is a heuristic to utilize this, covering most cases within _implementation-defined_ duration. However, the new behavior is not the perfect match for all the scenarios: * The heuristic of delayed leave is unsuitable for the tasks that are submitted in an unpredictable pattern and/or durations. * If oneTBB is used in composable scenarios it is not behaving as a good citizen consuming CPU resources. * For example, if an application runs a series of stages where oneTBB is used for one stage and OpenMP is used for a subsequent stage, there is a chance that oneTBB workers will interfere with OpenMP threads. This interference might result in slight oversubscription, which in turn might lead to underperformance. So there are two related problems but with different resolutions: * Completely disable new behavior for scenarios where the heuristic of delayed leave is unsuitable. * Optimize library behavior so customers can benefit from the heuristic of delayed leave but make it possible to indicate that "it is the time for the TBB arena to release threads". ## Proposal Let's tackle these problems one by one. ### Completely disable new behavior Let’s consider both “Delayed leave” and “Fast leave” as 2 different states in state machine.
* The "Delayed leave" heuristic benefits most of the workloads. Therefore, this is the default behavior for arena. * Workloads that has rather negative performance impact from the heuristic of delayed leave can create an arena in “Fast leave” state. There will be a question that we need to answer: * Do we see any value if arena potentially can transition from one to another state? To answer this question, the following scenarios should be considered: * What if different types of workloads are mixed in one application? * Different types of arenas can be used for different types of workloads. ### When threads should leave? oneTBB itself can only guess when the ideal time to release threads from the arena is. Therefore, it does its best effort to preserve and enhance performance without completely messing up composability guarantees (that is how delayed leave is implemented). As we already discussed, there are cases where it does not work perfectly, therefore customers that want to further optimize this aspect of oneTBB behavior should be able to do it. This problem can be considered from another angle. Essentially, if the user can indicate where parallel computation ends, they can also indicate where it starts. With this approach, the user not only releases threads when necessary but also specifies a programmable block where worker threads should expect new work coming regularly to the executing arena. Let’s add a new state to the existing state machine. To represent "Parallel Phase" state. > **_NOTE:_** The "Fast leave" state is colored Grey just for simplicity of the chart. Let's assume that arena was created with the "Delayed leave". The logic demonstrated below is applicable to the "Fast leave" as well. This state diagram leads to several questions: * What if there are multiple Parallel Phases? * If “End of Parallel Phase” leads back to “Delayed leave” how soon will threads be released from arena? * What if we indicated that threads should leave arena after the "Parallel Phase"? * What if we just indicated the end of the "Parallel Phase"? The extended state machine aims to answer these questions. * The first call to the “Start of Phase” will transition into the “Parallel Phase” state. * The last call to the “End of Phase” will transition back to the “Delayed leave” state or into the "One-time Fast leave" if it is indicated that threads should leave sooner. * Concurrent or nested calls to the “Start of Phase” or the “End of Phase” increment/decrement a reference counter. Let's consider the semantics that an API for explicit parallel phases can provide: * Start of a parallel phase: * Indicates the point from which the scheduler can use a hint and keep threads in the arena for longer. * Serves as a warm-up hint to the scheduler: * Allows reducing delays of computation start by initiating the wake-up of worker threads in advance. * "Parallel phase" itself: * Scheduler can implement different policies to retain threads in the arena. * For instance, more aggressive policy might be implemented for _parallel phase_. It can be beneficial in cases when the default arena leave policy is not sufficient enough. * The semantics for retaining threads is a hint to the scheduler; thus, no real guarantee is provided. The scheduler can ignore the hint and move threads to another arena or to sleep if conditions are met. * End of a parallel phase: * Indicates the point from which the scheduler may drop the hint and no longer retain threads in the arena. * Indicates that worker threads should avoid busy-waiting once there is no more work in the arena. * Temporarily overrides the default arena leave policy, which will be restored when new work is submitted. ### Proposed API Summary of API changes: * Add enumeration class for the arena leave policy. * Add the policy as the last parameter to the arena constructor and initializer defaulted to "automatic". * Add functions to start and end the parallel phase to the `task_arena` class and the `this_task_arena` namespace. * Add RAII class to map a parallel phase to a code scope. ```cpp class task_arena { enum class leave_policy : /* unspecified type */ { automatic = /* unspecifed */, fast = /* unspecifed */, }; task_arena(int max_concurrency = automatic, unsigned reserved_for_masters = 1, priority a_priority = priority::normal, leave_policy a_leave_policy = leave_policy::automatic); task_arena(const constraints& constraints_, unsigned reserved_for_masters = 1, priority a_priority = priority::normal, leave_policy a_leave_policy = leave_policy::automatic); void initialize(int max_concurrency, unsigned reserved_for_masters = 1, priority a_priority = priority::normal, leave_policy a_leave_policy = leave_policy::automatic); void initialize(constraints a_constraints, unsigned reserved_for_masters = 1, priority a_priority = priority::normal, leave_policy a_leave_policy = leave_policy::automatic); void start_parallel_phase(); void end_parallel_phase(bool with_fast_leave = false); class scoped_parallel_phase { scoped_parallel_phase(task_arena& ta, bool with_fast_leave = false); }; }; namespace this_task_arena { void start_parallel_phase(); void end_parallel_phase(bool with_fast_leave = false); } ``` The _parallel phase_ continues until each previous `start_parallel_phase` call to the same arena has a matching `end_parallel_phase` call.
Let's introduce RAII scoped object that will help to manage the contract. If the end of the parallel phase is not indicated by the user, it will be done automatically when the last public reference is removed from the arena (i.e., task_arena has been destroyed or, for an implicitly created arena, the thread that owns it has completed). This ensures correctness is preserved (threads will not be retained forever). ### Examples Following code snippets show how the new API can be used. ```cpp void task_arena_leave_policy_example() { tbb::task_arena ta{tbb::task_arena::automatic, 1, priority::normal, leave_policy::fast}; ta.execute([]() { // Parallel computation }); // Different parallel runtime is used // so it is preferred that worker threads won't be retained // in the arena at this point. #pragma omp parallel for for (int i = 0; i < work_size; ++i) { // Computation } } void parallel_phase_example() { tbb::this_task_arena::start_parallel_phase(); tbb::parallel_for(0, work_size, [] (int idx) { // User defined body }); // Some serial computation tbb::parallel_for(0, work_size, [] (int idx) { // User defined body }); tbb::this_task_arena::end_parallel_phase(/*with_fast_leave=*/true); // Different parallel runtime (for example, OpenMP) is used // so it is preferred that worker threads won't be retained // in the arena at this point. #pragma omp parallel for for (int i = 0; i < work_size; ++i) { // Computation } } void scoped_parallel_phase_example() { tbb::task_arena ta{/*arena constraints*/}; { // Start of the parallel phase tbb::task_arena::scoped_parallel_phase phase{ta, /*with_fast_leave=*/true}; ta.execute([]() { // Parallel computation }); // Serial computation ta.execute([]() { // Parallel computation }); } // End of the parallel phase // Different parallel runtime (for example, OpenMP) is used // so it is preferred that worker threads won't be retained // in the arena at this point. #pragma omp parallel for for (int i = 0; i < work_size; ++i) { // Computation } } ``` ## Considerations The alternative approaches were also considered.
We can express this state machine as complete graph and provide low-level interface that will give control over state transition. We considered this approach too low-level. Plus, it leaves a question: "How to manage concurrent changes of the state?". The retaining of worker threads should be implemented with care because it might introduce performance problems if: * Threads cannot migrate to another arena because they are retained in the current arena. * Compute resources are not homogeneous, e.g., the CPU is hybrid. Heavier involvement of less performant core types might result in artificial work imbalance in the arena. ## Technical Details To implement the proposed feature, the following changes were made: * Added a new entity `thread_leave_manager` to the `r1::arena` which is responsible for for managing the state of workers' arena leaving behaviour. * Introduced two new entry points to the library. * `r1::enter_parallel_phase(d1::task_arena_base*, std::uintptr_t)` - used to communicate the start of parallel phase with the library. * `r1::exit_parallel_phase(d1::task_arena_base*, std::uintptr_t)` - used to communicate the end of parallel phase with the library. ### Thread Leave Manager `thread_leave_manager` class implements the state machine described in proposal. Specifically, it controls when worker threads are allowed to be retained in the arena. `thread_leave_manager` is initialized with a state that determines the default behavior for workers leaving the arena. To support `start/end_parallel_phase` API, it provides functionality to override the default state with a "Parallel Phase" state. It also keeps track of the number of active parallel phases. The following sequence diagram illustrates the interaction between the user and the `thread_leave_manager` during the execution of parallel phases. It shows how the `thread_leave_manager` manages the state transitions when using `start/end_parallel_phase`. ## Open Questions in Design Some open questions that remain: * Are the suggested APIs sufficient? * In the current version of proposed API, the `scoped_parallel_phase` object can be created only for already existing `task_arena`. Should it be possible for `this_task_arena` as well? * What should be expected from "Parallel Phase" API for `this_task_arena` when a calling thread doesn't yet have any associated arena? * Should parallel phase API be limited only to RAII-only style? * Are there any scenarios where inconvenience of handling `scoped_parallel_phase` object is not acceptable? * Are there additional use cases that should be considered that we missed in our analysis? * Do we see any value if arena potentially can transition from one to another state? * What if different types of workloads are mixed in one application? * What if there concurrent calls to this API? ## Conditions to become fully supported Following conditions need to be met for the feature to move from experimental to fully supported: * Open questions regarding API should be resolved. * The feature should demonstrate performance improvements in scenarios mentioned. * oneTBB specification needs to be updated to reflect the new feature. ================================================ FILE: third-party/tbb/rfcs/proposed/README.md ================================================ # Design Documents for Proposed Features Proposed features in this directory have reached some level of consensus within the community, indicating that they have potential and deserve further development. However, the proposed changes have not yet been released as a preview or fully supported feature of the library. RFCs in the `rfcs/proposed` directory should explain the motivation, design, and open questions related to the proposed extension. ================================================ FILE: third-party/tbb/rfcs/proposed/loading-dependencies/loading-dependencies-by-module-name.org ================================================ #+title: Loading Dependencies By Module Name * Introduction There is a well-known attack that involves loading of a malicious dependency instead of the original one without notice to the party that does this loading. In the industry it is usually called /DLL injection/ or /DLL preloading attack/ and it is mostly associated with the Windows platform as it is known to be particularly vulnerable to this kind of attack [1]. One of the recommendations that safeguards against this type of attack is to specify fully qualified path to a dependency [2]. Historically, oneTBB loads its optional dependencies during its initialization process when these dependencies are used for the first time. The way oneTBB does this is by building full paths to their dependencies using the path where the oneTBB library itself resides. It is the only sensible path which can be obtained by oneTBB, whose usage conditions are not known at the time of development. The purpose is to minimize the risk of a DLL injection attack issue so that only certain paths are probed by the system loader. However, dependencies of a dependency are still searched by their module names only [3]. So, the risk is minimized only for a dependency itself and not for the libraries it depends on, not to mention that the file of a dependency can be replaced in the file system by an attacker, which breaks even that protection. Besides that, loading of a dependency by specifying full path represents an inconvenience to the developers who want to make use of their own variant of a dependency. Not only they need to place their variant of a dependency to all of the places from which it is going to be found and loaded by every client component that depends on it, but also this makes problematic the implementation (if not impossible) of some scenarios where the dependency being loaded maintains single state shared among all its clients. Such scenarios are hard to implement because copies of the same DLL loaded from different paths are considered to be different DLLs and in certain cases there is no support for filesystem linking mechanism to point to a single file [4, 5]. So, what is the main problem due to which loading by a module name makes Windows much more vulnerable to DLL injection than Linux? Besides difference in the order of accessing paths specified in the environment, Windows also prioritizes searching in the directory from which the application is loaded and current working directory [2]. Assuming that application is loaded from a directory that requires administrative permission on write, which is usually the case, it is the current working directory that forms the main DLL preloading attack scenario [1]. There are approaches to exclude the current working directory from the search order. However, for a library to avoid process-wide changes to the search order the only viable solution for run-time loading is to pass ~LOAD_LIBRARY_SAFE_CURRENT_DIRS~ flag to the ~LoadLibraryEx~ Windows API [6]. With the removal of the current working directory from loader's consideration, the search order on Windows starts having little difference with the search order on Linux. The difference includes the order in which directories specified in the environment and system directories are considered, and the presence of the first step of looking into an application directory on Windows [2, 7]. Since the system environment variables and the environment of other processes cannot be changed, the only vulnerable place is an application directory [8, 9]. Because the application can be installed in a directory that does not require administrative permissions on write, it still can be started by an account having them. Unlike Linux systems, for the process started with administrative permissions, the paths specified in the environment and the application directory are still considered by the Windows system loader [2, 7]. Therefore, an attacker can update permissive installation directory with a malicious version of a binary, hence making it loaded in a process with elevated permissions. Note that specifying fully qualified path to the dependency does not help in this case. Fortunately, there is a signature verification process that helps validating the authenticity of a binary before loading it into process address space and starting its execution. This allows making use of the established search order while checking that genuine version of the dependency is used. However, not loading the binary because of the failed signature verification might not be always desired. Especially, during the development phase or for a software distributor who does not have the signature with which to sign the binary. Therefore, to preserve backward compatibility of such usage models, it is essential to have the possibility to disable signature verification. * Proposal Based on the analysis in the "Introduction" section and to support versatile distribution models of oneTBB this RFC proposes to: On Windows only: 1. Introduce signature verification step to the run-time dependency loading process. 2. Introduce the ~TBB_VERIFY_DEPENDENCY_SIGNATURE~ compilation option that would enable signature verification, and set it ~ON~ by default. 3. Update documentation to include information about new ~TBB_VERIFY_DEPENDENCY_SIGNATURE~ flag. 4. Pass ~LOAD_LIBRARY_SAFE_CURRENT_DIRS~ flag to the ~LoadLibraryEx~ calls so that current working directory is excluded from the list of directories in which the system loader looks when trying to find and resolve dependency. On all OSes: - Change dependency loading approach to load by module names only. * References 1. [[https://support.microsoft.com/en-us/topic/secure-loading-of-libraries-to-prevent-dll-preloading-attacks-d41303ec-0748-9211-f317-2edc819682e1][Microsoft, "Secure loading of libraries to prevent DLL preloading attacks".]] 2. [[https://learn.microsoft.com/en-us/windows/win32/dlls/dynamic-link-library-security][Microsoft, "Dynamic-Link Library Security", 7 January 2021]] 3. [[https://learn.microsoft.com/en-us/windows/win32/dlls/dynamic-link-library-search-order#factors-that-affect-searching][Microsoft, "Dynamic-link library search order", 9 February 2023]]. 4. [[https://learn.microsoft.com/en-us/windows/win32/dlls/run-time-dynamic-linking][Microsoft, "Run-Time Dynamic Linking", 7 January 2021]] 5. [[https://github.com/NuGet/Home/issues/10734][NuGet project issue on GitHub, "NuGet packaging should support symlinks within packages", 7 April 2021]] 6. [[https://learn.microsoft.com/en-us/windows/win32/api/LibLoaderAPI/nf-libloaderapi-loadlibraryexa][Microsoft, "LoadLibraryExA function (libloaderapi.h)", 9 February 2023]] 7. [[https://www.man7.org/linux/man-pages/man8/ld.so.8.html][Linux man-pages 6.9.1, "ld.so(8) — Linux manual page", 8 May 2024]] 8. [[https://learn.microsoft.com/en-us/windows/win32/procthread/environment-variables][Microsoft, "Environment Variables", 7 January 2021]] 9. [[https://learn.microsoft.com/en-us/windows/win32/api/winbase/nf-winbase-setenvironmentvariable][Microsoft, "SetEnvironmentVariable function (winbase.h)", 23 September 2022]] ================================================ FILE: third-party/tbb/rfcs/proposed/numa_support/README.md ================================================ # NUMA support ## Introduction In Non-Uniform Memory Access (NUMA) systems, the cost of memory accesses depends on the *nearness* of the processor to the memory resource on which the accessed data resides. While oneTBB has core support that enables developers to tune for Non-Uniform Memory Access (NUMA) systems, we believe this support can be simplified and improved to provide an improved user experience. This RFC acts as an umbrella for sub-proposals that address four areas for improvement: 1. improved reliability of HWLOC-dependent topology and pinning support in, 2. addition of a NUMA-aware allocation, 3. simplified approaches to associate task distribution with data placement and 4. where possible, improved out-of-the-box performance for high-level oneTBB features. We expect that this draft proposal will spawn sub-proposals that will progress independently based on feedback and prioritization of the suggested features. The features for NUMA tuning already available in the oneTBB 1.3 specification include: - Functions in the `tbb::info` namespace **[info_namespace]** - `std::vector numa_nodes()` - `int default_concurrency(numa_node_id id = oneapi::tbb::task_arena::automatic)` - `tbb::task_arena::constraints` in **[scheduler.task_arena]** Below is the example based on existing oneTBB documentation that demonstrates the use of these APIs to pin threads to different arenas to each of the NUMA nodes available on a system, submit work across those `task_arena` objects and into associated `task_group` objects, and then wait for work again using both the `task_arena` and `task_group` objects. void constrain_for_numa_nodes() { std::vector numa_nodes = tbb::info::numa_nodes(); std::vector arenas(numa_nodes.size()); std::vector task_groups(numa_nodes.size()); // initialize each arena, each constrained to a different NUMA node for (int i = 0; i < numa_nodes.size(); i++) arenas[i].initialize(tbb::task_arena::constraints(numa_nodes[i]), 0); // enqueue work to all but the first arena, using the task_group to track work // by using defer, the task_group reference count is incremented immediately for (int i = 1; i < numa_nodes.size(); i++) arenas[i].enqueue( task_groups[i].defer([] { tbb::parallel_for(0, N, [](int j) { f(w); }); }) ); // directly execute the work to completion in the remaining arena arenas[0].execute([] { tbb::parallel_for(0, N, [](int j) { f(w); }); }); // join the other arenas to wait on their task_groups for (int i = 1; i < numa_nodes.size(); i++) arenas[i].execute([&task_groups, i] { task_groups[i].wait(); }); } ### The need for application-specific knowledge In general when tuning a parallel application for NUMA systems, the goal is to expose sufficient parallelism while minimizing (or at least controlling) data access and communication costs. The tradeoffs involved in this tuning often rely on application-specific knowledge. In particular, NUMA tuning typically involves: 1. Understanding the overall application problem and its use of algorithms and data containers 2. Placement/allocation of data container objects onto memory resources 3. Distribution of tasks to hardware resources that optimize for data placement As shown in the previous example, the oneTBB 1.3 specification only provides low-level support for NUMA optimization. The `tbb::info` namespace provides topology discovery. And the combination of `task_arena`, `task_arena::constraints` and `task_group` provide a mechanism for placing tasks onto specific processors. There is no high-level support for memory allocation or placement, or for guiding the task distribution of algorithms. ### Issues that should be resolved in the oneTBB library **The behavior of existing features is not always predictable.** There is a note in section **[info_namespace]** of the oneTBB specification that describes the function `std::vector numa_nodes()`, "If error occurs during system topology parsing, returns vector containing single element that equals to `task_arena::automatic`." In practice, the error can occurs because HWLOC is not detected on the system. While the oneTBB documentation states in several places that HWLOC is required for NUMA support and even provides guidance on [how to check for HWLOC](https://www.intel.com/content/www/us/en/docs/onetbb/get-started-guide/2021-12/next-steps.html), the inability to resolve HWLOC at runtime silently returns a default of `task_arena::automatic`. This default does not pin threads to NUMA nodes. It is too easy to write code similar to the preceding example and be unaware that a HWLOC installation error (or lack of HWLOC) has undone all your effort. **Getting good performance using these tools requires notable manual coding effort by users.** As we can see in the preceding example, if we want to spread work across the NUMA nodes in a system we might need to query the topology using functions in the `tbb::info` namespace, create one `task_arena` per NUMA node, along with one `task_group` per NUMA node, and then add an extra loop that iterates over these `task_arena` and `task_group` objects to execute the work on the desired NUMA nodes. We also need to handle all container allocations using OS-specific APIs (or behaviors, such as first-touch) to allocator or place them on the appropriate NUMA nodes. **The out-of-the-box performance of the generic TBB APIs on NUMA systems is not good enough.** Should the oneTBB library do anything special by default if the system is a NUMA system? Or should regular random stealing distribute the work across all of the cores, regardless of which NUMA first touched the data? Is it reasonable for a developer to expect that a series of loops, such as the ones that follow, will try to create a NUMA-friendly distribution of tasks so that accesses to the same elements of `b` and `c` in the two loops are from the same NUMA nodes? Or is this too much to expect without providing hints? tbb::parallel_for(0, N, [](int i) { b[i] = f(i); c[i] = g(i); }); tbb::parallel_for(0, N, [](int i) { a[i] = b[i] + c[i]; }); ## Possible Sub-Proposals ### Increased availability of NUMA support See [sub-RFC for increased availability of NUMA API](tbbbind-link-static-hwloc.org) ### Add NUMA-constrained arenas See [sub-RFC for creation and use of NUMA-constrained arenas](numa-arenas-creation-and-use.org) ### NUMA-aware allocation Define allocators or other features that simplify the process of allocating or placing data onto specific NUMA nodes. ### Simplified approaches to associate task distribution with data placement As discussed earlier, NUMA-aware allocation is just the first step in optimizing for NUMA architectures. We also need to deliver mechanisms to guide task distribution so that tasks are executed on execution resources that are near to the data they access. oneTBB already provides low-level support through `tbb::info` and `tbb::task_arena`, but we should up-level this support into the high-level algorithms, flow graph and containers where appropriate. ### Improved out-of-the-box performance for high-level oneTBB features. For high-level oneTBB features that are modified to provide improved NUMA support, we can try to align default behaviors for those features with user-expectations when used on NUMA systems. ## Open Questions 1. Do we need simplified support, or are users that want NUMA support in oneTBB willing to, or perhaps even prefer, to manage the details manually? 2. Is it reasonable to expect good out-of-the-box performance on NUMA systems without user hints or guidance. ================================================ FILE: third-party/tbb/rfcs/proposed/numa_support/tbbbind-link-static-hwloc.org ================================================ # -*- fill-column: 80; -*- #+title: Link ~tbbbind~ with Static HWLOC for NUMA API predictability *Note:* This document is a sub-RFC of the [[file:README.md][umbrella RFC about improving NUMA support]]. Specifically, the "Increased availability of NUMA support" section. * Introduction oneTBB has a soft dependency on several variants of ~tbbbind~, which the library loads during the initialization stage. Each ~tbbbind~, in turn, has a hard dependency on a specific version of the HWLOC library [1, 2]. The soft dependency means that the library continues the execution even if the system loader fails to resolve the hard dependency on HWLOC for ~tbbbind~. In this case, oneTBB does not discover the hardware topology. Instead, it defaults to viewing all CPU cores as uniform, consistent with TBB behavior when NUMA constraints are not used. As a result, the following code returns the irrelevant values that do not reflect the actual topology: #+begin_src C++ std::vector numa_nodes = oneapi::tbb::info::numa_nodes(); std::vector core_types = oneapi::tbb::info::core_types(); #+end_src This lack of valid HW topology, caused by the absence of a third-party library, is the major problem with the current oneTBB behavior. The problem lies in the lack of diagnostics making it difficult for developers to detect. As a result, the code continues to run but fails to use NUMA as intended. Dependency on a shared HWLOC library has the following benefits: 1. Code reuse with all of the positive consequences out of this, including relying on the same code that has been tested and debugged, allowing the OS to share it among different processes, which consequently improves on cache locality and memory footprint. That's the primary purpose of shared libraries. 2. A drop-in replacement. Users are able to use their own version of HWLOC without recompilation of oneTBB. This specific version of HWLOC could include a hotfix to support a particular and/or new hardware that a customer has, but whose support is not yet upstreamed to HWLOC project. It is also possible that such support won't be upstreamed at all if that hardware is not going to be available for massive users. It could also be a development version of HWLOC that someone wants to test on their systems first. Of course, they can do it with the static version as well, but that's more cumbersome as it requires recompilation of every dependent component. The only disadvantage from depending on HWLOC library dynamically is that the developers that use oneTBB's NUMA support API need to make sure the library is available and can be found by oneTBB. Depending on the distribution model of a developer's code, this is achieved either by: 1. Asking the end user to have necessary version of a dependency pre-installed. 2. Bundling necessary HWLOC version together with other pieces of a product release. However, the requirement to fulfill one of the above steps for the NUMA API to start paying off may be considered as an incovenience and, what is more important, it is not always obvious that one of these steps is needed. Especially, due to silent behavior in case HWLOC library cannot be found in the environment. The proposal is to reduce the effect of the disadvantage of relying on a dynamic HWLOC library. The improvements involve statically linking HWLOC with one of the ~tbbbind~ libraries distributed together with oneTBB. At the same time, you retain the flexibility to specify different version of HWLOC library if needed. Since HWLOC 1.x is an older version and modern operating systems install HWLOC 2.x by default, the probability of users being restricted to HWLOC 1.x is relatively small. Thus, we can reuse the filename of the ~tbbbind~ library linked to HWLOC 1.x for the library linked against a static HWLOC 2.x. * Proposal 1. Replace the dynamic link of ~tbbbind~ library currently linked against HWLOC 1.x with a link to a static HWLOC library version 2.x. 2. Add loading of that ~tbbbind~ variant as the last attempt to resolve the dependency on functionality provided by the ~tbbbind~ layer. 3. Update the oneTBB documentation, including [[https://uxlfoundation.github.io/oneTBB/search.html?q=tbb%3A%3Ainfo][these pages]], to detail the steps for identifying which ~tbbbind~ is being used. ** Advantages 1. The proposed behavior introduces a fallback mechanism for resolving the HWLOC library dependency when it is not in the environment, while still preferring user-provided versions. As a result, the problematic oneTBB API usage works as expected, returning an enumerated list of actual NUMA nodes and core types on the system the code is running on, provided that the loaded HWLOC library works on that system and that an application properly distributes all binaries of oneTBB, sets the environment so that the necessary variant of ~tbbbind~ library can be found and loaded. 2. Dropping support for HWLOC 1.x, does not introduce an additional ~tbbbind~ variant while maintaining support for widely used versions of HWLOC. ** Disadvantages By default, there is still no diagnostics if you fail to correctly setup an environment with your version of HWLOC. Although, specifying the ~TBB_VERSION=1~ environment variable helps identify configuration issues quickly. * Alternative Handling for Missing System Topology The other behavior in case HWLOC library cannot be found is to be more explicit about the problem of a missing component and to either issue a warning or to refuse working requiring one of the ~tbbbind~ variant to be loaded (e.g., throw an exception). Comparing these alternative approaches to the one proposed. ** Common Advantages - Explicitly indicates that the functionality being used does not work, instead of failing silently. - Avoids the need to distribute an additional variant of ~tbbbind~ library. ** Common Disadvantages - Requires additional step from the user side to resolve the problem. In other words, it does not provide complete solution to the problem. *** Disadvantages of Issuing a Warning - The warning may be unnoticed, especially if standard streams are closed. *** Disadvantages of Throwing an Exception - May break existing code that does not expect an exception to be thrown. - Requires introduction of an additional exception hierarchy. * References 1. [[https://www.open-mpi.org/projects/hwloc/][HWLOC project main page]] 2. [[https://github.com/open-mpi/hwloc][HWLOC project repository on GitHub]] ================================================ FILE: third-party/tbb/rfcs/supported/README.md ================================================ # Design Documents for Supported Features Supported proposals describe extensions implemented and released as fully supported features of the oneTBB library. A fully supported feature has a high-quality implementation. If the proposal impacted the public API of the library, it should appear in the oneTBB specification and have supporting documentation in the oneTBB Reference as needed. A fully supported feature is regularly tested. Proposals that appear in `rfcs/supported` may be retained indefinitely to provide insight into the design of existing features. ================================================ FILE: third-party/tbb/rfcs/template.md ================================================ # Descriptive Name for the Proposal ## Introduction Short description of the idea proposed with explained motivation. The motivation could be: - Improved users experience for API changes and extensions. Code snippets to showcase the benefits would be nice here. - Performance improvements with the data, if available. - Improved engineering practices. Introduction may also include any additional information that sheds light on the proposal, such as history of the matter, links to relevant issues and discussions, etc. ## Proposal A full and detailed description of the proposal with highlighted consequences. Depending on the kind of the proposal, the description should cover: - New use cases supported by the extension. - The expected performance benefit for a modification. - The interface of extensions including class definitions or function declarations. A proposal should clearly outline the alternatives that were considered, along with their pros and cons. Each alternative should be clearly separated to make discussions easier to follow. Pay close attention to the following aspects of the library: - API and ABI backward compatibility. The library follows semantic versioning so if any of those interfaces are to be broken, the RFC needs to state that explicitly. - Performance implications, as performance is one of the main goals of the library. - Changes to the build system. While the library's primary building system is CMake, there are some frameworks that may build the library directly from the sources. - Dependencies and support matrix: does the proposal bring any new dependencies or affect the supported configurations? Some other common subsections here are: - Discussion: some people like to list all the options first (as separate subsections), and then have a dedicated section with the discussion. - List of the proposed API and examples of its usage. - Testing aspects. - Short explanation and links to the related sub-proposals, if any. Such sub-proposals could be organized as separate standalone RFCs, but this is not mandatory. If the change is insignificant or doesn't make any sense without the original proposal, you can have it in the RFC. - Execution plan (next steps), if approved. ## Open Questions For new proposals (i.e., those in the `rfcs/proposed` directory), list any open questions. ================================================ FILE: third-party/tbb/src/tbb/CMakeLists.txt ================================================ # Copyright (c) 2020-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. add_library(tbb address_waiter.cpp allocator.cpp arena.cpp arena_slot.cpp concurrent_bounded_queue.cpp dynamic_link.cpp exception.cpp governor.cpp global_control.cpp itt_notify.cpp main.cpp market.cpp tcm_adaptor.cpp misc.cpp misc_ex.cpp observer_proxy.cpp parallel_pipeline.cpp private_server.cpp profiling.cpp rml_tbb.cpp rtm_mutex.cpp rtm_rw_mutex.cpp semaphore.cpp small_object_pool.cpp task.cpp task_dispatcher.cpp task_group_context.cpp thread_dispatcher.cpp thread_request_serializer.cpp threading_control.cpp version.cpp queuing_rw_mutex.cpp) add_library(TBB::tbb ALIAS tbb) if (WIN32) target_sources(tbb PRIVATE tbb.rc) set_target_properties(tbb PROPERTIES OUTPUT_NAME "tbb${TBB_BINARY_VERSION}") endif() # TODO: Add statistics.cpp target_compile_definitions(tbb PUBLIC $<$:TBB_USE_DEBUG> PRIVATE __TBB_BUILD ${TBB_RESUMABLE_TASKS_USE_THREADS} $<$>:__TBB_DYNAMIC_LOAD_ENABLED=0> $<$>:__TBB_SOURCE_DIRECTLY_INCLUDED=1>) if (NOT ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(armv7-a|aarch64|mips|arm64|riscv)" OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64" OR WINDOWS_STORE OR TBB_WINDOWS_DRIVER)) target_compile_definitions(tbb PRIVATE __TBB_USE_ITT_NOTIFY) endif() target_include_directories(tbb PUBLIC $ $) target_compile_options(tbb PRIVATE ${TBB_CXX_STD_FLAG} # TODO: consider making it PUBLIC. ${TBB_MMD_FLAG} ${TBB_DSE_FLAG} ${TBB_WARNING_LEVEL} ${TBB_WARNING_SUPPRESS} ${TBB_LIB_COMPILE_FLAGS} ${TBB_COMMON_COMPILE_FLAGS} ) # Avoid use of target_link_libraries here as it changes /DEF option to \DEF on Windows. set_target_properties(tbb PROPERTIES DEFINE_SYMBOL "" VERSION ${TBB_BINARY_VERSION}.${TBB_BINARY_MINOR_VERSION} SOVERSION ${TBB_BINARY_VERSION} ) tbb_handle_ipo(tbb) if (TBB_DEF_FILE_PREFIX) # If there's no prefix, assume we're using export directives set_target_properties(tbb PROPERTIES LINK_FLAGS "${TBB_LINK_DEF_FILE_FLAG}\"${CMAKE_CURRENT_SOURCE_DIR}/def/${TBB_DEF_FILE_PREFIX}-tbb.def\"" LINK_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/def/${TBB_DEF_FILE_PREFIX}-tbb.def" ) endif() # Prefer using target_link_options instead of target_link_libraries to specify link options because # target_link_libraries may incorrectly handle some options (on Windows, for example). if (COMMAND target_link_options) target_link_options(tbb PRIVATE ${TBB_LIB_LINK_FLAGS} ${TBB_COMMON_LINK_FLAGS} ) else() target_link_libraries(tbb PRIVATE ${TBB_LIB_LINK_FLAGS} ${TBB_COMMON_LINK_FLAGS} ) endif() target_link_libraries(tbb PRIVATE Threads::Threads ${TBB_LIB_LINK_LIBS} ${TBB_COMMON_LINK_LIBS} ) # Strip debug symbols into a separate .dbg file if(TBB_LINUX_SEPARATE_DBG) if(NOT CMAKE_BUILD_TYPE STREQUAL "release") find_program(OBJCOPY_COMMAND objcopy) if(NOT OBJCOPY_COMMAND) message(WARNING "objcopy command not found in the system") else() add_custom_command(TARGET tbb POST_BUILD COMMAND objcopy --only-keep-debug $ $.dbg COMMAND objcopy --strip-debug $ COMMAND objcopy --add-gnu-debuglink=$.dbg $ COMMENT "Creating and associating .dbg file with tbb" ) endif() else() message(WARNING " TBB_LINUX_SEPARATE_DBG flag is not used on release config") endif() endif() if(TBB_BUILD_APPLE_FRAMEWORKS) set_target_properties(tbb PROPERTIES FRAMEWORK TRUE FRAMEWORK_VERSION ${TBB_BINARY_VERSION}.${TBB_BINARY_MINOR_VERSION} XCODE_ATTRIBUTE_PRODUCT_BUNDLE_IDENTIFIER com.intel.tbb MACOSX_FRAMEWORK_IDENTIFIER com.intel.tbb MACOSX_FRAMEWORK_BUNDLE_VERSION ${TBB_BINARY_VERSION}.${TBB_BINARY_MINOR_VERSION} MACOSX_FRAMEWORK_SHORT_VERSION_STRING ${TBB_BINARY_VERSION}) endif() tbb_install_target(tbb) if (TBB_INSTALL) if (MSVC) # Create a copy of target linker file (tbb[_debug].lib) with legacy name (tbb[_debug].lib) # to support previous user experience for linkage. install(FILES $ DESTINATION lib CONFIGURATIONS RelWithDebInfo Release MinSizeRel RENAME tbb.lib COMPONENT devel ) install(FILES $ DESTINATION lib CONFIGURATIONS Debug RENAME tbb_debug.lib COMPONENT devel ) endif() if(TBB_LINUX_SEPARATE_DBG) install(FILES $.dbg DESTINATION lib COMPONENT devel ) endif() set(_tbb_pc_lib_name tbb) if (WIN32) set(_tbb_pc_lib_name ${_tbb_pc_lib_name}${TBB_BINARY_VERSION}) endif() if (CMAKE_SIZEOF_VOID_P EQUAL 8) set(TBB_PC_NAME tbb) else() set(TBB_PC_NAME tbb32) endif() set(_prefix_for_pc_file "${CMAKE_INSTALL_PREFIX}") if (IS_ABSOLUTE "${CMAKE_INSTALL_LIBDIR}") set(_libdir_for_pc_file "${CMAKE_INSTALL_LIBDIR}") else() set(_libdir_for_pc_file "\${prefix}/${CMAKE_INSTALL_LIBDIR}") endif() if (IS_ABSOLUTE "${CMAKE_INSTALL_INCLUDEDIR}") set(_includedir_for_pc_file "${CMAKE_INSTALL_INCLUDEDIR}") else() set(_includedir_for_pc_file "\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}") endif() configure_file(${PROJECT_SOURCE_DIR}/integration/pkg-config/tbb.pc.in ${CMAKE_CURRENT_BINARY_DIR}/${TBB_PC_NAME}.pc @ONLY) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${TBB_PC_NAME}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/ COMPONENT devel) endif() if (COMMAND tbb_gen_vars) tbb_gen_vars(tbb) endif() ================================================ FILE: third-party/tbb/src/tbb/address_waiter.cpp ================================================ /* Copyright (c) 2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "oneapi/tbb/detail/_utils.h" #include "governor.h" #include "concurrent_monitor.h" #include "oneapi/tbb/detail/_waitable_atomic.h" #include namespace tbb { namespace detail { namespace r1 { struct address_context { address_context() = default; address_context(void* address, std::uintptr_t context) : my_address(address), my_context(context) {} void* my_address{nullptr}; std::uintptr_t my_context{0}; }; class address_waiter : public concurrent_monitor_base { using base_type = concurrent_monitor_base; public: using base_type::base_type; /** per-thread descriptor for concurrent_monitor */ using thread_context = sleep_node; }; // 1024 is a rough estimate based on two assumptions: // 1) there are no more than 1000 threads in the application; // 2) the mutexes are optimized for short critical sections less than a couple of microseconds, // which is less than 1/1000 of a time slice. // In the worst case, we have single mutex that is locked and its thread is preempted. // Therefore, the probability of a collision while taking unrelated mutex is about 1/size of a table. static constexpr std::size_t num_address_waiters = 2 << 10; static_assert(std::is_standard_layout::value, "address_waiter must be with standard layout"); static address_waiter address_waiter_table[num_address_waiters]; void clear_address_waiter_table() { for (std::size_t i = 0; i < num_address_waiters; ++i) { address_waiter_table[i].destroy(); } } static address_waiter& get_address_waiter(void* address) { std::uintptr_t tag = std::uintptr_t(address); return address_waiter_table[((tag >> 5) ^ tag) % num_address_waiters]; } void wait_on_address(void* address, d1::delegate_base& predicate, std::uintptr_t context) { address_waiter& waiter = get_address_waiter(address); waiter.wait(predicate, address_context{address, context}); } void notify_by_address(void* address, std::uintptr_t target_context) { address_waiter& waiter = get_address_waiter(address); auto predicate = [address, target_context] (address_context ctx) { return ctx.my_address == address && ctx.my_context == target_context; }; waiter.notify_relaxed(predicate); } void notify_by_address_one(void* address) { address_waiter& waiter = get_address_waiter(address); auto predicate = [address] (address_context ctx) { return ctx.my_address == address; }; waiter.notify_one_relaxed(predicate); } void notify_by_address_all(void* address) { address_waiter& waiter = get_address_waiter(address); auto predicate = [address] (address_context ctx) { return ctx.my_address == address; }; waiter.notify_relaxed(predicate); } } // namespace r1 } // namespace detail } // namespace tbb ================================================ FILE: third-party/tbb/src/tbb/allocator.cpp ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "oneapi/tbb/version.h" #include "oneapi/tbb/detail/_exception.h" #include "oneapi/tbb/detail/_assert.h" #include "oneapi/tbb/detail/_utils.h" #include "oneapi/tbb/tbb_allocator.h" // Is this OK? #include "oneapi/tbb/cache_aligned_allocator.h" #include "dynamic_link.h" #include "misc.h" #include #ifdef _WIN32 #include #else #include #endif #if (!defined(_WIN32) && !defined(_WIN64)) || defined(__CYGWIN__) #include // posix_memalign, free // With glibc, uClibc and musl on Linux and bionic on Android it is safe to use memalign(), as the allocated memory // can be freed with free(). It is also better to use memalign() since posix_memalign() is just a wrapper on top of // memalign() and it offers nothing but overhead due to inconvenient interface. This is likely the case with other // standard libraries as well, and more libraries can be added to the preprocessor check below. Unfortunately, we // can't detect musl, so we simply enable memalign() on Linux and Android in general. #if defined(linux) || defined(__linux) || defined(__linux__) || defined(__ANDROID__) #include // memalign #define __TBB_USE_MEMALIGN #else #define __TBB_USE_POSIX_MEMALIGN #endif #elif defined(_MSC_VER) || defined(__MINGW32__) #include // _aligned_malloc, _aligned_free #define __TBB_USE_MSVC_ALIGNED_MALLOC #endif #if __TBB_WEAK_SYMBOLS_PRESENT #pragma weak scalable_malloc #pragma weak scalable_free #pragma weak scalable_aligned_malloc #pragma weak scalable_aligned_free extern "C" { void* scalable_malloc(std::size_t); void scalable_free(void*); void* scalable_aligned_malloc(std::size_t, std::size_t); void scalable_aligned_free(void*); } #endif /* __TBB_WEAK_SYMBOLS_PRESENT */ namespace tbb { namespace detail { namespace r1 { //! Initialization routine used for first indirect call via allocate_handler. static void* initialize_allocate_handler(std::size_t size); //! Handler for memory allocation using allocate_handler_type = void* (*)(std::size_t size); static std::atomic allocate_handler{ &initialize_allocate_handler }; allocate_handler_type allocate_handler_unsafe = nullptr; //! Handler for memory deallocation static void (*deallocate_handler)(void* pointer) = nullptr; //! Initialization routine used for first indirect call via cache_aligned_allocate_handler. static void* initialize_cache_aligned_allocate_handler(std::size_t n, std::size_t alignment); //! Allocates overaligned memory using standard memory allocator. It is used when scalable_allocator is not available. static void* std_cache_aligned_allocate(std::size_t n, std::size_t alignment); //! Deallocates overaligned memory using standard memory allocator. It is used when scalable_allocator is not available. static void std_cache_aligned_deallocate(void* p); //! Handler for padded memory allocation using cache_aligned_allocate_handler_type = void* (*)(std::size_t n, std::size_t alignment); static std::atomic cache_aligned_allocate_handler{ &initialize_cache_aligned_allocate_handler }; cache_aligned_allocate_handler_type cache_aligned_allocate_handler_unsafe = nullptr; //! Handler for padded memory deallocation static void (*cache_aligned_deallocate_handler)(void* p) = nullptr; //! Table describing how to link the handlers. static const dynamic_link_descriptor MallocLinkTable[] = { DLD(scalable_malloc, allocate_handler_unsafe), DLD(scalable_free, deallocate_handler), DLD(scalable_aligned_malloc, cache_aligned_allocate_handler_unsafe), DLD(scalable_aligned_free, cache_aligned_deallocate_handler), }; #if TBB_USE_DEBUG #define DEBUG_SUFFIX "_debug" #else #define DEBUG_SUFFIX #endif /* TBB_USE_DEBUG */ // MALLOCLIB_NAME is the name of the oneTBB memory allocator library. #if _WIN32||_WIN64 #define MALLOCLIB_NAME "tbbmalloc" DEBUG_SUFFIX ".dll" #elif __APPLE__ #define MALLOCLIB_NAME "libtbbmalloc" DEBUG_SUFFIX ".2.dylib" #elif __FreeBSD__ || __NetBSD__ || __OpenBSD__ || __sun || _AIX || __ANDROID__ #define MALLOCLIB_NAME "libtbbmalloc" DEBUG_SUFFIX ".so" #elif __unix__ // Note that order of these #elif's is important! #define MALLOCLIB_NAME "libtbbmalloc" DEBUG_SUFFIX ".so.2" #else #error Unknown OS #endif //! Initialize the allocation/free handler pointers. /** Caller is responsible for ensuring this routine is called exactly once. The routine attempts to dynamically link with the TBB memory allocator. If that allocator is not found, it links to malloc and free. */ void initialize_handler_pointers() { __TBB_ASSERT(allocate_handler == &initialize_allocate_handler, nullptr); bool success = dynamic_link(MALLOCLIB_NAME, MallocLinkTable, 4); if(!success) { // If unsuccessful, set the handlers to the default routines. // This must be done now, and not before FillDynamicLinks runs, because if other // threads call the handlers, we want them to go through the DoOneTimeInitializations logic, // which forces them to wait. allocate_handler_unsafe = &std::malloc; deallocate_handler = &std::free; cache_aligned_allocate_handler_unsafe = &std_cache_aligned_allocate; cache_aligned_deallocate_handler = &std_cache_aligned_deallocate; } allocate_handler.store(allocate_handler_unsafe, std::memory_order_release); cache_aligned_allocate_handler.store(cache_aligned_allocate_handler_unsafe, std::memory_order_release); PrintExtraVersionInfo( "ALLOCATOR", success?"scalable_malloc":"malloc" ); } static std::once_flag initialization_state; void initialize_cache_aligned_allocator() { std::call_once(initialization_state, &initialize_handler_pointers); } //! Executed on very first call through allocate_handler /** Only one of initialize_allocate_handler() and initialize_cache_aligned_allocate_handler() is called, since each one of them also initializes the other. In the current implementation of oneTBB library initialization, cache_aligned_allocate() is used, which in turn calls initialize_cache_aligned_allocate_handler(). As mentioned above, that also initializes the regular allocate_handler. Therefore, initialize_allocate_handler() is not called in the current library implementation. */ static void* initialize_allocate_handler(std::size_t size) { initialize_cache_aligned_allocator(); __TBB_ASSERT(allocate_handler != &initialize_allocate_handler, nullptr); return (*allocate_handler)(size); } //! Executed on very first call through cache_aligned_allocate_handler static void* initialize_cache_aligned_allocate_handler(std::size_t bytes, std::size_t alignment) { initialize_cache_aligned_allocator(); __TBB_ASSERT(cache_aligned_allocate_handler != &initialize_cache_aligned_allocate_handler, nullptr); return (*cache_aligned_allocate_handler)(bytes, alignment); } // TODO: use CPUID to find actual line size, though consider backward compatibility // nfs - no false sharing static constexpr std::size_t nfs_size = 128; std::size_t __TBB_EXPORTED_FUNC cache_line_size() { return nfs_size; } void* __TBB_EXPORTED_FUNC cache_aligned_allocate(std::size_t size) { const std::size_t cache_line_size = nfs_size; __TBB_ASSERT(is_power_of_two(cache_line_size), "must be power of two"); // Check for overflow if (size + cache_line_size < size) { throw_exception(exception_id::bad_alloc); } // scalable_aligned_malloc considers zero size request an error, and returns nullptr if (size == 0) size = 1; void* result = cache_aligned_allocate_handler.load(std::memory_order_acquire)(size, cache_line_size); if (!result) { throw_exception(exception_id::bad_alloc); } __TBB_ASSERT(is_aligned(result, cache_line_size), "The returned address isn't aligned"); return result; } void __TBB_EXPORTED_FUNC cache_aligned_deallocate(void* p) { __TBB_ASSERT(cache_aligned_deallocate_handler, "Initialization has not been yet."); (*cache_aligned_deallocate_handler)(p); } static void* std_cache_aligned_allocate(std::size_t bytes, std::size_t alignment) { #if defined(__TBB_USE_MEMALIGN) return memalign(alignment, bytes); #elif defined(__TBB_USE_POSIX_MEMALIGN) void* p = nullptr; int res = posix_memalign(&p, alignment, bytes); if (res != 0) p = nullptr; return p; #elif defined(__TBB_USE_MSVC_ALIGNED_MALLOC) return _aligned_malloc(bytes, alignment); #else // TODO: make it common with cache_aligned_resource std::size_t space = alignment + bytes; std::uintptr_t base = reinterpret_cast(std::malloc(space)); if (!base) { return nullptr; } std::uintptr_t result = (base + nfs_size) & ~(nfs_size - 1); // Round up to the next cache line (align the base address) __TBB_ASSERT((result - base) >= sizeof(std::uintptr_t), "Cannot store a base pointer to the header"); __TBB_ASSERT(space - (result - base) >= bytes, "Not enough space for the storage"); // Record where block actually starts. (reinterpret_cast(result))[-1] = base; return reinterpret_cast(result); #endif } static void std_cache_aligned_deallocate(void* p) { #if defined(__TBB_USE_MEMALIGN) || defined(__TBB_USE_POSIX_MEMALIGN) free(p); #elif defined(__TBB_USE_MSVC_ALIGNED_MALLOC) _aligned_free(p); #else if (p) { __TBB_ASSERT(reinterpret_cast(p) >= 0x4096, "attempt to free block not obtained from cache_aligned_allocator"); // Recover where block actually starts std::uintptr_t base = (reinterpret_cast(p))[-1]; __TBB_ASSERT(((base + nfs_size) & ~(nfs_size - 1)) == reinterpret_cast(p), "Incorrect alignment or not allocated by std_cache_aligned_deallocate?"); std::free(reinterpret_cast(base)); } #endif } void* __TBB_EXPORTED_FUNC allocate_memory(std::size_t size) { void* result = allocate_handler.load(std::memory_order_acquire)(size); if (!result) { throw_exception(exception_id::bad_alloc); } return result; } void __TBB_EXPORTED_FUNC deallocate_memory(void* p) { if (p) { __TBB_ASSERT(deallocate_handler, "Initialization has not been yet."); (*deallocate_handler)(p); } } bool __TBB_EXPORTED_FUNC is_tbbmalloc_used() { auto handler_snapshot = allocate_handler.load(std::memory_order_acquire); if (handler_snapshot == &initialize_allocate_handler) { initialize_cache_aligned_allocator(); } handler_snapshot = allocate_handler.load(std::memory_order_relaxed); __TBB_ASSERT(handler_snapshot != &initialize_allocate_handler && deallocate_handler != nullptr, nullptr); // Cast to void avoids type mismatch errors on some compilers (e.g. __IBMCPP__) __TBB_ASSERT((reinterpret_cast(handler_snapshot) == reinterpret_cast(&std::malloc)) == (reinterpret_cast(deallocate_handler) == reinterpret_cast(&std::free)), "Both shim pointers must refer to routines from the same package (either TBB or CRT)"); return reinterpret_cast(handler_snapshot) == reinterpret_cast(&std::malloc); } } // namespace r1 } // namespace detail } // namespace tbb ================================================ FILE: third-party/tbb/src/tbb/arena.cpp ================================================ /* Copyright (c) 2005-2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "task_dispatcher.h" #include "governor.h" #include "threading_control.h" #include "arena.h" #include "itt_notify.h" #include "semaphore.h" #include "waiters.h" #include "oneapi/tbb/detail/_task.h" #include "oneapi/tbb/info.h" #include "oneapi/tbb/tbb_allocator.h" #include #include #include namespace tbb { namespace detail { namespace r1 { #if __TBB_ARENA_BINDING class numa_binding_observer : public tbb::task_scheduler_observer { binding_handler* my_binding_handler; public: numa_binding_observer( d1::task_arena* ta, int num_slots, int numa_id, core_type_id core_type, int max_threads_per_core ) : task_scheduler_observer(*ta) , my_binding_handler(construct_binding_handler(num_slots, numa_id, core_type, max_threads_per_core)) {} void on_scheduler_entry( bool ) override { apply_affinity_mask(my_binding_handler, this_task_arena::current_thread_index()); } void on_scheduler_exit( bool ) override { restore_affinity_mask(my_binding_handler, this_task_arena::current_thread_index()); } ~numa_binding_observer() override{ destroy_binding_handler(my_binding_handler); } }; numa_binding_observer* construct_binding_observer( d1::task_arena* ta, int num_slots, int numa_id, core_type_id core_type, int max_threads_per_core ) { numa_binding_observer* binding_observer = nullptr; if ((core_type >= 0 && core_type_count() > 1) || (numa_id >= 0 && numa_node_count() > 1) || max_threads_per_core > 0) { binding_observer = new(allocate_memory(sizeof(numa_binding_observer))) numa_binding_observer(ta, num_slots, numa_id, core_type, max_threads_per_core); __TBB_ASSERT(binding_observer, "Failure during NUMA binding observer allocation and construction"); } return binding_observer; } void destroy_binding_observer( numa_binding_observer* binding_observer ) { __TBB_ASSERT(binding_observer, "Trying to deallocate nullptr pointer"); binding_observer->observe(false); binding_observer->~numa_binding_observer(); deallocate_memory(binding_observer); } #endif /*!__TBB_ARENA_BINDING*/ void arena::on_thread_leaving(unsigned ref_param) { // // Implementation of arena destruction synchronization logic contained various // bugs/flaws at the different stages of its evolution, so below is a detailed // description of the issues taken into consideration in the framework of the // current design. // // In case of using fire-and-forget tasks (scheduled via task::enqueue()) // external thread is allowed to leave its arena before all its work is executed, // and market may temporarily revoke all workers from this arena. Since revoked // workers never attempt to reset arena state to EMPTY and cancel its request // to RML for threads, the arena object is destroyed only when both the last // thread is leaving it and arena's state is EMPTY (that is its external thread // left and it does not contain any work). // Thus resetting arena to EMPTY state (as earlier TBB versions did) should not // be done here (or anywhere else in the external thread to that matter); doing so // can result either in arena's premature destruction (at least without // additional costly checks in workers) or in unnecessary arena state changes // (and ensuing workers migration). // // A worker that checks for work presence and transitions arena to the EMPTY // state (in snapshot taking procedure arena::out_of_work()) updates // arena::my_pool_state first and only then arena::my_num_workers_requested. // So the check for work absence must be done against the latter field. // // In a time window between decrementing the active threads count and checking // if there is an outstanding request for workers. New worker thread may arrive, // finish remaining work, set arena state to empty, and leave decrementing its // refcount and destroying. Then the current thread will destroy the arena // the second time. To preclude it a local copy of the outstanding request // value can be stored before decrementing active threads count. // // But this technique may cause two other problem. When the stored request is // zero, it is possible that arena still has threads and they can generate new // tasks and thus re-establish non-zero requests. Then all the threads can be // revoked (as described above) leaving this thread the last one, and causing // it to destroy non-empty arena. // // The other problem takes place when the stored request is non-zero. Another // thread may complete the work, set arena state to empty, and leave without // arena destruction before this thread decrements the refcount. This thread // cannot destroy the arena either. Thus the arena may be "orphaned". // // In both cases we cannot dereference arena pointer after the refcount is // decremented, as our arena may already be destroyed. // // If this is the external thread, the market is protected by refcount to it. // In case of workers market's liveness is ensured by the RML connection // rundown protocol, according to which the client (i.e. the market) lives // until RML server notifies it about connection termination, and this // notification is fired only after all workers return into RML. // // Thus if we decremented refcount to zero we ask the market to check arena // state (including the fact if it is alive) under the lock. // __TBB_ASSERT(my_references.load(std::memory_order_relaxed) >= ref_param, "broken arena reference counter"); // When there is no workers someone must free arena, as // without workers, no one calls out_of_work(). if (ref_param == ref_external && !my_mandatory_concurrency.test()) { out_of_work(); } threading_control* tc = my_threading_control; auto tc_client_snapshot = tc->prepare_client_destruction(my_tc_client); // Release our reference to sync with destroy_client unsigned remaining_ref = my_references.fetch_sub(ref_param, std::memory_order_release) - ref_param; // do not access `this` it might be destroyed already if (remaining_ref == 0) { if (tc->try_destroy_client(tc_client_snapshot)) { // We are requested to destroy ourself free_arena(); } } } std::size_t arena::occupy_free_slot_in_range( thread_data& tls, std::size_t lower, std::size_t upper ) { if ( lower >= upper ) return out_of_arena; // Start search for an empty slot from the one we occupied the last time std::size_t index = tls.my_arena_index; if ( index < lower || index >= upper ) index = tls.my_random.get() % (upper - lower) + lower; __TBB_ASSERT( index >= lower && index < upper, nullptr); // Find a free slot for ( std::size_t i = index; i < upper; ++i ) if (my_slots[i].try_occupy()) return i; for ( std::size_t i = lower; i < index; ++i ) if (my_slots[i].try_occupy()) return i; return out_of_arena; } template std::size_t arena::occupy_free_slot(thread_data& tls) { // Firstly, external threads try to occupy reserved slots std::size_t index = as_worker ? out_of_arena : occupy_free_slot_in_range( tls, 0, my_num_reserved_slots ); if ( index == out_of_arena ) { // Secondly, all threads try to occupy all non-reserved slots index = occupy_free_slot_in_range(tls, my_num_reserved_slots, my_num_slots ); // Likely this arena is already saturated if ( index == out_of_arena ) return out_of_arena; } atomic_update( my_limit, (unsigned)(index + 1), std::less() ); return index; } std::uintptr_t arena::calculate_stealing_threshold() { stack_anchor_type anchor; return r1::calculate_stealing_threshold(reinterpret_cast(&anchor), my_threading_control->worker_stack_size()); } void arena::process(thread_data& tls) { governor::set_thread_data(tls); // TODO: consider moving to create_one_job. __TBB_ASSERT( is_alive(my_guard), nullptr); __TBB_ASSERT( my_num_slots >= 1, nullptr); std::size_t index = occupy_free_slot(tls); if (index == out_of_arena) { on_thread_leaving(ref_worker); return; } __TBB_ASSERT( index >= my_num_reserved_slots, "Workers cannot occupy reserved slots" ); tls.attach_arena(*this, index); // worker thread enters the dispatch loop to look for a work tls.my_inbox.set_is_idle(true); if (tls.my_arena_slot->is_task_pool_published()) { tls.my_inbox.set_is_idle(false); } task_dispatcher& task_disp = tls.my_arena_slot->default_task_dispatcher(); tls.enter_task_dispatcher(task_disp, calculate_stealing_threshold()); __TBB_ASSERT(task_disp.can_steal(), nullptr); __TBB_ASSERT( !tls.my_last_observer, "There cannot be notified local observers when entering arena" ); my_observers.notify_entry_observers(tls.my_last_observer, tls.my_is_worker); // Waiting on special object tied to this arena outermost_worker_waiter waiter(*this); d1::task* t = tls.my_task_dispatcher->local_wait_for_all(nullptr, waiter); // For purposes of affinity support, the slot's mailbox is considered idle while no thread is // attached to it. tls.my_inbox.set_is_idle(true); __TBB_ASSERT_EX(t == nullptr, "Outermost worker must not leave dispatch loop with a task"); __TBB_ASSERT(governor::is_thread_data_set(&tls), nullptr); __TBB_ASSERT(tls.my_task_dispatcher == &task_disp, nullptr); my_observers.notify_exit_observers(tls.my_last_observer, tls.my_is_worker); tls.my_last_observer = nullptr; tls.leave_task_dispatcher(); // Arena slot detach (arena may be used in market::process) // TODO: Consider moving several calls below into a new method(e.g.detach_arena). tls.my_arena_slot->release(); tls.my_arena_slot = nullptr; tls.my_inbox.detach(); __TBB_ASSERT(tls.my_inbox.is_idle_state(true), nullptr); __TBB_ASSERT(is_alive(my_guard), nullptr); // In contrast to earlier versions of TBB (before 3.0 U5) now it is possible // that arena may be temporarily left unpopulated by threads. See comments in // arena::on_thread_leaving() for more details. on_thread_leaving(ref_worker); __TBB_ASSERT(tls.my_arena == this, "my_arena is used as a hint when searching the arena to join"); } arena::arena(threading_control* control, unsigned num_slots, unsigned num_reserved_slots, unsigned priority_level #if __TBB_PREVIEW_PARALLEL_PHASE , tbb::task_arena::leave_policy lp #endif ) { __TBB_ASSERT( !my_guard, "improperly allocated arena?" ); __TBB_ASSERT( sizeof(my_slots[0]) % cache_line_size()==0, "arena::slot size not multiple of cache line size" ); __TBB_ASSERT( is_aligned(this, cache_line_size()), "arena misaligned" ); my_threading_control = control; my_limit = 1; // Two slots are mandatory: for the external thread, and for 1 worker (required to support starvation resistant tasks). my_num_slots = num_arena_slots(num_slots, num_reserved_slots); my_num_reserved_slots = num_reserved_slots; my_max_num_workers = num_slots-num_reserved_slots; my_priority_level = priority_level; my_references = ref_external; // accounts for the external thread my_observers.my_arena = this; my_co_cache.init(4 * num_slots); __TBB_ASSERT ( my_max_num_workers <= my_num_slots, nullptr); // Initialize the default context. It should be allocated before task_dispatch construction. my_default_ctx = new (cache_aligned_allocate(sizeof(d1::task_group_context))) d1::task_group_context{ d1::task_group_context::isolated, d1::task_group_context::fp_settings }; // Construct slots. Mark internal synchronization elements for the tools. task_dispatcher* base_td_pointer = reinterpret_cast(my_slots + my_num_slots); for( unsigned i = 0; i < my_num_slots; ++i ) { // __TBB_ASSERT( !my_slots[i].my_scheduler && !my_slots[i].task_pool, nullptr); __TBB_ASSERT( !my_slots[i].task_pool_ptr, nullptr); __TBB_ASSERT( !my_slots[i].my_task_pool_size, nullptr); mailbox(i).construct(); my_slots[i].init_task_streams(i); my_slots[i].my_default_task_dispatcher = new(base_td_pointer + i) task_dispatcher(this); my_slots[i].my_is_occupied.store(false, std::memory_order_relaxed); } my_fifo_task_stream.initialize(my_num_slots); my_resume_task_stream.initialize(my_num_slots); #if __TBB_PREVIEW_CRITICAL_TASKS my_critical_task_stream.initialize(my_num_slots); #endif my_mandatory_requests = 0; #if __TBB_PREVIEW_PARALLEL_PHASE my_thread_leave.set_initial_state(lp); #endif } arena& arena::allocate_arena(threading_control* control, unsigned num_slots, unsigned num_reserved_slots, unsigned priority_level #if __TBB_PREVIEW_PARALLEL_PHASE , tbb::task_arena::leave_policy lp #endif ) { __TBB_ASSERT( sizeof(base_type) + sizeof(arena_slot) == sizeof(arena), "All arena data fields must go to arena_base" ); __TBB_ASSERT( sizeof(base_type) % cache_line_size() == 0, "arena slots area misaligned: wrong padding" ); __TBB_ASSERT( sizeof(mail_outbox) == max_nfs_size, "Mailbox padding is wrong" ); std::size_t n = allocation_size(num_arena_slots(num_slots, num_reserved_slots)); unsigned char* storage = (unsigned char*)cache_aligned_allocate(n); // Zero all slots to indicate that they are empty std::memset( storage, 0, n ); return *new( storage + num_arena_slots(num_slots, num_reserved_slots) * sizeof(mail_outbox) ) arena(control, num_slots, num_reserved_slots, priority_level #if __TBB_PREVIEW_PARALLEL_PHASE , lp #endif ); } void arena::free_arena () { __TBB_ASSERT( is_alive(my_guard), nullptr); __TBB_ASSERT( !my_references.load(std::memory_order_relaxed), "There are threads in the dying arena" ); __TBB_ASSERT( !my_total_num_workers_requested && !my_num_workers_allotted, "Dying arena requests workers" ); __TBB_ASSERT( is_empty(), "Inconsistent state of a dying arena" ); #if __TBB_ARENA_BINDING if (my_numa_binding_observer != nullptr) { destroy_binding_observer(my_numa_binding_observer); my_numa_binding_observer = nullptr; } #endif /*__TBB_ARENA_BINDING*/ poison_value( my_guard ); for ( unsigned i = 0; i < my_num_slots; ++i ) { // __TBB_ASSERT( !my_slots[i].my_scheduler, "arena slot is not empty" ); // TODO: understand the assertion and modify // __TBB_ASSERT( my_slots[i].task_pool == EmptyTaskPool, nullptr); __TBB_ASSERT( my_slots[i].head == my_slots[i].tail, nullptr); // TODO: replace by is_quiescent_local_task_pool_empty my_slots[i].free_task_pool(); mailbox(i).drain(); my_slots[i].my_default_task_dispatcher->~task_dispatcher(); } __TBB_ASSERT(my_fifo_task_stream.empty(), "Not all enqueued tasks were executed"); __TBB_ASSERT(my_resume_task_stream.empty(), "Not all enqueued tasks were executed"); // Cleanup coroutines/schedulers cache my_co_cache.cleanup(); my_default_ctx->~task_group_context(); cache_aligned_deallocate(my_default_ctx); #if __TBB_PREVIEW_CRITICAL_TASKS __TBB_ASSERT( my_critical_task_stream.empty(), "Not all critical tasks were executed"); #endif // Clear enfources synchronization with observe(false) my_observers.clear(); void* storage = &mailbox(my_num_slots-1); __TBB_ASSERT( my_references.load(std::memory_order_relaxed) == 0, nullptr); this->~arena(); #if TBB_USE_ASSERT > 1 std::memset( storage, 0, allocation_size(my_num_slots) ); #endif /* TBB_USE_ASSERT */ cache_aligned_deallocate( storage ); } bool arena::has_enqueued_tasks() { return !my_fifo_task_stream.empty(); } void arena::request_workers(int mandatory_delta, int workers_delta, bool wakeup_threads) { my_threading_control->adjust_demand(my_tc_client, mandatory_delta, workers_delta); if (wakeup_threads) { // Notify all sleeping threads that work has appeared in the arena. get_waiting_threads_monitor().notify([&] (market_context context) { return this == context.my_arena_addr; }); } } bool arena::has_tasks() { // TODO: rework it to return at least a hint about where a task was found; better if the task itself. std::size_t n = my_limit.load(std::memory_order_acquire); bool tasks_are_available = false; for (std::size_t k = 0; k < n && !tasks_are_available; ++k) { tasks_are_available = !my_slots[k].is_empty(); } tasks_are_available = tasks_are_available || has_enqueued_tasks() || !my_resume_task_stream.empty(); #if __TBB_PREVIEW_CRITICAL_TASKS tasks_are_available = tasks_are_available || !my_critical_task_stream.empty(); #endif return tasks_are_available; } void arena::out_of_work() { // We should try unset my_pool_state first due to keep arena invariants in consistent state // Otherwise, we might have my_pool_state = false and my_mandatory_concurrency = true that is broken invariant bool disable_mandatory = my_mandatory_concurrency.try_clear_if([this] { return !has_enqueued_tasks(); }); bool release_workers = my_pool_state.try_clear_if([this] { return !has_tasks(); }); if (disable_mandatory || release_workers) { int mandatory_delta = disable_mandatory ? -1 : 0; int workers_delta = release_workers ? -(int)my_max_num_workers : 0; if (disable_mandatory && is_arena_workerless()) { // We had set workers_delta to 1 when enabled mandatory concurrency, so revert it now workers_delta = -1; } request_workers(mandatory_delta, workers_delta); } } void arena::set_top_priority(bool is_top_priority) { my_is_top_priority.store(is_top_priority, std::memory_order_relaxed); } bool arena::is_top_priority() const { return my_is_top_priority.load(std::memory_order_relaxed); } bool arena::try_join() { if (is_joinable()) { my_references += arena::ref_worker; return true; } return false; } void arena::set_allotment(unsigned allotment) { if (my_num_workers_allotted.load(std::memory_order_relaxed) != allotment) { my_num_workers_allotted.store(allotment, std::memory_order_relaxed); } } int arena::update_concurrency(unsigned allotment) { int delta = allotment - my_num_workers_allotted.load(std::memory_order_relaxed); if (delta != 0) { my_num_workers_allotted.store(allotment, std::memory_order_relaxed); } return delta; } std::pair arena::update_request(int mandatory_delta, int workers_delta) { __TBB_ASSERT(-1 <= mandatory_delta && mandatory_delta <= 1, nullptr); int min_workers_request = 0; int max_workers_request = 0; // Calculate min request my_mandatory_requests += mandatory_delta; min_workers_request = my_mandatory_requests > 0 ? 1 : 0; // Calculate max request my_total_num_workers_requested += workers_delta; // Clamp worker request into interval [0, my_max_num_workers] max_workers_request = clamp(my_total_num_workers_requested, 0, min_workers_request > 0 && is_arena_workerless() ? 1 : (int)my_max_num_workers); return { min_workers_request, max_workers_request }; } thread_control_monitor& arena::get_waiting_threads_monitor() { return my_threading_control->get_waiting_threads_monitor(); } void arena::enqueue_task(d1::task& t, d1::task_group_context& ctx, thread_data& td) { task_group_context_impl::bind_to(ctx, &td); task_accessor::context(t) = &ctx; task_accessor::isolation(t) = no_isolation; my_fifo_task_stream.push( &t, random_lane_selector(td.my_random) ); advertise_new_work(); } arena &arena::create(threading_control *control, unsigned num_slots, unsigned num_reserved_slots, unsigned arena_priority_level, d1::constraints constraints #if __TBB_PREVIEW_PARALLEL_PHASE , tbb::task_arena::leave_policy lp #endif ) { __TBB_ASSERT(num_slots > 0, NULL); __TBB_ASSERT(num_reserved_slots <= num_slots, NULL); // Add public market reference for an external thread/task_arena (that adds an internal reference in exchange). arena& a = arena::allocate_arena(control, num_slots, num_reserved_slots, arena_priority_level #if __TBB_PREVIEW_PARALLEL_PHASE , lp #endif ); a.my_tc_client = control->create_client(a); // We should not publish arena until all fields are initialized control->publish_client(a.my_tc_client, constraints); return a; } } // namespace r1 } // namespace detail } // namespace tbb // Enable task_arena.h #include "oneapi/tbb/task_arena.h" // task_arena_base namespace tbb { namespace detail { namespace r1 { #if TBB_USE_ASSERT void assert_arena_priority_valid( tbb::task_arena::priority a_priority ) { bool is_arena_priority_correct = a_priority == tbb::task_arena::priority::high || a_priority == tbb::task_arena::priority::normal || a_priority == tbb::task_arena::priority::low; __TBB_ASSERT( is_arena_priority_correct, "Task arena priority should be equal to one of the predefined values." ); } #else void assert_arena_priority_valid( tbb::task_arena::priority ) {} #endif unsigned arena_priority_level( tbb::task_arena::priority a_priority ) { assert_arena_priority_valid( a_priority ); return d1::num_priority_levels - unsigned(int(a_priority) / d1::priority_stride); } tbb::task_arena::priority arena_priority( unsigned priority_level ) { auto priority = tbb::task_arena::priority( (d1::num_priority_levels - priority_level) * d1::priority_stride ); assert_arena_priority_valid( priority ); return priority; } struct task_arena_impl { static void initialize(d1::task_arena_base&); static void terminate(d1::task_arena_base&); static bool attach(d1::task_arena_base&); static void execute(d1::task_arena_base&, d1::delegate_base&); static void wait(d1::task_arena_base&); static int max_concurrency(const d1::task_arena_base*); static void enqueue(d1::task&, d1::task_group_context*, d1::task_arena_base*); static d1::slot_id execution_slot(const d1::task_arena_base&); static void enter_parallel_phase(d1::task_arena_base*, std::uintptr_t); static void exit_parallel_phase(d1::task_arena_base*, std::uintptr_t); }; void __TBB_EXPORTED_FUNC initialize(d1::task_arena_base& ta) { task_arena_impl::initialize(ta); } void __TBB_EXPORTED_FUNC terminate(d1::task_arena_base& ta) { task_arena_impl::terminate(ta); } bool __TBB_EXPORTED_FUNC attach(d1::task_arena_base& ta) { return task_arena_impl::attach(ta); } void __TBB_EXPORTED_FUNC execute(d1::task_arena_base& ta, d1::delegate_base& d) { task_arena_impl::execute(ta, d); } void __TBB_EXPORTED_FUNC wait(d1::task_arena_base& ta) { task_arena_impl::wait(ta); } int __TBB_EXPORTED_FUNC max_concurrency(const d1::task_arena_base* ta) { return task_arena_impl::max_concurrency(ta); } void __TBB_EXPORTED_FUNC enqueue(d1::task& t, d1::task_arena_base* ta) { task_arena_impl::enqueue(t, nullptr, ta); } void __TBB_EXPORTED_FUNC enqueue(d1::task& t, d1::task_group_context& ctx, d1::task_arena_base* ta) { task_arena_impl::enqueue(t, &ctx, ta); } d1::slot_id __TBB_EXPORTED_FUNC execution_slot(const d1::task_arena_base& arena) { return task_arena_impl::execution_slot(arena); } void __TBB_EXPORTED_FUNC enter_parallel_phase(d1::task_arena_base* ta, std::uintptr_t flags) { task_arena_impl::enter_parallel_phase(ta, flags); } void __TBB_EXPORTED_FUNC exit_parallel_phase(d1::task_arena_base* ta, std::uintptr_t flags) { task_arena_impl::exit_parallel_phase(ta, flags); } void task_arena_impl::initialize(d1::task_arena_base& ta) { // Enforce global market initialization to properly initialize soft limit (void)governor::get_thread_data(); d1::constraints arena_constraints; #if __TBB_ARENA_BINDING arena_constraints = d1::constraints{} .set_core_type(ta.core_type()) .set_max_threads_per_core(ta.max_threads_per_core()) .set_numa_id(ta.my_numa_id); #endif /*__TBB_ARENA_BINDING*/ if (ta.my_max_concurrency < 1) { #if __TBB_ARENA_BINDING ta.my_max_concurrency = (int)default_concurrency(arena_constraints); #else /*!__TBB_ARENA_BINDING*/ ta.my_max_concurrency = (int)governor::default_num_threads(); #endif /*!__TBB_ARENA_BINDING*/ } #if __TBB_CPUBIND_PRESENT numa_binding_observer* observer = construct_binding_observer( static_cast(&ta), arena::num_arena_slots(ta.my_max_concurrency, ta.my_num_reserved_slots), ta.my_numa_id, ta.core_type(), ta.max_threads_per_core()); if (observer) { // TODO: Consider lazy initialization for internal arena so // the direct calls to observer might be omitted until actual initialization. observer->on_scheduler_entry(true); } #endif /*__TBB_CPUBIND_PRESENT*/ __TBB_ASSERT(ta.my_arena.load(std::memory_order_relaxed) == nullptr, "Arena already initialized"); unsigned priority_level = arena_priority_level(ta.my_priority); threading_control* thr_control = threading_control::register_public_reference(); arena& a = arena::create(thr_control, unsigned(ta.my_max_concurrency), ta.my_num_reserved_slots, priority_level, arena_constraints #if __TBB_PREVIEW_PARALLEL_PHASE , ta.get_leave_policy() #endif ); ta.my_arena.store(&a, std::memory_order_release); #if __TBB_CPUBIND_PRESENT a.my_numa_binding_observer = observer; if (observer) { observer->on_scheduler_exit(true); observer->observe(true); } #endif /*__TBB_CPUBIND_PRESENT*/ } void task_arena_impl::terminate(d1::task_arena_base& ta) { arena* a = ta.my_arena.load(std::memory_order_relaxed); assert_pointer_valid(a); threading_control::unregister_public_reference(/*blocking_terminate=*/false); a->on_thread_leaving(arena::ref_external); ta.my_arena.store(nullptr, std::memory_order_relaxed); } bool task_arena_impl::attach(d1::task_arena_base& ta) { __TBB_ASSERT(!ta.my_arena.load(std::memory_order_relaxed), nullptr); thread_data* td = governor::get_thread_data_if_initialized(); if( td && td->my_arena ) { arena* a = td->my_arena; // There is an active arena to attach to. // It's still used by s, so won't be destroyed right away. __TBB_ASSERT(a->my_references > 0, nullptr); a->my_references += arena::ref_external; ta.my_num_reserved_slots = a->my_num_reserved_slots; ta.my_priority = arena_priority(a->my_priority_level); ta.my_max_concurrency = ta.my_num_reserved_slots + a->my_max_num_workers; __TBB_ASSERT(arena::num_arena_slots(ta.my_max_concurrency, ta.my_num_reserved_slots) == a->my_num_slots, nullptr); ta.my_arena.store(a, std::memory_order_release); // increases threading_control's ref count for task_arena threading_control::register_public_reference(); return true; } return false; } void task_arena_impl::enqueue(d1::task& t, d1::task_group_context* c, d1::task_arena_base* ta) { thread_data* td = governor::get_thread_data(); // thread data is only needed for FastRandom instance assert_pointer_valid(td, "thread_data pointer should not be null"); arena* a = ta ? ta->my_arena.load(std::memory_order_relaxed) : td->my_arena ; assert_pointer_valid(a, "arena pointer should not be null"); auto* ctx = c ? c : a->my_default_ctx; assert_pointer_valid(ctx, "context pointer should not be null"); // Is there a better place for checking the state of ctx? __TBB_ASSERT(!a->my_default_ctx->is_group_execution_cancelled(), "The task will not be executed because its task_group_context is cancelled."); a->enqueue_task(t, *ctx, *td); } d1::slot_id task_arena_impl::execution_slot(const d1::task_arena_base& ta) { thread_data* td = governor::get_thread_data_if_initialized(); if (td && (td->is_attached_to(ta.my_arena.load(std::memory_order_relaxed)))) { return td->my_arena_index; } return d1::slot_id(-1); } class nested_arena_context : no_copy { public: nested_arena_context(thread_data& td, arena& nested_arena, std::size_t slot_index) : m_orig_execute_data_ext(td.my_task_dispatcher->m_execute_data_ext) { if (td.my_arena != &nested_arena) { m_orig_arena = td.my_arena; m_orig_slot_index = td.my_arena_index; m_orig_last_observer = td.my_last_observer; m_orig_is_thread_registered = td.my_is_registered; td.detach_task_dispatcher(); td.attach_arena(nested_arena, slot_index); td.my_is_registered = false; if (td.my_inbox.is_idle_state(true)) td.my_inbox.set_is_idle(false); task_dispatcher& task_disp = td.my_arena_slot->default_task_dispatcher(); td.enter_task_dispatcher(task_disp, m_orig_execute_data_ext.task_disp->m_stealing_threshold); // If the calling thread occupies the slots out of external thread reserve we need to notify the // market that this arena requires one worker less. if (td.my_arena_index >= td.my_arena->my_num_reserved_slots) { td.my_arena->request_workers(/* mandatory_delta = */ 0, /* workers_delta = */ -1); } td.my_last_observer = nullptr; // The task_arena::execute method considers each calling thread as an external thread. td.my_arena->my_observers.notify_entry_observers(td.my_last_observer, /* worker*/false); } m_task_dispatcher = td.my_task_dispatcher; m_orig_fifo_tasks_allowed = m_task_dispatcher->allow_fifo_task(true); m_orig_critical_task_allowed = m_task_dispatcher->m_properties.critical_task_allowed; m_task_dispatcher->m_properties.critical_task_allowed = true; execution_data_ext& ed_ext = td.my_task_dispatcher->m_execute_data_ext; ed_ext.context = td.my_arena->my_default_ctx; ed_ext.original_slot = td.my_arena_index; ed_ext.affinity_slot = d1::no_slot; ed_ext.task_disp = td.my_task_dispatcher; ed_ext.isolation = no_isolation; __TBB_ASSERT(td.my_arena_slot, nullptr); __TBB_ASSERT(td.my_arena_slot->is_occupied(), nullptr); __TBB_ASSERT(td.my_task_dispatcher, nullptr); } ~nested_arena_context() { thread_data& td = *m_task_dispatcher->m_thread_data; __TBB_ASSERT(governor::is_thread_data_set(&td), nullptr); m_task_dispatcher->allow_fifo_task(m_orig_fifo_tasks_allowed); m_task_dispatcher->m_properties.critical_task_allowed = m_orig_critical_task_allowed; if (m_orig_arena) { td.my_arena->my_observers.notify_exit_observers(td.my_last_observer, /*worker*/ false); td.my_last_observer = m_orig_last_observer; // Notify the market that this thread releasing a one slot // that can be used by a worker thread. if (td.my_arena_index >= td.my_arena->my_num_reserved_slots) { td.my_arena->request_workers(/* mandatory_delta = */ 0, /* workers_delta = */ 1); } td.leave_task_dispatcher(); td.my_arena_slot->release(); td.my_arena->my_exit_monitors.notify_one(); // do not relax! td.my_is_registered = m_orig_is_thread_registered; td.attach_arena(*m_orig_arena, m_orig_slot_index); td.attach_task_dispatcher(*m_orig_execute_data_ext.task_disp); __TBB_ASSERT(td.my_inbox.is_idle_state(false), nullptr); } td.my_task_dispatcher->m_execute_data_ext = m_orig_execute_data_ext; } private: execution_data_ext m_orig_execute_data_ext{}; arena* m_orig_arena{ nullptr }; observer_proxy* m_orig_last_observer{ nullptr }; task_dispatcher* m_task_dispatcher{ nullptr }; unsigned m_orig_slot_index{}; bool m_orig_fifo_tasks_allowed{}; bool m_orig_critical_task_allowed{}; bool m_orig_is_thread_registered{}; }; class delegated_task : public d1::task { d1::delegate_base& m_delegate; concurrent_monitor& m_monitor; d1::wait_context& m_wait_ctx; std::atomic m_completed; d1::task* execute(d1::execution_data& ed) override { const execution_data_ext& ed_ext = static_cast(ed); execution_data_ext orig_execute_data_ext = ed_ext.task_disp->m_execute_data_ext; __TBB_ASSERT(&ed_ext.task_disp->m_execute_data_ext == &ed, "The execute data shall point to the current task dispatcher execute data"); __TBB_ASSERT(ed_ext.task_disp->m_execute_data_ext.isolation == no_isolation, nullptr); ed_ext.task_disp->m_execute_data_ext.context = ed_ext.task_disp->get_thread_data().my_arena->my_default_ctx; bool fifo_task_allowed = ed_ext.task_disp->allow_fifo_task(true); try_call([&] { m_delegate(); }).on_completion([&] { ed_ext.task_disp->m_execute_data_ext = orig_execute_data_ext; ed_ext.task_disp->allow_fifo_task(fifo_task_allowed); }); finalize(); return nullptr; } d1::task* cancel(d1::execution_data&) override { finalize(); return nullptr; } void finalize() { m_wait_ctx.release(); // must precede the wakeup m_monitor.notify([this] (std::uintptr_t ctx) { return ctx == std::uintptr_t(&m_delegate); }); // do not relax, it needs a fence! m_completed.store(true, std::memory_order_release); } public: delegated_task(d1::delegate_base& d, concurrent_monitor& s, d1::wait_context& wo) : m_delegate(d), m_monitor(s), m_wait_ctx(wo), m_completed{ false }{} ~delegated_task() override { // The destructor can be called earlier than the m_monitor is notified // because the waiting thread can be released after m_wait_ctx.release_wait. // To close that race we wait for the m_completed signal. spin_wait_until_eq(m_completed, true); } }; void task_arena_impl::execute(d1::task_arena_base& ta, d1::delegate_base& d) { arena* a = ta.my_arena.load(std::memory_order_relaxed); __TBB_ASSERT(a != nullptr, nullptr); thread_data* td = governor::get_thread_data(); bool same_arena = td->my_arena == a; std::size_t index1 = td->my_arena_index; if (!same_arena) { index1 = a->occupy_free_slot(*td); if (index1 == arena::out_of_arena) { concurrent_monitor::thread_context waiter((std::uintptr_t)&d); d1::wait_context wo(1); d1::task_group_context exec_context(d1::task_group_context::isolated); task_group_context_impl::copy_fp_settings(exec_context, *a->my_default_ctx); delegated_task dt(d, a->my_exit_monitors, wo); a->enqueue_task( dt, exec_context, *td); size_t index2 = arena::out_of_arena; do { a->my_exit_monitors.prepare_wait(waiter); if (!wo.continue_execution()) { a->my_exit_monitors.cancel_wait(waiter); break; } index2 = a->occupy_free_slot(*td); if (index2 != arena::out_of_arena) { a->my_exit_monitors.cancel_wait(waiter); nested_arena_context scope(*td, *a, index2 ); r1::wait(wo, exec_context); __TBB_ASSERT(!exec_context.my_exception.load(std::memory_order_relaxed), nullptr); // exception can be thrown above, not deferred break; } a->my_exit_monitors.commit_wait(waiter); } while (wo.continue_execution()); if (index2 == arena::out_of_arena) { // notify a waiting thread even if this thread did not enter arena, // in case it was woken by a leaving thread but did not need to enter a->my_exit_monitors.notify_one(); // do not relax! } // process possible exception auto exception = exec_context.my_exception.load(std::memory_order_acquire); if (exception) { __TBB_ASSERT(exec_context.is_group_execution_cancelled(), "The task group context with an exception should be canceled."); exception->throw_self(); } __TBB_ASSERT(governor::is_thread_data_set(td), nullptr); return; } // if (index1 == arena::out_of_arena) } // if (!same_arena) context_guard_helper context_guard; context_guard.set_ctx(a->my_default_ctx); nested_arena_context scope(*td, *a, index1); #if _WIN64 try { #endif d(); __TBB_ASSERT(same_arena || governor::is_thread_data_set(td), nullptr); #if _WIN64 } catch (...) { context_guard.restore_default(); throw; } #endif } void task_arena_impl::wait(d1::task_arena_base& ta) { arena* a = ta.my_arena.load(std::memory_order_relaxed); __TBB_ASSERT(a != nullptr, nullptr); thread_data* td = governor::get_thread_data(); __TBB_ASSERT_EX(td, "Scheduler is not initialized"); __TBB_ASSERT(td->my_arena != a || td->my_arena_index == 0, "internal_wait is not supported within a worker context" ); if (a->my_max_num_workers != 0) { while (a->num_workers_active() || !a->is_empty()) { yield(); } } } int task_arena_impl::max_concurrency(const d1::task_arena_base *ta) { arena* a = nullptr; if( ta ) // for special cases of ta->max_concurrency() a = ta->my_arena.load(std::memory_order_relaxed); else if( thread_data* td = governor::get_thread_data_if_initialized() ) a = td->my_arena; // the current arena if any if( a ) { // Get parameters from the arena __TBB_ASSERT( !ta || ta->my_max_concurrency==1, nullptr); int mandatory_worker = 0; if (a->is_arena_workerless() && a->my_num_reserved_slots == 1) { mandatory_worker = a->my_mandatory_concurrency.test() ? 1 : 0; } return a->my_num_reserved_slots + a->my_max_num_workers + mandatory_worker; } if (ta && ta->my_max_concurrency == 1) { return 1; } #if __TBB_ARENA_BINDING if (ta) { d1::constraints arena_constraints = d1::constraints{} .set_numa_id(ta->my_numa_id) .set_core_type(ta->core_type()) .set_max_threads_per_core(ta->max_threads_per_core()); return (int)default_concurrency(arena_constraints); } #endif /*!__TBB_ARENA_BINDING*/ __TBB_ASSERT(!ta || ta->my_max_concurrency==d1::task_arena_base::automatic, nullptr); return int(governor::default_num_threads()); } #if __TBB_PREVIEW_PARALLEL_PHASE void task_arena_impl::enter_parallel_phase(d1::task_arena_base* ta, std::uintptr_t /*reserved*/) { arena* a = ta ? ta->my_arena.load(std::memory_order_relaxed) : governor::get_thread_data()->my_arena; __TBB_ASSERT(a, nullptr); a->my_thread_leave.register_parallel_phase(); a->advertise_new_work(); } void task_arena_impl::exit_parallel_phase(d1::task_arena_base* ta, std::uintptr_t flags) { arena* a = ta ? ta->my_arena.load(std::memory_order_relaxed) : governor::get_thread_data()->my_arena; __TBB_ASSERT(a, nullptr); a->my_thread_leave.unregister_parallel_phase(/*with_fast_leave=*/static_cast(flags)); } #endif void isolate_within_arena(d1::delegate_base& d, std::intptr_t isolation) { // TODO: Decide what to do if the scheduler is not initialized. Is there a use case for it? thread_data* tls = governor::get_thread_data(); assert_pointers_valid(tls, tls->my_task_dispatcher); task_dispatcher* dispatcher = tls->my_task_dispatcher; isolation_type previous_isolation = dispatcher->m_execute_data_ext.isolation; try_call([&] { // We temporarily change the isolation tag of the currently running task. It will be restored in the destructor of the guard. isolation_type current_isolation = isolation ? isolation : reinterpret_cast(&d); // Save the current isolation value and set new one previous_isolation = dispatcher->set_isolation(current_isolation); // Isolation within this callable d(); }).on_completion([&] { __TBB_ASSERT(governor::get_thread_data()->my_task_dispatcher == dispatcher, nullptr); dispatcher->set_isolation(previous_isolation); }); } } // namespace r1 } // namespace detail } // namespace tbb ================================================ FILE: third-party/tbb/src/tbb/arena.h ================================================ /* Copyright (c) 2005-2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _TBB_arena_H #define _TBB_arena_H #include #include #include "oneapi/tbb/detail/_task.h" #include "oneapi/tbb/detail/_utils.h" #include "oneapi/tbb/spin_mutex.h" #include "scheduler_common.h" #include "intrusive_list.h" #include "task_stream.h" #include "arena_slot.h" #include "rml_tbb.h" #include "mailbox.h" #include "governor.h" #include "concurrent_monitor.h" #include "observer_proxy.h" #include "thread_control_monitor.h" #include "threading_control_client.h" namespace tbb { namespace detail { namespace r1 { class task_dispatcher; class task_group_context; class threading_control; class allocate_root_with_context_proxy; #if __TBB_ARENA_BINDING class numa_binding_observer; #endif /*__TBB_ARENA_BINDING*/ //! Bounded coroutines cache LIFO ring buffer class arena_co_cache { //! Ring buffer storage task_dispatcher** my_co_scheduler_cache; //! Current cache index unsigned my_head; //! Cache capacity for arena unsigned my_max_index; //! Accessor lock for modification operations tbb::spin_mutex my_co_cache_mutex; unsigned next_index() { return ( my_head == my_max_index ) ? 0 : my_head + 1; } unsigned prev_index() { return ( my_head == 0 ) ? my_max_index : my_head - 1; } bool internal_empty() { return my_co_scheduler_cache[prev_index()] == nullptr; } void internal_task_dispatcher_cleanup(task_dispatcher* to_cleanup) { to_cleanup->~task_dispatcher(); cache_aligned_deallocate(to_cleanup); } public: void init(unsigned cache_capacity) { std::size_t alloc_size = cache_capacity * sizeof(task_dispatcher*); my_co_scheduler_cache = (task_dispatcher**)cache_aligned_allocate(alloc_size); std::memset( my_co_scheduler_cache, 0, alloc_size ); my_head = 0; my_max_index = cache_capacity - 1; } void cleanup() { while (task_dispatcher* to_cleanup = pop()) { internal_task_dispatcher_cleanup(to_cleanup); } cache_aligned_deallocate(my_co_scheduler_cache); } //! Insert scheduler to the current available place. //! Replace an old value, if necessary. void push(task_dispatcher* s) { task_dispatcher* to_cleanup = nullptr; { tbb::spin_mutex::scoped_lock lock(my_co_cache_mutex); // Check if we are replacing some existing buffer entrance if (my_co_scheduler_cache[my_head] != nullptr) { to_cleanup = my_co_scheduler_cache[my_head]; } // Store the cached value my_co_scheduler_cache[my_head] = s; // Move head index to the next slot my_head = next_index(); } // Cleanup replaced buffer if any if (to_cleanup) { internal_task_dispatcher_cleanup(to_cleanup); } } //! Get a cached scheduler if any task_dispatcher* pop() { tbb::spin_mutex::scoped_lock lock(my_co_cache_mutex); // No cached coroutine if (internal_empty()) { return nullptr; } // Move head index to the currently available value my_head = prev_index(); // Retrieve the value from the buffer task_dispatcher* to_return = my_co_scheduler_cache[my_head]; // Clear the previous entrance value my_co_scheduler_cache[my_head] = nullptr; return to_return; } }; struct stack_anchor_type { stack_anchor_type() = default; stack_anchor_type(const stack_anchor_type&) = delete; }; class atomic_flag { static const std::uintptr_t SET = 1; static const std::uintptr_t UNSET = 0; std::atomic my_state{UNSET}; public: bool test_and_set() { std::uintptr_t state = my_state.load(std::memory_order_acquire); switch (state) { case SET: return false; default: /* busy */ if (my_state.compare_exchange_strong(state, SET)) { // We interrupted clear transaction return false; } if (state != UNSET) { // We lost our epoch return false; } // We are too late but still in the same epoch __TBB_fallthrough; case UNSET: return my_state.compare_exchange_strong(state, SET); } } template bool try_clear_if(Pred&& pred) { std::uintptr_t busy = std::uintptr_t(&busy); std::uintptr_t state = my_state.load(std::memory_order_acquire); if (state == SET && my_state.compare_exchange_strong(state, busy)) { if (pred()) { return my_state.compare_exchange_strong(busy, UNSET); } // The result of the next operation is discarded, always false should be returned. my_state.compare_exchange_strong(busy, SET); } return false; } bool test(std::memory_order order = std::memory_order_acquire) { return my_state.load(order) != UNSET; } }; #if __TBB_PREVIEW_PARALLEL_PHASE class thread_leave_manager { static const std::uintptr_t DELAYED_LEAVE = 0; static const std::uintptr_t FAST_LEAVE = 1; static const std::uintptr_t ONE_TIME_FAST_LEAVE = 1 << 1; static const std::uintptr_t PARALLEL_PHASE = 1 << 2; std::atomic my_state{UINTPTR_MAX}; public: // This method is not thread-safe! // Required to be called after construction to set initial state of the state machine. void set_initial_state(tbb::task_arena::leave_policy lp) { if (lp == tbb::task_arena::leave_policy::automatic) { std::uintptr_t platform_policy = governor::hybrid_cpu() ? FAST_LEAVE : DELAYED_LEAVE; my_state.store(platform_policy, std::memory_order_relaxed); } else { __TBB_ASSERT(lp == tbb::task_arena::leave_policy::fast, "Was the new value introduced for leave policy?"); my_state.store(FAST_LEAVE, std::memory_order_relaxed); } } void reset_if_needed() { std::uintptr_t curr = my_state.load(std::memory_order_relaxed); if (curr == ONE_TIME_FAST_LEAVE) { // Potentially can override decision of the parallel phase from future epoch // but it is not a problem because it does not violate the correctness my_state.fetch_and(~ONE_TIME_FAST_LEAVE); } } // Indicate start of parallel phase in the state machine void register_parallel_phase() { __TBB_ASSERT(my_state.load(std::memory_order_relaxed) != UINTPTR_MAX, "The initial state was not set"); std::uintptr_t prev = my_state.fetch_add(PARALLEL_PHASE); __TBB_ASSERT(prev + PARALLEL_PHASE > prev, "Overflow detected"); if (prev & ONE_TIME_FAST_LEAVE) { // State was previously transitioned to "One-time Fast leave", thus with the start // of new parallel phase, it should be reset my_state.fetch_and(~ONE_TIME_FAST_LEAVE); } } // Indicate the end of parallel phase in the state machine void unregister_parallel_phase(bool enable_fast_leave) { std::uintptr_t prev = my_state.load(std::memory_order_relaxed); __TBB_ASSERT(prev != UINTPTR_MAX, "The initial state was not set"); std::uintptr_t desired{}; do { __TBB_ASSERT(prev - PARALLEL_PHASE < prev, "A call to unregister without its register complement"); desired = prev - PARALLEL_PHASE; // Mark the end of this phase in reference counter if (enable_fast_leave && /*it was the last parallel phase*/desired == DELAYED_LEAVE) { desired = ONE_TIME_FAST_LEAVE; } } while (!my_state.compare_exchange_strong(prev, desired)); } bool is_retention_allowed() { std::uintptr_t curr = my_state.load(std::memory_order_relaxed); __TBB_ASSERT(curr != UINTPTR_MAX, "The initial state was not set"); return curr != FAST_LEAVE && curr != ONE_TIME_FAST_LEAVE; } }; #endif /* __TBB_PREVIEW_PARALLEL_PHASE */ //! The structure of an arena, except the array of slots. /** Separated in order to simplify padding. Intrusive list node base class is used by market to form a list of arenas. **/ // TODO: Analyze arena_base cache lines placement struct arena_base : padded { //! The number of workers that have been marked out by the resource manager to service the arena. std::atomic my_num_workers_allotted; // heavy use in stealing loop //! Reference counter for the arena. /** Worker and external thread references are counted separately: first several bits are for references from external thread threads or explicit task_arenas (see arena::ref_external_bits below); the rest counts the number of workers servicing the arena. */ std::atomic my_references; // heavy use in stealing loop //! The maximal number of currently busy slots. std::atomic my_limit; // heavy use in stealing loop //! Task pool for the tasks scheduled via task::enqueue() method. /** Such scheduling guarantees eventual execution even if - new tasks are constantly coming (by extracting scheduled tasks in relaxed FIFO order); - the enqueuing thread does not call any of wait_for_all methods. **/ task_stream my_fifo_task_stream; // heavy use in stealing loop //! Task pool for the tasks scheduled via tbb::resume() function. task_stream my_resume_task_stream; // heavy use in stealing loop #if __TBB_PREVIEW_CRITICAL_TASKS //! Task pool for the tasks with critical property set. /** Critical tasks are scheduled for execution ahead of other sources (including local task pool and even bypassed tasks) unless the thread already executes a critical task in an outer dispatch loop **/ // used on the hot path of the task dispatch loop task_stream my_critical_task_stream; #endif //! The total number of workers that are requested from the resource manager. int my_total_num_workers_requested; //! The index in the array of per priority lists of arenas this object is in. /*const*/ unsigned my_priority_level; //! The max priority level of arena in permit manager. std::atomic my_is_top_priority{false}; //! Current task pool state and estimate of available tasks amount. atomic_flag my_pool_state; //! The list of local observers attached to this arena. observer_list my_observers; #if __TBB_ARENA_BINDING //! Pointer to internal observer that allows to bind threads in arena to certain NUMA node. numa_binding_observer* my_numa_binding_observer{nullptr}; #endif /*__TBB_ARENA_BINDING*/ // Below are rarely modified members threading_control* my_threading_control; //! Default task group context. d1::task_group_context* my_default_ctx; //! Waiting object for external threads that cannot join the arena. concurrent_monitor my_exit_monitors; #if __TBB_PREVIEW_PARALLEL_PHASE //! Manages state of thread_leave state machine thread_leave_manager my_thread_leave; #endif //! Coroutines (task_dispathers) cache buffer arena_co_cache my_co_cache; // arena needs an extra worker despite the arena limit atomic_flag my_mandatory_concurrency; // the number of local mandatory concurrency requests int my_mandatory_requests; //! The number of slots in the arena. unsigned my_num_slots; //! The number of reserved slots (can be occupied only by external threads). unsigned my_num_reserved_slots; //! The number of workers requested by the external thread owning the arena. unsigned my_max_num_workers; threading_control_client my_tc_client; #if TBB_USE_ASSERT //! Used to trap accesses to the object after its destruction. std::uintptr_t my_guard; #endif /* TBB_USE_ASSERT */ }; // struct arena_base class arena: public padded { public: using base_type = padded; //! Types of work advertised by advertise_new_work() enum new_work_type { work_spawned, wakeup, work_enqueued }; //! Constructor arena(threading_control* control, unsigned max_num_workers, unsigned num_reserved_slots, unsigned priority_level #if __TBB_PREVIEW_PARALLEL_PHASE , tbb::task_arena::leave_policy lp #endif ); //! Allocate an instance of arena. static arena& allocate_arena(threading_control* control, unsigned num_slots, unsigned num_reserved_slots, unsigned priority_level #if __TBB_PREVIEW_PARALLEL_PHASE , tbb::task_arena::leave_policy lp #endif ); static arena& create(threading_control* control, unsigned num_slots, unsigned num_reserved_slots, unsigned arena_priority_level, d1::constraints constraints = d1::constraints{} #if __TBB_PREVIEW_PARALLEL_PHASE , tbb::task_arena::leave_policy lp = tbb::task_arena::leave_policy::automatic #endif ); static int unsigned num_arena_slots ( unsigned num_slots, unsigned num_reserved_slots ) { return num_reserved_slots == 0 ? num_slots : max(2u, num_slots); } static int allocation_size( unsigned num_slots ) { return sizeof(base_type) + num_slots * (sizeof(mail_outbox) + sizeof(arena_slot) + sizeof(task_dispatcher)); } //! Get reference to mailbox corresponding to given slot_id mail_outbox& mailbox( d1::slot_id slot ) { __TBB_ASSERT( slot != d1::no_slot, "affinity should be specified" ); return reinterpret_cast(this)[-(int)(slot+1)]; // cast to 'int' is redundant but left for readability } //! Completes arena shutdown, destructs and deallocates it. void free_arena(); //! The number of least significant bits for external references static const unsigned ref_external_bits = 12; // up to 4095 external and 1M workers //! Reference increment values for externals and workers static const unsigned ref_external = 1; static const unsigned ref_worker = 1 << ref_external_bits; //! The number of workers active in the arena. unsigned num_workers_active() const { return my_references.load(std::memory_order_acquire) >> ref_external_bits; } //! Check if the recall is requested by the market. bool is_recall_requested() const { return num_workers_active() > my_num_workers_allotted.load(std::memory_order_relaxed); } void request_workers(int mandatory_delta, int workers_delta, bool wakeup_threads = false); //! If necessary, raise a flag that there is new job in arena. template void advertise_new_work(); //! Attempts to steal a task from a randomly chosen arena slot d1::task* steal_task(unsigned arena_index, FastRandom& frnd, execution_data_ext& ed, isolation_type isolation); //! Get a task from a global starvation resistant queue template d1::task* get_stream_task(task_stream& stream, unsigned& hint); #if __TBB_PREVIEW_CRITICAL_TASKS //! Tries to find a critical task in global critical task stream d1::task* get_critical_task(unsigned& hint, isolation_type isolation); #endif //! Check if there is job anywhere in arena. void out_of_work(); //! enqueue a task into starvation-resistance queue void enqueue_task(d1::task&, d1::task_group_context&, thread_data&); //! Registers the worker with the arena and enters TBB scheduler dispatch loop void process(thread_data&); //! Notification that the thread leaves its arena void on_thread_leaving(unsigned ref_param); //! Check for the presence of enqueued tasks bool has_enqueued_tasks(); //! Check for the presence of any tasks bool has_tasks(); bool is_empty() { return my_pool_state.test() == /* EMPTY */ false; } thread_control_monitor& get_waiting_threads_monitor(); static const std::size_t out_of_arena = ~size_t(0); //! Tries to occupy a slot in the arena. On success, returns the slot index; if no slot is available, returns out_of_arena. template std::size_t occupy_free_slot(thread_data&); //! Tries to occupy a slot in the specified range. std::size_t occupy_free_slot_in_range(thread_data& tls, std::size_t lower, std::size_t upper); std::uintptr_t calculate_stealing_threshold(); unsigned priority_level() { return my_priority_level; } bool has_request() { return my_total_num_workers_requested; } unsigned references() const { return my_references.load(std::memory_order_acquire); } bool is_arena_workerless() const { return my_max_num_workers == 0; } void set_top_priority(bool); bool is_top_priority() const; bool is_joinable() const { return num_workers_active() < my_num_workers_allotted.load(std::memory_order_relaxed); } bool try_join(); void set_allotment(unsigned allotment); int update_concurrency(unsigned concurrency); std::pair update_request(int mandatory_delta, int workers_delta); /** Must be the last data field */ arena_slot my_slots[1]; }; // class arena template void arena::advertise_new_work() { bool is_mandatory_needed = false; bool are_workers_needed = false; if (work_type != work_spawned) { // Local memory fence here and below is required to avoid missed wakeups; see the comment below. // Starvation resistant tasks require concurrency, so missed wakeups are unacceptable. atomic_fence_seq_cst(); } if (work_type == work_enqueued && my_num_slots > my_num_reserved_slots) { is_mandatory_needed = my_mandatory_concurrency.test_and_set(); } // Double-check idiom that, in case of spawning, is deliberately sloppy about memory fences. // Technically, to avoid missed wakeups, there should be a full memory fence between the point we // released the task pool (i.e. spawned task) and read the arena's state. However, adding such a // fence might hurt overall performance more than it helps, because the fence would be executed // on every task pool release, even when stealing does not occur. Since TBB allows parallelism, // but never promises parallelism, the missed wakeup is not a correctness problem. are_workers_needed = my_pool_state.test_and_set(); if (is_mandatory_needed || are_workers_needed) { int mandatory_delta = is_mandatory_needed ? 1 : 0; int workers_delta = are_workers_needed ? my_max_num_workers : 0; if (is_mandatory_needed && is_arena_workerless()) { // Set workers_delta to 1 to keep arena invariants consistent workers_delta = 1; } #if __TBB_PREVIEW_PARALLEL_PHASE my_thread_leave.reset_if_needed(); #endif request_workers(mandatory_delta, workers_delta, /* wakeup_threads = */ true); } } inline d1::task* arena::steal_task(unsigned arena_index, FastRandom& frnd, execution_data_ext& ed, isolation_type isolation) { auto slot_num_limit = my_limit.load(std::memory_order_relaxed); if (slot_num_limit == 1) { // No slots to steal from return nullptr; } // Try to steal a task from a random victim. std::size_t k = frnd.get() % (slot_num_limit - 1); // The following condition excludes the external thread that might have // already taken our previous place in the arena from the list . // of potential victims. But since such a situation can take // place only in case of significant oversubscription, keeping // the checks simple seems to be preferable to complicating the code. if (k >= arena_index) { ++k; // Adjusts random distribution to exclude self } arena_slot* victim = &my_slots[k]; d1::task **pool = victim->task_pool.load(std::memory_order_relaxed); d1::task *t = nullptr; if (pool == EmptyTaskPool || !(t = victim->steal_task(*this, isolation, k))) { return nullptr; } if (task_accessor::is_proxy_task(*t)) { task_proxy &tp = *(task_proxy*)t; d1::slot_id slot = tp.slot; t = tp.extract_task(); if (!t) { // Proxy was empty, so it's our responsibility to free it tp.allocator.delete_object(&tp, ed); return nullptr; } // Note affinity is called for any stolen task (proxy or general) ed.affinity_slot = slot; } else { // Note affinity is called for any stolen task (proxy or general) ed.affinity_slot = d1::any_slot; } // Update task owner thread id to identify stealing ed.original_slot = k; return t; } template inline d1::task* arena::get_stream_task(task_stream& stream, unsigned& hint) { if (stream.empty()) return nullptr; return stream.pop(subsequent_lane_selector(hint)); } #if __TBB_PREVIEW_CRITICAL_TASKS // Retrieves critical task respecting isolation level, if provided. The rule is: // 1) If no outer critical task and no isolation => take any critical task // 2) If working on an outer critical task and no isolation => cannot take any critical task // 3) If no outer critical task but isolated => respect isolation // 4) If working on an outer critical task and isolated => respect isolation // Hint is used to keep some LIFO-ness, start search with the lane that was used during push operation. inline d1::task* arena::get_critical_task(unsigned& hint, isolation_type isolation) { if (my_critical_task_stream.empty()) return nullptr; if ( isolation != no_isolation ) { return my_critical_task_stream.pop_specific( hint, isolation ); } else { return my_critical_task_stream.pop(preceding_lane_selector(hint)); } } #endif // __TBB_PREVIEW_CRITICAL_TASKS } // namespace r1 } // namespace detail } // namespace tbb #endif /* _TBB_arena_H */ ================================================ FILE: third-party/tbb/src/tbb/arena_slot.cpp ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "arena_slot.h" #include "arena.h" #include "thread_data.h" namespace tbb { namespace detail { namespace r1 { //------------------------------------------------------------------------ // Arena Slot //------------------------------------------------------------------------ d1::task* arena_slot::get_task_impl(size_t T, execution_data_ext& ed, bool& tasks_omitted, isolation_type isolation) { __TBB_ASSERT(tail.load(std::memory_order_relaxed) <= T || is_local_task_pool_quiescent(), "Is it safe to get a task at position T?"); d1::task* result = task_pool_ptr[T]; __TBB_ASSERT(!is_poisoned( result ), "The poisoned task is going to be processed"); if (!result) { return nullptr; } bool omit = isolation != no_isolation && isolation != task_accessor::isolation(*result); if (!omit && !task_accessor::is_proxy_task(*result)) { return result; } else if (omit) { tasks_omitted = true; return nullptr; } task_proxy& tp = static_cast(*result); d1::slot_id aff_id = tp.slot; if ( d1::task *t = tp.extract_task() ) { ed.affinity_slot = aff_id; return t; } // Proxy was empty, so it's our responsibility to free it tp.allocator.delete_object(&tp, ed); if ( tasks_omitted ) { task_pool_ptr[T] = nullptr; } return nullptr; } d1::task* arena_slot::get_task(execution_data_ext& ed, isolation_type isolation) { __TBB_ASSERT(is_task_pool_published(), nullptr); // The current task position in the task pool. std::size_t T0 = tail.load(std::memory_order_relaxed); // The bounds of available tasks in the task pool. H0 is only used when the head bound is reached. std::size_t H0 = (std::size_t)-1, T = T0; d1::task* result = nullptr; bool task_pool_empty = false; bool tasks_omitted = false; do { __TBB_ASSERT( !result, nullptr ); // The full fence is required to sync the store of `tail` with the load of `head` (write-read barrier) T = --tail; // The acquire load of head is required to guarantee consistency of our task pool // when a thief rolls back the head. if ( (std::intptr_t)( head.load(std::memory_order_acquire) ) > (std::intptr_t)T ) { acquire_task_pool(); H0 = head.load(std::memory_order_relaxed); if ( (std::intptr_t)H0 > (std::intptr_t)T ) { // The thief has not backed off - nothing to grab. __TBB_ASSERT( H0 == head.load(std::memory_order_relaxed) && T == tail.load(std::memory_order_relaxed) && H0 == T + 1, "victim/thief arbitration algorithm failure" ); reset_task_pool_and_leave(); // No tasks in the task pool. task_pool_empty = true; break; } else if ( H0 == T ) { // There is only one task in the task pool. reset_task_pool_and_leave(); task_pool_empty = true; } else { // Release task pool if there are still some tasks. // After the release, the tail will be less than T, thus a thief // will not attempt to get a task at position T. release_task_pool(); } } result = get_task_impl( T, ed, tasks_omitted, isolation ); if ( result ) { poison_pointer( task_pool_ptr[T] ); break; } else if ( !tasks_omitted ) { poison_pointer( task_pool_ptr[T] ); __TBB_ASSERT( T0 == T+1, nullptr ); T0 = T; } } while ( !result && !task_pool_empty ); if ( tasks_omitted ) { if ( task_pool_empty ) { // All tasks have been checked. The task pool should be in reset state. // We just restore the bounds for the available tasks. // TODO: Does it have sense to move them to the beginning of the task pool? __TBB_ASSERT( is_quiescent_local_task_pool_reset(), nullptr ); if ( result ) { // If we have a task, it should be at H0 position. __TBB_ASSERT( H0 == T, nullptr ); ++H0; } __TBB_ASSERT( H0 <= T0, nullptr ); if ( H0 < T0 ) { // Restore the task pool if there are some tasks. head.store(H0, std::memory_order_relaxed); tail.store(T0, std::memory_order_relaxed); // The release fence is used in publish_task_pool. publish_task_pool(); // Synchronize with snapshot as we published some tasks. ed.task_disp->m_thread_data->my_arena->advertise_new_work(); } } else { // A task has been obtained. We need to make a hole in position T. __TBB_ASSERT( is_task_pool_published(), nullptr ); __TBB_ASSERT( result, nullptr ); task_pool_ptr[T] = nullptr; tail.store(T0, std::memory_order_release); // Synchronize with snapshot as we published some tasks. // TODO: consider some approach not to call wakeup for each time. E.g. check if the tail reached the head. ed.task_disp->m_thread_data->my_arena->advertise_new_work(); } } __TBB_ASSERT( (std::intptr_t)tail.load(std::memory_order_relaxed) >= 0, nullptr ); __TBB_ASSERT( result || tasks_omitted || is_quiescent_local_task_pool_reset(), nullptr ); return result; } d1::task* arena_slot::steal_task(arena& a, isolation_type isolation, std::size_t slot_index) { d1::task** victim_pool = lock_task_pool(); if (!victim_pool) { return nullptr; } d1::task* result = nullptr; std::size_t H = head.load(std::memory_order_relaxed); // mirror std::size_t H0 = H; bool tasks_omitted = false; do { // The full fence is required to sync the store of `head` with the load of `tail` (write-read barrier) H = ++head; // The acquire load of tail is required to guarantee consistency of victim_pool // because the owner synchronizes task spawning via tail. if ((std::intptr_t)H > (std::intptr_t)(tail.load(std::memory_order_acquire))) { // Stealing attempt failed, deque contents has not been changed by us head.store( /*dead: H = */ H0, std::memory_order_relaxed ); __TBB_ASSERT( !result, nullptr ); goto unlock; } result = victim_pool[H-1]; __TBB_ASSERT( !is_poisoned( result ), nullptr ); if (result) { if (isolation == no_isolation || isolation == task_accessor::isolation(*result)) { if (!task_accessor::is_proxy_task(*result)) { break; } task_proxy& tp = *static_cast(result); // If mailed task is likely to be grabbed by its destination thread, skip it. if (!task_proxy::is_shared(tp.task_and_tag) || !tp.outbox->recipient_is_idle() || a.mailbox(slot_index).recipient_is_idle()) { break; } } // The task cannot be executed either due to isolation or proxy constraints. result = nullptr; tasks_omitted = true; } else if (!tasks_omitted) { // Cleanup the task pool from holes until a task is skipped. __TBB_ASSERT( H0 == H-1, nullptr ); poison_pointer( victim_pool[H0] ); H0 = H; } } while (!result); __TBB_ASSERT( result, nullptr ); // emit "task was consumed" signal poison_pointer( victim_pool[H-1] ); if (tasks_omitted) { // Some proxies in the task pool have been omitted. Set the stolen task to nullptr. victim_pool[H-1] = nullptr; // The release store synchronizes the victim_pool update(the store of nullptr). head.store( /*dead: H = */ H0, std::memory_order_release ); } unlock: unlock_task_pool(victim_pool); #if __TBB_PREFETCHING __TBB_cl_evict(&victim_slot.head); __TBB_cl_evict(&victim_slot.tail); #endif if (tasks_omitted) { // Synchronize with snapshot as the head and tail can be bumped which can falsely trigger EMPTY state a.advertise_new_work(); } return result; } } // namespace r1 } // namespace detail } // namespace tbb ================================================ FILE: third-party/tbb/src/tbb/arena_slot.h ================================================ /* Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _TBB_arena_slot_H #define _TBB_arena_slot_H #include "oneapi/tbb/detail/_config.h" #include "oneapi/tbb/detail/_utils.h" #include "oneapi/tbb/detail/_template_helpers.h" #include "oneapi/tbb/detail/_task.h" #include "oneapi/tbb/cache_aligned_allocator.h" #include "misc.h" #include "mailbox.h" #include "scheduler_common.h" #include namespace tbb { namespace detail { namespace r1 { class arena; class task_group_context; //-------------------------------------------------------------------------------------------------------- // Arena Slot //-------------------------------------------------------------------------------------------------------- static d1::task** const EmptyTaskPool = nullptr; static d1::task** const LockedTaskPool = reinterpret_cast(~std::intptr_t(0)); struct alignas(max_nfs_size) arena_slot_shared_state { //! Scheduler of the thread attached to the slot /** Marks the slot as busy, and is used to iterate through the schedulers belonging to this arena **/ std::atomic my_is_occupied; // Synchronization of access to Task pool /** Also is used to specify if the slot is empty or locked: 0 - empty -1 - locked **/ std::atomic task_pool; //! Index of the first ready task in the deque. /** Modified by thieves, and by the owner during compaction/reallocation **/ std::atomic head; }; struct alignas(max_nfs_size) arena_slot_private_state { //! Hint provided for operations with the container of starvation-resistant tasks. /** Modified by the owner thread (during these operations). **/ unsigned hint_for_fifo_stream; #if __TBB_PREVIEW_CRITICAL_TASKS //! Similar to 'hint_for_fifo_stream' but for critical tasks. unsigned hint_for_critical_stream; #endif //! Similar to 'hint_for_fifo_stream' but for the resume tasks. unsigned hint_for_resume_stream; //! Index of the element following the last ready task in the deque. /** Modified by the owner thread. **/ std::atomic tail; //! Capacity of the primary task pool (number of elements - pointers to task). std::size_t my_task_pool_size; //! Task pool of the scheduler that owns this slot // TODO: previously was task**__TBB_atomic, but seems like not accessed on other thread d1::task** task_pool_ptr; }; class arena_slot : private arena_slot_shared_state, private arena_slot_private_state { friend class arena; friend class outermost_worker_waiter; friend class task_dispatcher; friend class thread_data; friend class nested_arena_context; //! The original task dispather associated with this slot task_dispatcher* my_default_task_dispatcher; #if TBB_USE_ASSERT void fill_with_canary_pattern ( std::size_t first, std::size_t last ) { for ( std::size_t i = first; i < last; ++i ) poison_pointer(task_pool_ptr[i]); } #else void fill_with_canary_pattern ( size_t, std::size_t ) {} #endif /* TBB_USE_ASSERT */ static constexpr std::size_t min_task_pool_size = 64; void allocate_task_pool( std::size_t n ) { std::size_t byte_size = ((n * sizeof(d1::task*) + max_nfs_size - 1) / max_nfs_size) * max_nfs_size; my_task_pool_size = byte_size / sizeof(d1::task*); task_pool_ptr = (d1::task**)cache_aligned_allocate(byte_size); // No need to clear the fresh deque since valid items are designated by the head and tail members. // But fill it with a canary pattern in the high vigilance debug mode. fill_with_canary_pattern( 0, my_task_pool_size ); } public: //! Deallocate task pool that was allocated by means of allocate_task_pool. void free_task_pool( ) { // TODO: understand the assertion and modify // __TBB_ASSERT( !task_pool /* TODO: == EmptyTaskPool */, nullptr); if( task_pool_ptr ) { __TBB_ASSERT( my_task_pool_size, nullptr); cache_aligned_deallocate( task_pool_ptr ); task_pool_ptr = nullptr; my_task_pool_size = 0; } } //! Get a task from the local pool. /** Called only by the pool owner. Returns the pointer to the task or nullptr if a suitable task is not found. Resets the pool if it is empty. **/ d1::task* get_task(execution_data_ext&, isolation_type); //! Steal task from slot's ready pool d1::task* steal_task(arena&, isolation_type, std::size_t); //! Some thread is now the owner of this slot void occupy() { __TBB_ASSERT(!my_is_occupied.load(std::memory_order_relaxed), nullptr); my_is_occupied.store(true, std::memory_order_release); } //! Try to occupy the slot bool try_occupy() { return !is_occupied() && my_is_occupied.exchange(true) == false; } //! Some thread is now the owner of this slot void release() { __TBB_ASSERT(my_is_occupied.load(std::memory_order_relaxed), nullptr); my_is_occupied.store(false, std::memory_order_release); } //! Spawn newly created tasks void spawn(d1::task& t) { std::size_t T = prepare_task_pool(1); __TBB_ASSERT(is_poisoned(task_pool_ptr[T]), nullptr); task_pool_ptr[T] = &t; commit_spawned_tasks(T + 1); if (!is_task_pool_published()) { publish_task_pool(); } } bool is_task_pool_published() const { return task_pool.load(std::memory_order_relaxed) != EmptyTaskPool; } bool is_empty() const { return task_pool.load(std::memory_order_relaxed) == EmptyTaskPool || head.load(std::memory_order_relaxed) >= tail.load(std::memory_order_relaxed); } bool is_occupied() const { return my_is_occupied.load(std::memory_order_relaxed); } task_dispatcher& default_task_dispatcher() { __TBB_ASSERT(my_default_task_dispatcher != nullptr, nullptr); return *my_default_task_dispatcher; } void init_task_streams(unsigned h) { hint_for_fifo_stream = h; #if __TBB_RESUMABLE_TASKS hint_for_resume_stream = h; #endif #if __TBB_PREVIEW_CRITICAL_TASKS hint_for_critical_stream = h; #endif } #if __TBB_PREVIEW_CRITICAL_TASKS unsigned& critical_hint() { return hint_for_critical_stream; } #endif private: //! Get a task from the local pool at specified location T. /** Returns the pointer to the task or nullptr if the task cannot be executed, e.g. proxy has been deallocated or isolation constraint is not met. tasks_omitted tells if some tasks have been omitted. Called only by the pool owner. The caller should guarantee that the position T is not available for a thief. **/ d1::task* get_task_impl(size_t T, execution_data_ext& ed, bool& tasks_omitted, isolation_type isolation); //! Makes sure that the task pool can accommodate at least n more elements /** If necessary relocates existing task pointers or grows the ready task deque. * Returns (possible updated) tail index (not accounting for n). **/ std::size_t prepare_task_pool(std::size_t num_tasks) { std::size_t T = tail.load(std::memory_order_relaxed); // mirror if ( T + num_tasks <= my_task_pool_size ) { return T; } std::size_t new_size = num_tasks; if ( !my_task_pool_size ) { __TBB_ASSERT( !is_task_pool_published() && is_quiescent_local_task_pool_reset(), nullptr); __TBB_ASSERT( !task_pool_ptr, nullptr); if ( num_tasks < min_task_pool_size ) new_size = min_task_pool_size; allocate_task_pool( new_size ); return 0; } acquire_task_pool(); std::size_t H = head.load(std::memory_order_relaxed); // mirror d1::task** new_task_pool = task_pool_ptr; __TBB_ASSERT( my_task_pool_size >= min_task_pool_size, nullptr); // Count not skipped tasks. Consider using std::count_if. for ( std::size_t i = H; i < T; ++i ) if ( new_task_pool[i] ) ++new_size; // If the free space at the beginning of the task pool is too short, we // are likely facing a pathological single-producer-multiple-consumers // scenario, and thus it's better to expand the task pool bool allocate = new_size > my_task_pool_size - min_task_pool_size/4; if ( allocate ) { // Grow task pool. As this operation is rare, and its cost is asymptotically // amortizable, we can tolerate new task pool allocation done under the lock. if ( new_size < 2 * my_task_pool_size ) new_size = 2 * my_task_pool_size; allocate_task_pool( new_size ); // updates my_task_pool_size } // Filter out skipped tasks. Consider using std::copy_if. std::size_t T1 = 0; for ( std::size_t i = H; i < T; ++i ) { if ( new_task_pool[i] ) { task_pool_ptr[T1++] = new_task_pool[i]; } } // Deallocate the previous task pool if a new one has been allocated. if ( allocate ) cache_aligned_deallocate( new_task_pool ); else fill_with_canary_pattern( T1, tail ); // Publish the new state. commit_relocated_tasks( T1 ); // assert_task_pool_valid(); return T1; } //! Makes newly spawned tasks visible to thieves void commit_spawned_tasks(std::size_t new_tail) { __TBB_ASSERT (new_tail <= my_task_pool_size, "task deque end was overwritten"); // emit "task was released" signal // Release fence is necessary to make sure that previously stored task pointers // are visible to thieves. tail.store(new_tail, std::memory_order_release); } //! Used by workers to enter the task pool /** Does not lock the task pool in case if arena slot has been successfully grabbed. **/ void publish_task_pool() { __TBB_ASSERT ( task_pool == EmptyTaskPool, "someone else grabbed my arena slot?" ); __TBB_ASSERT ( head.load(std::memory_order_relaxed) < tail.load(std::memory_order_relaxed), "entering arena without tasks to share" ); // Release signal on behalf of previously spawned tasks (when this thread was not in arena yet) task_pool.store(task_pool_ptr, std::memory_order_release ); } //! Locks the local task pool /** Garbles task_pool for the duration of the lock. Requires correctly set task_pool_ptr. ATTENTION: This method is mostly the same as generic_scheduler::lock_task_pool(), with a little different logic of slot state checks (slot is either locked or points to our task pool). Thus if either of them is changed, consider changing the counterpart as well. **/ void acquire_task_pool() { if (!is_task_pool_published()) { return; // we are not in arena - nothing to lock } bool sync_prepare_done = false; for( atomic_backoff b;;b.pause() ) { #if TBB_USE_ASSERT // Local copy of the arena slot task pool pointer is necessary for the next // assertion to work correctly to exclude asynchronous state transition effect. d1::task** tp = task_pool.load(std::memory_order_relaxed); __TBB_ASSERT( tp == LockedTaskPool || tp == task_pool_ptr, "slot ownership corrupt?" ); #endif d1::task** expected = task_pool_ptr; if( task_pool.load(std::memory_order_relaxed) != LockedTaskPool && task_pool.compare_exchange_strong(expected, LockedTaskPool ) ) { // We acquired our own slot break; } else if( !sync_prepare_done ) { // Start waiting sync_prepare_done = true; } // Someone else acquired a lock, so pause and do exponential backoff. } __TBB_ASSERT( task_pool.load(std::memory_order_relaxed) == LockedTaskPool, "not really acquired task pool" ); } //! Unlocks the local task pool /** Restores task_pool munged by acquire_task_pool. Requires correctly set task_pool_ptr. **/ void release_task_pool() { if ( !(task_pool.load(std::memory_order_relaxed) != EmptyTaskPool) ) return; // we are not in arena - nothing to unlock __TBB_ASSERT( task_pool.load(std::memory_order_relaxed) == LockedTaskPool, "arena slot is not locked" ); task_pool.store( task_pool_ptr, std::memory_order_release ); } //! Locks victim's task pool, and returns pointer to it. The pointer can be nullptr. /** Garbles victim_arena_slot->task_pool for the duration of the lock. **/ d1::task** lock_task_pool() { d1::task** victim_task_pool; for ( atomic_backoff backoff;; /*backoff pause embedded in the loop*/) { victim_task_pool = task_pool.load(std::memory_order_relaxed); // Microbenchmarks demonstrated that aborting stealing attempt when the // victim's task pool is locked degrade performance. // NOTE: Do not use comparison of head and tail indices to check for // the presence of work in the victim's task pool, as they may give // incorrect indication because of task pool relocations and resizes. if (victim_task_pool == EmptyTaskPool) { break; } d1::task** expected = victim_task_pool; if (victim_task_pool != LockedTaskPool && task_pool.compare_exchange_strong(expected, LockedTaskPool) ) { // We've locked victim's task pool break; } // Someone else acquired a lock, so pause and do exponential backoff. backoff.pause(); } __TBB_ASSERT(victim_task_pool == EmptyTaskPool || (task_pool.load(std::memory_order_relaxed) == LockedTaskPool && victim_task_pool != LockedTaskPool), "not really locked victim's task pool?"); return victim_task_pool; } //! Unlocks victim's task pool /** Restores victim_arena_slot->task_pool munged by lock_task_pool. **/ void unlock_task_pool(d1::task** victim_task_pool) { __TBB_ASSERT(task_pool.load(std::memory_order_relaxed) == LockedTaskPool, "victim arena slot is not locked"); __TBB_ASSERT(victim_task_pool != LockedTaskPool, nullptr); task_pool.store(victim_task_pool, std::memory_order_release); } #if TBB_USE_ASSERT bool is_local_task_pool_quiescent() const { d1::task** tp = task_pool.load(std::memory_order_relaxed); return tp == EmptyTaskPool || tp == LockedTaskPool; } bool is_quiescent_local_task_pool_empty() const { __TBB_ASSERT(is_local_task_pool_quiescent(), "Task pool is not quiescent"); return head.load(std::memory_order_relaxed) == tail.load(std::memory_order_relaxed); } bool is_quiescent_local_task_pool_reset() const { __TBB_ASSERT(is_local_task_pool_quiescent(), "Task pool is not quiescent"); return head.load(std::memory_order_relaxed) == 0 && tail.load(std::memory_order_relaxed) == 0; } #endif // TBB_USE_ASSERT //! Leave the task pool /** Leaving task pool automatically releases the task pool if it is locked. **/ void leave_task_pool() { __TBB_ASSERT(is_task_pool_published(), "Not in arena"); // Do not reset my_arena_index. It will be used to (attempt to) re-acquire the slot next time __TBB_ASSERT(task_pool.load(std::memory_order_relaxed) == LockedTaskPool, "Task pool must be locked when leaving arena"); __TBB_ASSERT(is_quiescent_local_task_pool_empty(), "Cannot leave arena when the task pool is not empty"); // No release fence is necessary here as this assignment precludes external // accesses to the local task pool when becomes visible. Thus it is harmless // if it gets hoisted above preceding local bookkeeping manipulations. task_pool.store(EmptyTaskPool, std::memory_order_relaxed); } //! Resets head and tail indices to 0, and leaves task pool /** The task pool must be locked by the owner (via acquire_task_pool).**/ void reset_task_pool_and_leave() { __TBB_ASSERT(task_pool.load(std::memory_order_relaxed) == LockedTaskPool, "Task pool must be locked when resetting task pool"); tail.store(0, std::memory_order_relaxed); head.store(0, std::memory_order_relaxed); leave_task_pool(); } //! Makes relocated tasks visible to thieves and releases the local task pool. /** Obviously, the task pool must be locked when calling this method. **/ void commit_relocated_tasks(std::size_t new_tail) { __TBB_ASSERT(is_local_task_pool_quiescent(), "Task pool must be locked when calling commit_relocated_tasks()"); head.store(0, std::memory_order_relaxed); // Tail is updated last to minimize probability of a thread making arena // snapshot being misguided into thinking that this task pool is empty. tail.store(new_tail, std::memory_order_release); release_task_pool(); } }; } // namespace r1 } // namespace detail } // namespace tbb #endif // __TBB_arena_slot_H ================================================ FILE: third-party/tbb/src/tbb/assert_impl.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_assert_impl_H #define __TBB_assert_impl_H #include "oneapi/tbb/detail/_config.h" #include "oneapi/tbb/detail/_utils.h" #include #include #include #include #if _MSC_VER && _DEBUG #include #endif #include #if __TBBMALLOC_BUILD namespace rml { namespace internal { #else namespace tbb { namespace detail { namespace r1 { #endif // TODO: consider extension for formatted error description string static void assertion_failure_impl(const char* location, int line, const char* expression, const char* comment) { std::fprintf(stderr, "Assertion %s failed (located in the %s function, line in file: %d)\n", expression, location, line); if (comment) { std::fprintf(stderr, "Detailed description: %s\n", comment); } #if _MSC_VER && _DEBUG if (1 == _CrtDbgReport(_CRT_ASSERT, location, line, "tbb_debug.dll", "%s\r\n%s", expression, comment?comment:"")) { _CrtDbgBreak(); } else #endif { std::fflush(stderr); std::abort(); } } // Do not move the definition into the assertion_failure function because it will require "magic statics". // It will bring a dependency on C++ runtime on some platforms while assert_impl.h is reused in tbbmalloc // that should not depend on C++ runtime static std::atomic assertion_state; void __TBB_EXPORTED_FUNC assertion_failure(const char* location, int line, const char* expression, const char* comment) { #if __TBB_MSVC_UNREACHABLE_CODE_IGNORED // Workaround for erroneous "unreachable code" during assertion throwing using call_once #pragma warning (push) #pragma warning (disable: 4702) #endif // We cannot use std::call_once because it brings a dependency on C++ runtime on some platforms // while assert_impl.h is reused in tbbmalloc that should not depend on C++ runtime atomic_do_once([&](){ assertion_failure_impl(location, line, expression, comment); }, assertion_state); #if __TBB_MSVC_UNREACHABLE_CODE_IGNORED #pragma warning (pop) #endif } //! Report a runtime warning. void runtime_warning( const char* format, ... ) { char str[1024]; std::memset(str, 0, 1024); va_list args; va_start(args, format); vsnprintf( str, 1024-1, format, args); va_end(args); fprintf(stderr, "TBB Warning: %s\n", str); } #if __TBBMALLOC_BUILD }} // namespaces rml::internal #else } // namespace r1 } // namespace detail } // namespace tbb #endif #endif // __TBB_assert_impl_H ================================================ FILE: third-party/tbb/src/tbb/cancellation_disseminator.h ================================================ /* Copyright (c) 2022-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _TBB_cancellation_disseminator_H #define _TBB_cancellation_disseminator_H #include "oneapi/tbb/mutex.h" #include "oneapi/tbb/task_group.h" #include "intrusive_list.h" #include "thread_data.h" namespace tbb { namespace detail { namespace r1 { class cancellation_disseminator { public: //! Finds all contexts affected by the state change and propagates the new state to them. /* The propagation is relayed to the cancellation_disseminator because tasks created by one external thread can be passed to and executed by other external threads. This means that context trees can span several arenas at once and thus state change propagation cannot be generally localized to one arena only. */ bool propagate_task_group_state(std::atomic d1::task_group_context::*mptr_state, d1::task_group_context& src, uint32_t new_state) { if (src.my_may_have_children.load(std::memory_order_relaxed) != d1::task_group_context::may_have_children) { return true; } // The whole propagation algorithm is under the lock in order to ensure correctness // in case of concurrent state changes at the different levels of the context tree. threads_list_mutex_type::scoped_lock lock(my_threads_list_mutex); // TODO: consider to use double-check idiom if ((src.*mptr_state).load(std::memory_order_relaxed) != new_state) { // Another thread has concurrently changed the state. Back down. return false; } // Advance global state propagation epoch ++the_context_state_propagation_epoch; // Propagate to all workers and external threads and sync up their local epochs with the global one // The whole propagation sequence is locked, thus no contention is expected for (auto& thr_data : my_threads_list) { thr_data.propagate_task_group_state(mptr_state, src, new_state); } return true; } void register_thread(thread_data& td) { threads_list_mutex_type::scoped_lock lock(my_threads_list_mutex); my_threads_list.push_front(td); } void unregister_thread(thread_data& td) { threads_list_mutex_type::scoped_lock lock(my_threads_list_mutex); my_threads_list.remove(td); } private: using thread_data_list_type = intrusive_list; using threads_list_mutex_type = d1::mutex; threads_list_mutex_type my_threads_list_mutex; thread_data_list_type my_threads_list; }; } // namespace r1 } // namespace detail } // namespace tbb #endif // _TBB_cancellation_disseminator_H ================================================ FILE: third-party/tbb/src/tbb/co_context.h ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _TBB_co_context_H #define _TBB_co_context_H #include "oneapi/tbb/detail/_config.h" #if __TBB_RESUMABLE_TASKS #include #include #if __TBB_RESUMABLE_TASKS_USE_THREADS #if _WIN32 || _WIN64 #include #else #include #endif #include #include "governor.h" #elif _WIN32 || _WIN64 #include #else // ucontext.h API is deprecated since macOS 10.6 #if __APPLE__ #if __INTEL_COMPILER #pragma warning(push) #pragma warning(disable:1478) #elif __clang__ #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wdeprecated-declarations" #endif #endif // __APPLE__ #include #include // mprotect #include "governor.h" // default_page_size() #ifndef MAP_STACK // macOS* does not define MAP_STACK #define MAP_STACK 0 #endif #ifndef MAP_ANONYMOUS // macOS* defines MAP_ANON, which is deprecated in Linux*. #define MAP_ANONYMOUS MAP_ANON #endif #endif // _WIN32 || _WIN64 namespace tbb { namespace detail { namespace r1 { #if __TBB_RESUMABLE_TASKS_USE_THREADS struct coroutine_type { #if _WIN32 || _WIN64 using handle_type = HANDLE; #else using handle_type = pthread_t; #endif handle_type my_thread; std::condition_variable my_condvar; std::mutex my_mutex; thread_data* my_thread_data{ nullptr }; bool my_is_active{ true }; }; #elif _WIN32 || _WIN64 typedef LPVOID coroutine_type; #else struct coroutine_type { coroutine_type() : my_context(), my_stack(), my_stack_size() {} ucontext_t my_context; void* my_stack; std::size_t my_stack_size; }; #endif // Forward declaration of the coroutine API. void create_coroutine(coroutine_type& c, std::size_t stack_size, void* arg); void current_coroutine(coroutine_type& c); void swap_coroutine(coroutine_type& prev_coroutine, coroutine_type& new_coroutine); void destroy_coroutine(coroutine_type& c); class co_context { enum co_state { co_invalid, co_suspended, co_executing, co_destroyed }; coroutine_type my_coroutine; co_state my_state; public: co_context(std::size_t stack_size, void* arg) : my_state(stack_size ? co_suspended : co_executing) { if (stack_size) { __TBB_ASSERT(arg != nullptr, nullptr); create_coroutine(my_coroutine, stack_size, arg); } else { current_coroutine(my_coroutine); } } ~co_context() { __TBB_ASSERT(1 << my_state & (1 << co_suspended | 1 << co_executing), nullptr); if (my_state == co_suspended) { #if __TBB_RESUMABLE_TASKS_USE_THREADS my_state = co_executing; #endif destroy_coroutine(my_coroutine); } my_state = co_destroyed; } void resume(co_context& target) { // Do not create non-trivial objects on the stack of this function. They might never be destroyed. __TBB_ASSERT(my_state == co_executing, nullptr); __TBB_ASSERT(target.my_state == co_suspended, nullptr); my_state = co_suspended; target.my_state = co_executing; // 'target' can reference an invalid object after swap_coroutine. Do not access it. swap_coroutine(my_coroutine, target.my_coroutine); __TBB_ASSERT(my_state == co_executing, nullptr); } }; #if _WIN32 || _WIN64 /* [[noreturn]] */ void __stdcall co_local_wait_for_all(void* arg) noexcept; #else /* [[noreturn]] */ void co_local_wait_for_all(unsigned hi, unsigned lo) noexcept; #endif #if __TBB_RESUMABLE_TASKS_USE_THREADS void handle_perror(int error_code, const char* what); inline void check(int error_code, const char* routine) { if (error_code) { handle_perror(error_code, routine); } } using thread_data_t = std::pair; #if _WIN32 || _WIN64 inline unsigned WINAPI coroutine_thread_func(void* d) #else inline void* coroutine_thread_func(void* d) #endif { thread_data_t& data = *static_cast(d); coroutine_type& c = data.first; void* arg = data.second; { std::unique_lock lock(c.my_mutex); __TBB_ASSERT(c.my_thread_data == nullptr, nullptr); c.my_is_active = false; // We read the data notify the waiting thread data.second = nullptr; c.my_condvar.notify_one(); c.my_condvar.wait(lock, [&c] { return c.my_is_active == true; }); } __TBB_ASSERT(c.my_thread_data != nullptr, nullptr); governor::set_thread_data(*c.my_thread_data); #if _WIN32 || _WIN64 co_local_wait_for_all(arg); return 0; #else std::uintptr_t addr = std::uintptr_t(arg); unsigned lo = unsigned(addr); unsigned hi = unsigned(std::uint64_t(addr) >> 32); __TBB_ASSERT(sizeof(addr) == 8 || hi == 0, nullptr); co_local_wait_for_all(hi, lo); return nullptr; #endif }; inline void create_coroutine(coroutine_type& c, std::size_t stack_size, void* arg) { thread_data_t data{ c, arg }; #if _WIN32 || _WIN64 c.my_thread = (HANDLE)_beginthreadex(nullptr, unsigned(stack_size), coroutine_thread_func, &data, STACK_SIZE_PARAM_IS_A_RESERVATION, nullptr); if (!c.my_thread) { handle_perror(0, "create_coroutine: _beginthreadex failed\n"); } #else pthread_attr_t s; check(pthread_attr_init(&s), "pthread_attr_init has failed"); if (stack_size > 0) { check(pthread_attr_setstacksize(&s, stack_size), "pthread_attr_setstack_size has failed"); } check(pthread_create(&c.my_thread, &s, coroutine_thread_func, &data), "pthread_create has failed"); check(pthread_attr_destroy(&s), "pthread_attr_destroy has failed"); #endif // Wait for the just created thread to read the data std::unique_lock lock(c.my_mutex); c.my_condvar.wait(lock, [&arg] { return arg == nullptr; }); } inline void current_coroutine(coroutine_type& c) { #if _WIN32 || _WIN64 c.my_thread = GetCurrentThread(); #else c.my_thread = pthread_self(); #endif } inline void swap_coroutine(coroutine_type& prev_coroutine, coroutine_type& new_coroutine) { thread_data* td = governor::get_thread_data(); __TBB_ASSERT(prev_coroutine.my_is_active == true, "The current thread should be active"); // Detach our state before notification other thread // (because we might be notified just after other thread notification) prev_coroutine.my_thread_data = nullptr; prev_coroutine.my_is_active = false; governor::clear_thread_data(); { std::unique_lock lock(new_coroutine.my_mutex); __TBB_ASSERT(new_coroutine.my_is_active == false, "The sleeping thread should not be active"); __TBB_ASSERT(new_coroutine.my_thread_data == nullptr, "The sleeping thread should not be active"); new_coroutine.my_thread_data = td; new_coroutine.my_is_active = true; new_coroutine.my_condvar.notify_one(); } std::unique_lock lock(prev_coroutine.my_mutex); prev_coroutine.my_condvar.wait(lock, [&prev_coroutine] { return prev_coroutine.my_is_active == true; }); __TBB_ASSERT(governor::get_thread_data() != nullptr, nullptr); governor::set_thread_data(*prev_coroutine.my_thread_data); } inline void destroy_coroutine(coroutine_type& c) { { std::unique_lock lock(c.my_mutex); __TBB_ASSERT(c.my_thread_data == nullptr, "The sleeping thread should not be active"); __TBB_ASSERT(c.my_is_active == false, "The sleeping thread should not be active"); c.my_is_active = true; c.my_condvar.notify_one(); } #if _WIN32 || _WIN64 WaitForSingleObject(c.my_thread, INFINITE); CloseHandle(c.my_thread); #else check(pthread_join(c.my_thread, nullptr), "pthread_join has failed"); #endif } #elif _WIN32 || _WIN64 inline void create_coroutine(coroutine_type& c, std::size_t stack_size, void* arg) { __TBB_ASSERT(arg, nullptr); c = CreateFiber(stack_size, co_local_wait_for_all, arg); __TBB_ASSERT(c, nullptr); } inline void current_coroutine(coroutine_type& c) { c = IsThreadAFiber() ? GetCurrentFiber() : ConvertThreadToFiberEx(nullptr, FIBER_FLAG_FLOAT_SWITCH); __TBB_ASSERT(c, nullptr); } inline void swap_coroutine(coroutine_type& prev_coroutine, coroutine_type& new_coroutine) { if (!IsThreadAFiber()) { ConvertThreadToFiberEx(nullptr, FIBER_FLAG_FLOAT_SWITCH); } __TBB_ASSERT(new_coroutine, nullptr); prev_coroutine = GetCurrentFiber(); __TBB_ASSERT(prev_coroutine, nullptr); SwitchToFiber(new_coroutine); } inline void destroy_coroutine(coroutine_type& c) { __TBB_ASSERT(c, nullptr); DeleteFiber(c); } #else // !(_WIN32 || _WIN64) inline void create_coroutine(coroutine_type& c, std::size_t stack_size, void* arg) { const std::size_t REG_PAGE_SIZE = governor::default_page_size(); const std::size_t page_aligned_stack_size = (stack_size + (REG_PAGE_SIZE - 1)) & ~(REG_PAGE_SIZE - 1); const std::size_t protected_stack_size = page_aligned_stack_size + 2 * REG_PAGE_SIZE; // Allocate the stack with protection property std::uintptr_t stack_ptr = (std::uintptr_t)mmap(nullptr, protected_stack_size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0); __TBB_ASSERT((void*)stack_ptr != MAP_FAILED, nullptr); // Allow read write on our stack (guarded pages are still protected) int err = mprotect((void*)(stack_ptr + REG_PAGE_SIZE), page_aligned_stack_size, PROT_READ | PROT_WRITE); __TBB_ASSERT_EX(!err, nullptr); // Remember the stack state c.my_stack = (void*)(stack_ptr + REG_PAGE_SIZE); c.my_stack_size = page_aligned_stack_size; err = getcontext(&c.my_context); __TBB_ASSERT_EX(!err, nullptr); c.my_context.uc_link = nullptr; // cast to char* to disable FreeBSD clang-3.4.1 'incompatible type' error c.my_context.uc_stack.ss_sp = (char*)c.my_stack; c.my_context.uc_stack.ss_size = c.my_stack_size; c.my_context.uc_stack.ss_flags = 0; typedef void(*coroutine_func_t)(); std::uintptr_t addr = std::uintptr_t(arg); unsigned lo = unsigned(addr); unsigned hi = unsigned(std::uint64_t(addr) >> 32); __TBB_ASSERT(sizeof(addr) == 8 || hi == 0, nullptr); makecontext(&c.my_context, (coroutine_func_t)co_local_wait_for_all, 2, hi, lo); } inline void current_coroutine(coroutine_type& c) { int err = getcontext(&c.my_context); __TBB_ASSERT_EX(!err, nullptr); } inline void swap_coroutine(coroutine_type& prev_coroutine, coroutine_type& new_coroutine) { int err = swapcontext(&prev_coroutine.my_context, &new_coroutine.my_context); __TBB_ASSERT_EX(!err, nullptr); } inline void destroy_coroutine(coroutine_type& c) { const std::size_t REG_PAGE_SIZE = governor::default_page_size(); // Free stack memory with guarded pages munmap((void*)((std::uintptr_t)c.my_stack - REG_PAGE_SIZE), c.my_stack_size + 2 * REG_PAGE_SIZE); // Clear the stack state afterwards c.my_stack = nullptr; c.my_stack_size = 0; } #if __APPLE__ #if __INTEL_COMPILER #pragma warning(pop) // 1478 warning #elif __clang__ #pragma clang diagnostic pop // "-Wdeprecated-declarations" #endif #endif #endif // _WIN32 || _WIN64 } // namespace r1 } // namespace detail } // namespace tbb #endif /* __TBB_RESUMABLE_TASKS */ #endif /* _TBB_co_context_H */ ================================================ FILE: third-party/tbb/src/tbb/concurrent_bounded_queue.cpp ================================================ /* Copyright (c) 2020-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "oneapi/tbb/detail/_utils.h" #include "oneapi/tbb/concurrent_queue.h" #include "oneapi/tbb/cache_aligned_allocator.h" #include "concurrent_monitor.h" namespace tbb { namespace detail { namespace r1 { static constexpr std::size_t monitors_number = 2; std::uint8_t* __TBB_EXPORTED_FUNC allocate_bounded_queue_rep( std::size_t queue_rep_size ) { std::size_t monitors_mem_size = sizeof(concurrent_monitor) * monitors_number; std::uint8_t* mem = static_cast(cache_aligned_allocate(queue_rep_size + monitors_mem_size)); concurrent_monitor* monitors = reinterpret_cast(mem + queue_rep_size); for (std::size_t i = 0; i < monitors_number; ++i) { new (monitors + i) concurrent_monitor(); } return mem; } void __TBB_EXPORTED_FUNC deallocate_bounded_queue_rep( std::uint8_t* mem, std::size_t queue_rep_size ) { concurrent_monitor* monitors = reinterpret_cast(mem + queue_rep_size); for (std::size_t i = 0; i < monitors_number; ++i) { monitors[i].~concurrent_monitor(); } cache_aligned_deallocate(mem); } void __TBB_EXPORTED_FUNC wait_bounded_queue_monitor( concurrent_monitor* monitors, std::size_t monitor_tag, std::ptrdiff_t target, d1::delegate_base& predicate ) { __TBB_ASSERT(monitor_tag < monitors_number, nullptr); concurrent_monitor& monitor = monitors[monitor_tag]; monitor.wait([&] { return !predicate(); }, std::uintptr_t(target)); } void __TBB_EXPORTED_FUNC abort_bounded_queue_monitors( concurrent_monitor* monitors ) { concurrent_monitor& items_avail = monitors[d2::cbq_items_avail_tag]; concurrent_monitor& slots_avail = monitors[d2::cbq_slots_avail_tag]; items_avail.abort_all(); slots_avail.abort_all(); } struct predicate_leq { std::size_t my_ticket; predicate_leq( std::size_t ticket ) : my_ticket(ticket) {} bool operator() ( std::uintptr_t ticket ) const { return static_cast(ticket) <= my_ticket; } }; void __TBB_EXPORTED_FUNC notify_bounded_queue_monitor( concurrent_monitor* monitors, std::size_t monitor_tag, std::size_t ticket) { __TBB_ASSERT(monitor_tag < monitors_number, nullptr); concurrent_monitor& monitor = monitors[monitor_tag]; monitor.notify(predicate_leq(ticket)); } } // namespace r1 } // namespace detail } // namespace tbb ================================================ FILE: third-party/tbb/src/tbb/concurrent_monitor.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_concurrent_monitor_H #define __TBB_concurrent_monitor_H #include "oneapi/tbb/spin_mutex.h" #include "oneapi/tbb/detail/_exception.h" #include "oneapi/tbb/detail/_aligned_space.h" #include "concurrent_monitor_mutex.h" #include "semaphore.h" #include namespace tbb { namespace detail { namespace r1 { //! Circular doubly-linked list with sentinel /** head.next points to the front and head.prev points to the back */ class circular_doubly_linked_list_with_sentinel { public: struct base_node { base_node* next; base_node* prev; constexpr base_node(base_node* n, base_node* p) : next(n), prev(p) {} explicit base_node() : next((base_node*)(uintptr_t)0xcdcdcdcd), prev((base_node*)(uintptr_t)0xcdcdcdcd) {} }; // ctor constexpr circular_doubly_linked_list_with_sentinel() : count(0), head(&head, &head) {} circular_doubly_linked_list_with_sentinel(const circular_doubly_linked_list_with_sentinel&) = delete; circular_doubly_linked_list_with_sentinel& operator=(const circular_doubly_linked_list_with_sentinel&) = delete; inline std::size_t size() const { return count.load(std::memory_order_relaxed); } inline bool empty() const { return size() == 0; } inline base_node* front() const { return head.next; } inline base_node* last() const { return head.prev; } inline const base_node* end() const { return &head; } //! add to the back of the list inline void add( base_node* n ) { count.store(count.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed); n->prev = head.prev; n->next = &head; head.prev->next = n; head.prev = n; } //! remove node 'n' inline void remove( base_node& n ) { __TBB_ASSERT(count.load(std::memory_order_relaxed) > 0, "attempt to remove an item from an empty list"); count.store(count.load( std::memory_order_relaxed ) - 1, std::memory_order_relaxed); n.prev->next = n.next; n.next->prev = n.prev; } //! move all elements to 'lst' and initialize the 'this' list inline void flush_to( circular_doubly_linked_list_with_sentinel& lst ) { const std::size_t l_count = size(); if (l_count > 0) { lst.count.store(l_count, std::memory_order_relaxed); lst.head.next = head.next; lst.head.prev = head.prev; head.next->prev = &lst.head; head.prev->next = &lst.head; clear(); } } void clear() { head.next = &head; head.prev = &head; count.store(0, std::memory_order_relaxed); } private: std::atomic count; base_node head; }; using base_list = circular_doubly_linked_list_with_sentinel; using base_node = circular_doubly_linked_list_with_sentinel::base_node; template class concurrent_monitor_base; template class wait_node : public base_node { public: #if __TBB_GLIBCXX_VERSION >= 40800 && __TBB_GLIBCXX_VERSION < 40900 wait_node(Context ctx) : my_context(ctx), my_is_in_list(false) {} #else wait_node(Context ctx) : my_context(ctx) {} #endif virtual ~wait_node() = default; virtual void init() { __TBB_ASSERT(!my_initialized, nullptr); my_initialized = true; } virtual void wait() = 0; virtual void reset() { __TBB_ASSERT(my_skipped_wakeup, nullptr); my_skipped_wakeup = false; } virtual void notify() = 0; protected: friend class concurrent_monitor_base; friend class thread_data; Context my_context{}; #if __TBB_GLIBCXX_VERSION >= 40800 && __TBB_GLIBCXX_VERSION < 40900 std::atomic my_is_in_list; #else std::atomic my_is_in_list{false}; #endif bool my_initialized{false}; bool my_skipped_wakeup{false}; bool my_aborted{false}; unsigned my_epoch{0}; }; template class sleep_node : public wait_node { using base_type = wait_node; public: using base_type::base_type; ~sleep_node() override { if (this->my_initialized) { if (this->my_skipped_wakeup) semaphore().P(); semaphore().~binary_semaphore(); } } binary_semaphore& semaphore() { return *sema.begin(); } void init() override { if (!this->my_initialized) { new (sema.begin()) binary_semaphore; base_type::init(); } } void wait() override { __TBB_ASSERT(this->my_initialized, "Use of commit_wait() without prior prepare_wait()"); semaphore().P(); __TBB_ASSERT(!this->my_is_in_list.load(std::memory_order_relaxed), "Still in the queue?"); if (this->my_aborted) throw_exception(exception_id::user_abort); } void reset() override { base_type::reset(); semaphore().P(); } void notify() override { semaphore().V(); } private: tbb::detail::aligned_space sema; }; //! concurrent_monitor /** fine-grained concurrent_monitor implementation */ template class concurrent_monitor_base { public: //! ctor constexpr concurrent_monitor_base() {} //! dtor ~concurrent_monitor_base() = default; concurrent_monitor_base(const concurrent_monitor_base&) = delete; concurrent_monitor_base& operator=(const concurrent_monitor_base&) = delete; //! prepare wait by inserting 'thr' into the wait queue void prepare_wait( wait_node& node) { // TODO: consider making even more lazy instantiation of the semaphore, that is only when it is actually needed, e.g. move it in node::wait() if (!node.my_initialized) { node.init(); } // this is good place to pump previous skipped wakeup else if (node.my_skipped_wakeup) { node.reset(); } node.my_is_in_list.store(true, std::memory_order_relaxed); { concurrent_monitor_mutex::scoped_lock l(my_mutex); node.my_epoch = my_epoch.load(std::memory_order_relaxed); my_waitset.add(&node); } // Prepare wait guarantees Write Read memory barrier. // In C++ only full fence covers this type of barrier. atomic_fence_seq_cst(); } //! Commit wait if event count has not changed; otherwise, cancel wait. /** Returns true if committed, false if canceled. */ inline bool commit_wait( wait_node& node ) { const bool do_it = node.my_epoch == my_epoch.load(std::memory_order_relaxed); // this check is just an optimization if (do_it) { node.wait(); } else { cancel_wait( node ); } return do_it; } //! Cancel the wait. Removes the thread from the wait queue if not removed yet. void cancel_wait( wait_node& node ) { // possible skipped wakeup will be pumped in the following prepare_wait() node.my_skipped_wakeup = true; // try to remove node from waitset // Cancel wait guarantees acquire memory barrier. bool in_list = node.my_is_in_list.load(std::memory_order_acquire); if (in_list) { concurrent_monitor_mutex::scoped_lock l(my_mutex); if (node.my_is_in_list.load(std::memory_order_relaxed)) { my_waitset.remove(node); // node is removed from waitset, so there will be no wakeup node.my_is_in_list.store(false, std::memory_order_relaxed); node.my_skipped_wakeup = false; } } } //! Wait for a condition to be satisfied with waiting-on my_context template bool wait(Pred&& pred, NodeType&& node) { prepare_wait(node); while (!guarded_call(std::forward(pred), node)) { if (commit_wait(node)) { return true; } prepare_wait(node); } cancel_wait(node); return false; } //! Notify one thread about the event void notify_one() { atomic_fence_seq_cst(); notify_one_relaxed(); } //! Notify one thread about the event. Relaxed version. void notify_one_relaxed() { if (my_waitset.empty()) { return; } base_node* n; const base_node* end = my_waitset.end(); { concurrent_monitor_mutex::scoped_lock l(my_mutex); my_epoch.store(my_epoch.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed); n = my_waitset.front(); if (n != end) { my_waitset.remove(*n); // GCC 12.x-14.x issues a warning here that to_wait_node(n)->my_is_in_list might have size 0, since n is // a base_node pointer. (This cannot happen, because only wait_node pointers are added to my_waitset.) #if (__TBB_GCC_VERSION >= 120100 && __TBB_GCC_VERSION < 150000 ) && !__clang__ && !__INTEL_COMPILER #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wstringop-overflow" #endif to_wait_node(n)->my_is_in_list.store(false, std::memory_order_relaxed); #if (__TBB_GCC_VERSION >= 120100 && __TBB_GCC_VERSION < 150000 ) && !__clang__ && !__INTEL_COMPILER #pragma GCC diagnostic pop #endif } } if (n != end) { to_wait_node(n)->notify(); } } //! Notify all waiting threads of the event void notify_all() { atomic_fence_seq_cst(); notify_all_relaxed(); } // ! Notify all waiting threads of the event; Relaxed version void notify_all_relaxed() { if (my_waitset.empty()) { return; } base_list temp; const base_node* end; { concurrent_monitor_mutex::scoped_lock l(my_mutex); my_epoch.store(my_epoch.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed); // TODO: Possible optimization, don't change node state under lock, just do flush my_waitset.flush_to(temp); end = temp.end(); for (base_node* n = temp.front(); n != end; n = n->next) { to_wait_node(n)->my_is_in_list.store(false, std::memory_order_relaxed); } } base_node* nxt; for (base_node* n = temp.front(); n != end; n=nxt) { nxt = n->next; to_wait_node(n)->notify(); } #if TBB_USE_ASSERT temp.clear(); #endif } //! Notify waiting threads of the event that satisfies the given predicate template void notify( const P& predicate ) { atomic_fence_seq_cst(); notify_relaxed( predicate ); } //! Notify waiting threads of the event that satisfies the given predicate; //! the predicate is called under the lock. Relaxed version. template void notify_relaxed( const P& predicate ) { if (my_waitset.empty()) { return; } base_list temp; base_node* nxt; const base_node* end = my_waitset.end(); { concurrent_monitor_mutex::scoped_lock l(my_mutex); my_epoch.store(my_epoch.load( std::memory_order_relaxed ) + 1, std::memory_order_relaxed); for (base_node* n = my_waitset.last(); n != end; n = nxt) { nxt = n->prev; auto* node = static_cast*>(n); if (predicate(node->my_context)) { my_waitset.remove(*n); node->my_is_in_list.store(false, std::memory_order_relaxed); temp.add(n); } } } end = temp.end(); for (base_node* n=temp.front(); n != end; n = nxt) { nxt = n->next; to_wait_node(n)->notify(); } #if TBB_USE_ASSERT temp.clear(); #endif } //! Notify waiting threads of the event that satisfies the given predicate; //! the predicate is called under the lock. Relaxed version. template void notify_one_relaxed( const P& predicate ) { if (my_waitset.empty()) { return; } base_node* tmp = nullptr; base_node* next{}; const base_node* end = my_waitset.end(); { concurrent_monitor_mutex::scoped_lock l(my_mutex); my_epoch.store(my_epoch.load( std::memory_order_relaxed ) + 1, std::memory_order_relaxed); for (base_node* n = my_waitset.last(); n != end; n = next) { next = n->prev; auto* node = static_cast*>(n); if (predicate(node->my_context)) { my_waitset.remove(*n); node->my_is_in_list.store(false, std::memory_order_relaxed); tmp = n; break; } } } if (tmp) { to_wait_node(tmp)->notify(); } } //! Abort any sleeping threads at the time of the call void abort_all() { atomic_fence_seq_cst(); abort_all_relaxed(); } //! Abort any sleeping threads at the time of the call; Relaxed version void abort_all_relaxed() { if (my_waitset.empty()) { return; } base_list temp; const base_node* end; { concurrent_monitor_mutex::scoped_lock l(my_mutex); my_epoch.store(my_epoch.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed); my_waitset.flush_to(temp); end = temp.end(); for (base_node* n = temp.front(); n != end; n = n->next) { to_wait_node(n)->my_is_in_list.store(false, std::memory_order_relaxed); } } base_node* nxt; for (base_node* n = temp.front(); n != end; n = nxt) { nxt = n->next; to_wait_node(n)->my_aborted = true; to_wait_node(n)->notify(); } #if TBB_USE_ASSERT temp.clear(); #endif } void destroy() { this->abort_all(); my_mutex.destroy(); __TBB_ASSERT(this->my_waitset.empty(), "waitset not empty?"); } private: template bool guarded_call(Pred&& predicate, NodeType& node) { bool res = false; tbb::detail::d0::try_call( [&] { res = std::forward(predicate)(); }).on_exception( [&] { cancel_wait(node); }); return res; } concurrent_monitor_mutex my_mutex{}; base_list my_waitset{}; std::atomic my_epoch{}; wait_node* to_wait_node( base_node* node ) { return static_cast*>(node); } }; class concurrent_monitor : public concurrent_monitor_base { using base_type = concurrent_monitor_base; public: using base_type::base_type; ~concurrent_monitor() { destroy(); } /** per-thread descriptor for concurrent_monitor */ using thread_context = sleep_node; }; } // namespace r1 } // namespace detail } // namespace tbb #endif /* __TBB_concurrent_monitor_H */ ================================================ FILE: third-party/tbb/src/tbb/concurrent_monitor_mutex.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_monitor_mutex_H #define __TBB_monitor_mutex_H #include "oneapi/tbb/detail/_utils.h" #include "oneapi/tbb/detail/_aligned_space.h" #include "semaphore.h" #include namespace tbb { namespace detail { namespace r1 { class concurrent_monitor_mutex { public: using scoped_lock = std::lock_guard; constexpr concurrent_monitor_mutex() {} ~concurrent_monitor_mutex() = default; void destroy() { #if !__TBB_USE_FUTEX if (my_init_flag.load(std::memory_order_relaxed)) { get_semaphore().~semaphore(); } #endif } void lock() { auto wakeup_condition = [&] { return my_flag.load(std::memory_order_relaxed) == 0; }; while (my_flag.exchange(1)) { if (!timed_spin_wait_until(wakeup_condition)) { ++my_waiters; while (!wakeup_condition()) { wait(); } --my_waiters; } } } void unlock() { my_flag.exchange(0); // full fence, so the next load is relaxed if (my_waiters.load(std::memory_order_relaxed)) { wakeup(); } } private: void wait() { #if __TBB_USE_FUTEX futex_wait(&my_flag, 1); #else get_semaphore().P(); #endif } void wakeup() { #if __TBB_USE_FUTEX futex_wakeup_one(&my_flag); #else get_semaphore().V(); #endif } // The flag should be int for the futex operations std::atomic my_flag{0}; std::atomic my_waiters{0}; #if !__TBB_USE_FUTEX semaphore& get_semaphore() { if (!my_init_flag.load(std::memory_order_acquire)) { std::lock_guard lock(my_init_mutex); if (!my_init_flag.load(std::memory_order_relaxed)) { new (my_semaphore.begin()) semaphore(); my_init_flag.store(true, std::memory_order_release); } } return *my_semaphore.begin(); } static std::mutex my_init_mutex; std::atomic my_init_flag{false}; aligned_space my_semaphore{}; #endif }; } // namespace r1 } // namespace detail } // namespace tbb #endif // __TBB_monitor_mutex_H ================================================ FILE: third-party/tbb/src/tbb/def/lin32-tbb.def ================================================ /* Copyright (c) 2005-2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ { global: /* Assertions (assert.cpp) */ _ZN3tbb6detail2r117assertion_failureEPKciS3_S3_; /* ITT (profiling.cpp) */ _ZN3tbb6detail2r112itt_task_endENS0_2d115itt_domain_enumE; _ZN3tbb6detail2r114itt_region_endENS0_2d115itt_domain_enumEPvy; _ZN3tbb6detail2r114itt_task_beginENS0_2d115itt_domain_enumEPvyS4_yNS0_2d021string_resource_indexE; _ZN3tbb6detail2r115call_itt_notifyEiPv; _ZN3tbb6detail2r115create_itt_syncEPvPKcS4_; _ZN3tbb6detail2r116itt_region_beginENS0_2d115itt_domain_enumEPvyS4_yNS0_2d021string_resource_indexE; _ZN3tbb6detail2r116itt_relation_addENS0_2d115itt_domain_enumEPvyNS0_2d012itt_relationES4_y; _ZN3tbb6detail2r117itt_set_sync_nameEPvPKc; _ZN3tbb6detail2r119itt_make_task_groupENS0_2d115itt_domain_enumEPvyS4_yNS0_2d021string_resource_indexE; _ZN3tbb6detail2r120itt_metadata_str_addENS0_2d115itt_domain_enumEPvyNS0_2d021string_resource_indexEPKc; _ZN3tbb6detail2r120itt_metadata_ptr_addENS0_2d115itt_domain_enumEPvyNS0_2d021string_resource_indexES4_; /* Allocators (allocator.cpp) */ _ZN3tbb6detail2r115allocate_memoryEj; _ZN3tbb6detail2r117deallocate_memoryEPv; _ZN3tbb6detail2r122cache_aligned_allocateEj; _ZN3tbb6detail2r124cache_aligned_deallocateEPv; _ZN3tbb6detail2r115cache_line_sizeEv; _ZN3tbb6detail2r117is_tbbmalloc_usedEv; /* Small object pool (small_object_pool.cpp) */ _ZN3tbb6detail2r18allocateERPNS0_2d117small_object_poolEj; _ZN3tbb6detail2r18allocateERPNS0_2d117small_object_poolEjRKNS2_14execution_dataE; _ZN3tbb6detail2r110deallocateERNS0_2d117small_object_poolEPvj; _ZN3tbb6detail2r110deallocateERNS0_2d117small_object_poolEPvjRKNS2_14execution_dataE; /* Error handling (exception.cpp) */ _ZN3tbb6detail2r115throw_exceptionENS0_2d012exception_idE; _ZTIN3tbb6detail2r114bad_last_allocE; _ZTVN3tbb6detail2r114bad_last_allocE; _ZTIN3tbb6detail2r112missing_waitE; _ZTVN3tbb6detail2r112missing_waitE; _ZTIN3tbb6detail2r110user_abortE; _ZTVN3tbb6detail2r110user_abortE; _ZTIN3tbb6detail2r111unsafe_waitE; _ZTVN3tbb6detail2r111unsafe_waitE; /* RTM Mutex (rtm_mutex.cpp) */ _ZN3tbb6detail2r17acquireERNS0_2d19rtm_mutexERNS3_11scoped_lockEb; _ZN3tbb6detail2r17releaseERNS0_2d19rtm_mutex11scoped_lockE; _ZN3tbb6detail2r111try_acquireERNS0_2d19rtm_mutexERNS3_11scoped_lockE; /* RTM RW Mutex (rtm_rw_mutex.cpp) */ _ZN3tbb6detail2r114acquire_readerERNS0_2d112rtm_rw_mutexERNS3_11scoped_lockEb; _ZN3tbb6detail2r114acquire_writerERNS0_2d112rtm_rw_mutexERNS3_11scoped_lockEb; _ZN3tbb6detail2r118try_acquire_readerERNS0_2d112rtm_rw_mutexERNS3_11scoped_lockE; _ZN3tbb6detail2r118try_acquire_writerERNS0_2d112rtm_rw_mutexERNS3_11scoped_lockE; _ZN3tbb6detail2r17releaseERNS0_2d112rtm_rw_mutex11scoped_lockE; _ZN3tbb6detail2r17upgradeERNS0_2d112rtm_rw_mutex11scoped_lockE; _ZN3tbb6detail2r19downgradeERNS0_2d112rtm_rw_mutex11scoped_lockE; /* Tasks and partitioners (task.cpp) */ _ZN3tbb6detail2r17suspendEPFvPvPNS1_18suspend_point_typeEES2_; _ZN3tbb6detail2r16resumeEPNS1_18suspend_point_typeE; _ZN3tbb6detail2r121current_suspend_pointEv; _ZN3tbb6detail2r114notify_waitersEj; _ZN3tbb6detail2r127get_thread_reference_vertexEPNS0_2d126wait_tree_vertex_interfaceE; /* Task dispatcher (task_dispatcher.cpp) */ _ZN3tbb6detail2r114execution_slotEPKNS0_2d114execution_dataE; _ZN3tbb6detail2r14waitERNS0_2d112wait_contextERNS2_18task_group_contextE; _ZN3tbb6detail2r15spawnERNS0_2d14taskERNS2_18task_group_contextE; _ZN3tbb6detail2r15spawnERNS0_2d14taskERNS2_18task_group_contextEt; _ZN3tbb6detail2r116execute_and_waitERNS0_2d14taskERNS2_18task_group_contextERNS2_12wait_contextES6_; _ZN3tbb6detail2r16submitERNS0_2d14taskERNS2_18task_group_contextEPNS1_5arenaEj; _ZN3tbb6detail2r115current_contextEv; /* Task group context (task_group_context.cpp) */ _ZN3tbb6detail2r110initializeERNS0_2d118task_group_contextE; _ZN3tbb6detail2r122cancel_group_executionERNS0_2d118task_group_contextE; _ZN3tbb6detail2r128is_group_execution_cancelledERNS0_2d118task_group_contextE; _ZN3tbb6detail2r15resetERNS0_2d118task_group_contextE; _ZN3tbb6detail2r17destroyERNS0_2d118task_group_contextE; _ZN3tbb6detail2r119capture_fp_settingsERNS0_2d118task_group_contextE; /* Task arena (arena.cpp) */ _ZN3tbb6detail2r115max_concurrencyEPKNS0_2d115task_arena_baseE; _ZN3tbb6detail2r110initializeERNS0_2d115task_arena_baseE; _ZN3tbb6detail2r16attachERNS0_2d115task_arena_baseE; _ZN3tbb6detail2r17executeERNS0_2d115task_arena_baseERNS2_13delegate_baseE; _ZN3tbb6detail2r19terminateERNS0_2d115task_arena_baseE; _ZN3tbb6detail2r120isolate_within_arenaERNS0_2d113delegate_baseEi; _ZN3tbb6detail2r17enqueueERNS0_2d14taskEPNS2_15task_arena_baseE; _ZN3tbb6detail2r17enqueueERNS0_2d14taskERNS2_18task_group_contextEPNS2_15task_arena_baseE; _ZN3tbb6detail2r14waitERNS0_2d115task_arena_baseE; _ZN3tbb6detail2r114execution_slotERKNS0_2d115task_arena_baseE; _ZN3tbb6detail2r119exit_parallel_phaseEPNS0_2d115task_arena_baseEj; _ZN3tbb6detail2r120enter_parallel_phaseEPNS0_2d115task_arena_baseEj; /* System topology parsing and threads pinning (governor.cpp) */ _ZN3tbb6detail2r115numa_node_countEv; _ZN3tbb6detail2r117fill_numa_indicesEPi; _ZN3tbb6detail2r115core_type_countEi; _ZN3tbb6detail2r122fill_core_type_indicesEPii; _ZN3tbb6detail2r131constraints_default_concurrencyERKNS0_2d111constraintsEi; _ZN3tbb6detail2r128constraints_threads_per_coreERKNS0_2d111constraintsEi; _ZN3tbb6detail2r124numa_default_concurrencyEi; /* Observer (observer_proxy.cpp) */ _ZN3tbb6detail2r17observeERNS0_2d123task_scheduler_observerEb; /* Queuing RW Mutex (queuing_rw_mutex.cpp) */ _ZN3tbb6detail2r111try_acquireERNS0_2d116queuing_rw_mutexERNS3_11scoped_lockEb; _ZN3tbb6detail2r117upgrade_to_writerERNS0_2d116queuing_rw_mutex11scoped_lockE; _ZN3tbb6detail2r119downgrade_to_readerERNS0_2d116queuing_rw_mutex11scoped_lockE; _ZN3tbb6detail2r17acquireERNS0_2d116queuing_rw_mutexERNS3_11scoped_lockEb; _ZN3tbb6detail2r17releaseERNS0_2d116queuing_rw_mutex11scoped_lockE; _ZN3tbb6detail2r19constructERNS0_2d116queuing_rw_mutexE; _ZN3tbb6detail2r19is_writerERKNS0_2d116queuing_rw_mutex11scoped_lockE; /* Global control (global_control.cpp) */ _ZN3tbb6detail2r16createERNS0_2d114global_controlE; _ZN3tbb6detail2r17destroyERNS0_2d114global_controlE; _ZN3tbb6detail2r127global_control_active_valueEi; _ZN3tbb6detail2r18finalizeERNS0_2d121task_scheduler_handleEi; _ZN3tbb6detail2r13getERNS0_2d121task_scheduler_handleE; /* Parallel pipeline (parallel_pipeline.cpp) */ _ZN3tbb6detail2r117parallel_pipelineERNS0_2d118task_group_contextEjRKNS2_11filter_nodeE; _ZN3tbb6detail2r116set_end_of_inputERNS0_2d111base_filterE; /* Concurrent bounded queue (concurrent_bounded_queue.cpp) */ _ZN3tbb6detail2r126allocate_bounded_queue_repEj; _ZN3tbb6detail2r126wait_bounded_queue_monitorEPNS1_18concurrent_monitorEjiRNS0_2d113delegate_baseE; _ZN3tbb6detail2r128abort_bounded_queue_monitorsEPNS1_18concurrent_monitorE; _ZN3tbb6detail2r128deallocate_bounded_queue_repEPhj; _ZN3tbb6detail2r128notify_bounded_queue_monitorEPNS1_18concurrent_monitorEjj; /* Concurrent monitor (address_waiter.cpp) */ _ZN3tbb6detail2r115wait_on_addressEPvRNS0_2d113delegate_baseEj; _ZN3tbb6detail2r117notify_by_addressEPvj; _ZN3tbb6detail2r121notify_by_address_oneEPv; _ZN3tbb6detail2r121notify_by_address_allEPv; /* Versioning (version.cpp) */ TBB_runtime_interface_version; TBB_runtime_version; local: /* TODO: fill more precisely */ *; }; ================================================ FILE: third-party/tbb/src/tbb/def/lin64-tbb.def ================================================ /* Copyright (c) 2005-2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ { global: /* Assertions (assert.cpp) */ _ZN3tbb6detail2r117assertion_failureEPKciS3_S3_; /* ITT (profiling.cpp) */ _ZN3tbb6detail2r112itt_task_endENS0_2d115itt_domain_enumE; _ZN3tbb6detail2r114itt_region_endENS0_2d115itt_domain_enumEPvy; _ZN3tbb6detail2r114itt_task_beginENS0_2d115itt_domain_enumEPvyS4_yNS0_2d021string_resource_indexE; _ZN3tbb6detail2r115call_itt_notifyEiPv; _ZN3tbb6detail2r115create_itt_syncEPvPKcS4_; _ZN3tbb6detail2r116itt_region_beginENS0_2d115itt_domain_enumEPvyS4_yNS0_2d021string_resource_indexE; _ZN3tbb6detail2r116itt_relation_addENS0_2d115itt_domain_enumEPvyNS0_2d012itt_relationES4_y; _ZN3tbb6detail2r117itt_set_sync_nameEPvPKc; _ZN3tbb6detail2r119itt_make_task_groupENS0_2d115itt_domain_enumEPvyS4_yNS0_2d021string_resource_indexE; _ZN3tbb6detail2r120itt_metadata_str_addENS0_2d115itt_domain_enumEPvyNS0_2d021string_resource_indexEPKc; _ZN3tbb6detail2r120itt_metadata_ptr_addENS0_2d115itt_domain_enumEPvyNS0_2d021string_resource_indexES4_; /* Allocators (allocator.cpp) */ _ZN3tbb6detail2r115allocate_memoryEm; _ZN3tbb6detail2r117deallocate_memoryEPv; _ZN3tbb6detail2r122cache_aligned_allocateEm; _ZN3tbb6detail2r124cache_aligned_deallocateEPv; _ZN3tbb6detail2r115cache_line_sizeEv; _ZN3tbb6detail2r117is_tbbmalloc_usedEv; /* Small object pool (small_object_pool.cpp) */ _ZN3tbb6detail2r18allocateERPNS0_2d117small_object_poolEm; _ZN3tbb6detail2r18allocateERPNS0_2d117small_object_poolEmRKNS2_14execution_dataE; _ZN3tbb6detail2r110deallocateERNS0_2d117small_object_poolEPvm; _ZN3tbb6detail2r110deallocateERNS0_2d117small_object_poolEPvmRKNS2_14execution_dataE; /* Error handling (exception.cpp) */ _ZN3tbb6detail2r115throw_exceptionENS0_2d012exception_idE; _ZTIN3tbb6detail2r114bad_last_allocE; _ZTVN3tbb6detail2r114bad_last_allocE; _ZTIN3tbb6detail2r112missing_waitE; _ZTVN3tbb6detail2r112missing_waitE; _ZTIN3tbb6detail2r110user_abortE; _ZTVN3tbb6detail2r110user_abortE; _ZTIN3tbb6detail2r111unsafe_waitE; _ZTVN3tbb6detail2r111unsafe_waitE; /* RTM Mutex (rtm_mutex.cpp) */ _ZN3tbb6detail2r17acquireERNS0_2d19rtm_mutexERNS3_11scoped_lockEb; _ZN3tbb6detail2r17releaseERNS0_2d19rtm_mutex11scoped_lockE; _ZN3tbb6detail2r111try_acquireERNS0_2d19rtm_mutexERNS3_11scoped_lockE; /* RTM RW Mutex (rtm_rw_mutex.cpp) */ _ZN3tbb6detail2r114acquire_readerERNS0_2d112rtm_rw_mutexERNS3_11scoped_lockEb; _ZN3tbb6detail2r114acquire_writerERNS0_2d112rtm_rw_mutexERNS3_11scoped_lockEb; _ZN3tbb6detail2r118try_acquire_readerERNS0_2d112rtm_rw_mutexERNS3_11scoped_lockE; _ZN3tbb6detail2r118try_acquire_writerERNS0_2d112rtm_rw_mutexERNS3_11scoped_lockE; _ZN3tbb6detail2r17releaseERNS0_2d112rtm_rw_mutex11scoped_lockE; _ZN3tbb6detail2r17upgradeERNS0_2d112rtm_rw_mutex11scoped_lockE; _ZN3tbb6detail2r19downgradeERNS0_2d112rtm_rw_mutex11scoped_lockE; /* Tasks and partitioners (task.cpp) */ _ZN3tbb6detail2r17suspendEPFvPvPNS1_18suspend_point_typeEES2_; _ZN3tbb6detail2r16resumeEPNS1_18suspend_point_typeE; _ZN3tbb6detail2r121current_suspend_pointEv; _ZN3tbb6detail2r114notify_waitersEm; _ZN3tbb6detail2r127get_thread_reference_vertexEPNS0_2d126wait_tree_vertex_interfaceE; /* Task dispatcher (task_dispatcher.cpp) */ _ZN3tbb6detail2r114execution_slotEPKNS0_2d114execution_dataE; _ZN3tbb6detail2r14waitERNS0_2d112wait_contextERNS2_18task_group_contextE; _ZN3tbb6detail2r15spawnERNS0_2d14taskERNS2_18task_group_contextE; _ZN3tbb6detail2r15spawnERNS0_2d14taskERNS2_18task_group_contextEt; _ZN3tbb6detail2r116execute_and_waitERNS0_2d14taskERNS2_18task_group_contextERNS2_12wait_contextES6_; _ZN3tbb6detail2r16submitERNS0_2d14taskERNS2_18task_group_contextEPNS1_5arenaEm; _ZN3tbb6detail2r115current_contextEv; /* Task group context (task_group_context.cpp) */ _ZN3tbb6detail2r110initializeERNS0_2d118task_group_contextE; _ZN3tbb6detail2r122cancel_group_executionERNS0_2d118task_group_contextE; _ZN3tbb6detail2r128is_group_execution_cancelledERNS0_2d118task_group_contextE; _ZN3tbb6detail2r15resetERNS0_2d118task_group_contextE; _ZN3tbb6detail2r17destroyERNS0_2d118task_group_contextE; _ZN3tbb6detail2r119capture_fp_settingsERNS0_2d118task_group_contextE; /* Task arena (arena.cpp) */ _ZN3tbb6detail2r115max_concurrencyEPKNS0_2d115task_arena_baseE; _ZN3tbb6detail2r110initializeERNS0_2d115task_arena_baseE; _ZN3tbb6detail2r16attachERNS0_2d115task_arena_baseE; _ZN3tbb6detail2r17executeERNS0_2d115task_arena_baseERNS2_13delegate_baseE; _ZN3tbb6detail2r19terminateERNS0_2d115task_arena_baseE; _ZN3tbb6detail2r120isolate_within_arenaERNS0_2d113delegate_baseEl; _ZN3tbb6detail2r17enqueueERNS0_2d14taskEPNS2_15task_arena_baseE; _ZN3tbb6detail2r17enqueueERNS0_2d14taskERNS2_18task_group_contextEPNS2_15task_arena_baseE; _ZN3tbb6detail2r14waitERNS0_2d115task_arena_baseE; _ZN3tbb6detail2r114execution_slotERKNS0_2d115task_arena_baseE; _ZN3tbb6detail2r119exit_parallel_phaseEPNS0_2d115task_arena_baseEm; _ZN3tbb6detail2r120enter_parallel_phaseEPNS0_2d115task_arena_baseEm; /* System topology parsing and threads pinning (governor.cpp) */ _ZN3tbb6detail2r115numa_node_countEv; _ZN3tbb6detail2r117fill_numa_indicesEPi; _ZN3tbb6detail2r115core_type_countEl; _ZN3tbb6detail2r122fill_core_type_indicesEPil; _ZN3tbb6detail2r131constraints_default_concurrencyERKNS0_2d111constraintsEl; _ZN3tbb6detail2r128constraints_threads_per_coreERKNS0_2d111constraintsEl; _ZN3tbb6detail2r124numa_default_concurrencyEi; /* Observer (observer_proxy.cpp) */ _ZN3tbb6detail2r17observeERNS0_2d123task_scheduler_observerEb; /* Queuing RW Mutex (queuing_rw_mutex.cpp) */ _ZN3tbb6detail2r111try_acquireERNS0_2d116queuing_rw_mutexERNS3_11scoped_lockEb; _ZN3tbb6detail2r117upgrade_to_writerERNS0_2d116queuing_rw_mutex11scoped_lockE; _ZN3tbb6detail2r119downgrade_to_readerERNS0_2d116queuing_rw_mutex11scoped_lockE; _ZN3tbb6detail2r17acquireERNS0_2d116queuing_rw_mutexERNS3_11scoped_lockEb; _ZN3tbb6detail2r17releaseERNS0_2d116queuing_rw_mutex11scoped_lockE; _ZN3tbb6detail2r19constructERNS0_2d116queuing_rw_mutexE; _ZN3tbb6detail2r19is_writerERKNS0_2d116queuing_rw_mutex11scoped_lockE; /* Global control (global_control.cpp) */ _ZN3tbb6detail2r16createERNS0_2d114global_controlE; _ZN3tbb6detail2r17destroyERNS0_2d114global_controlE; _ZN3tbb6detail2r127global_control_active_valueEi; _ZN3tbb6detail2r18finalizeERNS0_2d121task_scheduler_handleEl; _ZN3tbb6detail2r13getERNS0_2d121task_scheduler_handleE; /* Parallel pipeline (parallel_pipeline.cpp) */ _ZN3tbb6detail2r117parallel_pipelineERNS0_2d118task_group_contextEmRKNS2_11filter_nodeE; _ZN3tbb6detail2r116set_end_of_inputERNS0_2d111base_filterE; /* Concurrent bounded queue (concurrent_bounded_queue.cpp) */ _ZN3tbb6detail2r126allocate_bounded_queue_repEm; _ZN3tbb6detail2r126wait_bounded_queue_monitorEPNS1_18concurrent_monitorEmlRNS0_2d113delegate_baseE; _ZN3tbb6detail2r128abort_bounded_queue_monitorsEPNS1_18concurrent_monitorE; _ZN3tbb6detail2r128deallocate_bounded_queue_repEPhm; _ZN3tbb6detail2r128notify_bounded_queue_monitorEPNS1_18concurrent_monitorEmm; /* Concurrent monitor (address_waiter.cpp) */ _ZN3tbb6detail2r115wait_on_addressEPvRNS0_2d113delegate_baseEm; _ZN3tbb6detail2r117notify_by_addressEPvm; _ZN3tbb6detail2r121notify_by_address_oneEPv; _ZN3tbb6detail2r121notify_by_address_allEPv; /* Versioning (version.cpp) */ TBB_runtime_interface_version; TBB_runtime_version; local: /* TODO: fill more precisely */ *; }; ================================================ FILE: third-party/tbb/src/tbb/def/mac64-tbb.def ================================================ # Copyright (c) 2005-2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # TODO: check the legacy comment below, currently use extra leading underscore everywhere. # Sometimes macOS* requires leading underscore (e. g. in export list file), but sometimes not # (e. g. when searching symbol in a dynamic library via dlsym()). Symbols in this file SHOULD # be listed WITHOUT one leading underscore. __TBB_SYMBOL macro should add underscore when # necessary, depending on the intended usage. # Assertions (assert.cpp) __ZN3tbb6detail2r117assertion_failureEPKciS3_S3_ # ITT (profiling.cpp) __ZN3tbb6detail2r112itt_task_endENS0_2d115itt_domain_enumE __ZN3tbb6detail2r114itt_region_endENS0_2d115itt_domain_enumEPvy __ZN3tbb6detail2r114itt_task_beginENS0_2d115itt_domain_enumEPvyS4_yNS0_2d021string_resource_indexE __ZN3tbb6detail2r115call_itt_notifyEiPv __ZN3tbb6detail2r115create_itt_syncEPvPKcS4_ __ZN3tbb6detail2r116itt_region_beginENS0_2d115itt_domain_enumEPvyS4_yNS0_2d021string_resource_indexE __ZN3tbb6detail2r116itt_relation_addENS0_2d115itt_domain_enumEPvyNS0_2d012itt_relationES4_y __ZN3tbb6detail2r117itt_set_sync_nameEPvPKc __ZN3tbb6detail2r119itt_make_task_groupENS0_2d115itt_domain_enumEPvyS4_yNS0_2d021string_resource_indexE __ZN3tbb6detail2r120itt_metadata_str_addENS0_2d115itt_domain_enumEPvyNS0_2d021string_resource_indexEPKc __ZN3tbb6detail2r120itt_metadata_ptr_addENS0_2d115itt_domain_enumEPvyNS0_2d021string_resource_indexES4_ # Allocators (allocator.cpp) __ZN3tbb6detail2r115allocate_memoryEm __ZN3tbb6detail2r117deallocate_memoryEPv __ZN3tbb6detail2r122cache_aligned_allocateEm __ZN3tbb6detail2r124cache_aligned_deallocateEPv __ZN3tbb6detail2r115cache_line_sizeEv __ZN3tbb6detail2r117is_tbbmalloc_usedEv # Small object pool (small_object_pool.cpp) __ZN3tbb6detail2r18allocateERPNS0_2d117small_object_poolEm __ZN3tbb6detail2r18allocateERPNS0_2d117small_object_poolEmRKNS2_14execution_dataE __ZN3tbb6detail2r110deallocateERNS0_2d117small_object_poolEPvm __ZN3tbb6detail2r110deallocateERNS0_2d117small_object_poolEPvmRKNS2_14execution_dataE # Error handling (exception.cpp) __ZN3tbb6detail2r115throw_exceptionENS0_2d012exception_idE __ZTIN3tbb6detail2r114bad_last_allocE __ZTVN3tbb6detail2r114bad_last_allocE __ZTIN3tbb6detail2r112missing_waitE __ZTVN3tbb6detail2r112missing_waitE __ZTIN3tbb6detail2r110user_abortE __ZTVN3tbb6detail2r110user_abortE __ZTIN3tbb6detail2r111unsafe_waitE __ZTVN3tbb6detail2r111unsafe_waitE # RTM Mutex (rtm_mutex.cpp) __ZN3tbb6detail2r17acquireERNS0_2d19rtm_mutexERNS3_11scoped_lockEb __ZN3tbb6detail2r17releaseERNS0_2d19rtm_mutex11scoped_lockE __ZN3tbb6detail2r111try_acquireERNS0_2d19rtm_mutexERNS3_11scoped_lockE # RTM RW Mutex (rtm_rw_mutex.cpp) __ZN3tbb6detail2r114acquire_readerERNS0_2d112rtm_rw_mutexERNS3_11scoped_lockEb __ZN3tbb6detail2r114acquire_writerERNS0_2d112rtm_rw_mutexERNS3_11scoped_lockEb __ZN3tbb6detail2r118try_acquire_readerERNS0_2d112rtm_rw_mutexERNS3_11scoped_lockE __ZN3tbb6detail2r118try_acquire_writerERNS0_2d112rtm_rw_mutexERNS3_11scoped_lockE __ZN3tbb6detail2r17releaseERNS0_2d112rtm_rw_mutex11scoped_lockE __ZN3tbb6detail2r17upgradeERNS0_2d112rtm_rw_mutex11scoped_lockE __ZN3tbb6detail2r19downgradeERNS0_2d112rtm_rw_mutex11scoped_lockE # Tasks and partitioners (task.cpp) __ZN3tbb6detail2r17suspendEPFvPvPNS1_18suspend_point_typeEES2_ __ZN3tbb6detail2r16resumeEPNS1_18suspend_point_typeE __ZN3tbb6detail2r121current_suspend_pointEv __ZN3tbb6detail2r114notify_waitersEm __ZN3tbb6detail2r127get_thread_reference_vertexEPNS0_2d126wait_tree_vertex_interfaceE # Task dispatcher (task_dispatcher.cpp) __ZN3tbb6detail2r114execution_slotEPKNS0_2d114execution_dataE __ZN3tbb6detail2r14waitERNS0_2d112wait_contextERNS2_18task_group_contextE __ZN3tbb6detail2r15spawnERNS0_2d14taskERNS2_18task_group_contextE __ZN3tbb6detail2r15spawnERNS0_2d14taskERNS2_18task_group_contextEt __ZN3tbb6detail2r116execute_and_waitERNS0_2d14taskERNS2_18task_group_contextERNS2_12wait_contextES6_ __ZN3tbb6detail2r16submitERNS0_2d14taskERNS2_18task_group_contextEPNS1_5arenaEm __ZN3tbb6detail2r115current_contextEv # Task group context (task_group_context.cpp) __ZN3tbb6detail2r110initializeERNS0_2d118task_group_contextE __ZN3tbb6detail2r122cancel_group_executionERNS0_2d118task_group_contextE __ZN3tbb6detail2r128is_group_execution_cancelledERNS0_2d118task_group_contextE __ZN3tbb6detail2r15resetERNS0_2d118task_group_contextE __ZN3tbb6detail2r17destroyERNS0_2d118task_group_contextE __ZN3tbb6detail2r119capture_fp_settingsERNS0_2d118task_group_contextE # Task arena (arena.cpp) __ZN3tbb6detail2r115max_concurrencyEPKNS0_2d115task_arena_baseE __ZN3tbb6detail2r110initializeERNS0_2d115task_arena_baseE __ZN3tbb6detail2r16attachERNS0_2d115task_arena_baseE __ZN3tbb6detail2r17executeERNS0_2d115task_arena_baseERNS2_13delegate_baseE __ZN3tbb6detail2r19terminateERNS0_2d115task_arena_baseE __ZN3tbb6detail2r120isolate_within_arenaERNS0_2d113delegate_baseEl __ZN3tbb6detail2r17enqueueERNS0_2d14taskEPNS2_15task_arena_baseE __ZN3tbb6detail2r17enqueueERNS0_2d14taskERNS2_18task_group_contextEPNS2_15task_arena_baseE __ZN3tbb6detail2r14waitERNS0_2d115task_arena_baseE __ZN3tbb6detail2r114execution_slotERKNS0_2d115task_arena_baseE __ZN3tbb6detail2r119exit_parallel_phaseEPNS0_2d115task_arena_baseEm __ZN3tbb6detail2r120enter_parallel_phaseEPNS0_2d115task_arena_baseEm # System topology parsing and threads pinning (governor.cpp) __ZN3tbb6detail2r115numa_node_countEv __ZN3tbb6detail2r117fill_numa_indicesEPi __ZN3tbb6detail2r115core_type_countEl __ZN3tbb6detail2r122fill_core_type_indicesEPil __ZN3tbb6detail2r131constraints_default_concurrencyERKNS0_2d111constraintsEl __ZN3tbb6detail2r128constraints_threads_per_coreERKNS0_2d111constraintsEl __ZN3tbb6detail2r124numa_default_concurrencyEi # Observer (observer_proxy.cpp) __ZN3tbb6detail2r17observeERNS0_2d123task_scheduler_observerEb # Queuing RW Mutex (queuing_rw_mutex.cpp) __ZN3tbb6detail2r111try_acquireERNS0_2d116queuing_rw_mutexERNS3_11scoped_lockEb __ZN3tbb6detail2r117upgrade_to_writerERNS0_2d116queuing_rw_mutex11scoped_lockE __ZN3tbb6detail2r119downgrade_to_readerERNS0_2d116queuing_rw_mutex11scoped_lockE __ZN3tbb6detail2r17acquireERNS0_2d116queuing_rw_mutexERNS3_11scoped_lockEb __ZN3tbb6detail2r17releaseERNS0_2d116queuing_rw_mutex11scoped_lockE __ZN3tbb6detail2r19constructERNS0_2d116queuing_rw_mutexE __ZN3tbb6detail2r19is_writerERKNS0_2d116queuing_rw_mutex11scoped_lockE # Global control (global_control.cpp) __ZN3tbb6detail2r16createERNS0_2d114global_controlE __ZN3tbb6detail2r17destroyERNS0_2d114global_controlE __ZN3tbb6detail2r127global_control_active_valueEi __ZN3tbb6detail2r18finalizeERNS0_2d121task_scheduler_handleEl __ZN3tbb6detail2r13getERNS0_2d121task_scheduler_handleE # Parallel pipeline (parallel_pipeline.cpp) __ZN3tbb6detail2r117parallel_pipelineERNS0_2d118task_group_contextEmRKNS2_11filter_nodeE __ZN3tbb6detail2r116set_end_of_inputERNS0_2d111base_filterE # Concurrent bounded queue (concurrent_bounded_queue.cpp) __ZN3tbb6detail2r126allocate_bounded_queue_repEm __ZN3tbb6detail2r126wait_bounded_queue_monitorEPNS1_18concurrent_monitorEmlRNS0_2d113delegate_baseE __ZN3tbb6detail2r128abort_bounded_queue_monitorsEPNS1_18concurrent_monitorE __ZN3tbb6detail2r128deallocate_bounded_queue_repEPhm __ZN3tbb6detail2r128notify_bounded_queue_monitorEPNS1_18concurrent_monitorEmm # Concurrent monitor (address_waiter.cpp) __ZN3tbb6detail2r115wait_on_addressEPvRNS0_2d113delegate_baseEm __ZN3tbb6detail2r117notify_by_addressEPvm __ZN3tbb6detail2r121notify_by_address_oneEPv __ZN3tbb6detail2r121notify_by_address_allEPv # Versioning (version.cpp) _TBB_runtime_interface_version _TBB_runtime_version ================================================ FILE: third-party/tbb/src/tbb/def/win32-tbb.def ================================================ ; Copyright (c) 2005-2025 Intel Corporation ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. ; You may obtain a copy of the License at ; ; http://www.apache.org/licenses/LICENSE-2.0 ; ; Unless required by applicable law or agreed to in writing, software ; distributed under the License is distributed on an "AS IS" BASIS, ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ; See the License for the specific language governing permissions and ; limitations under the License. ; This file is organized with a section for each .cpp file. EXPORTS ; Assertions (assert.cpp) ?assertion_failure@r1@detail@tbb@@YAXPBDH00@Z ; ITT (tbb_profiling.cpp) ?call_itt_notify@r1@detail@tbb@@YAXHPAX@Z ?create_itt_sync@r1@detail@tbb@@YAXPAXPB_W1@Z ?itt_make_task_group@r1@detail@tbb@@YAXW4itt_domain_enum@d1@23@PAX_K12W4string_resource_index@d0@23@@Z ?itt_task_begin@r1@detail@tbb@@YAXW4itt_domain_enum@d1@23@PAX_K12W4string_resource_index@d0@23@@Z ?itt_task_end@r1@detail@tbb@@YAXW4itt_domain_enum@d1@23@@Z ?itt_metadata_str_add@r1@detail@tbb@@YAXW4itt_domain_enum@d1@23@PAX_KW4string_resource_index@d0@23@PBD@Z ?itt_relation_add@r1@detail@tbb@@YAXW4itt_domain_enum@d1@23@PAX_KW4itt_relation@d0@23@12@Z ?itt_region_begin@r1@detail@tbb@@YAXW4itt_domain_enum@d1@23@PAX_K12W4string_resource_index@d0@23@@Z ?itt_region_end@r1@detail@tbb@@YAXW4itt_domain_enum@d1@23@PAX_K@Z ?itt_set_sync_name@r1@detail@tbb@@YAXPAXPB_W@Z ?itt_metadata_ptr_add@r1@detail@tbb@@YAXW4itt_domain_enum@d1@23@PAX_KW4string_resource_index@d0@23@1@Z ; Allocators (tbb_allocator.cpp) ?cache_aligned_allocate@r1@detail@tbb@@YAPAXI@Z ?cache_aligned_deallocate@r1@detail@tbb@@YAXPAX@Z ?cache_line_size@r1@detail@tbb@@YAIXZ ?allocate_memory@r1@detail@tbb@@YAPAXI@Z ?deallocate_memory@r1@detail@tbb@@YAXPAX@Z ?is_tbbmalloc_used@r1@detail@tbb@@YA_NXZ ; Small object pool (small_object_pool.cpp) ?allocate@r1@detail@tbb@@YAPAXAAPAVsmall_object_pool@d1@23@IABUexecution_data@523@@Z ?allocate@r1@detail@tbb@@YAPAXAAPAVsmall_object_pool@d1@23@I@Z ?deallocate@r1@detail@tbb@@YAXAAVsmall_object_pool@d1@23@PAXIABUexecution_data@523@@Z ?deallocate@r1@detail@tbb@@YAXAAVsmall_object_pool@d1@23@PAXI@Z ; Error handling (exception.cpp) ?throw_exception@r1@detail@tbb@@YAXW4exception_id@d0@23@@Z ?what@bad_last_alloc@r1@detail@tbb@@UBEPBDXZ ?what@user_abort@r1@detail@tbb@@UBEPBDXZ ?what@missing_wait@r1@detail@tbb@@UBEPBDXZ ; RTM Mutex (rtm_mutex.cpp) ?acquire@r1@detail@tbb@@YAXAAVrtm_mutex@d1@23@AAVscoped_lock@4523@_N@Z ?release@r1@detail@tbb@@YAXAAVscoped_lock@rtm_mutex@d1@23@@Z ?try_acquire@r1@detail@tbb@@YA_NAAVrtm_mutex@d1@23@AAVscoped_lock@4523@@Z ; RTM RW Mutex (rtm_rw_mutex.cpp) ?acquire_reader@r1@detail@tbb@@YAXAAVrtm_rw_mutex@d1@23@AAVscoped_lock@4523@_N@Z ?acquire_writer@r1@detail@tbb@@YAXAAVrtm_rw_mutex@d1@23@AAVscoped_lock@4523@_N@Z ?downgrade@r1@detail@tbb@@YA_NAAVscoped_lock@rtm_rw_mutex@d1@23@@Z ?release@r1@detail@tbb@@YAXAAVscoped_lock@rtm_rw_mutex@d1@23@@Z ?try_acquire_reader@r1@detail@tbb@@YA_NAAVrtm_rw_mutex@d1@23@AAVscoped_lock@4523@@Z ?try_acquire_writer@r1@detail@tbb@@YA_NAAVrtm_rw_mutex@d1@23@AAVscoped_lock@4523@@Z ?upgrade@r1@detail@tbb@@YA_NAAVscoped_lock@rtm_rw_mutex@d1@23@@Z ; Tasks and partitioners (task.cpp) ?current_suspend_point@r1@detail@tbb@@YAPAUsuspend_point_type@123@XZ ?resume@r1@detail@tbb@@YAXPAUsuspend_point_type@123@@Z ?suspend@r1@detail@tbb@@YAXP6AXPAXPAUsuspend_point_type@123@@Z0@Z ?notify_waiters@r1@detail@tbb@@YAXI@Z ?get_thread_reference_vertex@r1@detail@tbb@@YAPAVwait_tree_vertex_interface@d1@23@PAV4523@@Z ; Task dispatcher (task_dispatcher.cpp) ?spawn@r1@detail@tbb@@YAXAAVtask@d1@23@AAVtask_group_context@523@G@Z ?spawn@r1@detail@tbb@@YAXAAVtask@d1@23@AAVtask_group_context@523@@Z ?execute_and_wait@r1@detail@tbb@@YAXAAVtask@d1@23@AAVtask_group_context@523@AAVwait_context@523@1@Z ?execution_slot@r1@detail@tbb@@YAGPBUexecution_data@d1@23@@Z ?wait@r1@detail@tbb@@YAXAAVwait_context@d1@23@AAVtask_group_context@523@@Z ?submit@r1@detail@tbb@@YAXAAVtask@d1@23@AAVtask_group_context@523@PAVarena@123@I@Z ?current_context@r1@detail@tbb@@YAPAVtask_group_context@d1@23@XZ ; Task group context (task_group_context.cpp) ?cancel_group_execution@r1@detail@tbb@@YA_NAAVtask_group_context@d1@23@@Z ?capture_fp_settings@r1@detail@tbb@@YAXAAVtask_group_context@d1@23@@Z ?destroy@r1@detail@tbb@@YAXAAVtask_group_context@d1@23@@Z ?initialize@r1@detail@tbb@@YAXAAVtask_group_context@d1@23@@Z ?is_group_execution_cancelled@r1@detail@tbb@@YA_NAAVtask_group_context@d1@23@@Z ?reset@r1@detail@tbb@@YAXAAVtask_group_context@d1@23@@Z ; Task arena (arena.cpp) ?attach@r1@detail@tbb@@YA_NAAVtask_arena_base@d1@23@@Z ?enqueue@r1@detail@tbb@@YAXAAVtask@d1@23@PAVtask_arena_base@523@@Z ?execute@r1@detail@tbb@@YAXAAVtask_arena_base@d1@23@AAVdelegate_base@523@@Z ?initialize@r1@detail@tbb@@YAXAAVtask_arena_base@d1@23@@Z ?isolate_within_arena@r1@detail@tbb@@YAXAAVdelegate_base@d1@23@H@Z ?max_concurrency@r1@detail@tbb@@YAHPBVtask_arena_base@d1@23@@Z ?terminate@r1@detail@tbb@@YAXAAVtask_arena_base@d1@23@@Z ?wait@r1@detail@tbb@@YAXAAVtask_arena_base@d1@23@@Z ?enqueue@r1@detail@tbb@@YAXAAVtask@d1@23@AAVtask_group_context@523@PAVtask_arena_base@523@@Z ?execution_slot@r1@detail@tbb@@YAGABVtask_arena_base@d1@23@@Z ?enter_parallel_phase@r1@detail@tbb@@YAXPAVtask_arena_base@d1@23@I@Z ?exit_parallel_phase@r1@detail@tbb@@YAXPAVtask_arena_base@d1@23@I@Z ; System topology parsing and threads pinning (governor.cpp) ?numa_node_count@r1@detail@tbb@@YAIXZ ?fill_numa_indices@r1@detail@tbb@@YAXPAH@Z ?core_type_count@r1@detail@tbb@@YAIH@Z ?fill_core_type_indices@r1@detail@tbb@@YAXPAHH@Z ?numa_default_concurrency@r1@detail@tbb@@YAHH@Z ?constraints_default_concurrency@r1@detail@tbb@@YAHABUconstraints@d1@23@H@Z ?constraints_threads_per_core@r1@detail@tbb@@YAHABUconstraints@d1@23@H@Z ; Observer (observer_proxy.cpp) ?observe@r1@detail@tbb@@YAXAAVtask_scheduler_observer@d1@23@_N@Z ; Queuing RW Mutex (queuing_rw_mutex.cpp) ?acquire@r1@detail@tbb@@YAXAAVqueuing_rw_mutex@d1@23@AAVscoped_lock@4523@_N@Z ?construct@r1@detail@tbb@@YAXAAVqueuing_rw_mutex@d1@23@@Z ?downgrade_to_reader@r1@detail@tbb@@YA_NAAVscoped_lock@queuing_rw_mutex@d1@23@@Z ?release@r1@detail@tbb@@YAXAAVscoped_lock@queuing_rw_mutex@d1@23@@Z ?try_acquire@r1@detail@tbb@@YA_NAAVqueuing_rw_mutex@d1@23@AAVscoped_lock@4523@_N@Z ?upgrade_to_writer@r1@detail@tbb@@YA_NAAVscoped_lock@queuing_rw_mutex@d1@23@@Z ?is_writer@r1@detail@tbb@@YA_NABVscoped_lock@queuing_rw_mutex@d1@23@@Z ; Global control (global_control.cpp) ?create@r1@detail@tbb@@YAXAAVglobal_control@d1@23@@Z ?destroy@r1@detail@tbb@@YAXAAVglobal_control@d1@23@@Z ?global_control_active_value@r1@detail@tbb@@YAIH@Z ?get@r1@detail@tbb@@YAXAAVtask_scheduler_handle@d1@23@@Z ?finalize@r1@detail@tbb@@YA_NAAVtask_scheduler_handle@d1@23@H@Z ; Parallel pipeline (parallel_pipeline.cpp) ?parallel_pipeline@r1@detail@tbb@@YAXAAVtask_group_context@d1@23@IABVfilter_node@523@@Z ?set_end_of_input@r1@detail@tbb@@YAXAAVbase_filter@d1@23@@Z ; Concurrent bounded queue (concurrent_bounded_queue.cpp) ?abort_bounded_queue_monitors@r1@detail@tbb@@YAXPAVconcurrent_monitor@123@@Z ?allocate_bounded_queue_rep@r1@detail@tbb@@YAPAEI@Z ?deallocate_bounded_queue_rep@r1@detail@tbb@@YAXPAEI@Z ?notify_bounded_queue_monitor@r1@detail@tbb@@YAXPAVconcurrent_monitor@123@II@Z ?wait_bounded_queue_monitor@r1@detail@tbb@@YAXPAVconcurrent_monitor@123@IHAAVdelegate_base@d1@23@@Z ; Concurrent monitor (address_waiter.cpp) ?wait_on_address@r1@detail@tbb@@YAXPAXAAVdelegate_base@d1@23@I@Z ?notify_by_address@r1@detail@tbb@@YAXPAXI@Z ?notify_by_address_one@r1@detail@tbb@@YAXPAX@Z ?notify_by_address_all@r1@detail@tbb@@YAXPAX@Z ;; Versioning (version.cpp) TBB_runtime_interface_version TBB_runtime_version ================================================ FILE: third-party/tbb/src/tbb/def/win64-tbb.def ================================================ ; Copyright (c) 2005-2025 Intel Corporation ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. ; You may obtain a copy of the License at ; ; http://www.apache.org/licenses/LICENSE-2.0 ; ; Unless required by applicable law or agreed to in writing, software ; distributed under the License is distributed on an "AS IS" BASIS, ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ; See the License for the specific language governing permissions and ; limitations under the License. ; This file is organized with a section for each .cpp file. EXPORTS ; Assertions (assert.cpp) ?assertion_failure@r1@detail@tbb@@YAXPEBDH00@Z ; ITT (tbb_profiling.cpp) ?call_itt_notify@r1@detail@tbb@@YAXHPEAX@Z ?create_itt_sync@r1@detail@tbb@@YAXPEAXPEB_W1@Z ?itt_make_task_group@r1@detail@tbb@@YAXW4itt_domain_enum@d1@23@PEAX_K12W4string_resource_index@d0@23@@Z ?itt_task_begin@r1@detail@tbb@@YAXW4itt_domain_enum@d1@23@PEAX_K12W4string_resource_index@d0@23@@Z ?itt_task_end@r1@detail@tbb@@YAXW4itt_domain_enum@d1@23@@Z ?itt_set_sync_name@r1@detail@tbb@@YAXPEAXPEB_W@Z ?itt_metadata_str_add@r1@detail@tbb@@YAXW4itt_domain_enum@d1@23@PEAX_KW4string_resource_index@d0@23@PEBD@Z ?itt_relation_add@r1@detail@tbb@@YAXW4itt_domain_enum@d1@23@PEAX_KW4itt_relation@d0@23@12@Z ?itt_region_begin@r1@detail@tbb@@YAXW4itt_domain_enum@d1@23@PEAX_K12W4string_resource_index@d0@23@@Z ?itt_region_end@r1@detail@tbb@@YAXW4itt_domain_enum@d1@23@PEAX_K@Z ?itt_metadata_ptr_add@r1@detail@tbb@@YAXW4itt_domain_enum@d1@23@PEAX_KW4string_resource_index@d0@23@1@Z ; Allocators (tbb_allocator.cpp) ?cache_aligned_allocate@r1@detail@tbb@@YAPEAX_K@Z ?cache_aligned_deallocate@r1@detail@tbb@@YAXPEAX@Z ?cache_line_size@r1@detail@tbb@@YA_KXZ ?allocate_memory@r1@detail@tbb@@YAPEAX_K@Z ?deallocate_memory@r1@detail@tbb@@YAXPEAX@Z ?is_tbbmalloc_used@r1@detail@tbb@@YA_NXZ ; Small object pool (small_object_pool.cpp) ?allocate@r1@detail@tbb@@YAPEAXAEAPEAVsmall_object_pool@d1@23@_KAEBUexecution_data@523@@Z ?allocate@r1@detail@tbb@@YAPEAXAEAPEAVsmall_object_pool@d1@23@_K@Z ?deallocate@r1@detail@tbb@@YAXAEAVsmall_object_pool@d1@23@PEAX_KAEBUexecution_data@523@@Z ?deallocate@r1@detail@tbb@@YAXAEAVsmall_object_pool@d1@23@PEAX_K@Z ; Error handling (exception.cpp) ?throw_exception@r1@detail@tbb@@YAXW4exception_id@d0@23@@Z ?what@bad_last_alloc@r1@detail@tbb@@UEBAPEBDXZ ?what@user_abort@r1@detail@tbb@@UEBAPEBDXZ ?what@missing_wait@r1@detail@tbb@@UEBAPEBDXZ ; RTM Mutex (rtm_mutex.cpp) ?try_acquire@r1@detail@tbb@@YA_NAEAVrtm_mutex@d1@23@AEAVscoped_lock@4523@@Z ?acquire@r1@detail@tbb@@YAXAEAVrtm_mutex@d1@23@AEAVscoped_lock@4523@_N@Z ?release@r1@detail@tbb@@YAXAEAVscoped_lock@rtm_mutex@d1@23@@Z ; RTM RW Mutex (rtm_rw_mutex.cpp) ?acquire_writer@r1@detail@tbb@@YAXAEAVrtm_rw_mutex@d1@23@AEAVscoped_lock@4523@_N@Z ?acquire_reader@r1@detail@tbb@@YAXAEAVrtm_rw_mutex@d1@23@AEAVscoped_lock@4523@_N@Z ?upgrade@r1@detail@tbb@@YA_NAEAVscoped_lock@rtm_rw_mutex@d1@23@@Z ?downgrade@r1@detail@tbb@@YA_NAEAVscoped_lock@rtm_rw_mutex@d1@23@@Z ?try_acquire_writer@r1@detail@tbb@@YA_NAEAVrtm_rw_mutex@d1@23@AEAVscoped_lock@4523@@Z ?try_acquire_reader@r1@detail@tbb@@YA_NAEAVrtm_rw_mutex@d1@23@AEAVscoped_lock@4523@@Z ?release@r1@detail@tbb@@YAXAEAVscoped_lock@rtm_rw_mutex@d1@23@@Z ; Tasks and partitioners (task.cpp) ?suspend@r1@detail@tbb@@YAXP6AXPEAXPEAUsuspend_point_type@123@@Z0@Z ?resume@r1@detail@tbb@@YAXPEAUsuspend_point_type@123@@Z ?current_suspend_point@r1@detail@tbb@@YAPEAUsuspend_point_type@123@XZ ?notify_waiters@r1@detail@tbb@@YAX_K@Z ?get_thread_reference_vertex@r1@detail@tbb@@YAPEAVwait_tree_vertex_interface@d1@23@PEAV4523@@Z ; Task dispatcher (task_dispatcher.cpp) ?spawn@r1@detail@tbb@@YAXAEAVtask@d1@23@AEAVtask_group_context@523@@Z ?spawn@r1@detail@tbb@@YAXAEAVtask@d1@23@AEAVtask_group_context@523@G@Z ?execute_and_wait@r1@detail@tbb@@YAXAEAVtask@d1@23@AEAVtask_group_context@523@AEAVwait_context@523@1@Z ?execution_slot@r1@detail@tbb@@YAGPEBUexecution_data@d1@23@@Z ?wait@r1@detail@tbb@@YAXAEAVwait_context@d1@23@AEAVtask_group_context@523@@Z ?submit@r1@detail@tbb@@YAXAEAVtask@d1@23@AEAVtask_group_context@523@PEAVarena@123@_K@Z ?current_context@r1@detail@tbb@@YAPEAVtask_group_context@d1@23@XZ ; Task group context (task_group_context.cpp) ?initialize@r1@detail@tbb@@YAXAEAVtask_group_context@d1@23@@Z ?destroy@r1@detail@tbb@@YAXAEAVtask_group_context@d1@23@@Z ?is_group_execution_cancelled@r1@detail@tbb@@YA_NAEAVtask_group_context@d1@23@@Z ?reset@r1@detail@tbb@@YAXAEAVtask_group_context@d1@23@@Z ?cancel_group_execution@r1@detail@tbb@@YA_NAEAVtask_group_context@d1@23@@Z ?capture_fp_settings@r1@detail@tbb@@YAXAEAVtask_group_context@d1@23@@Z ; Task arena (arena.cpp) ?max_concurrency@r1@detail@tbb@@YAHPEBVtask_arena_base@d1@23@@Z ?initialize@r1@detail@tbb@@YAXAEAVtask_arena_base@d1@23@@Z ?terminate@r1@detail@tbb@@YAXAEAVtask_arena_base@d1@23@@Z ?execute@r1@detail@tbb@@YAXAEAVtask_arena_base@d1@23@AEAVdelegate_base@523@@Z ?wait@r1@detail@tbb@@YAXAEAVtask_arena_base@d1@23@@Z ?attach@r1@detail@tbb@@YA_NAEAVtask_arena_base@d1@23@@Z ?isolate_within_arena@r1@detail@tbb@@YAXAEAVdelegate_base@d1@23@_J@Z ?enqueue@r1@detail@tbb@@YAXAEAVtask@d1@23@PEAVtask_arena_base@523@@Z ?enqueue@r1@detail@tbb@@YAXAEAVtask@d1@23@AEAVtask_group_context@523@PEAVtask_arena_base@523@@Z ?execution_slot@r1@detail@tbb@@YAGAEBVtask_arena_base@d1@23@@Z ?enter_parallel_phase@r1@detail@tbb@@YAXPEAVtask_arena_base@d1@23@_K@Z ?exit_parallel_phase@r1@detail@tbb@@YAXPEAVtask_arena_base@d1@23@_K@Z ; System topology parsing and threads pinning (governor.cpp) ?numa_node_count@r1@detail@tbb@@YAIXZ ?fill_numa_indices@r1@detail@tbb@@YAXPEAH@Z ?core_type_count@r1@detail@tbb@@YAI_J@Z ?fill_core_type_indices@r1@detail@tbb@@YAXPEAH_J@Z ?numa_default_concurrency@r1@detail@tbb@@YAHH@Z ?constraints_default_concurrency@r1@detail@tbb@@YAHAEBUconstraints@d1@23@_J@Z ?constraints_threads_per_core@r1@detail@tbb@@YAHAEBUconstraints@d1@23@_J@Z ; Observer (observer_proxy.cpp) ?observe@r1@detail@tbb@@YAXAEAVtask_scheduler_observer@d1@23@_N@Z ; Queuing RW Mutex (queuing_rw_mutex.cpp) ?construct@r1@detail@tbb@@YAXAEAVqueuing_rw_mutex@d1@23@@Z ?try_acquire@r1@detail@tbb@@YA_NAEAVqueuing_rw_mutex@d1@23@AEAVscoped_lock@4523@_N@Z ?acquire@r1@detail@tbb@@YAXAEAVqueuing_rw_mutex@d1@23@AEAVscoped_lock@4523@_N@Z ?release@r1@detail@tbb@@YAXAEAVscoped_lock@queuing_rw_mutex@d1@23@@Z ?upgrade_to_writer@r1@detail@tbb@@YA_NAEAVscoped_lock@queuing_rw_mutex@d1@23@@Z ?downgrade_to_reader@r1@detail@tbb@@YA_NAEAVscoped_lock@queuing_rw_mutex@d1@23@@Z ?is_writer@r1@detail@tbb@@YA_NAEBVscoped_lock@queuing_rw_mutex@d1@23@@Z ; Global control (global_control.cpp) ?global_control_active_value@r1@detail@tbb@@YA_KH@Z ?create@r1@detail@tbb@@YAXAEAVglobal_control@d1@23@@Z ?destroy@r1@detail@tbb@@YAXAEAVglobal_control@d1@23@@Z ?get@r1@detail@tbb@@YAXAEAVtask_scheduler_handle@d1@23@@Z ?finalize@r1@detail@tbb@@YA_NAEAVtask_scheduler_handle@d1@23@_J@Z ; Parallel pipeline (parallel_pipeline.cpp) ?set_end_of_input@r1@detail@tbb@@YAXAEAVbase_filter@d1@23@@Z ?parallel_pipeline@r1@detail@tbb@@YAXAEAVtask_group_context@d1@23@_KAEBVfilter_node@523@@Z ; Concurrent bounded queue (concurrent_bounded_queue.cpp) ?allocate_bounded_queue_rep@r1@detail@tbb@@YAPEAE_K@Z ?deallocate_bounded_queue_rep@r1@detail@tbb@@YAXPEAE_K@Z ?wait_bounded_queue_monitor@r1@detail@tbb@@YAXPEAVconcurrent_monitor@123@_K_JAEAVdelegate_base@d1@23@@Z ?abort_bounded_queue_monitors@r1@detail@tbb@@YAXPEAVconcurrent_monitor@123@@Z ?notify_bounded_queue_monitor@r1@detail@tbb@@YAXPEAVconcurrent_monitor@123@_K1@Z ; Concurrent monitor (address_waiter.cpp) ?wait_on_address@r1@detail@tbb@@YAXPEAXAEAVdelegate_base@d1@23@_K@Z ?notify_by_address@r1@detail@tbb@@YAXPEAX_K@Z ?notify_by_address_one@r1@detail@tbb@@YAXPEAX@Z ?notify_by_address_all@r1@detail@tbb@@YAXPEAX@Z ;; Versioning (version.cpp) TBB_runtime_interface_version TBB_runtime_version ================================================ FILE: third-party/tbb/src/tbb/dynamic_link.cpp ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "dynamic_link.h" #include "environment.h" #include "oneapi/tbb/detail/_template_helpers.h" #include "oneapi/tbb/detail/_utils.h" /* This file is used by both TBB and OpenMP RTL. Do not use __TBB_ASSERT() macro and runtime_warning() function because they are not available in OpenMP. Use __TBB_ASSERT_EX and DYNAMIC_LINK_WARNING instead. */ #include // va_list etc. #include // strrchr #if _WIN32 #include // Unify system calls #define dlopen( name, flags ) LoadLibrary( name ) #define dlsym( handle, name ) GetProcAddress( handle, name ) // FreeLibrary return bool value that is not used. #define dlclose( handle ) (void)( ! FreeLibrary( handle ) ) #define dlerror() GetLastError() #ifndef PATH_MAX #define PATH_MAX MAX_PATH #endif #else /* _WIN32 */ #include #include #include #include #endif /* _WIN32 */ #if __TBB_WEAK_SYMBOLS_PRESENT && !__TBB_DYNAMIC_LOAD_ENABLED //TODO: use function attribute for weak symbols instead of the pragma. #pragma weak dlopen #pragma weak dlsym #pragma weak dlclose #endif /* __TBB_WEAK_SYMBOLS_PRESENT && !__TBB_DYNAMIC_LOAD_ENABLED */ #define __USE_STATIC_DL_INIT ( !__ANDROID__ ) /* dynamic_link is a common interface for searching for required symbols in an executable and dynamic libraries. dynamic_link provides certain guarantees: 1. Either all or none of the requested symbols are resolved. Moreover, if symbols are not resolved, the dynamic_link_descriptor table is not modified; 2. All returned symbols have secured lifetime: this means that none of them can be invalidated until dynamic_unlink is called; 3. Any loaded library is loaded only via the full path. The full path is that from which the runtime itself was loaded. (This is done to avoid security issues caused by loading libraries from insecure paths). dynamic_link searches for the requested symbols in three stages, stopping as soon as all of the symbols have been resolved. 1. Search the global scope: a. On Windows: dynamic_link tries to obtain the handle of the requested library and if it succeeds it resolves the symbols via that handle. b. On Linux: dynamic_link tries to search for the symbols in the global scope via the main program handle. If the symbols are present in the global scope their lifetime is not guaranteed (since dynamic_link does not know anything about the library from which they are exported). Therefore it tries to "pin" the symbols by obtaining the library name and reopening it. dlopen may fail to reopen the library in two cases: i. The symbols are exported from the executable. Currently dynamic _link cannot handle this situation, so it will not find these symbols in this step. ii. The necessary library has been unloaded and cannot be reloaded. It seems there is nothing that can be done in this case. No symbols are returned. 2. Dynamic load: an attempt is made to load the requested library via the full path. The full path used is that from which the runtime itself was loaded. If the library can be loaded, then an attempt is made to resolve the requested symbols in the newly loaded library. If the symbols are not found the library is unloaded. 3. Weak symbols: if weak symbols are available they are returned. */ namespace tbb { namespace detail { namespace r1 { #if __TBB_WEAK_SYMBOLS_PRESENT || __TBB_DYNAMIC_LOAD_ENABLED #if !defined(DYNAMIC_LINK_WARNING) && !__TBB_WIN8UI_SUPPORT && __TBB_DYNAMIC_LOAD_ENABLED // Report runtime errors and continue. #define DYNAMIC_LINK_WARNING dynamic_link_warning static void dynamic_link_warning( dynamic_link_error_t code, ... ) { suppress_unused_warning(code); } // library_warning #endif /* !defined(DYNAMIC_LINK_WARNING) && !__TBB_WIN8UI_SUPPORT && __TBB_DYNAMIC_LOAD_ENABLED */ static bool resolve_symbols( dynamic_link_handle module, const dynamic_link_descriptor descriptors[], std::size_t required ) { if ( !module ) return false; #if !__TBB_DYNAMIC_LOAD_ENABLED /* only __TBB_WEAK_SYMBOLS_PRESENT is defined */ if ( !dlsym ) return false; #endif /* !__TBB_DYNAMIC_LOAD_ENABLED */ const std::size_t n_desc=20; // Usually we don't have more than 20 descriptors per library __TBB_ASSERT_EX( required <= n_desc, "Too many descriptors is required" ); if ( required > n_desc ) return false; pointer_to_handler h[n_desc]; for ( std::size_t k = 0; k < required; ++k ) { dynamic_link_descriptor const & desc = descriptors[k]; pointer_to_handler addr = (pointer_to_handler)dlsym( module, desc.name ); if ( !addr ) { return false; } h[k] = addr; } // Commit the entry points. // Cannot use memset here, because the writes must be atomic. for( std::size_t k = 0; k < required; ++k ) *descriptors[k].handler = h[k]; return true; } #if __TBB_WIN8UI_SUPPORT bool dynamic_link( const char* library, const dynamic_link_descriptor descriptors[], std::size_t required, dynamic_link_handle*, int flags ) { dynamic_link_handle tmp_handle = nullptr; TCHAR wlibrary[256]; if ( MultiByteToWideChar(CP_UTF8, 0, library, -1, wlibrary, 255) == 0 ) return false; if ( flags & DYNAMIC_LINK_LOAD ) tmp_handle = LoadPackagedLibrary( wlibrary, 0 ); if (tmp_handle != nullptr){ return resolve_symbols(tmp_handle, descriptors, required); }else{ return false; } } void dynamic_unlink( dynamic_link_handle ) {} void dynamic_unlink_all() {} #else #if __TBB_DYNAMIC_LOAD_ENABLED /* There is a security issue on Windows: LoadLibrary() may load and execute malicious code. See http://www.microsoft.com/technet/security/advisory/2269637.mspx for details. To avoid the issue, we have to pass full path (not just library name) to LoadLibrary. This function constructs full path to the specified library (it is assumed the library located side-by-side with the tbb.dll. The function constructs absolute path for given relative path. Important: Base directory is not current one, it is the directory tbb.dll loaded from. Example: Let us assume "tbb.dll" is located in "c:\program files\common\intel\" directory, e.g. absolute path of the library is "c:\program files\common\intel\tbb.dll". Absolute path for "tbbmalloc.dll" would be "c:\program files\common\intel\tbbmalloc.dll". Absolute path for "malloc\tbbmalloc.dll" would be "c:\program files\common\intel\malloc\tbbmalloc.dll". */ // Struct handle_storage is used by dynamic_link routine to store handles of // all loaded or pinned dynamic libraries. When TBB is shut down, it calls // dynamic_unlink_all() that unloads modules referenced by handle_storage. // This struct should not have any constructors since it may be used before // the constructor is called. #define MAX_LOADED_MODULES 8 // The number of maximum possible modules which can be loaded using atomic_incrementer = std::atomic; static struct handles_t { atomic_incrementer my_size; dynamic_link_handle my_handles[MAX_LOADED_MODULES]; void add(const dynamic_link_handle &handle) { const std::size_t ind = my_size++; __TBB_ASSERT_EX( ind < MAX_LOADED_MODULES, "Too many modules are loaded" ); my_handles[ind] = handle; } void free() { const std::size_t size = my_size; for (std::size_t i=0; i( PATH_MAX ) ); if ( drc == 0 ) { // Error occurred. int err = GetLastError(); DYNAMIC_LINK_WARNING( dl_sys_fail, "GetModuleFileName", err ); return; } if ( drc >= PATH_MAX ) { // Buffer too short. DYNAMIC_LINK_WARNING( dl_buff_too_small ); return; } // Find the position of the last backslash. char *backslash = std::strrchr( ap_data._path, '\\' ); if ( !backslash ) { // Backslash not found. __TBB_ASSERT_EX( backslash != nullptr, "Unbelievable."); return; } __TBB_ASSERT_EX( backslash >= ap_data._path, "Unbelievable."); ap_data._len = (std::size_t)(backslash - ap_data._path) + 1; *(backslash+1) = 0; #else // Get the library path Dl_info dlinfo; int res = dladdr( (void*)&dynamic_link, &dlinfo ); // any function inside the library can be used for the address if ( !res ) { char const * err = dlerror(); DYNAMIC_LINK_WARNING( dl_sys_fail, "dladdr", err ); return; } else { __TBB_ASSERT_EX( dlinfo.dli_fname!=nullptr, "Unbelievable." ); } char const *slash = std::strrchr( dlinfo.dli_fname, '/' ); std::size_t fname_len=0; if ( slash ) { __TBB_ASSERT_EX( slash >= dlinfo.dli_fname, "Unbelievable."); fname_len = (std::size_t)(slash - dlinfo.dli_fname) + 1; } std::size_t rc; if ( dlinfo.dli_fname[0]=='/' ) { // The library path is absolute rc = 0; ap_data._len = 0; } else { // The library path is relative so get the current working directory if ( !getcwd( ap_data._path, sizeof(ap_data._path)/sizeof(ap_data._path[0]) ) ) { DYNAMIC_LINK_WARNING( dl_buff_too_small ); return; } ap_data._len = std::strlen( ap_data._path ); ap_data._path[ap_data._len++]='/'; rc = ap_data._len; } if ( fname_len>0 ) { ap_data._len += fname_len; if ( ap_data._len>PATH_MAX ) { DYNAMIC_LINK_WARNING( dl_buff_too_small ); ap_data._len=0; return; } std::strncpy( ap_data._path+rc, dlinfo.dli_fname, fname_len ); ap_data._path[ap_data._len]=0; } #endif /* _WIN32 */ } static void init_dl_data() { init_ap_data(); } /* The function constructs absolute path for given relative path. Important: Base directory is not current one, it is the directory libtbb.so loaded from. Arguments: in name -- Name of a file (may be with relative path; it must not be an absolute one). out path -- Buffer to save result (absolute path) to. in len -- Size of buffer. ret -- 0 -- Error occurred. > len -- Buffer too short, required size returned. otherwise -- Ok, number of characters (incl. terminating null) written to buffer. */ static std::size_t abs_path( char const * name, char * path, std::size_t len ) { if ( ap_data._len == 0 ) return 0; std::size_t name_len = std::strlen( name ); std::size_t full_len = name_len+ap_data._len; if ( full_len < len ) { __TBB_ASSERT( ap_data._path[ap_data._len] == 0, nullptr); __TBB_ASSERT( std::strlen(ap_data._path) == ap_data._len, nullptr); std::strncpy( path, ap_data._path, ap_data._len + 1 ); __TBB_ASSERT( path[ap_data._len] == 0, nullptr); std::strncat( path, name, len - ap_data._len ); __TBB_ASSERT( std::strlen(path) == full_len, nullptr); } return full_len+1; // +1 for null character } #endif // __TBB_DYNAMIC_LOAD_ENABLED void init_dynamic_link_data() { #if __TBB_DYNAMIC_LOAD_ENABLED std::call_once( init_dl_data_state, init_dl_data ); #endif } #if __USE_STATIC_DL_INIT // ap_data structure is initialized with current directory on Linux. // So it should be initialized as soon as possible since the current directory may be changed. // static_init_ap_data object provides this initialization during library loading. static struct static_init_dl_data_t { static_init_dl_data_t() { init_dynamic_link_data(); } } static_init_dl_data; #endif #if __TBB_WEAK_SYMBOLS_PRESENT static bool weak_symbol_link( const dynamic_link_descriptor descriptors[], std::size_t required ) { // Check if the required entries are present in what was loaded into our process. for ( std::size_t k = 0; k < required; ++k ) if ( !descriptors[k].ptr ) return false; // Commit the entry points. for ( std::size_t k = 0; k < required; ++k ) *descriptors[k].handler = (pointer_to_handler) descriptors[k].ptr; return true; } #else static bool weak_symbol_link( const dynamic_link_descriptor[], std::size_t ) { return false; } #endif /* __TBB_WEAK_SYMBOLS_PRESENT */ void dynamic_unlink( dynamic_link_handle handle ) { #if !__TBB_DYNAMIC_LOAD_ENABLED /* only __TBB_WEAK_SYMBOLS_PRESENT is defined */ if ( !dlclose ) return; #endif if ( handle ) { dlclose( handle ); } } void dynamic_unlink_all() { #if __TBB_DYNAMIC_LOAD_ENABLED handles.free(); #endif } static dynamic_link_handle global_symbols_link( const char* library, const dynamic_link_descriptor descriptors[], std::size_t required ) { dynamic_link_handle library_handle{}; #if _WIN32 auto res = GetModuleHandleEx(0, library, &library_handle); __TBB_ASSERT_EX((res && library_handle) || (!res && !library_handle), nullptr); #else /* _WIN32 */ #if !__TBB_DYNAMIC_LOAD_ENABLED /* only __TBB_WEAK_SYMBOLS_PRESENT is defined */ if ( !dlopen ) return 0; #endif /* !__TBB_DYNAMIC_LOAD_ENABLED */ // RTLD_GLOBAL - to guarantee that old TBB will find the loaded library // RTLD_NOLOAD - not to load the library without the full path library_handle = dlopen(library, RTLD_LAZY | RTLD_GLOBAL | RTLD_NOLOAD); #endif /* _WIN32 */ if (library_handle) { if (!resolve_symbols(library_handle, descriptors, required)) { dynamic_unlink(library_handle); library_handle = nullptr; } } return library_handle; } static void save_library_handle( dynamic_link_handle src, dynamic_link_handle *dst ) { __TBB_ASSERT_EX( src, "The library handle to store must be non-zero" ); if ( dst ) *dst = src; #if __TBB_DYNAMIC_LOAD_ENABLED else handles.add( src ); #endif /* __TBB_DYNAMIC_LOAD_ENABLED */ } #if !_WIN32 int loading_flags(bool local_binding) { int flags = RTLD_NOW; if (local_binding) { flags = flags | RTLD_LOCAL; #if (__linux__ && __GLIBC__) && !__TBB_USE_SANITIZERS if( !GetBoolEnvironmentVariable("TBB_ENABLE_SANITIZERS") ) { flags = flags | RTLD_DEEPBIND; } #endif } else { flags = flags | RTLD_GLOBAL; } return flags; } #endif dynamic_link_handle dynamic_load( const char* library, const dynamic_link_descriptor descriptors[], std::size_t required, bool local_binding ) { ::tbb::detail::suppress_unused_warning( library, descriptors, required, local_binding ); #if __TBB_DYNAMIC_LOAD_ENABLED std::size_t const len = PATH_MAX + 1; char path[ len ]; std::size_t rc = abs_path( library, path, len ); if ( 0 < rc && rc <= len ) { #if _WIN32 // Prevent Windows from displaying silly message boxes if it fails to load library // (e.g. because of MS runtime problems - one of those crazy manifest related ones) UINT prev_mode = SetErrorMode (SEM_FAILCRITICALERRORS); #endif /* _WIN32 */ // The second argument (loading_flags) is ignored on Windows dynamic_link_handle library_handle = dlopen( path, loading_flags(local_binding) ); #if _WIN32 SetErrorMode (prev_mode); #endif /* _WIN32 */ if( library_handle ) { if( !resolve_symbols( library_handle, descriptors, required ) ) { // The loaded library does not contain all the expected entry points dynamic_unlink( library_handle ); library_handle = nullptr; } } else DYNAMIC_LINK_WARNING( dl_lib_not_found, path, dlerror() ); return library_handle; } else if ( rc>len ) DYNAMIC_LINK_WARNING( dl_buff_too_small ); // rc == 0 means failing of init_ap_data so the warning has already been issued. #endif /* __TBB_DYNAMIC_LOAD_ENABLED */ return nullptr; } bool dynamic_link( const char* library, const dynamic_link_descriptor descriptors[], std::size_t required, dynamic_link_handle *handle, int flags ) { init_dynamic_link_data(); // TODO: May global_symbols_link find weak symbols? dynamic_link_handle library_handle = ( flags & DYNAMIC_LINK_GLOBAL ) ? global_symbols_link( library, descriptors, required ) : nullptr; #if defined(_MSC_VER) && _MSC_VER <= 1900 #pragma warning (push) // MSVC 2015 warning: 'int': forcing value to bool 'true' or 'false' #pragma warning (disable: 4800) #endif if ( !library_handle && ( flags & DYNAMIC_LINK_LOAD ) ) library_handle = dynamic_load( library, descriptors, required, flags & DYNAMIC_LINK_LOCAL ); #if defined(_MSC_VER) && _MSC_VER <= 1900 #pragma warning (pop) #endif if ( !library_handle && ( flags & DYNAMIC_LINK_WEAK ) ) return weak_symbol_link( descriptors, required ); if ( library_handle ) { save_library_handle( library_handle, handle ); return true; } return false; } #endif /*__TBB_WIN8UI_SUPPORT*/ #else /* __TBB_WEAK_SYMBOLS_PRESENT || __TBB_DYNAMIC_LOAD_ENABLED */ bool dynamic_link( const char*, const dynamic_link_descriptor*, std::size_t, dynamic_link_handle *handle, int ) { if ( handle ) *handle=0; return false; } void dynamic_unlink( dynamic_link_handle ) {} void dynamic_unlink_all() {} #endif /* __TBB_WEAK_SYMBOLS_PRESENT || __TBB_DYNAMIC_LOAD_ENABLED */ } // namespace r1 } // namespace detail } // namespace tbb ================================================ FILE: third-party/tbb/src/tbb/dynamic_link.h ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_dynamic_link #define __TBB_dynamic_link // Support for dynamic loading entry points from other shared libraries. #include "oneapi/tbb/detail/_config.h" #include #include /** By default, symbols declared and defined here go into namespace tbb::internal. To put them in other namespace, define macros OPEN_INTERNAL_NAMESPACE and CLOSE_INTERNAL_NAMESPACE to override the following default definitions. **/ #include #ifdef _WIN32 #include #endif /* _WIN32 */ namespace tbb { namespace detail { namespace r1 { //! Type definition for a pointer to a void somefunc(void) typedef void (*pointer_to_handler)(); //! The helper to construct dynamic_link_descriptor structure // Double cast through the void* in DLD macro is necessary to // prevent warnings from some compilers (g++ 4.1) #if __TBB_WEAK_SYMBOLS_PRESENT #define DLD(s,h) {#s, (pointer_to_handler*)(void*)(&h), (pointer_to_handler)&s} #define DLD_NOWEAK(s,h) {#s, (pointer_to_handler*)(void*)(&h), nullptr} #else #define DLD(s,h) {#s, (pointer_to_handler*)(void*)(&h)} #define DLD_NOWEAK(s,h) DLD(s,h) #endif /* __TBB_WEAK_SYMBOLS_PRESENT */ //! Association between a handler name and location of pointer to it. struct dynamic_link_descriptor { //! Name of the handler const char* name; //! Pointer to the handler pointer_to_handler* handler; #if __TBB_WEAK_SYMBOLS_PRESENT //! Weak symbol pointer_to_handler ptr; #endif }; #if _WIN32 using dynamic_link_handle = HMODULE; #else using dynamic_link_handle = void*; #endif /* _WIN32 */ const int DYNAMIC_LINK_GLOBAL = 0x01; const int DYNAMIC_LINK_LOAD = 0x02; const int DYNAMIC_LINK_WEAK = 0x04; const int DYNAMIC_LINK_LOCAL = 0x08; const int DYNAMIC_LINK_LOCAL_BINDING = DYNAMIC_LINK_LOCAL | DYNAMIC_LINK_LOAD; const int DYNAMIC_LINK_DEFAULT = DYNAMIC_LINK_GLOBAL | DYNAMIC_LINK_LOAD | DYNAMIC_LINK_WEAK; //! Fill in dynamically linked handlers. /** 'library' is the name of the requested library. It should not contain a full path since dynamic_link adds the full path (from which the runtime itself was loaded) to the library name. 'required' is the number of the initial entries in the array descriptors[] that have to be found in order for the call to succeed. If the library and all the required handlers are found, then the corresponding handler pointers are set, and the return value is true. Otherwise the original array of descriptors is left untouched and the return value is false. 'required' is limited by 20 (exceeding of this value will result in failure to load the symbols and the return value will be false). 'handle' is the handle of the library if it is loaded. Otherwise it is left untouched. 'flags' is the set of DYNAMIC_LINK_* flags. Each of the DYNAMIC_LINK_* flags allows its corresponding linking stage. **/ bool dynamic_link( const char* library, const dynamic_link_descriptor descriptors[], std::size_t required, dynamic_link_handle* handle = nullptr, int flags = DYNAMIC_LINK_DEFAULT ); void dynamic_unlink( dynamic_link_handle handle ); void dynamic_unlink_all(); enum dynamic_link_error_t { dl_success = 0, dl_lib_not_found, // char const * lib, dlerr_t err dl_sym_not_found, // char const * sym, dlerr_t err // Note: dlerr_t depends on OS: it is char const * on Linux* and macOS*, int on Windows*. dl_sys_fail, // char const * func, int err dl_buff_too_small // none }; // dynamic_link_error_t } // namespace r1 } // namespace detail } // namespace tbb #endif /* __TBB_dynamic_link */ ================================================ FILE: third-party/tbb/src/tbb/environment.h ================================================ /* Copyright (c) 2018-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_tbb_environment_H #define __TBB_tbb_environment_H #include #include #include #include namespace tbb { namespace detail { namespace r1 { #if __TBB_WIN8UI_SUPPORT static inline bool GetBoolEnvironmentVariable( const char * ) { return false; } static inline long GetIntegralEnvironmentVariable( const char * ) { return -1; } #else /* __TBB_WIN8UI_SUPPORT */ static inline bool GetBoolEnvironmentVariable( const char * name ) { if ( const char* s = std::getenv(name) ) { // The result is defined as true only if the environment variable contains // no characters except one '1' character and an arbitrary number of spaces // (including the absence of spaces). size_t index = std::strspn(s, " "); if (s[index] != '1') return false; index++; // Memory access after incrementing is safe, since the getenv() returns a // null-terminated string, and even if the character getting by index is '1', // and this character is the end of string, after incrementing we will get // an index of character, that contains '\0' index += std::strspn(&s[index], " "); return !s[index]; } return false; } static inline long GetIntegralEnvironmentVariable( const char * name ) { if ( const char* s = std::getenv(name) ) { char* end = nullptr; errno = 0; long value = std::strtol(s, &end, 10); // We have exceeded the range, value is negative or string is incovertable if ( errno == ERANGE || value < 0 || end==s ) { return -1; } for ( ; *end != '\0'; end++ ) { if ( !std::isspace(*end) ) { return -1; } } return value; } return -1; } #endif /* __TBB_WIN8UI_SUPPORT */ } // namespace r1 } // namespace detail } // namespace tbb #endif // __TBB_tbb_environment_H ================================================ FILE: third-party/tbb/src/tbb/exception.cpp ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "oneapi/tbb/detail/_exception.h" #include "oneapi/tbb/detail/_assert.h" #include "oneapi/tbb/detail/_template_helpers.h" #include #include #include // std::runtime_error #include #include #define __TBB_STD_RETHROW_EXCEPTION_POSSIBLY_BROKEN \ (__GLIBCXX__ && __TBB_GLIBCXX_VERSION>=40700 && __TBB_GLIBCXX_VERSION<60000 && TBB_USE_EXCEPTIONS) #if __TBB_STD_RETHROW_EXCEPTION_POSSIBLY_BROKEN // GCC ABI declarations necessary for a workaround #include #endif namespace tbb { namespace detail { namespace r1 { const char* bad_last_alloc::what() const noexcept(true) { return "bad allocation in previous or concurrent attempt"; } const char* user_abort::what() const noexcept(true) { return "User-initiated abort has terminated this operation"; } const char* missing_wait::what() const noexcept(true) { return "wait() was not called on the structured_task_group"; } #if TBB_USE_EXCEPTIONS template /*[[noreturn]]*/ void do_throw_noexcept(F throw_func) noexcept { throw_func(); } /*[[noreturn]]*/ void do_throw_noexcept(void (*throw_func)()) noexcept { throw_func(); #if __GNUC__ == 7 // In release, GCC 7 loses noexcept attribute during tail call optimization. // The following statement prevents tail call optimization. volatile bool reach_this_point = true; suppress_unused_warning(reach_this_point); #endif } bool terminate_on_exception(); // defined in global_control.cpp and ipc_server.cpp template /*[[noreturn]]*/ void do_throw(F throw_func) { if (terminate_on_exception()) { do_throw_noexcept(throw_func); } throw_func(); } #define DO_THROW(exc, init_args) do_throw( []{ throw exc init_args; } ); #else /* !TBB_USE_EXCEPTIONS */ #define PRINT_ERROR_AND_ABORT(exc_name, msg) \ std::fprintf (stderr, "Exception %s with message %s would have been thrown, " \ "if exception handling had not been disabled. Aborting.\n", exc_name, msg); \ std::fflush(stderr); \ std::abort(); #define DO_THROW(exc, init_args) PRINT_ERROR_AND_ABORT(#exc, #init_args) #endif /* !TBB_USE_EXCEPTIONS */ void throw_exception ( exception_id eid ) { switch ( eid ) { case exception_id::bad_alloc: DO_THROW(std::bad_alloc, ()); break; case exception_id::bad_last_alloc: DO_THROW(bad_last_alloc, ()); break; case exception_id::user_abort: DO_THROW( user_abort, () ); break; case exception_id::nonpositive_step: DO_THROW(std::invalid_argument, ("Step must be positive") ); break; case exception_id::out_of_range: DO_THROW(std::out_of_range, ("Index out of requested size range")); break; case exception_id::reservation_length_error: DO_THROW(std::length_error, ("Attempt to exceed implementation defined length limits")); break; case exception_id::missing_wait: DO_THROW(missing_wait, ()); break; case exception_id::invalid_load_factor: DO_THROW(std::out_of_range, ("Invalid hash load factor")); break; case exception_id::invalid_key: DO_THROW(std::out_of_range, ("invalid key")); break; case exception_id::bad_tagged_msg_cast: DO_THROW(std::runtime_error, ("Illegal tagged_msg cast")); break; case exception_id::unsafe_wait: DO_THROW(unsafe_wait, ("Unsafe to wait further")); break; default: __TBB_ASSERT ( false, "Unknown exception ID" ); } __TBB_ASSERT(false, "Unreachable code"); } /* The "what" should be fairly short, not more than about 128 characters. Because we control all the call sites to handle_perror, it is pointless to bullet-proof it for very long strings. Design note: ADR put this routine off to the side in tbb_misc.cpp instead of Task.cpp because the throw generates a pathetic lot of code, and ADR wanted this large chunk of code to be placed on a cold page. */ void handle_perror( int error_code, const char* what ) { const int BUF_SIZE = 255; char buf[BUF_SIZE + 1] = { 0 }; std::strncat(buf, what, BUF_SIZE); std::size_t buf_len = std::strlen(buf); if (error_code) { std::strncat(buf, ": ", BUF_SIZE - buf_len); buf_len = std::strlen(buf); std::strncat(buf, std::strerror(error_code), BUF_SIZE - buf_len); buf_len = std::strlen(buf); } __TBB_ASSERT(buf_len <= BUF_SIZE && buf[buf_len] == 0, nullptr); #if TBB_USE_EXCEPTIONS do_throw([&buf] { throw std::runtime_error(buf); }); #else PRINT_ERROR_AND_ABORT( "runtime_error", buf); #endif /* !TBB_USE_EXCEPTIONS */ } #if __TBB_STD_RETHROW_EXCEPTION_POSSIBLY_BROKEN // Runtime detection and workaround for the GCC bug 62258. // The problem is that std::rethrow_exception() does not increment a counter // of active exceptions, causing std::uncaught_exception() to return a wrong value. // The code is created after, and roughly reflects, the workaround // at https://gcc.gnu.org/bugzilla/attachment.cgi?id=34683 void fix_broken_rethrow() { struct gcc_eh_data { void * caughtExceptions; unsigned int uncaughtExceptions; }; gcc_eh_data* eh_data = punned_cast( abi::__cxa_get_globals() ); ++eh_data->uncaughtExceptions; } bool gcc_rethrow_exception_broken() { bool is_broken; __TBB_ASSERT( !std::uncaught_exception(), "gcc_rethrow_exception_broken() must not be called when an exception is active" ); try { // Throw, catch, and rethrow an exception try { throw __TBB_GLIBCXX_VERSION; } catch(...) { std::rethrow_exception( std::current_exception() ); } } catch(...) { // Check the bug presence is_broken = std::uncaught_exception(); } if( is_broken ) fix_broken_rethrow(); __TBB_ASSERT( !std::uncaught_exception(), nullptr); return is_broken; } #else void fix_broken_rethrow() {} bool gcc_rethrow_exception_broken() { return false; } #endif /* __TBB_STD_RETHROW_EXCEPTION_POSSIBLY_BROKEN */ } // namespace r1 } // namespace detail } // namespace tbb ================================================ FILE: third-party/tbb/src/tbb/global_control.cpp ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "oneapi/tbb/detail/_config.h" #include "oneapi/tbb/detail/_template_helpers.h" #include "oneapi/tbb/cache_aligned_allocator.h" #include "oneapi/tbb/global_control.h" #include "oneapi/tbb/tbb_allocator.h" #include "oneapi/tbb/spin_mutex.h" #include "governor.h" #include "threading_control.h" #include "market.h" #include "misc.h" #include #include namespace tbb { namespace detail { namespace r1 { //! Comparator for a set of global_control objects struct control_storage_comparator { bool operator()(const d1::global_control* lhs, const d1::global_control* rhs) const; }; class control_storage { friend struct global_control_impl; friend std::size_t global_control_active_value(int); friend void global_control_lock(); friend void global_control_unlock(); friend std::size_t global_control_active_value_unsafe(d1::global_control::parameter); protected: std::size_t my_active_value{0}; std::set> my_list{}; spin_mutex my_list_mutex{}; public: virtual ~control_storage() = default; virtual std::size_t default_value() const = 0; virtual void apply_active(std::size_t new_active) { my_active_value = new_active; } virtual bool is_first_arg_preferred(std::size_t a, std::size_t b) const { return a>b; // prefer max by default } virtual std::size_t active_value() { spin_mutex::scoped_lock lock(my_list_mutex); // protect my_list.empty() call return !my_list.empty() ? my_active_value : default_value(); } std::size_t active_value_unsafe() { return !my_list.empty() ? my_active_value : default_value(); } }; class alignas(max_nfs_size) allowed_parallelism_control : public control_storage { std::size_t default_value() const override { return max(1U, governor::default_num_threads()); } bool is_first_arg_preferred(std::size_t a, std::size_t b) const override { return a= 1, nullptr); // -1 to take external thread into account threading_control::set_active_num_workers(my_active_value - 1); } std::size_t active_value() override { spin_mutex::scoped_lock lock(my_list_mutex); // protect my_list.empty() call if (my_list.empty()) { return default_value(); } // non-zero, if market is active const std::size_t workers = threading_control::max_num_workers(); // We can't exceed market's maximal number of workers. // +1 to take external thread into account return workers ? min(workers + 1, my_active_value) : my_active_value; } }; class alignas(max_nfs_size) stack_size_control : public control_storage { std::size_t default_value() const override { #if _WIN32_WINNT >= 0x0602 /* _WIN32_WINNT_WIN8 */ static auto ThreadStackSizeDefault = [] { ULONG_PTR hi, lo; GetCurrentThreadStackLimits(&lo, &hi); return hi - lo; }(); return ThreadStackSizeDefault; #elif defined(EMSCRIPTEN) return __TBB_EMSCRIPTEN_STACK_SIZE; #else return ThreadStackSize; #endif } void apply_active(std::size_t new_active) override { control_storage::apply_active(new_active); #if __TBB_WIN8UI_SUPPORT && (_WIN32_WINNT < 0x0A00) __TBB_ASSERT( false, "For Windows 8 Store* apps we must not set stack size" ); #endif } }; class alignas(max_nfs_size) terminate_on_exception_control : public control_storage { std::size_t default_value() const override { return 0; } }; class alignas(max_nfs_size) lifetime_control : public control_storage { bool is_first_arg_preferred(std::size_t, std::size_t) const override { return false; // not interested } std::size_t default_value() const override { return 0; } void apply_active(std::size_t new_active) override { if (new_active == 1) { // reserve the market reference threading_control::register_lifetime_control(); } else if (new_active == 0) { // new_active == 0 threading_control::unregister_lifetime_control(/*blocking_terminate*/ false); } control_storage::apply_active(new_active); } }; static control_storage* controls[] = {nullptr, nullptr, nullptr, nullptr}; void global_control_acquire() { controls[0] = new (cache_aligned_allocate(sizeof(allowed_parallelism_control))) allowed_parallelism_control{}; controls[1] = new (cache_aligned_allocate(sizeof(stack_size_control))) stack_size_control{}; controls[2] = new (cache_aligned_allocate(sizeof(terminate_on_exception_control))) terminate_on_exception_control{}; controls[3] = new (cache_aligned_allocate(sizeof(lifetime_control))) lifetime_control{}; } void global_control_release() { for (auto& ptr : controls) { ptr->~control_storage(); cache_aligned_deallocate(ptr); ptr = nullptr; } } void global_control_lock() { for (auto& ctl : controls) { ctl->my_list_mutex.lock(); } } void global_control_unlock() { int N = std::distance(std::begin(controls), std::end(controls)); for (int i = N - 1; i >= 0; --i) { controls[i]->my_list_mutex.unlock(); } } std::size_t global_control_active_value_unsafe(d1::global_control::parameter param) { __TBB_ASSERT_RELEASE(param < d1::global_control::parameter_max, nullptr); return controls[param]->active_value_unsafe(); } //! Comparator for a set of global_control objects inline bool control_storage_comparator::operator()(const d1::global_control* lhs, const d1::global_control* rhs) const { __TBB_ASSERT_RELEASE(lhs->my_param < d1::global_control::parameter_max , nullptr); return lhs->my_value < rhs->my_value || (lhs->my_value == rhs->my_value && lhs < rhs); } bool terminate_on_exception() { return d1::global_control::active_value(d1::global_control::terminate_on_exception) == 1; } struct global_control_impl { private: static bool erase_if_present(control_storage* const c, d1::global_control& gc) { auto it = c->my_list.find(&gc); if (it != c->my_list.end()) { c->my_list.erase(it); return true; } return false; } public: static void create(d1::global_control& gc) { __TBB_ASSERT_RELEASE(gc.my_param < d1::global_control::parameter_max, nullptr); control_storage* const c = controls[gc.my_param]; spin_mutex::scoped_lock lock(c->my_list_mutex); if (c->my_list.empty() || c->is_first_arg_preferred(gc.my_value, c->my_active_value)) { // to guarantee that apply_active() is called with current active value, // calls it here and in internal_destroy() under my_list_mutex c->apply_active(gc.my_value); } c->my_list.insert(&gc); } static void destroy(d1::global_control& gc) { __TBB_ASSERT_RELEASE(gc.my_param < d1::global_control::parameter_max, nullptr); control_storage* const c = controls[gc.my_param]; // Concurrent reading and changing global parameter is possible. spin_mutex::scoped_lock lock(c->my_list_mutex); __TBB_ASSERT(gc.my_param == d1::global_control::scheduler_handle || !c->my_list.empty(), nullptr); std::size_t new_active = (std::size_t)(-1), old_active = c->my_active_value; if (!erase_if_present(c, gc)) { __TBB_ASSERT(gc.my_param == d1::global_control::scheduler_handle , nullptr); return; } if (c->my_list.empty()) { __TBB_ASSERT(new_active == (std::size_t) - 1, nullptr); new_active = c->default_value(); } else { new_active = (*c->my_list.begin())->my_value; } if (new_active != old_active) { c->apply_active(new_active); } } static bool remove_and_check_if_empty(d1::global_control& gc) { __TBB_ASSERT_RELEASE(gc.my_param < d1::global_control::parameter_max, nullptr); control_storage* const c = controls[gc.my_param]; spin_mutex::scoped_lock lock(c->my_list_mutex); __TBB_ASSERT(!c->my_list.empty(), nullptr); erase_if_present(c, gc); return c->my_list.empty(); } #if TBB_USE_ASSERT static bool is_present(d1::global_control& gc) { __TBB_ASSERT_RELEASE(gc.my_param < d1::global_control::parameter_max, nullptr); control_storage* const c = controls[gc.my_param]; spin_mutex::scoped_lock lock(c->my_list_mutex); auto it = c->my_list.find(&gc); if (it != c->my_list.end()) { return true; } return false; } #endif // TBB_USE_ASSERT }; void __TBB_EXPORTED_FUNC create(d1::global_control& gc) { global_control_impl::create(gc); } void __TBB_EXPORTED_FUNC destroy(d1::global_control& gc) { global_control_impl::destroy(gc); } bool remove_and_check_if_empty(d1::global_control& gc) { return global_control_impl::remove_and_check_if_empty(gc); } #if TBB_USE_ASSERT bool is_present(d1::global_control& gc) { return global_control_impl::is_present(gc); } #endif // TBB_USE_ASSERT std::size_t __TBB_EXPORTED_FUNC global_control_active_value(int param) { __TBB_ASSERT_RELEASE(param < d1::global_control::parameter_max, nullptr); return controls[param]->active_value(); } } // namespace r1 } // namespace detail } // namespace tbb ================================================ FILE: third-party/tbb/src/tbb/governor.cpp ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "governor.h" #include "threading_control.h" #include "main.h" #include "thread_data.h" #include "market.h" #include "arena.h" #include "dynamic_link.h" #include "concurrent_monitor.h" #include "thread_dispatcher.h" #include "oneapi/tbb/task_group.h" #include "oneapi/tbb/global_control.h" #include "oneapi/tbb/tbb_allocator.h" #include "oneapi/tbb/info.h" #include "task_dispatcher.h" #include #include #include #include #include #ifdef EMSCRIPTEN #include #endif namespace tbb { namespace detail { namespace r1 { #if TBB_USE_ASSERT std::atomic the_observer_proxy_count; #endif /* TBB_USE_ASSERT */ void clear_address_waiter_table(); void global_control_acquire(); void global_control_release(); //! global_control.cpp contains definition bool remove_and_check_if_empty(d1::global_control& gc); bool is_present(d1::global_control& gc); namespace rml { tbb_server* make_private_server( tbb_client& client ); } // namespace rml namespace system_topology { void destroy(); } //------------------------------------------------------------------------ // governor //------------------------------------------------------------------------ void governor::acquire_resources () { global_control_acquire(); #if __TBB_USE_POSIX int status = theTLS.create(auto_terminate); #else int status = theTLS.create(); #endif if( status ) handle_perror(status, "TBB failed to initialize task scheduler TLS\n"); detect_cpu_features(cpu_features); is_rethrow_broken = gcc_rethrow_exception_broken(); } void governor::release_resources () { theRMLServerFactory.close(); destroy_process_mask(); __TBB_ASSERT(!(__TBB_InitOnce::initialization_done() && theTLS.get()), "TBB is unloaded while thread data still alive?"); int status = theTLS.destroy(); if( status ) runtime_warning("failed to destroy task scheduler TLS: %s", std::strerror(status)); clear_address_waiter_table(); #if TBB_USE_ASSERT if (the_observer_proxy_count != 0) { runtime_warning("Leaked %ld observer_proxy objects\n", long(the_observer_proxy_count)); } #endif /* TBB_USE_ASSERT */ system_topology::destroy(); dynamic_unlink_all(); global_control_release(); } rml::tbb_server* governor::create_rml_server ( rml::tbb_client& client ) { rml::tbb_server* server = nullptr; if( !UsePrivateRML ) { ::rml::factory::status_type status = theRMLServerFactory.make_server( server, client ); if( status != ::rml::factory::st_success ) { UsePrivateRML = true; runtime_warning( "rml::tbb_factory::make_server failed with status %x, falling back on private rml", status ); } } if ( !server ) { __TBB_ASSERT( UsePrivateRML, nullptr); server = rml::make_private_server( client ); } __TBB_ASSERT( server, "Failed to create RML server" ); return server; } void governor::one_time_init() { if ( !__TBB_InitOnce::initialization_done() ) { DoOneTimeInitialization(); } } bool governor::does_client_join_workers(const rml::tbb_client &client) { return ((const thread_dispatcher&)client).must_join_workers(); } /* There is no portable way to get stack base address in Posix, however the modern Linux versions provide pthread_attr_np API that can be used to obtain thread's stack size and base address. Unfortunately even this function does not provide enough information for the main thread on IA-64 architecture (RSE spill area and memory stack are allocated as two separate discontinuous chunks of memory), and there is no portable way to discern the main and the secondary threads. Thus for macOS* and IA-64 architecture for Linux* OS we use the TBB worker stack size for all threads and use the current stack top as the stack base. This simplified approach is based on the following assumptions: 1) If the default stack size is insufficient for the user app needs, the required amount will be explicitly specified by the user at the point of the TBB scheduler initialization (as an argument to tbb::task_scheduler_init constructor). 2) When an external thread initializes the scheduler, it has enough space on its stack. Here "enough" means "at least as much as worker threads have". 3) If the user app strives to conserve the memory by cutting stack size, it should do this for TBB workers too (as in the #1). */ static void get_stack_attributes(std::uintptr_t& stack_base, std::size_t& stack_size, std::size_t fallback_stack_size) { // Stacks are growing top-down. Highest address is called "stack base", // and the lowest is "stack limit". stack_size = fallback_stack_size; #if __TBB_USE_WINAPI NT_TIB* pteb = (NT_TIB*)NtCurrentTeb(); __TBB_ASSERT(&pteb < pteb->StackBase && &pteb > pteb->StackLimit, "invalid stack info in TEB"); stack_base = reinterpret_cast(pteb->StackBase); #elif defined(EMSCRIPTEN) stack_base = reinterpret_cast(emscripten_stack_get_base()); #else // There is no portable way to get stack base address in Posix, so we use // non-portable method (on all modern Linux) or the simplified approach // based on the common sense assumptions. The most important assumption // is that the main thread's stack size is not less than that of other threads. // Points to the lowest addressable byte of a stack. void* stack_limit = nullptr; #if __linux__ && !__bg__ size_t np_stack_size = 0; pthread_attr_t np_attr_stack; if (0 == pthread_getattr_np(pthread_self(), &np_attr_stack)) { if (0 == pthread_attr_getstack(&np_attr_stack, &stack_limit, &np_stack_size)) { __TBB_ASSERT( &stack_limit > stack_limit, "stack size must be positive" ); if (np_stack_size > 0) stack_size = np_stack_size; } pthread_attr_destroy(&np_attr_stack); } #endif /* __linux__ */ if (stack_limit) { stack_base = reinterpret_cast(stack_limit) + stack_size; } else { // Use an anchor as a base stack address. int anchor{}; stack_base = reinterpret_cast(&anchor); } #endif /* __TBB_USE_WINAPI */ } #if (_WIN32||_WIN64) && !__TBB_DYNAMIC_LOAD_ENABLED static void register_external_thread_destructor() { struct thread_destructor { ~thread_destructor() { governor::terminate_external_thread(); } }; // ~thread_destructor() will be call during the calling thread termination static thread_local thread_destructor thr_destructor; } #endif // (_WIN32||_WIN64) && !__TBB_DYNAMIC_LOAD_ENABLED void governor::init_external_thread() { one_time_init(); // Create new scheduler instance with arena int num_slots = default_num_threads(); // TODO_REVAMP: support an external thread without an implicit arena int num_reserved_slots = 1; unsigned arena_priority_level = 1; // corresponds to tbb::task_arena::priority::normal std::size_t stack_size = 0; threading_control* thr_control = threading_control::register_public_reference(); arena& a = arena::create(thr_control, num_slots, num_reserved_slots, arena_priority_level); // External thread always occupies the first slot thread_data& td = *new(cache_aligned_allocate(sizeof(thread_data))) thread_data(0, false); td.attach_arena(a, /*slot index*/ 0); __TBB_ASSERT(td.my_inbox.is_idle_state(false), nullptr); std::uintptr_t stack_base{}; get_stack_attributes(stack_base, stack_size, a.my_threading_control->worker_stack_size()); task_dispatcher& task_disp = td.my_arena_slot->default_task_dispatcher(); td.enter_task_dispatcher(task_disp, calculate_stealing_threshold(stack_base, stack_size)); td.my_arena_slot->occupy(); thr_control->register_thread(td); set_thread_data(td); #if (_WIN32||_WIN64) && !__TBB_DYNAMIC_LOAD_ENABLED // The external thread destructor is called from dllMain but it is not available with a static build. // Therefore, we need to register the current thread to call the destructor during thread termination. register_external_thread_destructor(); #endif } void governor::auto_terminate(void* tls) { __TBB_ASSERT(get_thread_data_if_initialized() == nullptr || get_thread_data_if_initialized() == tls, nullptr); if (tls) { thread_data* td = static_cast(tls); auto clear_tls = [td] { td->~thread_data(); cache_aligned_deallocate(td); clear_thread_data(); }; // Only external thread can be inside an arena during termination. if (td->my_arena_slot) { arena* a = td->my_arena; threading_control* thr_control = a->my_threading_control; // If the TLS slot is already cleared by OS or underlying concurrency // runtime, restore its value to properly clean up arena if (!is_thread_data_set(td)) { set_thread_data(*td); } a->my_observers.notify_exit_observers(td->my_last_observer, td->my_is_worker); td->leave_task_dispatcher(); td->my_arena_slot->release(); // Release an arena a->on_thread_leaving(arena::ref_external); thr_control->unregister_thread(*td); // The tls should be cleared before market::release because // market can destroy the tls key if we keep the last reference clear_tls(); // If there was an associated arena, it added a public market reference thr_control->unregister_public_reference(/* blocking terminate =*/ false); } else { clear_tls(); } } __TBB_ASSERT(get_thread_data_if_initialized() == nullptr, nullptr); } void governor::initialize_rml_factory () { ::rml::factory::status_type res = theRMLServerFactory.open(); UsePrivateRML = res != ::rml::factory::st_success; } void __TBB_EXPORTED_FUNC get(d1::task_scheduler_handle& handle) { handle.m_ctl = new(allocate_memory(sizeof(global_control))) global_control(global_control::scheduler_handle, 1); } void release_impl(d1::task_scheduler_handle& handle) { if (handle.m_ctl != nullptr) { handle.m_ctl->~global_control(); deallocate_memory(handle.m_ctl); handle.m_ctl = nullptr; } } bool finalize_impl(d1::task_scheduler_handle& handle) { __TBB_ASSERT_RELEASE(handle, "trying to finalize with null handle"); __TBB_ASSERT(is_present(*handle.m_ctl), "finalize or release was already called on this object"); bool ok = true; // ok if threading_control does not exist yet if (threading_control::is_present()) { thread_data* td = governor::get_thread_data_if_initialized(); if (td) { task_dispatcher* task_disp = td->my_task_dispatcher; __TBB_ASSERT(task_disp, nullptr); if (task_disp->m_properties.outermost && !td->my_is_worker) { // is not inside a parallel region governor::auto_terminate(td); } } if (remove_and_check_if_empty(*handle.m_ctl)) { ok = threading_control::unregister_lifetime_control(/*blocking_terminate*/ true); } else { ok = false; } } return ok; } bool __TBB_EXPORTED_FUNC finalize(d1::task_scheduler_handle& handle, std::intptr_t mode) { if (mode == d1::release_nothrowing) { release_impl(handle); return true; } else { bool ok = finalize_impl(handle); // TODO: it is unsafe when finalize is called concurrently and further library unload release_impl(handle); if (mode == d1::finalize_throwing && !ok) { throw_exception(exception_id::unsafe_wait); } return ok; } } #if __TBB_ARENA_BINDING #if __TBB_WEAK_SYMBOLS_PRESENT #pragma weak __TBB_internal_initialize_system_topology #pragma weak __TBB_internal_destroy_system_topology #pragma weak __TBB_internal_allocate_binding_handler #pragma weak __TBB_internal_deallocate_binding_handler #pragma weak __TBB_internal_apply_affinity #pragma weak __TBB_internal_restore_affinity #pragma weak __TBB_internal_get_default_concurrency extern "C" { void __TBB_internal_initialize_system_topology( size_t groups_num, int& numa_nodes_count, int*& numa_indexes_list, int& core_types_count, int*& core_types_indexes_list ); void __TBB_internal_destroy_system_topology( ); //TODO: consider renaming to `create_binding_handler` and `destroy_binding_handler` binding_handler* __TBB_internal_allocate_binding_handler( int slot_num, int numa_id, int core_type_id, int max_threads_per_core ); void __TBB_internal_deallocate_binding_handler( binding_handler* handler_ptr ); void __TBB_internal_apply_affinity( binding_handler* handler_ptr, int slot_num ); void __TBB_internal_restore_affinity( binding_handler* handler_ptr, int slot_num ); int __TBB_internal_get_default_concurrency( int numa_id, int core_type_id, int max_threads_per_core ); } #endif /* __TBB_WEAK_SYMBOLS_PRESENT */ // Stubs that will be used if TBBbind library is unavailable. static void dummy_destroy_system_topology ( ) { } static binding_handler* dummy_allocate_binding_handler ( int, int, int, int ) { return nullptr; } static void dummy_deallocate_binding_handler ( binding_handler* ) { } static void dummy_apply_affinity ( binding_handler*, int ) { } static void dummy_restore_affinity ( binding_handler*, int ) { } static int dummy_get_default_concurrency( int, int, int ) { return governor::default_num_threads(); } // Handlers for communication with TBBbind static void (*initialize_system_topology_ptr)( size_t groups_num, int& numa_nodes_count, int*& numa_indexes_list, int& core_types_count, int*& core_types_indexes_list ) = nullptr; static void (*destroy_system_topology_ptr)( ) = dummy_destroy_system_topology; static binding_handler* (*allocate_binding_handler_ptr)( int slot_num, int numa_id, int core_type_id, int max_threads_per_core ) = dummy_allocate_binding_handler; static void (*deallocate_binding_handler_ptr)( binding_handler* handler_ptr ) = dummy_deallocate_binding_handler; static void (*apply_affinity_ptr)( binding_handler* handler_ptr, int slot_num ) = dummy_apply_affinity; static void (*restore_affinity_ptr)( binding_handler* handler_ptr, int slot_num ) = dummy_restore_affinity; int (*get_default_concurrency_ptr)( int numa_id, int core_type_id, int max_threads_per_core ) = dummy_get_default_concurrency; #if _WIN32 || _WIN64 || __unix__ || __APPLE__ // Table describing how to link the handlers. static const dynamic_link_descriptor TbbBindLinkTable[] = { DLD(__TBB_internal_initialize_system_topology, initialize_system_topology_ptr), DLD(__TBB_internal_destroy_system_topology, destroy_system_topology_ptr), #if __TBB_CPUBIND_PRESENT DLD(__TBB_internal_allocate_binding_handler, allocate_binding_handler_ptr), DLD(__TBB_internal_deallocate_binding_handler, deallocate_binding_handler_ptr), DLD(__TBB_internal_apply_affinity, apply_affinity_ptr), DLD(__TBB_internal_restore_affinity, restore_affinity_ptr), #endif DLD(__TBB_internal_get_default_concurrency, get_default_concurrency_ptr) }; static const unsigned LinkTableSize = sizeof(TbbBindLinkTable) / sizeof(dynamic_link_descriptor); #if TBB_USE_DEBUG #define DEBUG_SUFFIX "_debug" #else #define DEBUG_SUFFIX #endif /* TBB_USE_DEBUG */ #if _WIN32 || _WIN64 #define LIBRARY_EXTENSION ".dll" #define LIBRARY_PREFIX #elif __APPLE__ #define LIBRARY_EXTENSION __TBB_STRING(.3.dylib) #define LIBRARY_PREFIX "lib" #elif __unix__ #define LIBRARY_EXTENSION __TBB_STRING(.so.3) #define LIBRARY_PREFIX "lib" #endif /* __unix__ */ #define TBBBIND_NAME LIBRARY_PREFIX "tbbbind" DEBUG_SUFFIX LIBRARY_EXTENSION #define TBBBIND_2_0_NAME LIBRARY_PREFIX "tbbbind_2_0" DEBUG_SUFFIX LIBRARY_EXTENSION #define TBBBIND_2_5_NAME LIBRARY_PREFIX "tbbbind_2_5" DEBUG_SUFFIX LIBRARY_EXTENSION #endif /* _WIN32 || _WIN64 || __unix__ */ // Representation of system hardware topology information on the TBB side. // System topology may be initialized by third-party component (e.g. hwloc) // or just filled in with default stubs. namespace system_topology { constexpr int automatic = -1; static std::atomic initialization_state; namespace { int numa_nodes_count = 0; int* numa_nodes_indexes = nullptr; int core_types_count = 0; int* core_types_indexes = nullptr; const char* load_tbbbind_shared_object() { #if _WIN32 || _WIN64 || __unix__ || __APPLE__ #if _WIN32 && !_WIN64 // For 32-bit Windows applications, process affinity masks can only support up to 32 logical CPUs. SYSTEM_INFO si; GetNativeSystemInfo(&si); if (si.dwNumberOfProcessors > 32) return nullptr; #endif /* _WIN32 && !_WIN64 */ for (const auto& tbbbind_version : {TBBBIND_2_5_NAME, TBBBIND_2_0_NAME, TBBBIND_NAME}) { if (dynamic_link(tbbbind_version, TbbBindLinkTable, LinkTableSize, nullptr, DYNAMIC_LINK_LOCAL_BINDING)) { return tbbbind_version; } } #endif /* _WIN32 || _WIN64 || __unix__ || __APPLE__ */ return nullptr; } int processor_groups_num() { #if _WIN32 return NumberOfProcessorGroups(); #else // Stub to improve code readability by reducing number of the compile-time conditions return 1; #endif } } // internal namespace // Tries to load TBBbind library API, if success, gets NUMA topology information from it, // in another case, fills NUMA topology by stubs. void initialization_impl() { governor::one_time_init(); if (const char* tbbbind_name = load_tbbbind_shared_object()) { initialize_system_topology_ptr( processor_groups_num(), numa_nodes_count, numa_nodes_indexes, core_types_count, core_types_indexes ); PrintExtraVersionInfo("TBBBIND", tbbbind_name); return; } static int dummy_index = automatic; numa_nodes_count = 1; numa_nodes_indexes = &dummy_index; core_types_count = 1; core_types_indexes = &dummy_index; PrintExtraVersionInfo("TBBBIND", "UNAVAILABLE"); } void initialize() { atomic_do_once(initialization_impl, initialization_state); } void destroy() { destroy_system_topology_ptr(); } } // namespace system_topology binding_handler* construct_binding_handler(int slot_num, int numa_id, int core_type_id, int max_threads_per_core) { system_topology::initialize(); return allocate_binding_handler_ptr(slot_num, numa_id, core_type_id, max_threads_per_core); } void destroy_binding_handler(binding_handler* handler_ptr) { __TBB_ASSERT(deallocate_binding_handler_ptr, "tbbbind loading was not performed"); deallocate_binding_handler_ptr(handler_ptr); } void apply_affinity_mask(binding_handler* handler_ptr, int slot_index) { __TBB_ASSERT(slot_index >= 0, "Negative thread index"); __TBB_ASSERT(apply_affinity_ptr, "tbbbind loading was not performed"); apply_affinity_ptr(handler_ptr, slot_index); } void restore_affinity_mask(binding_handler* handler_ptr, int slot_index) { __TBB_ASSERT(slot_index >= 0, "Negative thread index"); __TBB_ASSERT(restore_affinity_ptr, "tbbbind loading was not performed"); restore_affinity_ptr(handler_ptr, slot_index); } unsigned __TBB_EXPORTED_FUNC numa_node_count() { system_topology::initialize(); return system_topology::numa_nodes_count; } void __TBB_EXPORTED_FUNC fill_numa_indices(int* index_array) { system_topology::initialize(); std::memcpy(index_array, system_topology::numa_nodes_indexes, system_topology::numa_nodes_count * sizeof(int)); } int __TBB_EXPORTED_FUNC numa_default_concurrency(int node_id) { if (node_id >= 0) { system_topology::initialize(); int result = get_default_concurrency_ptr( node_id, /*core_type*/system_topology::automatic, /*threads_per_core*/system_topology::automatic ); if (result > 0) return result; } return governor::default_num_threads(); } unsigned __TBB_EXPORTED_FUNC core_type_count(intptr_t /*reserved*/) { system_topology::initialize(); return system_topology::core_types_count; } void __TBB_EXPORTED_FUNC fill_core_type_indices(int* index_array, intptr_t /*reserved*/) { system_topology::initialize(); std::memcpy(index_array, system_topology::core_types_indexes, system_topology::core_types_count * sizeof(int)); } void constraints_assertion(d1::constraints c) { bool is_topology_initialized = system_topology::initialization_state == do_once_state::initialized; __TBB_ASSERT_RELEASE(c.max_threads_per_core == system_topology::automatic || c.max_threads_per_core > 0, "Wrong max_threads_per_core constraints field value."); auto numa_nodes_begin = system_topology::numa_nodes_indexes; auto numa_nodes_end = system_topology::numa_nodes_indexes + system_topology::numa_nodes_count; __TBB_ASSERT_RELEASE( c.numa_id == system_topology::automatic || (is_topology_initialized && std::find(numa_nodes_begin, numa_nodes_end, c.numa_id) != numa_nodes_end), "The constraints::numa_id value is not known to the library. Use tbb::info::numa_nodes() to get the list of possible values."); int* core_types_begin = system_topology::core_types_indexes; int* core_types_end = system_topology::core_types_indexes + system_topology::core_types_count; __TBB_ASSERT_RELEASE(c.core_type == system_topology::automatic || (is_topology_initialized && std::find(core_types_begin, core_types_end, c.core_type) != core_types_end), "The constraints::core_type value is not known to the library. Use tbb::info::core_types() to get the list of possible values."); } int __TBB_EXPORTED_FUNC constraints_default_concurrency(const d1::constraints& c, intptr_t /*reserved*/) { constraints_assertion(c); if (c.numa_id >= 0 || c.core_type >= 0 || c.max_threads_per_core > 0) { system_topology::initialize(); return get_default_concurrency_ptr(c.numa_id, c.core_type, c.max_threads_per_core); } return governor::default_num_threads(); } int __TBB_EXPORTED_FUNC constraints_threads_per_core(const d1::constraints&, intptr_t /*reserved*/) { return system_topology::automatic; } #endif /* __TBB_ARENA_BINDING */ } // namespace r1 } // namespace detail } // namespace tbb ================================================ FILE: third-party/tbb/src/tbb/governor.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _TBB_governor_H #define _TBB_governor_H #include "rml_tbb.h" #include "misc.h" // for AvailableHwConcurrency #include "tls.h" namespace tbb { namespace detail { namespace r1 { class market; class thread_data; class __TBB_InitOnce; #if __TBB_USE_ITT_NOTIFY //! Defined in profiling.cpp extern bool ITT_Present; #endif typedef std::size_t stack_size_type; //------------------------------------------------------------------------ // Class governor //------------------------------------------------------------------------ //! The class handles access to the single instance of market, and to TLS to keep scheduler instances. /** It also supports automatic on-demand initialization of the TBB scheduler. The class contains only static data members and methods.*/ class governor { private: friend class __TBB_InitOnce; friend class thread_dispatcher; friend class threading_control_impl; // TODO: consider using thread_local (measure performance and side effects) //! TLS for scheduler instances associated with individual threads static basic_tls theTLS; // TODO (TBB_REVAMP_TODO): reconsider constant names static rml::tbb_factory theRMLServerFactory; static bool UsePrivateRML; // Flags for runtime-specific conditions static cpu_features_type cpu_features; static bool is_rethrow_broken; //! Create key for thread-local storage and initialize RML. static void acquire_resources (); //! Destroy the thread-local storage key and deinitialize RML. static void release_resources (); static rml::tbb_server* create_rml_server ( rml::tbb_client& ); public: static unsigned default_num_threads () { // Caches the maximal level of parallelism supported by the hardware static unsigned num_threads = AvailableHwConcurrency(); return num_threads; } static std::size_t default_page_size () { // Caches the size of OS regular memory page static std::size_t page_size = DefaultSystemPageSize(); return page_size; } static void one_time_init(); //! Processes scheduler initialization request (possibly nested) in an external thread /** If necessary creates new instance of arena and/or local scheduler. The auto_init argument specifies if the call is due to automatic initialization. **/ static void init_external_thread(); //! The routine to undo automatic initialization. /** The signature is written with void* so that the routine can be the destructor argument to pthread_key_create. */ static void auto_terminate(void* tls); //! Obtain the thread-local instance of the thread data. /** If the scheduler has not been initialized yet, initialization is done automatically. Note that auto-initialized scheduler instance is destroyed only when its thread terminates. **/ static thread_data* get_thread_data() { thread_data* td = theTLS.get(); if (td) { return td; } init_external_thread(); td = theTLS.get(); __TBB_ASSERT(td, nullptr); return td; } static void set_thread_data(thread_data& td) { theTLS.set(&td); } static void clear_thread_data() { theTLS.set(nullptr); } static thread_data* get_thread_data_if_initialized () { return theTLS.get(); } static bool is_thread_data_set(thread_data* td) { return theTLS.get() == td; } //! Undo automatic initialization if necessary; call when a thread exits. static void terminate_external_thread() { auto_terminate(get_thread_data_if_initialized()); } static void initialize_rml_factory (); static bool does_client_join_workers (const rml::tbb_client &client); static bool speculation_enabled() { return cpu_features.rtm_enabled; } #if __TBB_WAITPKG_INTRINSICS_PRESENT static bool wait_package_enabled() { return cpu_features.waitpkg_enabled; } #endif static bool hybrid_cpu() { return cpu_features.hybrid; } static bool rethrow_exception_broken() { return is_rethrow_broken; } static bool is_itt_present() { #if __TBB_USE_ITT_NOTIFY return ITT_Present; #else return false; #endif } }; // class governor } // namespace r1 } // namespace detail } // namespace tbb #endif /* _TBB_governor_H */ ================================================ FILE: third-party/tbb/src/tbb/intrusive_list.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _TBB_intrusive_list_H #define _TBB_intrusive_list_H #include "oneapi/tbb/detail/_intrusive_list_node.h" namespace tbb { namespace detail { namespace r1 { using d1::intrusive_list_node; //! List of element of type T, where T is derived from intrusive_list_node /** The class is not thread safe. **/ template class intrusive_list_base { //! Pointer to the head node intrusive_list_node my_head; //! Number of list elements std::size_t my_size; static intrusive_list_node& node ( T& item ) { return List::node(item); } static T& item ( intrusive_list_node* node ) { return List::item(node); } static const T& item( const intrusive_list_node* node ) { return List::item(node); } template class iterator_impl { static_assert(std::is_same::value || std::is_same::value, "Incorrect DereferenceType in iterator_impl"); using pointer_type = typename std::conditional::value, intrusive_list_node*, const intrusive_list_node*>::type; public: iterator_impl() : my_pos(nullptr) {} iterator_impl( pointer_type pos ) : my_pos(pos) {} iterator_impl& operator++() { my_pos = my_pos->my_next_node; return *this; } iterator_impl operator++( int ) { iterator_impl it(*this); ++*this; return it; } iterator_impl& operator--() { my_pos = my_pos->my_prev_node; return *this; } iterator_impl operator--( int ) { iterator_impl it(*this); --*this; return it; } bool operator==( const iterator_impl& rhs ) const { return my_pos == rhs.my_pos; } bool operator!=( const iterator_impl& rhs ) const { return my_pos != rhs.my_pos; } DereferenceType& operator*() const { return intrusive_list_base::item(my_pos); } DereferenceType* operator->() const { return &intrusive_list_base::item(my_pos); } private: // Node the iterator points to at the moment pointer_type my_pos; }; // class iterator_impl void assert_ok () const { __TBB_ASSERT( (my_head.my_prev_node == &my_head && !my_size) || (my_head.my_next_node != &my_head && my_size >0), "intrusive_list_base corrupted" ); #if TBB_USE_ASSERT >= 2 std::size_t i = 0; for ( intrusive_list_node *n = my_head.my_next_node; n != &my_head; n = n->my_next_node ) ++i; __TBB_ASSERT( my_size == i, "Wrong size" ); #endif /* TBB_USE_ASSERT >= 2 */ } public: using iterator = iterator_impl; using const_iterator = iterator_impl; intrusive_list_base () : my_size(0) { my_head.my_prev_node = &my_head; my_head.my_next_node = &my_head; } bool empty () const { return my_head.my_next_node == &my_head; } std::size_t size () const { return my_size; } iterator begin () { return iterator(my_head.my_next_node); } iterator end () { return iterator(&my_head); } const_iterator begin () const { return const_iterator(my_head.my_next_node); } const_iterator end () const { return const_iterator(&my_head); } void push_front ( T& val ) { __TBB_ASSERT( node(val).my_prev_node == &node(val) && node(val).my_next_node == &node(val), "Object with intrusive list node can be part of only one intrusive list simultaneously" ); // An object can be part of only one intrusive list at the given moment via the given node member node(val).my_prev_node = &my_head; node(val).my_next_node = my_head.my_next_node; my_head.my_next_node->my_prev_node = &node(val); my_head.my_next_node = &node(val); ++my_size; assert_ok(); } void remove( T& val ) { __TBB_ASSERT( node(val).my_prev_node != &node(val) && node(val).my_next_node != &node(val), "Element to remove is not in the list" ); __TBB_ASSERT( node(val).my_prev_node->my_next_node == &node(val) && node(val).my_next_node->my_prev_node == &node(val), "Element to remove is not in the list" ); --my_size; node(val).my_next_node->my_prev_node = node(val).my_prev_node; node(val).my_prev_node->my_next_node = node(val).my_next_node; #if TBB_USE_ASSERT node(val).my_prev_node = node(val).my_next_node = &node(val); #endif assert_ok(); } iterator erase ( iterator it ) { T& val = *it; ++it; remove( val ); return it; } }; // intrusive_list_base #if __TBB_TODO // With standard compliant compilers memptr_intrusive_list could be named simply intrusive_list, // and inheritance based intrusive_list version would become its partial specialization. // Here are the corresponding declarations: struct dummy_intrusive_list_item { intrusive_list_node my_node; }; template class intrusive_list : public intrusive_list_base, T>; template class intrusive_list : public intrusive_list_base, T>; #endif /* __TBB_TODO */ //! Double linked list of items of type T containing a member of type intrusive_list_node. /** NodePtr is a member pointer to the node data field. Class U is either T or a base class of T containing the node member. Default values exist for the sake of a partial specialization working with inheritance case. The list does not have ownership of its items. Its purpose is to avoid dynamic memory allocation when forming lists of existing objects. The class is not thread safe. **/ template class memptr_intrusive_list : public intrusive_list_base, T> { friend class intrusive_list_base, T>; static intrusive_list_node& node ( T& val ) { return val.*NodePtr; } static T& item ( intrusive_list_node* node ) { // Cannot use __TBB_offsetof (and consequently __TBB_get_object_ref) macro // with *NodePtr argument because gcc refuses to interpret pasted "->" and "*" // as member pointer dereferencing operator, and explicit usage of ## in // __TBB_offsetof implementation breaks operations with normal member names. return *reinterpret_cast((char*)node - ((ptrdiff_t)&(reinterpret_cast(0x1000)->*NodePtr) - 0x1000)); } static const T& item( const intrusive_list_node* node ) { return item(const_cast(node)); } }; // intrusive_list //! Double linked list of items of type T that is derived from intrusive_list_node class. /** The list does not have ownership of its items. Its purpose is to avoid dynamic memory allocation when forming lists of existing objects. The class is not thread safe. **/ template class intrusive_list : public intrusive_list_base, T> { friend class intrusive_list_base, T>; static intrusive_list_node& node ( T& val ) { return val; } static T& item ( intrusive_list_node* node ) { return *static_cast(node); } static const T& item( const intrusive_list_node* node ) { return *static_cast(node); } }; // intrusive_list } // namespace r1 } // namespace detail } // namespace tbb #endif /* _TBB_intrusive_list_H */ ================================================ FILE: third-party/tbb/src/tbb/itt_notify.cpp ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #if __TBB_USE_ITT_NOTIFY #if _WIN32||_WIN64 #ifndef UNICODE #define UNICODE #endif #else #pragma weak dlopen #pragma weak dlsym #pragma weak dlerror #endif /* WIN */ #if __TBB_BUILD extern "C" void ITT_DoOneTimeInitialization(); #define __itt_init_ittlib_name(x,y) (ITT_DoOneTimeInitialization(), true) #elif __TBBMALLOC_BUILD extern "C" void MallocInitializeITT(); #define __itt_init_ittlib_name(x,y) (MallocInitializeITT(), true) #else #error This file is expected to be used for either TBB or TBB allocator build. #endif // __TBB_BUILD #include "tools_api/ittnotify_static.c" namespace tbb { namespace detail { namespace r1 { /** This extra proxy method is necessary since __itt_init_lib is declared as static **/ int __TBB_load_ittnotify() { #if !(_WIN32||_WIN64) // tool_api crashes without dlopen, check that it's present. Common case // for lack of dlopen is static binaries, i.e. ones build with -static. if (dlopen == nullptr) return 0; #endif return __itt_init_ittlib(nullptr, // groups for: (__itt_group_id)(__itt_group_sync // prepare/cancel/acquired/releasing | __itt_group_thread // name threads | __itt_group_stitch // stack stitching | __itt_group_structure )); } } //namespace r1 } //namespace detail } // namespace tbb #endif /* __TBB_USE_ITT_NOTIFY */ ================================================ FILE: third-party/tbb/src/tbb/itt_notify.h ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _TBB_ITT_NOTIFY #define _TBB_ITT_NOTIFY #include "oneapi/tbb/detail/_config.h" #if __TBB_USE_ITT_NOTIFY #if _WIN32||_WIN64 #ifndef UNICODE #define UNICODE #endif #endif /* WIN */ #ifndef INTEL_ITTNOTIFY_API_PRIVATE #define INTEL_ITTNOTIFY_API_PRIVATE #endif #include "tools_api/ittnotify.h" #include "tools_api/legacy/ittnotify.h" extern "C" void __itt_fini_ittlib(void); extern "C" void __itt_release_resources(void); #if _WIN32||_WIN64 #undef _T #endif /* WIN */ #endif /* __TBB_USE_ITT_NOTIFY */ #if !ITT_CALLER_NULL #define ITT_CALLER_NULL ((__itt_caller)0) #endif namespace tbb { namespace detail { namespace r1 { //! Unicode support #if (_WIN32||_WIN64) //! Unicode character type. Always wchar_t on Windows. /** We do not use typedefs from Windows TCHAR family to keep consistence of TBB coding style. **/ using tchar = wchar_t; //! Standard Windows macro to markup the string literals. #define _T(string_literal) L ## string_literal #else /* !WIN */ using tchar = char; //! Standard Windows style macro to markup the string literals. #define _T(string_literal) string_literal #endif /* !WIN */ //! Display names of internal synchronization types extern const tchar *SyncType_Scheduler; //! Display names of internal synchronization components/scenarios extern const tchar *SyncObj_ContextsList ; #if __TBB_USE_ITT_NOTIFY // const_cast() is necessary to cast off volatility #define ITT_NOTIFY(name,obj) __itt_##name(const_cast(static_cast(obj))) #define ITT_THREAD_SET_NAME(name) __itt_thread_set_name(name) #define ITT_FINI_ITTLIB() __itt_fini_ittlib() #define ITT_RELEASE_RESOURCES() __itt_release_resources() #define ITT_SYNC_CREATE(obj, type, name) __itt_sync_create((void*)(obj), type, name, 2) #define ITT_STACK_CREATE(obj) obj = __itt_stack_caller_create() #define ITT_STACK_DESTROY(obj) (obj!=nullptr) ? __itt_stack_caller_destroy(static_cast<__itt_caller>(obj)) : ((void)0) #define ITT_CALLEE_ENTER(cond, t, obj) if(cond) {\ __itt_stack_callee_enter(static_cast<__itt_caller>(obj));\ __itt_sync_acquired(t);\ } #define ITT_CALLEE_LEAVE(cond, obj) (cond) ? __itt_stack_callee_leave(static_cast<__itt_caller>(obj)) : ((void)0) #define ITT_TASK_GROUP(obj,name,parent) r1::itt_make_task_group(d1::ITT_DOMAIN_MAIN,(void*)(obj),ALGORITHM,(void*)(parent),(parent!=nullptr) ? ALGORITHM : FLOW_NULL,name) #define ITT_TASK_BEGIN(obj,name,id) r1::itt_task_begin(d1::ITT_DOMAIN_MAIN,(void*)(id),ALGORITHM,(void*)(obj),ALGORITHM,name) #define ITT_TASK_END r1::itt_task_end(d1::ITT_DOMAIN_MAIN) #else /* !__TBB_USE_ITT_NOTIFY */ #define ITT_NOTIFY(name,obj) ((void)0) #define ITT_THREAD_SET_NAME(name) ((void)0) #define ITT_FINI_ITTLIB() ((void)0) #define ITT_RELEASE_RESOURCES() ((void)0) #define ITT_SYNC_CREATE(obj, type, name) ((void)0) #define ITT_STACK_CREATE(obj) ((void)0) #define ITT_STACK_DESTROY(obj) ((void)0) #define ITT_CALLEE_ENTER(cond, t, obj) ((void)0) #define ITT_CALLEE_LEAVE(cond, obj) ((void)0) #define ITT_TASK_GROUP(type,name,parent) ((void)0) #define ITT_TASK_BEGIN(type,name,id) ((void)0) #define ITT_TASK_END ((void)0) #endif /* !__TBB_USE_ITT_NOTIFY */ int __TBB_load_ittnotify(); } // namespace r1 } // namespace detail } // namespace tbb #endif /* _TBB_ITT_NOTIFY */ ================================================ FILE: third-party/tbb/src/tbb/mailbox.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _TBB_mailbox_H #define _TBB_mailbox_H #include "oneapi/tbb/cache_aligned_allocator.h" #include "oneapi/tbb/detail/_small_object_pool.h" #include "scheduler_common.h" #include namespace tbb { namespace detail { namespace r1 { struct task_proxy : public d1::task { static const intptr_t pool_bit = 1<<0; static const intptr_t mailbox_bit = 1<<1; static const intptr_t location_mask = pool_bit | mailbox_bit; /* All but two low-order bits represent a (task*). Two low-order bits mean: 1 = proxy is/was/will be in task pool 2 = proxy is/was/will be in mailbox */ std::atomic task_and_tag; //! Pointer to next task_proxy in a mailbox std::atomic next_in_mailbox; //! Mailbox to which this was mailed. mail_outbox* outbox; //! Task affinity id which is referenced d1::slot_id slot; d1::small_object_allocator allocator; //! True if the proxy is stored both in its sender's pool and in the destination mailbox. static bool is_shared ( intptr_t tat ) { return (tat & location_mask) == location_mask; } //! Returns a pointer to the encapsulated task or nullptr. static task* task_ptr ( intptr_t tat ) { return (task*)(tat & ~location_mask); } //! Returns a pointer to the encapsulated task or nullptr, and frees proxy if necessary. template inline task* extract_task () { // __TBB_ASSERT( prefix().extra_state == es_task_proxy, "Normal task misinterpreted as a proxy?" ); intptr_t tat = task_and_tag.load(std::memory_order_acquire); __TBB_ASSERT( tat == from_bit || (is_shared(tat) && task_ptr(tat)), "Proxy's tag cannot specify both locations if the proxy " "was retrieved from one of its original locations" ); if ( tat != from_bit ) { const intptr_t cleaner_bit = location_mask & ~from_bit; // Attempt to transition the proxy to the "empty" state with // cleaner_bit specifying entity responsible for its eventual freeing. // Explicit cast to void* is to work around a seeming ICC 11.1 bug. if ( task_and_tag.compare_exchange_strong(tat, cleaner_bit) ) { // Successfully grabbed the task, and left new owner with the job of freeing the proxy return task_ptr(tat); } } // Proxied task has already been claimed from another proxy location. __TBB_ASSERT( task_and_tag.load(std::memory_order_relaxed) == from_bit, "Empty proxy cannot contain non-zero task pointer" ); return nullptr; } task* execute(d1::execution_data&) override { __TBB_ASSERT_RELEASE(false, nullptr); return nullptr; } task* cancel(d1::execution_data&) override { __TBB_ASSERT_RELEASE(false, nullptr); return nullptr; } }; // struct task_proxy //! Internal representation of mail_outbox, without padding. class unpadded_mail_outbox { protected: typedef std::atomic atomic_proxy_ptr; //! Pointer to first task_proxy in mailbox, or nullptr if box is empty. atomic_proxy_ptr my_first; //! Pointer to pointer that will point to next item in the queue. Never nullptr. std::atomic my_last; //! Owner of mailbox is not executing a task, and has drained its own task pool. std::atomic my_is_idle; }; // TODO: - consider moving to arena slot //! Class representing where mail is put. /** Padded to occupy a cache line. */ class mail_outbox : padded { task_proxy* internal_pop( isolation_type isolation ) { task_proxy* curr = my_first.load(std::memory_order_acquire); if ( !curr ) return nullptr; atomic_proxy_ptr* prev_ptr = &my_first; if ( isolation != no_isolation ) { while ( task_accessor::isolation(*curr) != isolation ) { prev_ptr = &curr->next_in_mailbox; // The next_in_mailbox should be read with acquire to guarantee (*curr) consistency. curr = curr->next_in_mailbox.load(std::memory_order_acquire); if ( !curr ) return nullptr; } } // There is a first item in the mailbox. See if there is a second. // The next_in_mailbox should be read with acquire to guarantee (*second) consistency. if ( task_proxy* second = curr->next_in_mailbox.load(std::memory_order_acquire) ) { // There are at least two items, so first item can be popped easily. prev_ptr->store(second, std::memory_order_relaxed); } else { // There is only one item. Some care is required to pop it. prev_ptr->store(nullptr, std::memory_order_relaxed); atomic_proxy_ptr* expected = &curr->next_in_mailbox; if ( my_last.compare_exchange_strong( expected, prev_ptr ) ) { // Successfully transitioned mailbox from having one item to having none. __TBB_ASSERT( !curr->next_in_mailbox.load(std::memory_order_relaxed), nullptr); } else { // Some other thread updated my_last but has not filled in first->next_in_mailbox // Wait until first item points to second item. atomic_backoff backoff; // The next_in_mailbox should be read with acquire to guarantee (*second) consistency. while ( !(second = curr->next_in_mailbox.load(std::memory_order_acquire)) ) backoff.pause(); prev_ptr->store( second, std::memory_order_relaxed); } } assert_pointer_valid(curr); return curr; } public: friend class mail_inbox; //! Push task_proxy onto the mailbox queue of another thread. /** Implementation is wait-free. */ void push( task_proxy* t ) { assert_pointer_valid(t); t->next_in_mailbox.store(nullptr, std::memory_order_relaxed); atomic_proxy_ptr* const link = my_last.exchange(&t->next_in_mailbox); // Logically, the release fence is not required because the exchange above provides the // release-acquire semantic that guarantees that (*t) will be consistent when another thread // loads the link atomic. However, C++11 memory model guarantees consistency of(*t) only // when the same atomic is used for synchronization. link->store(t, std::memory_order_release); } //! Return true if mailbox is empty bool empty() { return my_first.load(std::memory_order_relaxed) == nullptr; } //! Construct *this as a mailbox from zeroed memory. /** Raise assertion if *this is not previously zeroed, or sizeof(this) is wrong. This method is provided instead of a full constructor since we know the object will be constructed in zeroed memory. */ void construct() { __TBB_ASSERT( sizeof(*this)==max_nfs_size, nullptr ); __TBB_ASSERT( !my_first.load(std::memory_order_relaxed), nullptr ); __TBB_ASSERT( !my_last.load(std::memory_order_relaxed), nullptr ); __TBB_ASSERT( !my_is_idle.load(std::memory_order_relaxed), nullptr ); my_last = &my_first; suppress_unused_warning(pad); } //! Drain the mailbox void drain() { // No fences here because other threads have already quit. for( ; task_proxy* t = my_first; ) { my_first.store(t->next_in_mailbox, std::memory_order_relaxed); t->allocator.delete_object(t); } } //! True if thread that owns this mailbox is looking for work. bool recipient_is_idle() { return my_is_idle.load(std::memory_order_relaxed); } }; // class mail_outbox //! Class representing source of mail. class mail_inbox { //! Corresponding sink where mail that we receive will be put. mail_outbox* my_putter; public: //! Construct unattached inbox mail_inbox() : my_putter(nullptr) {} //! Attach inbox to a corresponding outbox. void attach( mail_outbox& putter ) { my_putter = &putter; } //! Detach inbox from its outbox void detach() { __TBB_ASSERT(my_putter,"not attached"); my_putter = nullptr; } //! Get next piece of mail, or nullptr if mailbox is empty. task_proxy* pop( isolation_type isolation ) { return my_putter->internal_pop( isolation ); } //! Return true if mailbox is empty bool empty() { return my_putter->empty(); } //! Indicate whether thread that reads this mailbox is idle. /** Raises assertion failure if mailbox is redundantly marked as not idle. */ void set_is_idle( bool value ) { if( my_putter ) { __TBB_ASSERT( my_putter->my_is_idle.load(std::memory_order_relaxed) || value, "attempt to redundantly mark mailbox as not idle" ); my_putter->my_is_idle.store(value, std::memory_order_relaxed); } } //! Indicate whether thread that reads this mailbox is idle. bool is_idle_state ( bool value ) const { return !my_putter || my_putter->my_is_idle.load(std::memory_order_relaxed) == value; } }; // class mail_inbox } // namespace r1 } // namespace detail } // namespace tbb #endif /* _TBB_mailbox_H */ ================================================ FILE: third-party/tbb/src/tbb/main.cpp ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "oneapi/tbb/detail/_config.h" #include "main.h" #include "governor.h" #include "threading_control.h" #include "environment.h" #include "market.h" #include "tcm_adaptor.h" #include "misc.h" #include "itt_notify.h" namespace tbb { namespace detail { namespace r1 { //------------------------------------------------------------------------ // Begin shared data layout. // The following global data items are mostly read-only after initialization. //------------------------------------------------------------------------ //------------------------------------------------------------------------ // governor data basic_tls governor::theTLS; rml::tbb_factory governor::theRMLServerFactory; bool governor::UsePrivateRML; bool governor::is_rethrow_broken; //------------------------------------------------------------------------ // threading_control data threading_control* threading_control::g_threading_control; threading_control::global_mutex_type threading_control::g_threading_control_mutex; //------------------------------------------------------------------------ // context propagation data context_state_propagation_mutex_type the_context_state_propagation_mutex; std::atomic the_context_state_propagation_epoch{}; //------------------------------------------------------------------------ // One time initialization data //! Counter of references to global shared resources such as TLS. std::atomic __TBB_InitOnce::count{}; std::atomic_flag __TBB_InitOnce::InitializationLock = ATOMIC_FLAG_INIT; //! Flag that is set to true after one-time initializations are done. std::atomic __TBB_InitOnce::InitializationDone{}; #if __TBB_USE_ITT_NOTIFY //! Defined in profiling.cpp extern bool ITT_Present; void ITT_DoUnsafeOneTimeInitialization(); #endif #if !(_WIN32||_WIN64) || __TBB_SOURCE_DIRECTLY_INCLUDED static __TBB_InitOnce __TBB_InitOnceHiddenInstance; #endif //------------------------------------------------------------------------ // __TBB_InitOnce //------------------------------------------------------------------------ void __TBB_InitOnce::add_ref() { if (++count == 1) { governor::acquire_resources(); tcm_adaptor::initialize(); } } void __TBB_InitOnce::remove_ref() { int k = --count; __TBB_ASSERT(k>=0,"removed __TBB_InitOnce ref that was not added?"); if( k==0 ) { governor::release_resources(); ITT_FINI_ITTLIB(); ITT_RELEASE_RESOURCES(); } } //------------------------------------------------------------------------ // One-time Initializations //------------------------------------------------------------------------ //! Defined in cache_aligned_allocator.cpp void initialize_cache_aligned_allocator(); //! Performs thread-safe lazy one-time general TBB initialization. void DoOneTimeInitialization() { __TBB_InitOnce::lock(); // No fence required for load of InitializationDone, because we are inside a critical section. if( !__TBB_InitOnce::InitializationDone ) { __TBB_InitOnce::add_ref(); if( GetBoolEnvironmentVariable("TBB_VERSION") ) { PrintVersion(); tcm_adaptor::print_version(); } bool itt_present = false; #if __TBB_USE_ITT_NOTIFY ITT_DoUnsafeOneTimeInitialization(); itt_present = ITT_Present; #endif /* __TBB_USE_ITT_NOTIFY */ initialize_cache_aligned_allocator(); governor::initialize_rml_factory(); // Force processor groups support detection governor::default_num_threads(); // Force OS regular page size detection governor::default_page_size(); PrintExtraVersionInfo( "TOOLS SUPPORT", itt_present ? "enabled" : "disabled" ); __TBB_InitOnce::InitializationDone = true; } __TBB_InitOnce::unlock(); } #if (_WIN32||_WIN64) && !__TBB_SOURCE_DIRECTLY_INCLUDED //! Windows "DllMain" that handles startup and shutdown of dynamic library. extern "C" bool WINAPI DllMain( HANDLE /*hinstDLL*/, DWORD reason, LPVOID lpvReserved ) { switch( reason ) { case DLL_PROCESS_ATTACH: __TBB_InitOnce::add_ref(); break; case DLL_PROCESS_DETACH: // Since THREAD_DETACH is not called for the main thread, call auto-termination // here as well - but not during process shutdown (due to risk of a deadlock). if ( lpvReserved == nullptr ) { // library unload governor::terminate_external_thread(); } __TBB_InitOnce::remove_ref(); // It is assumed that InitializationDone is not set after DLL_PROCESS_DETACH, // and thus no race on InitializationDone is possible. if ( __TBB_InitOnce::initialization_done() ) { // Remove reference that we added in DoOneTimeInitialization. __TBB_InitOnce::remove_ref(); } break; case DLL_THREAD_DETACH: governor::terminate_external_thread(); break; } return true; } #endif /* (_WIN32||_WIN64) && !__TBB_SOURCE_DIRECTLY_INCLUDED */ } // namespace r1 } // namespace detail } // namespace tbb ================================================ FILE: third-party/tbb/src/tbb/main.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _TBB_main_H #define _TBB_main_H #include "governor.h" #include namespace tbb { namespace detail { namespace r1 { void DoOneTimeInitialization(); //------------------------------------------------------------------------ // __TBB_InitOnce //------------------------------------------------------------------------ // TODO (TBB_REVAMP_TODO): consider better names //! Class that supports TBB initialization. /** It handles acquisition and release of global resources (e.g. TLS) during startup and shutdown, as well as synchronization for DoOneTimeInitialization. */ class __TBB_InitOnce { friend void DoOneTimeInitialization(); friend void ITT_DoUnsafeOneTimeInitialization(); static std::atomic count; //! Platform specific code to acquire resources. static void acquire_resources(); //! Platform specific code to release resources. static void release_resources(); //! Specifies if the one-time initializations has been done. static std::atomic InitializationDone; //! Global initialization lock /** Scenarios are possible when tools interop has to be initialized before the TBB itself. This imposes a requirement that the global initialization lock has to support valid static initialization, and does not issue any tool notifications in any build mode. **/ static std::atomic_flag InitializationLock; public: static void lock() { tbb::detail::atomic_backoff backoff; while( InitializationLock.test_and_set() ) backoff.pause(); } static void unlock() { InitializationLock.clear(std::memory_order_release); } static bool initialization_done() { return InitializationDone.load(std::memory_order_acquire); } //! Add initial reference to resources. /** We assume that dynamic loading of the library prevents any other threads from entering the library until this constructor has finished running. **/ __TBB_InitOnce() { add_ref(); } //! Remove the initial reference to resources. /** This is not necessarily the last reference if other threads are still running. **/ ~__TBB_InitOnce() { governor::terminate_external_thread(); // TLS dtor not called for the main thread remove_ref(); // We assume that InitializationDone is not set after file-scope destructors // start running, and thus no race on InitializationDone is possible. if ( initialization_done() ) { // Remove an extra reference that was added in DoOneTimeInitialization. remove_ref(); } } //! Add reference to resources. If first reference added, acquire the resources. static void add_ref(); //! Remove reference to resources. If last reference removed, release the resources. static void remove_ref(); }; // class __TBB_InitOnce } // namespace r1 } // namespace detail } // namespace tbb #endif /* _TBB_main_H */ ================================================ FILE: third-party/tbb/src/tbb/market.cpp ================================================ /* Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "arena.h" #include "market.h" #include // std::find namespace tbb { namespace detail { namespace r1 { class tbb_permit_manager_client : public pm_client { public: tbb_permit_manager_client(arena& a) : pm_client(a) {} void register_thread() override {} void unregister_thread() override {} void set_allotment(unsigned allotment) { my_arena.set_allotment(allotment); } }; //------------------------------------------------------------------------ // market //------------------------------------------------------------------------ market::market(unsigned workers_soft_limit) : my_num_workers_soft_limit(workers_soft_limit) {} pm_client* market::create_client(arena& a) { return new (cache_aligned_allocate(sizeof(tbb_permit_manager_client))) tbb_permit_manager_client(a); } void market::register_client(pm_client* c, d1::constraints&) { mutex_type::scoped_lock lock(my_mutex); my_clients[c->priority_level()].push_back(c); } void market::unregister_and_destroy_client(pm_client& c) { { mutex_type::scoped_lock lock(my_mutex); auto& clients = my_clients[c.priority_level()]; auto it = std::find(clients.begin(), clients.end(), &c); __TBB_ASSERT(it != clients.end(), "Destroying of an unregistered client"); clients.erase(it); } auto client = static_cast(&c); client->~tbb_permit_manager_client(); cache_aligned_deallocate(client); } void market::update_allotment() { int effective_soft_limit = my_mandatory_num_requested > 0 && my_num_workers_soft_limit == 0 ? 1 : my_num_workers_soft_limit; int max_workers = min(my_total_demand, effective_soft_limit); __TBB_ASSERT(max_workers >= 0, nullptr); int unassigned_workers = max_workers; int assigned = 0; int carry = 0; unsigned max_priority_level = num_priority_levels; for (unsigned list_idx = 0; list_idx < num_priority_levels; ++list_idx ) { int assigned_per_priority = min(my_priority_level_demand[list_idx], unassigned_workers); unassigned_workers -= assigned_per_priority; // We use reverse iterator there to serve last added clients first for (auto it = my_clients[list_idx].rbegin(); it != my_clients[list_idx].rend(); ++it) { tbb_permit_manager_client& client = static_cast(**it); if (client.max_workers() == 0) { client.set_allotment(0); continue; } if (max_priority_level == num_priority_levels) { max_priority_level = list_idx; } int allotted = 0; if (my_num_workers_soft_limit == 0) { __TBB_ASSERT(max_workers == 0 || max_workers == 1, nullptr); allotted = client.min_workers() > 0 && assigned < max_workers ? 1 : 0; } else { int tmp = client.max_workers() * assigned_per_priority + carry; allotted = tmp / my_priority_level_demand[list_idx]; carry = tmp % my_priority_level_demand[list_idx]; __TBB_ASSERT(allotted <= client.max_workers(), nullptr); } client.set_allotment(allotted); client.set_top_priority(list_idx == max_priority_level); assigned += allotted; } } __TBB_ASSERT(assigned == max_workers, nullptr); } void market::set_active_num_workers(int soft_limit) { mutex_type::scoped_lock lock(my_mutex); if (my_num_workers_soft_limit != soft_limit) { my_num_workers_soft_limit = soft_limit; update_allotment(); } } void market::adjust_demand(pm_client& c, int mandatory_delta, int workers_delta) { __TBB_ASSERT(-1 <= mandatory_delta && mandatory_delta <= 1, nullptr); int delta{}; { mutex_type::scoped_lock lock(my_mutex); // Update client's state delta = c.update_request(mandatory_delta, workers_delta); // Update market's state my_total_demand += delta; my_priority_level_demand[c.priority_level()] += delta; my_mandatory_num_requested += mandatory_delta; update_allotment(); } notify_thread_request(delta); } } // namespace r1 } // namespace detail } // namespace tbb ================================================ FILE: third-party/tbb/src/tbb/market.h ================================================ /* Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _TBB_market_H #define _TBB_market_H #include "oneapi/tbb/rw_mutex.h" #include "oneapi/tbb/tbb_allocator.h" #include "oneapi/tbb/task_arena.h" #include "permit_manager.h" #include "pm_client.h" #include #include namespace tbb { namespace detail { namespace r1 { class market : public permit_manager { public: market(unsigned soft_limit); pm_client* create_client(arena& a) override; void register_client(pm_client* client, d1::constraints&) override; void unregister_and_destroy_client(pm_client& c) override; //! Request that arena's need in workers should be adjusted. void adjust_demand(pm_client&, int mandatory_delta, int workers_delta) override; //! Set number of active workers void set_active_num_workers(int soft_limit) override; private: //! Recalculates the number of workers assigned to each arena in the list. void update_allotment(); //! Keys for the arena map array. The lower the value the higher priority of the arena list. static constexpr unsigned num_priority_levels = d1::num_priority_levels; using mutex_type = d1::rw_mutex; mutex_type my_mutex; //! Current application-imposed limit on the number of workers int my_num_workers_soft_limit; //! Number of workers that were requested by all arenas on all priority levels int my_total_demand{0}; //! Number of workers that were requested by arenas per single priority list item int my_priority_level_demand[num_priority_levels] = {0}; //! How many times mandatory concurrency was requested from the market int my_mandatory_num_requested{0}; //! Per priority list of registered arenas using clients_container_type = std::vector>; clients_container_type my_clients[num_priority_levels]; }; // class market } // namespace r1 } // namespace detail } // namespace tbb #endif /* _TBB_market_H */ ================================================ FILE: third-party/tbb/src/tbb/market_concurrent_monitor.h ================================================ /* Copyright (c) 2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_market_concurrent_monitor_H #define __TBB_market_concurrent_monitor_H #include "concurrent_monitor.h" #include "scheduler_common.h" #include namespace tbb { namespace detail { namespace r1 { struct market_context { market_context() = default; market_context(std::uintptr_t first_addr, arena* a) : my_uniq_addr(first_addr), my_arena_addr(a) {} std::uintptr_t my_uniq_addr{0}; arena* my_arena_addr{nullptr}; }; #if __TBB_RESUMABLE_TASKS class resume_node : public wait_node { using base_type = wait_node; public: resume_node(market_context ctx, execution_data_ext& ed_ext, task_dispatcher& target) : base_type(ctx), my_curr_dispatcher(ed_ext.task_disp), my_target_dispatcher(&target) , my_suspend_point(my_curr_dispatcher->get_suspend_point()) {} ~resume_node() override { if (this->my_skipped_wakeup) { spin_wait_until_eq(this->my_notify_calls, 1); } poison_pointer(my_curr_dispatcher); poison_pointer(my_target_dispatcher); poison_pointer(my_suspend_point); } void init() override { base_type::init(); } void wait() override { my_curr_dispatcher->resume(*my_target_dispatcher); __TBB_ASSERT(!this->my_is_in_list.load(std::memory_order_relaxed), "Still in the queue?"); } void reset() override { base_type::reset(); spin_wait_until_eq(this->my_notify_calls, 1); my_notify_calls.store(0, std::memory_order_relaxed); } // notify is called (perhaps, concurrently) twice from: // - concurrent_monitor::notify // - post_resume_action::register_waiter // The second notify is called after thread switches the stack // (Because we can not call resume while the stack is occupied) // We need calling resume only when both notifications are performed. void notify() override { if (++my_notify_calls == 2) { r1::resume(my_suspend_point); } } private: friend class thread_data; friend struct suspend_point_type::resume_task; task_dispatcher* my_curr_dispatcher; task_dispatcher* my_target_dispatcher; suspend_point_type* my_suspend_point; std::atomic my_notify_calls{0}; }; #endif // __TBB_RESUMABLE_TASKS class market_concurrent_monitor : public concurrent_monitor_base { using base_type = concurrent_monitor_base; public: using base_type::base_type; ~market_concurrent_monitor() { destroy(); } /** per-thread descriptor for concurrent_monitor */ using thread_context = sleep_node; #if __TBB_RESUMABLE_TASKS using resume_context = resume_node; #endif }; } // namespace r1 } // namespace detail } // namespace tbb #endif // __TBB_market_concurrent_monitor_H ================================================ FILE: third-party/tbb/src/tbb/misc.cpp ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ // Source file for miscellaneous entities that are infrequently referenced by // an executing program. #include "oneapi/tbb/detail/_exception.h" #include "oneapi/tbb/detail/_machine.h" #include "oneapi/tbb/version.h" #include "misc.h" #include "governor.h" #include "assert_impl.h" // Out-of-line TBB assertion handling routines are instantiated here. #include "concurrent_monitor_mutex.h" #include #include #include #include #include #if _WIN32||_WIN64 #include #endif #if !_WIN32 #include // sysconf(_SC_PAGESIZE) #endif namespace tbb { namespace detail { namespace r1 { //------------------------------------------------------------------------ // governor data //------------------------------------------------------------------------ cpu_features_type governor::cpu_features; //------------------------------------------------------------------------ // concurrent_monitor_mutex data //------------------------------------------------------------------------ #if !__TBB_USE_FUTEX std::mutex concurrent_monitor_mutex::my_init_mutex; #endif size_t DefaultSystemPageSize() { #if _WIN32 SYSTEM_INFO si; GetSystemInfo(&si); return si.dwPageSize; #else return sysconf(_SC_PAGESIZE); #endif } /** The leading "\0" is here so that applying "strings" to the binary delivers a clean result. */ static const char VersionString[] = "\0" TBB_VERSION_STRINGS; static bool PrintVersionFlag = false; void PrintVersion() { PrintVersionFlag = true; std::fputs(VersionString+1,stderr); } void PrintExtraVersionInfo( const char* category, const char* format, ... ) { if( PrintVersionFlag ) { char str[1024]; std::memset(str, 0, 1024); va_list args; va_start(args, format); // Note: correct vsnprintf definition obtained from tbb_assert_impl.h std::vsnprintf( str, 1024-1, format, args); va_end(args); std::fprintf(stderr, "oneTBB: %s\t%s\n", category, str ); } } //! check for transaction support. #if _MSC_VER #include // for __cpuid #elif __APPLE__ #include #endif #if __TBB_x86_32 || __TBB_x86_64 void check_cpuid(int leaf, int sub_leaf, int registers[4]) { #if _MSC_VER __cpuidex(registers, leaf, sub_leaf); #else int reg_eax = 0; int reg_ebx = 0; int reg_ecx = 0; int reg_edx = 0; #if __TBB_x86_32 && __PIC__ // On 32-bit systems with position-independent code GCC fails to work around the stuff in EBX // register. We help it using backup and restore. __asm__("mov %%ebx, %%esi\n\t" "cpuid\n\t" "xchg %%ebx, %%esi" : "=a"(reg_eax), "=S"(reg_ebx), "=c"(reg_ecx), "=d"(reg_edx) : "0"(leaf), "2"(sub_leaf) // read value from eax and ecx ); #else __asm__("cpuid" : "=a"(reg_eax), "=b"(reg_ebx), "=c"(reg_ecx), "=d"(reg_edx) : "0"(leaf), "2"(sub_leaf) // read value from eax and ecx ); #endif registers[0] = reg_eax; registers[1] = reg_ebx; registers[2] = reg_ecx; registers[3] = reg_edx; #endif } #endif void detect_cpu_features(cpu_features_type& cpu_features) { suppress_unused_warning(cpu_features); #if __TBB_x86_32 || __TBB_x86_64 const int rtm_ebx_mask = 1 << 11; const int waitpkg_ecx_mask = 1 << 5; const int hybrid_edx_mask = 1 << 15; int registers[4] = {0}; // Check RTM, WAITPKG, HYBRID check_cpuid(7, 0, registers); cpu_features.rtm_enabled = (registers[1] & rtm_ebx_mask) != 0; cpu_features.waitpkg_enabled = (registers[2] & waitpkg_ecx_mask) != 0; cpu_features.hybrid = (registers[3] & hybrid_edx_mask) != 0; #elif __APPLE__ // Check HYBRID (hw.nperflevels > 1) uint64_t nperflevels = 0; size_t nperflevels_size = sizeof(nperflevels); if (!sysctlbyname("hw.nperflevels", &nperflevels, &nperflevels_size, nullptr, 0)) { cpu_features.hybrid = (nperflevels > 1); } #endif } } // namespace r1 } // namespace detail } // namespace tbb ================================================ FILE: third-party/tbb/src/tbb/misc.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _TBB_tbb_misc_H #define _TBB_tbb_misc_H #include "oneapi/tbb/detail/_config.h" #include "oneapi/tbb/detail/_assert.h" #include "oneapi/tbb/detail/_utils.h" #if __TBB_ARENA_BINDING #include "oneapi/tbb/info.h" #endif /*__TBB_ARENA_BINDING*/ #if __unix__ #include // __FreeBSD_version #if __FreeBSD_version >= 701000 #include #endif #endif #include // Does the operating system have a system call to pin a thread to a set of OS processors? #define __TBB_OS_AFFINITY_SYSCALL_PRESENT ((__linux__ && !__ANDROID__) || (__FreeBSD_version >= 701000)) // On IBM* Blue Gene* CNK nodes, the affinity API has restrictions that prevent its usability for TBB, // and also sysconf(_SC_NPROCESSORS_ONLN) already takes process affinity into account. #define __TBB_USE_OS_AFFINITY_SYSCALL (__TBB_OS_AFFINITY_SYSCALL_PRESENT && !__bg__) namespace tbb { namespace detail { namespace r1 { void runtime_warning(const char* format, ... ); #if __TBB_ARENA_BINDING class task_arena; class task_scheduler_observer; #endif /*__TBB_ARENA_BINDING*/ const std::size_t MByte = 1024*1024; #if __TBB_USE_WINAPI // The Microsoft Documentation about Thread Stack Size states that // "The default stack reservation size used by the linker is 1 MB" const std::size_t ThreadStackSize = 1*MByte; #else const std::size_t ThreadStackSize = (sizeof(uintptr_t) <= 4 ? 2 : 4 )*MByte; #endif #ifndef __TBB_HardwareConcurrency //! Returns maximal parallelism level supported by the current OS configuration. int AvailableHwConcurrency(); #else inline int AvailableHwConcurrency() { int n = __TBB_HardwareConcurrency(); return n > 0 ? n : 1; // Fail safety strap } #endif /* __TBB_HardwareConcurrency */ //! Returns OS regular memory page size size_t DefaultSystemPageSize(); //! Returns number of processor groups in the current OS configuration. /** AvailableHwConcurrency must be called at least once before calling this method. **/ int NumberOfProcessorGroups(); #if _WIN32||_WIN64 //! Retrieves index of processor group containing processor with the given index int FindProcessorGroupIndex ( int processorIndex ); //! Affinitizes the thread to the specified processor group void MoveThreadIntoProcessorGroup( void* hThread, int groupIndex ); #endif /* _WIN32||_WIN64 */ //! Prints TBB version information on stderr void PrintVersion(); //! Prints arbitrary extra TBB version information on stderr void PrintExtraVersionInfo( const char* category, const char* format, ... ); //! A callback routine to print RML version information on stderr void PrintRMLVersionInfo( void* arg, const char* server_info ); // For TBB compilation only; not to be used in public headers #if defined(min) || defined(max) #undef min #undef max #endif //! Utility template function returning lesser of the two values. /** Provided here to avoid including not strict safe .\n In case operands cause signed/unsigned or size mismatch warnings it is caller's responsibility to do the appropriate cast before calling the function. **/ template T min ( const T& val1, const T& val2 ) { return val1 < val2 ? val1 : val2; } //! Utility template function returning greater of the two values. /** Provided here to avoid including not strict safe .\n In case operands cause signed/unsigned or size mismatch warnings it is caller's responsibility to do the appropriate cast before calling the function. **/ template T max ( const T& val1, const T& val2 ) { return val1 < val2 ? val2 : val1; } //! Utility helper structure to ease overload resolution template struct int_to_type {}; //------------------------------------------------------------------------ // FastRandom //------------------------------------------------------------------------ //! A fast random number generator. /** Uses linear congruential method. */ class FastRandom { private: unsigned x, c; static const unsigned a = 0x9e3779b1; // a big prime number public: //! Get a random number. unsigned short get() { return get(x); } //! Get a random number for the given seed; update the seed for next use. unsigned short get( unsigned& seed ) { unsigned short r = (unsigned short)(seed>>16); __TBB_ASSERT(c&1, "c must be odd for big rng period"); seed = seed*a+c; return r; } //! Construct a random number generator. FastRandom( void* unique_ptr ) { init(uintptr_t(unique_ptr)); } template void init( T seed ) { init(seed,int_to_type()); } void init( uint64_t seed , int_to_type<8> ) { init(uint32_t((seed>>32)+seed), int_to_type<4>()); } void init( uint32_t seed, int_to_type<4> ) { // threads use different seeds for unique sequences c = (seed|1)*0xba5703f5; // c must be odd, shuffle by a prime number x = c^(seed>>1); // also shuffle x for the first get() invocation } }; //------------------------------------------------------------------------ // Atomic extensions //------------------------------------------------------------------------ //! Atomically replaces value of dst with newValue if they satisfy condition of compare predicate /** Return value semantics is the same as for CAS. **/ template T1 atomic_update(std::atomic& dst, T1 newValue, Pred compare) { T1 oldValue = dst.load(std::memory_order_acquire); while ( compare(oldValue, newValue) ) { if ( dst.compare_exchange_strong(oldValue, newValue) ) break; } return oldValue; } #if __TBB_USE_OS_AFFINITY_SYSCALL #if __linux__ typedef cpu_set_t basic_mask_t; #elif __FreeBSD_version >= 701000 typedef cpuset_t basic_mask_t; #else #error affinity_helper is not implemented in this OS #endif class affinity_helper : no_copy { basic_mask_t* threadMask; int is_changed; public: affinity_helper() : threadMask(nullptr), is_changed(0) {} ~affinity_helper(); void protect_affinity_mask( bool restore_process_mask ); void dismiss(); }; void destroy_process_mask(); #else class affinity_helper : no_copy { public: void protect_affinity_mask( bool ) {} }; inline void destroy_process_mask(){} #endif /* __TBB_USE_OS_AFFINITY_SYSCALL */ struct cpu_features_type { bool rtm_enabled{false}; bool waitpkg_enabled{false}; bool hybrid{false}; }; void detect_cpu_features(cpu_features_type& cpu_features); #if __TBB_ARENA_BINDING class binding_handler; binding_handler* construct_binding_handler(int slot_num, int numa_id, int core_type_id, int max_threads_per_core); void destroy_binding_handler(binding_handler* handler_ptr); void apply_affinity_mask(binding_handler* handler_ptr, int slot_num); void restore_affinity_mask(binding_handler* handler_ptr, int slot_num); #endif /*__TBB_ARENA_BINDING*/ // RTM specific section // abort code for mutexes that detect a conflict with another thread. enum { speculation_not_supported = 0x00, speculation_transaction_aborted = 0x01, speculation_can_retry = 0x02, speculation_memadd_conflict = 0x04, speculation_buffer_overflow = 0x08, speculation_breakpoint_hit = 0x10, speculation_nested_abort = 0x20, speculation_xabort_mask = 0xFF000000, speculation_xabort_shift = 24, speculation_xabort_not_free = 0xFF, // The value (0xFF) below comes from the Intel(R) 64 and IA-32 Architectures Optimization Reference Manual 12.4.5 lock not free speculation_successful_begin = 0xFFFFFFFF, speculation_retry = speculation_transaction_aborted | speculation_can_retry | speculation_memadd_conflict }; // We suppose that successful transactions are sequentially ordered and // do not require additional memory fences around them. // Technically it can be achieved only if xbegin has implicit // acquire memory semantics an xend/xabort has release memory semantics on compiler and hardware level. // See the article: https://arxiv.org/pdf/1710.04839.pdf static inline unsigned int begin_transaction() { #if __TBB_TSX_INTRINSICS_PRESENT return _xbegin(); #else return speculation_not_supported; // return unsuccessful code #endif } static inline void end_transaction() { #if __TBB_TSX_INTRINSICS_PRESENT _xend(); #endif } static inline void abort_transaction() { #if __TBB_TSX_INTRINSICS_PRESENT _xabort(speculation_xabort_not_free); #endif } #if TBB_USE_ASSERT static inline unsigned char is_in_transaction() { #if __TBB_TSX_INTRINSICS_PRESENT return _xtest(); #else return 0; #endif } #endif // TBB_USE_ASSERT } // namespace r1 } // namespace detail } // namespace tbb #endif /* _TBB_tbb_misc_H */ ================================================ FILE: third-party/tbb/src/tbb/misc_ex.cpp ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ // Source file for miscellaneous entities that are infrequently referenced by // an executing program, and implementation of which requires dynamic linking. #include "misc.h" #if !defined(__TBB_HardwareConcurrency) #include "dynamic_link.h" #include #include #if _WIN32||_WIN64 #include #if __TBB_WIN8UI_SUPPORT #include #endif #else #include #if __unix__ #if __linux__ #include #endif #include #include #include #elif __sun #include #elif __FreeBSD__ #include #include #include // Required by #include #endif #endif namespace tbb { namespace detail { namespace r1 { #if __TBB_USE_OS_AFFINITY_SYSCALL #if __unix__ // Handlers for interoperation with libiomp static int (*libiomp_try_restoring_original_mask)(); // Table for mapping to libiomp entry points static const dynamic_link_descriptor iompLinkTable[] = { DLD_NOWEAK( kmp_set_thread_affinity_mask_initial, libiomp_try_restoring_original_mask ) }; #endif static void set_thread_affinity_mask( std::size_t maskSize, const basic_mask_t* threadMask ) { #if __FreeBSD__ || __NetBSD__ || __OpenBSD__ if( cpuset_setaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) ) #else /* __unix__ */ if( sched_setaffinity( 0, maskSize, threadMask ) ) #endif // Here and below the error severity is lowered from critical level // because it may happen during TBB library unload because of not // waiting for workers to complete (current RML policy, to be fixed). // handle_perror( errno, "setaffinity syscall" ); runtime_warning( "setaffinity syscall failed" ); } static void get_thread_affinity_mask( std::size_t maskSize, basic_mask_t* threadMask ) { #if __FreeBSD__ || __NetBSD__ || __OpenBSD__ if( cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) ) #else /* __unix__ */ if( sched_getaffinity( 0, maskSize, threadMask ) ) #endif runtime_warning( "getaffinity syscall failed" ); } static basic_mask_t* process_mask; static int num_masks; void destroy_process_mask() { delete [] process_mask; process_mask = nullptr; } #define curMaskSize sizeof(basic_mask_t) * num_masks affinity_helper::~affinity_helper() { if( threadMask ) { if( is_changed ) { set_thread_affinity_mask( curMaskSize, threadMask ); } delete [] threadMask; } } void affinity_helper::protect_affinity_mask( bool restore_process_mask ) { if( threadMask == nullptr && num_masks ) { // TODO: assert num_masks validity? threadMask = new basic_mask_t [num_masks]; std::memset( threadMask, 0, curMaskSize ); get_thread_affinity_mask( curMaskSize, threadMask ); if( restore_process_mask ) { __TBB_ASSERT( process_mask, "A process mask is requested but not yet stored" ); is_changed = memcmp( process_mask, threadMask, curMaskSize ); if( is_changed ) set_thread_affinity_mask( curMaskSize, process_mask ); } else { // Assume that the mask will be changed by the caller. is_changed = 1; } } } void affinity_helper::dismiss() { delete [] threadMask; threadMask = nullptr; is_changed = 0; } #undef curMaskSize static std::atomic hardware_concurrency_info; static int theNumProcs; static void initialize_hardware_concurrency_info () { int err; int availableProcs = 0; int numMasks = 1; int maxProcs = sysconf(_SC_NPROCESSORS_ONLN); basic_mask_t* processMask; const std::size_t BasicMaskSize = sizeof(basic_mask_t); for (;;) { const int curMaskSize = BasicMaskSize * numMasks; processMask = new basic_mask_t[numMasks]; std::memset( processMask, 0, curMaskSize ); #if __FreeBSD__ || __NetBSD__ || __OpenBSD__ // CPU_LEVEL_WHICH - anonymous (current) mask, CPU_LEVEL_CPUSET - assigned mask err = cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, curMaskSize, processMask ); if ( !err || errno != ERANGE || curMaskSize * CHAR_BIT >= 16 * 1024 ) break; #else /* __unix__ */ int pid = getpid(); err = sched_getaffinity( pid, curMaskSize, processMask ); if ( !err || errno != EINVAL || curMaskSize * CHAR_BIT >= 256 * 1024 ) break; #endif delete[] processMask; numMasks <<= 1; } if ( !err ) { // We have found the mask size and captured the process affinity mask into processMask. num_masks = numMasks; // do here because it's needed for affinity_helper to work #if __unix__ // For better coexistence with libiomp which might have changed the mask already, // check for its presence and ask it to restore the mask. dynamic_link_handle libhandle; if ( dynamic_link( "libiomp5.so", iompLinkTable, 1, &libhandle, DYNAMIC_LINK_GLOBAL ) ) { // We have found the symbol provided by libiomp5 for restoring original thread affinity. affinity_helper affhelp; affhelp.protect_affinity_mask( /*restore_process_mask=*/false ); if ( libiomp_try_restoring_original_mask()==0 ) { // Now we have the right mask to capture, restored by libiomp. const int curMaskSize = BasicMaskSize * numMasks; std::memset( processMask, 0, curMaskSize ); get_thread_affinity_mask( curMaskSize, processMask ); } else affhelp.dismiss(); // thread mask has not changed dynamic_unlink( libhandle ); // Destructor of affinity_helper restores the thread mask (unless dismissed). } #endif for ( int m = 0; availableProcs < maxProcs && m < numMasks; ++m ) { for ( std::size_t i = 0; (availableProcs < maxProcs) && (i < BasicMaskSize * CHAR_BIT); ++i ) { if ( CPU_ISSET( i, processMask + m ) ) ++availableProcs; } } process_mask = processMask; } else { // Failed to get the process affinity mask; assume the whole machine can be used. availableProcs = (maxProcs == INT_MAX) ? sysconf(_SC_NPROCESSORS_ONLN) : maxProcs; delete[] processMask; } theNumProcs = availableProcs > 0 ? availableProcs : 1; // Fail safety strap __TBB_ASSERT( theNumProcs <= sysconf(_SC_NPROCESSORS_ONLN), nullptr); } int AvailableHwConcurrency() { atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info ); return theNumProcs; } /* End of __TBB_USE_OS_AFFINITY_SYSCALL implementation */ #elif __ANDROID__ // Work-around for Android that reads the correct number of available CPUs since system calls are unreliable. // Format of "present" file is: ([-|],)+ int AvailableHwConcurrency() { FILE *fp = fopen("/sys/devices/system/cpu/present", "r"); if (fp == nullptr) return 1; int num_args, lower, upper, num_cpus=0; while ((num_args = fscanf(fp, "%u-%u", &lower, &upper)) != EOF) { switch(num_args) { case 2: num_cpus += upper - lower + 1; break; case 1: num_cpus += 1; break; } fscanf(fp, ","); } fclose(fp); return (num_cpus > 0) ? num_cpus : 1; } #elif defined(_SC_NPROCESSORS_ONLN) int AvailableHwConcurrency() { int n = sysconf(_SC_NPROCESSORS_ONLN); return (n > 0) ? n : 1; } #elif _WIN32||_WIN64 static std::atomic hardware_concurrency_info; static const WORD TBB_ALL_PROCESSOR_GROUPS = 0xffff; // Statically allocate an array for processor group information. // Windows 7 supports maximum 4 groups, but let's look ahead a little. static const WORD MaxProcessorGroups = 64; struct ProcessorGroupInfo { DWORD_PTR mask; ///< Affinity mask covering the whole group int numProcs; ///< Number of processors in the group int numProcsRunningTotal; ///< Subtotal of processors in this and preceding groups //! Total number of processor groups in the system static int NumGroups; //! Index of the group with a slot reserved for the first external thread /** In the context of multiple processor groups support current implementation defines "the first external thread" as the first thread to invoke AvailableHwConcurrency(). TODO: Implement a dynamic scheme remapping workers depending on the pending external threads affinity. **/ static int HoleIndex; }; int ProcessorGroupInfo::NumGroups = 1; int ProcessorGroupInfo::HoleIndex = 0; ProcessorGroupInfo theProcessorGroups[MaxProcessorGroups]; int calculate_numa[MaxProcessorGroups]; //Array needed for FindProcessorGroupIndex to calculate Processor Group when number of threads > number of cores to distribute threads evenly between processor groups int numaSum; struct TBB_GROUP_AFFINITY { DWORD_PTR Mask; WORD Group; WORD Reserved[3]; }; static DWORD (WINAPI *TBB_GetActiveProcessorCount)( WORD groupIndex ) = nullptr; static WORD (WINAPI *TBB_GetActiveProcessorGroupCount)() = nullptr; static BOOL (WINAPI *TBB_SetThreadGroupAffinity)( HANDLE hThread, const TBB_GROUP_AFFINITY* newAff, TBB_GROUP_AFFINITY *prevAff ); static BOOL (WINAPI *TBB_GetThreadGroupAffinity)( HANDLE hThread, TBB_GROUP_AFFINITY* ); static const dynamic_link_descriptor ProcessorGroupsApiLinkTable[] = { DLD(GetActiveProcessorCount, TBB_GetActiveProcessorCount) , DLD(GetActiveProcessorGroupCount, TBB_GetActiveProcessorGroupCount) , DLD(SetThreadGroupAffinity, TBB_SetThreadGroupAffinity) , DLD(GetThreadGroupAffinity, TBB_GetThreadGroupAffinity) }; static void initialize_hardware_concurrency_info () { suppress_unused_warning(TBB_ALL_PROCESSOR_GROUPS); #if __TBB_WIN8UI_SUPPORT // For these applications processor groups info is unavailable // Setting up a number of processors for one processor group theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = std::thread::hardware_concurrency(); #else /* __TBB_WIN8UI_SUPPORT */ dynamic_link( "Kernel32.dll", ProcessorGroupsApiLinkTable, sizeof(ProcessorGroupsApiLinkTable)/sizeof(dynamic_link_descriptor) ); SYSTEM_INFO si; GetNativeSystemInfo(&si); DWORD_PTR pam, sam, m = 1; GetProcessAffinityMask( GetCurrentProcess(), &pam, &sam ); int nproc = 0; for ( std::size_t i = 0; i < sizeof(DWORD_PTR) * CHAR_BIT; ++i, m <<= 1 ) { if ( pam & m ) ++nproc; } int number_of_processors = (int)si.dwNumberOfProcessors; if (nproc > number_of_processors && TBB_GetThreadGroupAffinity) { // Sometimes on systems with multiple processor groups GetNativeSystemInfo // reports mask and processor count from the parent process TBB_GROUP_AFFINITY ga; if (TBB_GetThreadGroupAffinity(GetCurrentThread(), &ga)) { number_of_processors = (int)TBB_GetActiveProcessorCount(ga.Group); } } __TBB_ASSERT( nproc <= number_of_processors, nullptr); // By default setting up a number of processors for one processor group theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = nproc; // Setting up processor groups in case the process does not restrict affinity mask and more than one processor group is present if ( nproc == number_of_processors && TBB_GetActiveProcessorCount ) { // The process does not have restricting affinity mask and multiple processor groups are possible ProcessorGroupInfo::NumGroups = (int)TBB_GetActiveProcessorGroupCount(); __TBB_ASSERT( ProcessorGroupInfo::NumGroups <= MaxProcessorGroups, nullptr); // Fail safety bootstrap. Release versions will limit available concurrency // level, while debug ones would assert. if ( ProcessorGroupInfo::NumGroups > MaxProcessorGroups ) ProcessorGroupInfo::NumGroups = MaxProcessorGroups; if ( ProcessorGroupInfo::NumGroups > 1 ) { TBB_GROUP_AFFINITY ga; if ( TBB_GetThreadGroupAffinity( GetCurrentThread(), &ga ) ) ProcessorGroupInfo::HoleIndex = ga.Group; int nprocs = 0; int min_procs = INT_MAX; for ( WORD i = 0; i < ProcessorGroupInfo::NumGroups; ++i ) { ProcessorGroupInfo &pgi = theProcessorGroups[i]; pgi.numProcs = (int)TBB_GetActiveProcessorCount(i); if (pgi.numProcs < min_procs) min_procs = pgi.numProcs; //Finding the minimum number of processors in the Processor Groups calculate_numa[i] = pgi.numProcs; __TBB_ASSERT( pgi.numProcs <= (int)sizeof(DWORD_PTR) * CHAR_BIT, nullptr); pgi.mask = pgi.numProcs == sizeof(DWORD_PTR) * CHAR_BIT ? ~(DWORD_PTR)0 : (DWORD_PTR(1) << pgi.numProcs) - 1; pgi.numProcsRunningTotal = nprocs += pgi.numProcs; } __TBB_ASSERT( nprocs == (int)TBB_GetActiveProcessorCount( TBB_ALL_PROCESSOR_GROUPS ), nullptr); calculate_numa[0] = (calculate_numa[0] / min_procs)-1; for (WORD i = 1; i < ProcessorGroupInfo::NumGroups; ++i) { calculate_numa[i] = calculate_numa[i-1] + (calculate_numa[i] / min_procs); } numaSum = calculate_numa[ProcessorGroupInfo::NumGroups - 1]; } } #endif /* __TBB_WIN8UI_SUPPORT */ PrintExtraVersionInfo("Processor groups", "%d", ProcessorGroupInfo::NumGroups); if (ProcessorGroupInfo::NumGroups>1) for (int i=0; i= theProcessorGroups[current_grp_idx].numProcs && procIdx < theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal) { procIdx = procIdx - theProcessorGroups[current_grp_idx].numProcs; do { current_grp_idx = (current_grp_idx + 1) % (ProcessorGroupInfo::NumGroups); procIdx = procIdx - theProcessorGroups[current_grp_idx].numProcs; } while (procIdx >= 0); } else if (procIdx >= theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal) { int temp_grp_index = 0; procIdx = procIdx - theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal; procIdx = procIdx % (numaSum+1); //ProcIdx to stay between 0 and numaSum while (procIdx - calculate_numa[temp_grp_index] > 0) { temp_grp_index = (temp_grp_index + 1) % ProcessorGroupInfo::NumGroups; } current_grp_idx = temp_grp_index; } __TBB_ASSERT(current_grp_idx < ProcessorGroupInfo::NumGroups, nullptr); return current_grp_idx; } void MoveThreadIntoProcessorGroup( void* hThread, int groupIndex ) { __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "MoveThreadIntoProcessorGroup is used before AvailableHwConcurrency" ); if ( !TBB_SetThreadGroupAffinity ) return; TBB_GROUP_AFFINITY ga = { theProcessorGroups[groupIndex].mask, (WORD)groupIndex, {0,0,0} }; TBB_SetThreadGroupAffinity( hThread, &ga, nullptr); } int AvailableHwConcurrency() { atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info ); return theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal; } /* End of _WIN32||_WIN64 implementation */ #else #error AvailableHwConcurrency is not implemented for this OS #endif } // namespace r1 } // namespace detail } // namespace tbb #endif /* !__TBB_HardwareConcurrency */ ================================================ FILE: third-party/tbb/src/tbb/observer_proxy.cpp ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "oneapi/tbb/detail/_config.h" #include "oneapi/tbb/detail/_utils.h" #include "observer_proxy.h" #include "arena.h" #include "main.h" #include "thread_data.h" #include namespace tbb { namespace detail { namespace r1 { #if TBB_USE_ASSERT extern std::atomic the_observer_proxy_count; #endif /* TBB_USE_ASSERT */ observer_proxy::observer_proxy( d1::task_scheduler_observer& tso ) : my_ref_count(1), my_list(nullptr), my_next(nullptr), my_prev(nullptr), my_observer(&tso) { #if TBB_USE_ASSERT ++the_observer_proxy_count; #endif /* TBB_USE_ASSERT */ } observer_proxy::~observer_proxy() { __TBB_ASSERT( !my_ref_count, "Attempt to destroy proxy still in use" ); poison_value(my_ref_count); poison_pointer(my_prev); poison_pointer(my_next); #if TBB_USE_ASSERT --the_observer_proxy_count; #endif /* TBB_USE_ASSERT */ } void observer_list::clear() { { scoped_lock lock(mutex(), /*is_writer=*/true); observer_proxy *next = my_head.load(std::memory_order_relaxed); while ( observer_proxy *p = next ) { next = p->my_next; // Both proxy p and observer p->my_observer (if non-null) are guaranteed // to be alive while the list is locked. d1::task_scheduler_observer *obs = p->my_observer; // Make sure that possible concurrent observer destruction does not // conflict with the proxy list cleanup. if (!obs || !(p = obs->my_proxy.exchange(nullptr))) { continue; } // accessing 'obs' after detaching of obs->my_proxy leads to the race with observer destruction __TBB_ASSERT(!next || p == next->my_prev, nullptr); __TBB_ASSERT(is_alive(p->my_ref_count), "Observer's proxy died prematurely"); __TBB_ASSERT(p->my_ref_count.load(std::memory_order_relaxed) == 1, "Reference for observer is missing"); poison_pointer(p->my_observer); remove(p); --p->my_ref_count; delete p; } } // If observe(false) is called concurrently with the destruction of the arena, // need to wait until all proxies are removed. for (atomic_backoff backoff; ; backoff.pause()) { scoped_lock lock(mutex(), /*is_writer=*/false); if (my_head.load(std::memory_order_relaxed) == nullptr) { break; } } __TBB_ASSERT(my_head.load(std::memory_order_relaxed) == nullptr && my_tail.load(std::memory_order_relaxed) == nullptr, nullptr); } void observer_list::insert( observer_proxy* p ) { scoped_lock lock(mutex(), /*is_writer=*/true); if (my_head.load(std::memory_order_relaxed)) { p->my_prev = my_tail.load(std::memory_order_relaxed); my_tail.load(std::memory_order_relaxed)->my_next = p; } else { my_head.store(p, std::memory_order_relaxed); } my_tail.store(p, std::memory_order_relaxed); } void observer_list::remove(observer_proxy* p) { __TBB_ASSERT(my_head.load(std::memory_order_relaxed), "Attempt to remove an item from an empty list"); __TBB_ASSERT(!my_tail.load(std::memory_order_relaxed)->my_next, "Last item's my_next must be nullptr"); if (p == my_tail.load(std::memory_order_relaxed)) { __TBB_ASSERT(!p->my_next, nullptr); my_tail.store(p->my_prev, std::memory_order_relaxed); } else { __TBB_ASSERT(p->my_next, nullptr); p->my_next->my_prev = p->my_prev; } if (p == my_head.load(std::memory_order_relaxed)) { __TBB_ASSERT(!p->my_prev, nullptr); my_head.store(p->my_next, std::memory_order_relaxed); } else { __TBB_ASSERT(p->my_prev, nullptr); p->my_prev->my_next = p->my_next; } __TBB_ASSERT((my_head.load(std::memory_order_relaxed) && my_tail.load(std::memory_order_relaxed)) || (!my_head.load(std::memory_order_relaxed) && !my_tail.load(std::memory_order_relaxed)), nullptr); } void observer_list::remove_ref(observer_proxy* p) { std::uintptr_t r = p->my_ref_count.load(std::memory_order_acquire); __TBB_ASSERT(is_alive(r), nullptr); while (r > 1) { if (p->my_ref_count.compare_exchange_strong(r, r - 1)) { return; } } __TBB_ASSERT(r == 1, nullptr); // Reference count might go to zero { // Use lock to avoid resurrection by a thread concurrently walking the list observer_list::scoped_lock lock(mutex(), /*is_writer=*/true); r = --p->my_ref_count; if (!r) { remove(p); } } __TBB_ASSERT(r || !p->my_ref_count, nullptr); if (!r) { delete p; } } void observer_list::do_notify_entry_observers(observer_proxy*& last, bool worker) { // Pointer p marches though the list from last (exclusively) to the end. observer_proxy* p = last, * prev = p; for (;;) { d1::task_scheduler_observer* tso = nullptr; // Hold lock on list only long enough to advance to the next proxy in the list. { scoped_lock lock(mutex(), /*is_writer=*/false); do { if (p) { // We were already processing the list. if (observer_proxy* q = p->my_next) { if (p == prev) { remove_ref_fast(prev); // sets prev to nullptr if successful } p = q; } else { // Reached the end of the list. if (p == prev) { // Keep the reference as we store the 'last' pointer in scheduler __TBB_ASSERT(int(p->my_ref_count.load(std::memory_order_relaxed)) >= 1 + (p->my_observer ? 1 : 0), nullptr); } else { // The last few proxies were empty __TBB_ASSERT(int(p->my_ref_count.load(std::memory_order_relaxed)), nullptr); ++p->my_ref_count; if (prev) { lock.release(); remove_ref(prev); } } last = p; return; } } else { // Starting pass through the list p = my_head.load(std::memory_order_relaxed); if (!p) { return; } } tso = p->my_observer; } while (!tso); ++p->my_ref_count; ++tso->my_busy_count; } __TBB_ASSERT(!prev || p != prev, nullptr); // Release the proxy pinned before p if (prev) { remove_ref(prev); } // Do not hold any locks on the list while calling user's code. // Do not intercept any exceptions that may escape the callback so that // they are either handled by the TBB scheduler or passed to the debugger. tso->on_scheduler_entry(worker); __TBB_ASSERT(p->my_ref_count.load(std::memory_order_relaxed), nullptr); intptr_t bc = --tso->my_busy_count; __TBB_ASSERT_EX(bc >= 0, "my_busy_count underflowed"); prev = p; } } void observer_list::do_notify_exit_observers(observer_proxy* last, bool worker) { // Pointer p marches though the list from the beginning to last (inclusively). observer_proxy* p = nullptr, * prev = nullptr; for (;;) { d1::task_scheduler_observer* tso = nullptr; // Hold lock on list only long enough to advance to the next proxy in the list. { scoped_lock lock(mutex(), /*is_writer=*/false); do { if (p) { // We were already processing the list. if (p != last) { __TBB_ASSERT(p->my_next, "List items before 'last' must have valid my_next pointer"); if (p == prev) remove_ref_fast(prev); // sets prev to nullptr if successful p = p->my_next; } else { // remove the reference from the last item remove_ref_fast(p); if (p) { lock.release(); if (p != prev && prev) { remove_ref(prev); } remove_ref(p); } return; } } else { // Starting pass through the list p = my_head.load(std::memory_order_relaxed); __TBB_ASSERT(p, "Nonzero 'last' must guarantee that the global list is non-empty"); } tso = p->my_observer; } while (!tso); // The item is already refcounted if (p != last) // the last is already referenced since entry notification ++p->my_ref_count; ++tso->my_busy_count; } __TBB_ASSERT(!prev || p != prev, nullptr); if (prev) remove_ref(prev); // Do not hold any locks on the list while calling user's code. // Do not intercept any exceptions that may escape the callback so that // they are either handled by the TBB scheduler or passed to the debugger. tso->on_scheduler_exit(worker); __TBB_ASSERT(p->my_ref_count || p == last, nullptr); intptr_t bc = --tso->my_busy_count; __TBB_ASSERT_EX(bc >= 0, "my_busy_count underflowed"); prev = p; } } void __TBB_EXPORTED_FUNC observe(d1::task_scheduler_observer &tso, bool enable) { if( enable ) { if( !tso.my_proxy.load(std::memory_order_relaxed) ) { observer_proxy* p = new observer_proxy(tso); tso.my_proxy.store(p, std::memory_order_relaxed); tso.my_busy_count.store(0, std::memory_order_relaxed); thread_data* td = governor::get_thread_data_if_initialized(); if (p->my_observer->my_task_arena == nullptr) { if (!(td && td->my_arena)) { td = governor::get_thread_data(); } __TBB_ASSERT(__TBB_InitOnce::initialization_done(), nullptr); __TBB_ASSERT(td && td->my_arena, nullptr); p->my_list = &td->my_arena->my_observers; } else { d1::task_arena* ta = p->my_observer->my_task_arena; arena* a = ta->my_arena.load(std::memory_order_acquire); if (a == nullptr) { // Avoid recursion during arena initialization ta->initialize(); a = ta->my_arena.load(std::memory_order_relaxed); } __TBB_ASSERT(a != nullptr, nullptr); p->my_list = &a->my_observers; } p->my_list->insert(p); // Notify newly activated observer and other pending ones if it belongs to current arena if (td && td->my_arena && &td->my_arena->my_observers == p->my_list) { p->my_list->notify_entry_observers(td->my_last_observer, td->my_is_worker); } } } else { // Make sure that possible concurrent proxy list cleanup does not conflict // with the observer destruction here. if ( observer_proxy* proxy = tso.my_proxy.exchange(nullptr) ) { // List destruction should not touch this proxy after we've won the above interlocked exchange. __TBB_ASSERT( proxy->my_observer == &tso, nullptr); __TBB_ASSERT( is_alive(proxy->my_ref_count.load(std::memory_order_relaxed)), "Observer's proxy died prematurely" ); __TBB_ASSERT( proxy->my_ref_count.load(std::memory_order_relaxed) >= 1, "reference for observer missing" ); observer_list &list = *proxy->my_list; { // Ensure that none of the list walkers relies on observer pointer validity observer_list::scoped_lock lock(list.mutex(), /*is_writer=*/true); proxy->my_observer = nullptr; // Proxy may still be held by other threads (to track the last notified observer) if( !--proxy->my_ref_count ) {// nobody can increase it under exclusive lock list.remove(proxy); __TBB_ASSERT( !proxy->my_ref_count, nullptr); delete proxy; } } spin_wait_until_eq(tso.my_busy_count, 0); // other threads are still accessing the callback } } } } // namespace r1 } // namespace detail } // namespace tbb ================================================ FILE: third-party/tbb/src/tbb/observer_proxy.h ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_observer_proxy_H #define __TBB_observer_proxy_H #include "oneapi/tbb/detail/_config.h" #include "oneapi/tbb/detail/_aligned_space.h" #include "oneapi/tbb/task_scheduler_observer.h" #include "oneapi/tbb/spin_rw_mutex.h" namespace tbb { namespace detail { namespace r1 { class observer_list { friend class arena; // Mutex is wrapped with aligned_space to shut up warnings when its destructor // is called while threads are still using it. typedef aligned_space my_mutex_type; //! Pointer to the head of this list. std::atomic my_head{nullptr}; //! Pointer to the tail of this list. std::atomic my_tail{nullptr}; //! Mutex protecting this list. my_mutex_type my_mutex; //! Back-pointer to the arena this list belongs to. arena* my_arena; //! Decrement refcount of the proxy p if there are other outstanding references. /** In case of success sets p to nullptr. Must be invoked from under the list lock. **/ inline static void remove_ref_fast( observer_proxy*& p ); //! Implements notify_entry_observers functionality. void do_notify_entry_observers( observer_proxy*& last, bool worker ); //! Implements notify_exit_observers functionality. void do_notify_exit_observers( observer_proxy* last, bool worker ); public: observer_list () = default; //! Removes and destroys all observer proxies from the list. /** Cannot be used concurrently with other methods. **/ void clear (); //! Add observer proxy to the tail of the list. void insert ( observer_proxy* p ); //! Remove observer proxy from the list. void remove ( observer_proxy* p ); //! Decrement refcount of the proxy and destroy it if necessary. /** When refcount reaches zero removes the proxy from the list and destructs it. **/ void remove_ref( observer_proxy* p ); //! Type of the scoped lock for the reader-writer mutex associated with the list. typedef spin_rw_mutex::scoped_lock scoped_lock; //! Accessor to the reader-writer mutex associated with the list. spin_rw_mutex& mutex () { return my_mutex.begin()[0]; } //! Call entry notifications on observers added after last was notified. /** Updates last to become the last notified observer proxy (in the global list) or leaves it to be nullptr. The proxy has its refcount incremented. **/ inline void notify_entry_observers( observer_proxy*& last, bool worker ); //! Call exit notifications on last and observers added before it. inline void notify_exit_observers( observer_proxy*& last, bool worker ); }; // class observer_list //! Wrapper for an observer object /** To maintain shared lists of observers the scheduler first wraps each observer object into a proxy so that a list item remained valid even after the corresponding proxy object is destroyed by the user code. **/ class observer_proxy { friend class d1::task_scheduler_observer; friend class observer_list; friend void observe(d1::task_scheduler_observer&, bool); //! Reference count used for garbage collection. /** 1 for reference from my task_scheduler_observer. 1 for each task dispatcher's last observer pointer. No accounting for neighbors in the shared list. */ std::atomic my_ref_count; //! Reference to the list this observer belongs to. observer_list* my_list; //! Pointer to next observer in the list specified by my_head. /** nullptr for the last item in the list. **/ observer_proxy* my_next; //! Pointer to the previous observer in the list specified by my_head. /** For the head of the list points to the last item. **/ observer_proxy* my_prev; //! Associated observer d1::task_scheduler_observer* my_observer; //! Constructs proxy for the given observer and adds it to the specified list. observer_proxy( d1::task_scheduler_observer& ); ~observer_proxy(); }; // class observer_proxy void observer_list::remove_ref_fast( observer_proxy*& p ) { if( p->my_observer ) { // Can decrement refcount quickly, as it cannot drop to zero while under the lock. std::uintptr_t r = --p->my_ref_count; __TBB_ASSERT_EX( r, nullptr); p = nullptr; } else { // Use slow form of refcount decrementing, after the lock is released. } } void observer_list::notify_entry_observers(observer_proxy*& last, bool worker) { if (last == my_tail.load(std::memory_order_relaxed)) return; do_notify_entry_observers(last, worker); } void observer_list::notify_exit_observers( observer_proxy*& last, bool worker ) { if (last == nullptr) { return; } __TBB_ASSERT(!is_poisoned(last), nullptr); do_notify_exit_observers( last, worker ); __TBB_ASSERT(last != nullptr, nullptr); poison_pointer(last); } } // namespace r1 } // namespace detail } // namespace tbb #endif /* __TBB_observer_proxy_H */ ================================================ FILE: third-party/tbb/src/tbb/parallel_pipeline.cpp ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "oneapi/tbb/parallel_pipeline.h" #include "oneapi/tbb/spin_mutex.h" #include "oneapi/tbb/tbb_allocator.h" #include "oneapi/tbb/cache_aligned_allocator.h" #include "itt_notify.h" #include "tls.h" #include "oneapi/tbb/detail/_exception.h" #include "oneapi/tbb/detail/_small_object_pool.h" namespace tbb { namespace detail { namespace r1 { void handle_perror(int error_code, const char* aux_info); using Token = unsigned long; //! A processing pipeline that applies filters to items. /** @ingroup algorithms */ class pipeline { friend void parallel_pipeline(d1::task_group_context&, std::size_t, const d1::filter_node&); public: //! Construct empty pipeline. pipeline(d1::task_group_context& cxt, std::size_t max_token) : my_context(cxt), first_filter(nullptr), last_filter(nullptr), input_tokens(Token(max_token)), end_of_input(false), wait_ctx(0) { __TBB_ASSERT( max_token>0, "pipeline::run must have at least one token" ); } ~pipeline(); //! Add filter to end of pipeline. void add_filter( d1::base_filter& ); //! Traverse tree of fitler-node in-order and add filter for each leaf void fill_pipeline(const d1::filter_node& root) { if( root.left && root.right ) { fill_pipeline(*root.left); fill_pipeline(*root.right); } else { __TBB_ASSERT(!root.left && !root.right, "tree should be full"); add_filter(*root.create_filter()); } } private: friend class stage_task; friend class base_filter; friend void set_end_of_input(d1::base_filter& bf); task_group_context& my_context; //! Pointer to first filter in the pipeline. d1::base_filter* first_filter; //! Pointer to last filter in the pipeline. d1::base_filter* last_filter; //! Number of idle tokens waiting for input stage. std::atomic input_tokens; //! False until flow_control::stop() is called. std::atomic end_of_input; d1::wait_context wait_ctx; }; //! This structure is used to store task information in an input buffer struct task_info { void* my_object = nullptr; //! Invalid unless a task went through an ordered stage. Token my_token = 0; //! False until my_token is set. bool my_token_ready = false; //! True if my_object is valid. bool is_valid = false; //! Set to initial state (no object, no token) void reset() { my_object = nullptr; my_token = 0; my_token_ready = false; is_valid = false; } }; //! A buffer of input items for a filter. /** Each item is a task_info, inserted into a position in the buffer corresponding to a Token. */ class input_buffer { friend class base_filter; friend class stage_task; friend class pipeline; friend void set_end_of_input(d1::base_filter& bf); using size_type = Token; //! Array of deferred tasks that cannot yet start executing. task_info* array; //! Size of array /** Always 0 or a power of 2 */ size_type array_size; //! Lowest token that can start executing. /** All prior Token have already been seen. */ Token low_token; //! Serializes updates. spin_mutex array_mutex; //! Resize "array". /** Caller is responsible to acquiring a lock on "array_mutex". */ void grow( size_type minimum_size ); //! Initial size for "array" /** Must be a power of 2 */ static const size_type initial_buffer_size = 4; //! Used for out of order buffer, and for assigning my_token if is_ordered and my_token not already assigned Token high_token; //! True for ordered filter, false otherwise. const bool is_ordered; //! for parallel filters that accepts nullptrs, thread-local flag for reaching end_of_input using end_of_input_tls_t = basic_tls; end_of_input_tls_t end_of_input_tls; bool end_of_input_tls_allocated; // no way to test pthread creation of TLS public: input_buffer(const input_buffer&) = delete; input_buffer& operator=(const input_buffer&) = delete; //! Construct empty buffer. input_buffer( bool ordered) : array(nullptr), array_size(0), low_token(0), high_token(0), is_ordered(ordered), end_of_input_tls(), end_of_input_tls_allocated(false) { grow(initial_buffer_size); __TBB_ASSERT( array, nullptr ); } //! Destroy the buffer. ~input_buffer() { __TBB_ASSERT( array, nullptr ); cache_aligned_allocator().deallocate(array,array_size); poison_pointer( array ); if( end_of_input_tls_allocated ) { destroy_my_tls(); } } //! Define order when the first filter is serial_in_order. Token get_ordered_token(){ return high_token++; } //! Put a token into the buffer. /** If task information was placed into buffer, returns true; otherwise returns false, informing the caller to create and spawn a task. */ bool try_put_token( task_info& info ) { info.is_valid = true; spin_mutex::scoped_lock lock( array_mutex ); Token token; if( is_ordered ) { if( !info.my_token_ready ) { info.my_token = high_token++; info.my_token_ready = true; } token = info.my_token; } else token = high_token++; __TBB_ASSERT( (long)(token-low_token)>=0, nullptr ); if( token!=low_token ) { // Trying to put token that is beyond low_token. // Need to wait until low_token catches up before dispatching. if( token-low_token>=array_size ) grow( token-low_token+1 ); ITT_NOTIFY( sync_releasing, this ); array[token&(array_size-1)] = info; return true; } return false; } //! Note that processing of a token is finished. /** Fires up processing of the next token, if processing was deferred. */ // Uses template to avoid explicit dependency on stage_task. template void try_to_spawn_task_for_next_token(StageTask& spawner, d1::execution_data& ed) { task_info wakee; { spin_mutex::scoped_lock lock( array_mutex ); // Wake the next task task_info& item = array[++low_token & (array_size-1)]; ITT_NOTIFY( sync_acquired, this ); wakee = item; item.is_valid = false; } if( wakee.is_valid ) spawner.spawn_stage_task(wakee, ed); } // end_of_input signal for parallel_pipeline, parallel input filters with 0 tokens allowed. void create_my_tls() { int status = end_of_input_tls.create(); if(status) handle_perror(status, "TLS not allocated for filter"); end_of_input_tls_allocated = true; } void destroy_my_tls() { int status = end_of_input_tls.destroy(); if(status) handle_perror(status, "Failed to destroy filter TLS"); } bool my_tls_end_of_input() { return end_of_input_tls.get() != nullptr; } void set_my_tls_end_of_input() { end_of_input_tls.set(this); } }; void input_buffer::grow( size_type minimum_size ) { size_type old_size = array_size; size_type new_size = old_size ? 2*old_size : initial_buffer_size; while( new_size().allocate(new_size); task_info* old_array = array; for( size_type i=0; i().deallocate(old_array,old_size); } class stage_task : public d1::task, public task_info { private: friend class pipeline; pipeline& my_pipeline; d1::base_filter* my_filter; d1::small_object_allocator m_allocator; //! True if this task has not yet read the input. bool my_at_start; //! True if this can be executed again. bool execute_filter(d1::execution_data& ed); //! Spawn task if token is available. void try_spawn_stage_task(d1::execution_data& ed) { ITT_NOTIFY( sync_releasing, &my_pipeline.input_tokens ); if( (my_pipeline.input_tokens.fetch_sub(1, std::memory_order_release)) > 1 ) { d1::small_object_allocator alloc{}; r1::spawn( *alloc.new_object(ed, my_pipeline, alloc ), my_pipeline.my_context ); } } public: //! Construct stage_task for first stage in a pipeline. /** Such a stage has not read any input yet. */ stage_task(pipeline& pipeline, d1::small_object_allocator& alloc ) : my_pipeline(pipeline), my_filter(pipeline.first_filter), m_allocator(alloc), my_at_start(true) { task_info::reset(); my_pipeline.wait_ctx.reserve(); } //! Construct stage_task for a subsequent stage in a pipeline. stage_task(pipeline& pipeline, d1::base_filter* filter, const task_info& info, d1::small_object_allocator& alloc) : task_info(info), my_pipeline(pipeline), my_filter(filter), m_allocator(alloc), my_at_start(false) { my_pipeline.wait_ctx.reserve(); } //! Roughly equivalent to the constructor of input stage task void reset() { task_info::reset(); my_filter = my_pipeline.first_filter; my_at_start = true; } void finalize(d1::execution_data& ed) { m_allocator.delete_object(this, ed); } //! The virtual task execution method task* execute(d1::execution_data& ed) override { if(!execute_filter(ed)) { finalize(ed); return nullptr; } return this; } task* cancel(d1::execution_data& ed) override { finalize(ed); return nullptr; } ~stage_task() override { if ( my_filter && my_object ) { my_filter->finalize(my_object); my_object = nullptr; } my_pipeline.wait_ctx.release(); } //! Creates and spawns stage_task from task_info void spawn_stage_task(const task_info& info, d1::execution_data& ed) { d1::small_object_allocator alloc{}; stage_task* clone = alloc.new_object(ed, my_pipeline, my_filter, info, alloc); r1::spawn(*clone, my_pipeline.my_context); } }; bool stage_task::execute_filter(d1::execution_data& ed) { __TBB_ASSERT( !my_at_start || !my_object, "invalid state of task" ); if( my_at_start ) { if( my_filter->is_serial() ) { my_object = (*my_filter)(my_object); if( my_object || ( my_filter->object_may_be_null() && !my_pipeline.end_of_input.load(std::memory_order_relaxed)) ) { if( my_filter->is_ordered() ) { my_token = my_filter->my_input_buffer->get_ordered_token(); my_token_ready = true; } if( !my_filter->next_filter_in_pipeline ) { // we're only filter in pipeline reset(); return true; } else { try_spawn_stage_task(ed); } } else { my_pipeline.end_of_input.store(true, std::memory_order_relaxed); return false; } } else /*not is_serial*/ { if ( my_pipeline.end_of_input.load(std::memory_order_relaxed) ) { return false; } try_spawn_stage_task(ed); my_object = (*my_filter)(my_object); if( !my_object && (!my_filter->object_may_be_null() || my_filter->my_input_buffer->my_tls_end_of_input()) ){ my_pipeline.end_of_input.store(true, std::memory_order_relaxed); return false; } } my_at_start = false; } else { my_object = (*my_filter)(my_object); if( my_filter->is_serial() ) my_filter->my_input_buffer->try_to_spawn_task_for_next_token(*this, ed); } my_filter = my_filter->next_filter_in_pipeline; if( my_filter ) { // There is another filter to execute. if( my_filter->is_serial() ) { // The next filter must execute tokens when they are available (in order for serial_in_order) if( my_filter->my_input_buffer->try_put_token(*this) ){ my_filter = nullptr; // To prevent deleting my_object twice if exception occurs return false; } } } else { // Reached end of the pipe. std::size_t ntokens_avail = my_pipeline.input_tokens.fetch_add(1, std::memory_order_acquire); if( ntokens_avail>0 // Only recycle if there is one available token || my_pipeline.end_of_input.load(std::memory_order_relaxed) ) { return false; // No need to recycle for new input } ITT_NOTIFY( sync_acquired, &my_pipeline.input_tokens ); // Recycle as an input stage task. reset(); } return true; } pipeline::~pipeline() { while( first_filter ) { d1::base_filter* f = first_filter; if( input_buffer* b = f->my_input_buffer ) { b->~input_buffer(); deallocate_memory(b); } first_filter = f->next_filter_in_pipeline; f->~base_filter(); deallocate_memory(f); } } void pipeline::add_filter( d1::base_filter& new_fitler ) { __TBB_ASSERT( new_fitler.next_filter_in_pipeline==d1::base_filter::not_in_pipeline(), "filter already part of pipeline?" ); new_fitler.my_pipeline = this; if ( first_filter == nullptr ) first_filter = &new_fitler; else last_filter->next_filter_in_pipeline = &new_fitler; new_fitler.next_filter_in_pipeline = nullptr; last_filter = &new_fitler; if( new_fitler.is_serial() ) { new_fitler.my_input_buffer = new (allocate_memory(sizeof(input_buffer))) input_buffer( new_fitler.is_ordered() ); } else { if( first_filter == &new_fitler && new_fitler.object_may_be_null() ) { //TODO: buffer only needed to hold TLS; could improve new_fitler.my_input_buffer = new (allocate_memory(sizeof(input_buffer))) input_buffer( /*is_ordered*/false ); new_fitler.my_input_buffer->create_my_tls(); } } } void __TBB_EXPORTED_FUNC parallel_pipeline(d1::task_group_context& cxt, std::size_t max_token, const d1::filter_node& fn) { pipeline pipe(cxt, max_token); pipe.fill_pipeline(fn); d1::small_object_allocator alloc{}; stage_task& st = *alloc.new_object(pipe, alloc); // Start execution of tasks r1::execute_and_wait(st, cxt, pipe.wait_ctx, cxt); } void __TBB_EXPORTED_FUNC set_end_of_input(d1::base_filter& bf) { __TBB_ASSERT(bf.my_input_buffer, nullptr); __TBB_ASSERT(bf.object_may_be_null(), nullptr); if(bf.is_serial() ) { bf.my_pipeline->end_of_input.store(true, std::memory_order_relaxed); } else { __TBB_ASSERT(bf.my_input_buffer->end_of_input_tls_allocated, nullptr); bf.my_input_buffer->set_my_tls_end_of_input(); } } } // namespace r1 } // namespace detail } // namespace tbb ================================================ FILE: third-party/tbb/src/tbb/permit_manager.h ================================================ /* Copyright (c) 2022-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _TBB_permit_manager_H #define _TBB_permit_manager_H #include "oneapi/tbb/info.h" #include "oneapi/tbb/detail/_utils.h" #include "thread_request_serializer.h" namespace tbb { namespace detail { namespace r1 { class arena; class pm_client; class permit_manager : no_copy { public: virtual ~permit_manager() {} virtual pm_client* create_client(arena& a) = 0; virtual void register_client(pm_client* client, d1::constraints& constraints) = 0; virtual void unregister_and_destroy_client(pm_client& c) = 0; virtual void set_active_num_workers(int soft_limit) = 0; virtual void adjust_demand(pm_client&, int mandatory_delta, int workers_delta) = 0; void set_thread_request_observer(thread_request_observer& tr_observer) { __TBB_ASSERT(!my_thread_request_observer, "set_thread_request_observer was called already?"); my_thread_request_observer = &tr_observer; } protected: void notify_thread_request(int delta) { __TBB_ASSERT(my_thread_request_observer, "set_thread_request_observer was not called?"); if (delta) { my_thread_request_observer->update(delta); } } private: thread_request_observer* my_thread_request_observer{nullptr}; }; } // namespace r1 } // namespace detail } // namespace tbb #endif // _TBB_permit_manager_H ================================================ FILE: third-party/tbb/src/tbb/pm_client.h ================================================ /* Copyright (c) 2022-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _TBB_pm_client_H #define _TBB_pm_client_H #include "arena.h" namespace tbb { namespace detail { namespace r1 { class pm_client { public: pm_client(arena& a) : my_arena(a) {} virtual ~pm_client() {} unsigned priority_level() { return my_arena.priority_level(); } void set_top_priority(bool b) { my_arena.set_top_priority(b); } int min_workers() const { return my_min_workers; } int max_workers() const { return my_max_workers; } int update_request(int mandatory_delta, int workers_delta) { auto min_max_workers = my_arena.update_request(mandatory_delta, workers_delta); int delta = min_max_workers.second - my_max_workers; set_workers(min_max_workers.first, min_max_workers.second); return delta; } virtual void register_thread() = 0; virtual void unregister_thread() = 0; protected: void set_workers(int mn_w, int mx_w) { __TBB_ASSERT(mn_w >= 0, nullptr); __TBB_ASSERT(mx_w >= 0, nullptr); my_min_workers = mn_w; my_max_workers = mx_w; } arena& my_arena; int my_min_workers{0}; int my_max_workers{0}; }; } // namespace r1 } // namespace detail } // namespace tbb #endif // _TBB_pm_client_H ================================================ FILE: third-party/tbb/src/tbb/private_server.cpp ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "oneapi/tbb/cache_aligned_allocator.h" #include "oneapi/tbb/mutex.h" #include "rml_tbb.h" #include "rml_thread_monitor.h" #include "scheduler_common.h" #include "governor.h" #include "misc.h" #include namespace tbb { namespace detail { namespace r1 { namespace rml { using rml::internal::thread_monitor; typedef thread_monitor::handle_type thread_handle; class private_server; class private_worker: no_copy { private: //! State in finite-state machine that controls the worker. /** State diagram: init --> starting --> normal | | | | V | \------> quit <------/ */ enum state_t { //! *this is initialized st_init, //! *this has associated thread that is starting up. st_starting, //! Associated thread is doing normal life sequence. st_normal, //! Associated thread has ended normal life sequence and promises to never touch *this again. st_quit }; std::atomic my_state; //! Associated server private_server& my_server; //! Associated client tbb_client& my_client; //! index used for avoiding the 64K aliasing problem const std::size_t my_index; //! Monitor for sleeping when there is no work to do. /** The invariant that holds for sleeping workers is: "my_slack<=0 && my_state==st_normal && I am on server's list of asleep threads" */ thread_monitor my_thread_monitor; //! Handle of the OS thread associated with this worker thread_handle my_handle; //! Link for list of workers that are sleeping or have no associated thread. private_worker* my_next; friend class private_server; //! Actions executed by the associated thread void run() noexcept; //! Wake up associated thread (or launch a thread if there is none) void wake_or_launch(); //! Called by a thread (usually not the associated thread) to commence termination. void start_shutdown(); static __RML_DECL_THREAD_ROUTINE thread_routine( void* arg ); static void release_handle(thread_handle my_handle, bool join); protected: private_worker( private_server& server, tbb_client& client, const std::size_t i ) : my_state(st_init), my_server(server), my_client(client), my_index(i), my_handle(), my_next() {} }; static const std::size_t cache_line_size = tbb::detail::max_nfs_size; #if _MSC_VER && !defined(__INTEL_COMPILER) // Suppress overzealous compiler warnings about uninstantiable class #pragma warning(push) #pragma warning(disable:4510 4610) #endif class padded_private_worker: public private_worker { char pad[cache_line_size - sizeof(private_worker)%cache_line_size]; public: padded_private_worker( private_server& server, tbb_client& client, const std::size_t i ) : private_worker(server,client,i) { suppress_unused_warning(pad); } }; #if _MSC_VER && !defined(__INTEL_COMPILER) #pragma warning(pop) #endif class private_server: public tbb_server, no_copy { private: tbb_client& my_client; //! Maximum number of threads to be created. /** Threads are created lazily, so maximum might not actually be reached. */ const tbb_client::size_type my_n_thread; //! Stack size for each thread. */ const std::size_t my_stack_size; //! Number of jobs that could use their associated thread minus number of active threads. /** If negative, indicates oversubscription. If positive, indicates that more threads should run. Can be lowered asynchronously, but must be raised only while holding my_asleep_list_mutex, because raising it impacts the invariant for sleeping threads. */ std::atomic my_slack; //! Counter used to determine when to delete this. std::atomic my_ref_count; padded_private_worker* my_thread_array; //! List of workers that are asleep or committed to sleeping until notified by another thread. std::atomic my_asleep_list_root; //! Protects my_asleep_list_root typedef mutex asleep_list_mutex_type; asleep_list_mutex_type my_asleep_list_mutex; #if TBB_USE_ASSERT std::atomic my_net_slack_requests; #endif /* TBB_USE_ASSERT */ //! Wake up to two sleeping workers, if there are any sleeping. /** The call is used to propagate a chain reaction where each thread wakes up two threads, which in turn each wake up two threads, etc. */ void propagate_chain_reaction() { // First test of a double-check idiom. Second test is inside wake_some(0). if( my_asleep_list_root.load(std::memory_order_relaxed) ) wake_some(0); } //! Try to add t to list of sleeping workers bool try_insert_in_asleep_list( private_worker& t ); //! Equivalent of adding additional_slack to my_slack and waking up to 2 threads if my_slack permits. void wake_some( int additional_slack ); ~private_server() override; void remove_server_ref() { if( --my_ref_count==0 ) { my_client.acknowledge_close_connection(); this->~private_server(); tbb::cache_aligned_allocator().deallocate( this, 1 ); } } friend class private_worker; public: private_server( tbb_client& client ); version_type version() const override { return 0; } void request_close_connection( bool /*exiting*/ ) override { for( std::size_t i=0; i=2 && !__MINGW64__ // ensure that stack is properly aligned for TBB threads __attribute__((force_align_arg_pointer)) #endif __RML_DECL_THREAD_ROUTINE private_worker::thread_routine( void* arg ) { private_worker* self = static_cast(arg); AVOID_64K_ALIASING( self->my_index ); self->run(); // return 0 instead of nullptr due to the difference in the type __RML_DECL_THREAD_ROUTINE on various OSs return 0; } #if _MSC_VER && !defined(__INTEL_COMPILER) #pragma warning(pop) #endif void private_worker::release_handle(thread_handle handle, bool join) { if (join) thread_monitor::join(handle); else thread_monitor::detach_thread(handle); } void private_worker::start_shutdown() { __TBB_ASSERT(my_state.load(std::memory_order_relaxed) != st_quit, "The quit state is expected to be set only once"); // `acq` to acquire my_handle // `rel` to release market state state_t prev_state = my_state.exchange(st_quit, std::memory_order_acq_rel); if (prev_state == st_init) { // Perform action that otherwise would be performed by associated thread when it quits. my_server.remove_server_ref(); } else { __TBB_ASSERT(prev_state == st_normal || prev_state == st_starting, nullptr); // May have invalidated invariant for sleeping, so wake up the thread. // Note that the notify() here occurs without maintaining invariants for my_slack. // It does not matter, because my_state==st_quit overrides checking of my_slack. my_thread_monitor.notify(); // Do not need release handle in st_init state, // because in this case the thread wasn't started yet. // For st_starting release is done at launch site. if (prev_state == st_normal) release_handle(my_handle, governor::does_client_join_workers(my_client)); } } void private_worker::run() noexcept { my_server.propagate_chain_reaction(); // Transiting to st_normal here would require setting my_handle, // which would create race with the launching thread and // complications in handle management on Windows. ::rml::job& j = *my_client.create_one_job(); // memory_order_seq_cst to be strictly ordered after thread_monitor::wait on the next iteration while( my_state.load(std::memory_order_seq_cst)!=st_quit ) { if( my_server.my_slack.load(std::memory_order_acquire)>=0 ) { my_client.process(j); } else if( my_server.try_insert_in_asleep_list(*this) ) { my_thread_monitor.wait(); __TBB_ASSERT(my_state.load(std::memory_order_relaxed) == st_quit || !my_next, "Thread monitor missed a spurious wakeup?" ); my_server.propagate_chain_reaction(); } } my_client.cleanup(j); ++my_server.my_slack; my_server.remove_server_ref(); } inline void private_worker::wake_or_launch() { state_t state = my_state.load(std::memory_order_relaxed); switch (state) { case st_starting: __TBB_fallthrough; case st_normal: __TBB_ASSERT(!my_next, "Should not wake a thread while it's still in asleep list"); my_thread_monitor.notify(); break; case st_init: if (my_state.compare_exchange_strong(state, st_starting)) { // after this point, remove_server_ref() must be done by created thread #if __TBB_USE_WINAPI // Win thread_monitor::launch is designed on the assumption that the workers thread id go from 1 to Hard limit set by TBB market::global_market const std::size_t worker_idx = my_server.my_n_thread - this->my_index; my_handle = thread_monitor::launch(thread_routine, this, my_server.my_stack_size, &worker_idx); #elif __TBB_USE_POSIX { affinity_helper fpa; fpa.protect_affinity_mask( /*restore_process_mask=*/true); my_handle = thread_monitor::launch(thread_routine, this, my_server.my_stack_size); // Implicit destruction of fpa resets original affinity mask. } #endif /* __TBB_USE_POSIX */ state = st_starting; if (!my_state.compare_exchange_strong(state, st_normal)) { // Do shutdown during startup. my_handle can't be released // by start_shutdown, because my_handle value might be not set yet // at time of transition from st_starting to st_quit. __TBB_ASSERT(state == st_quit, nullptr); release_handle(my_handle, governor::does_client_join_workers(my_client)); } } break; default: __TBB_ASSERT(state == st_quit, nullptr); } } //------------------------------------------------------------------------ // Methods of private_server //------------------------------------------------------------------------ private_server::private_server( tbb_client& client ) : my_client(client), my_n_thread(client.max_job_count()), my_stack_size(client.min_stack_size()), my_slack(0), my_ref_count(my_n_thread+1), my_thread_array(nullptr), my_asleep_list_root(nullptr) #if TBB_USE_ASSERT , my_net_slack_requests(0) #endif /* TBB_USE_ASSERT */ { my_thread_array = tbb::cache_aligned_allocator().allocate( my_n_thread ); for( std::size_t i=0; imy_next = my_asleep_list_root.load(std::memory_order_relaxed); my_asleep_list_root.store(t, std::memory_order_relaxed); } } private_server::~private_server() { __TBB_ASSERT( my_net_slack_requests==0, nullptr); for( std::size_t i=my_n_thread; i--; ) my_thread_array[i].~padded_private_worker(); tbb::cache_aligned_allocator().deallocate( my_thread_array, my_n_thread ); tbb::detail::poison_pointer( my_thread_array ); } inline bool private_server::try_insert_in_asleep_list( private_worker& t ) { asleep_list_mutex_type::scoped_lock lock; if( !lock.try_acquire(my_asleep_list_mutex) ) return false; // Contribute to slack under lock so that if another takes that unit of slack, // it sees us sleeping on the list and wakes us up. auto expected = my_slack.load(std::memory_order_relaxed); while (expected < 0) { if (my_slack.compare_exchange_strong(expected, expected + 1)) { t.my_next = my_asleep_list_root.load(std::memory_order_relaxed); my_asleep_list_root.store(&t, std::memory_order_relaxed); return true; } } return false; } void private_server::wake_some( int additional_slack ) { __TBB_ASSERT( additional_slack>=0, nullptr ); private_worker* wakee[2]; private_worker**w = wakee; if (additional_slack) { // Contribute our unused slack to my_slack. my_slack += additional_slack; } int allotted_slack = 0; while (allotted_slack < 2) { // Chain reaction; Try to claim unit of slack int old = my_slack.load(std::memory_order_relaxed); do { if (old <= 0) goto done; } while (!my_slack.compare_exchange_strong(old, old - 1)); ++allotted_slack; } done: if (allotted_slack) { asleep_list_mutex_type::scoped_lock lock(my_asleep_list_mutex); auto root = my_asleep_list_root.load(std::memory_order_relaxed); while( root && wmy_next; } my_asleep_list_root.store(root, std::memory_order_relaxed); if(allotted_slack) { // Contribute our unused slack to my_slack. my_slack += allotted_slack; } } while( w>wakee ) { private_worker* ww = *--w; ww->my_next = nullptr; ww->wake_or_launch(); } } void private_server::adjust_job_count_estimate( int delta ) { #if TBB_USE_ASSERT my_net_slack_requests+=delta; #endif /* TBB_USE_ASSERT */ if( delta<0 ) { my_slack+=delta; } else if( delta>0 ) { wake_some( delta ); } } //! Factory method called from task.cpp to create a private_server. tbb_server* make_private_server( tbb_client& client ) { return new( tbb::cache_aligned_allocator().allocate(1) ) private_server(client); } } // namespace rml } // namespace r1 } // namespace detail } // namespace tbb ================================================ FILE: third-party/tbb/src/tbb/profiling.cpp ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "oneapi/tbb/detail/_config.h" #include "oneapi/tbb/detail/_template_helpers.h" #include "main.h" #include "itt_notify.h" #include "oneapi/tbb/profiling.h" #include namespace tbb { namespace detail { namespace r1 { #if __TBB_USE_ITT_NOTIFY bool ITT_Present; static std::atomic ITT_InitializationDone; static __itt_domain *tbb_domains[d1::ITT_NUM_DOMAINS] = {}; struct resource_string { const char *str; __itt_string_handle *itt_str_handle; }; // // populate resource strings // #define TBB_STRING_RESOURCE( index_name, str ) { str, nullptr }, static resource_string strings_for_itt[] = { #include "oneapi/tbb/detail/_string_resource.h" { "num_resource_strings", nullptr } }; #undef TBB_STRING_RESOURCE static __itt_string_handle* ITT_get_string_handle(std::uintptr_t idx) { __TBB_ASSERT(idx < NUM_STRINGS, "string handle out of valid range"); return idx < NUM_STRINGS ? strings_for_itt[idx].itt_str_handle : nullptr; } static void ITT_init_domains() { tbb_domains[d1::ITT_DOMAIN_MAIN] = __itt_domain_create( _T("tbb") ); tbb_domains[d1::ITT_DOMAIN_MAIN]->flags = 1; tbb_domains[d1::ITT_DOMAIN_FLOW] = __itt_domain_create( _T("tbb.flow") ); tbb_domains[d1::ITT_DOMAIN_FLOW]->flags = 1; tbb_domains[d1::ITT_DOMAIN_ALGO] = __itt_domain_create( _T("tbb.algorithm") ); tbb_domains[d1::ITT_DOMAIN_ALGO]->flags = 1; } static void ITT_init_strings() { for ( std::uintptr_t i = 0; i < NUM_STRINGS; ++i ) { #if _WIN32||_WIN64 strings_for_itt[i].itt_str_handle = __itt_string_handle_createA( strings_for_itt[i].str ); #else strings_for_itt[i].itt_str_handle = __itt_string_handle_create( strings_for_itt[i].str ); #endif } } static void ITT_init() { ITT_init_domains(); ITT_init_strings(); } /** Thread-unsafe lazy one-time initialization of tools interop. Used by both dummy handlers and general TBB one-time initialization routine. **/ void ITT_DoUnsafeOneTimeInitialization () { // Double check ITT_InitializationDone is necessary because the first check // in ITT_DoOneTimeInitialization is not guarded with the __TBB_InitOnce lock. if ( !ITT_InitializationDone ) { ITT_Present = (__TBB_load_ittnotify()!=0); if (ITT_Present) ITT_init(); ITT_InitializationDone = true; } } /** Thread-safe lazy one-time initialization of tools interop. Used by dummy handlers only. **/ extern "C" void ITT_DoOneTimeInitialization() { if ( !ITT_InitializationDone ) { __TBB_InitOnce::lock(); ITT_DoUnsafeOneTimeInitialization(); __TBB_InitOnce::unlock(); } } void create_itt_sync(void* ptr, const tchar* objtype, const tchar* objname) { ITT_SYNC_CREATE(ptr, objtype, objname); } void call_itt_notify(int t, void *ptr) { switch (t) { case 0: ITT_NOTIFY(sync_prepare, ptr); break; case 1: ITT_NOTIFY(sync_cancel, ptr); break; case 2: ITT_NOTIFY(sync_acquired, ptr); break; case 3: ITT_NOTIFY(sync_releasing, ptr); break; case 4: ITT_NOTIFY(sync_destroy, ptr); break; } } void itt_set_sync_name(void* obj, const tchar* name) { __itt_sync_rename(obj, name); } const __itt_id itt_null_id = { 0, 0, 0 }; static inline __itt_domain* get_itt_domain(d1::itt_domain_enum idx) { if (tbb_domains[idx] == nullptr) { ITT_DoOneTimeInitialization(); } return tbb_domains[idx]; } static inline void itt_id_make(__itt_id* id, void* addr, unsigned long long extra) { *id = __itt_id_make(addr, extra); } static inline void itt_id_create(const __itt_domain* domain, __itt_id id) { __itt_id_create(domain, id); } void itt_make_task_group(d1::itt_domain_enum domain, void* group, unsigned long long group_extra, void* parent, unsigned long long parent_extra, string_resource_index name_index) { if (__itt_domain* d = get_itt_domain(domain)) { __itt_id group_id = itt_null_id; __itt_id parent_id = itt_null_id; itt_id_make(&group_id, group, group_extra); itt_id_create(d, group_id); if (parent) { itt_id_make(&parent_id, parent, parent_extra); } __itt_string_handle* n = ITT_get_string_handle(name_index); __itt_task_group(d, group_id, parent_id, n); } } void __TBB_EXPORTED_FUNC itt_metadata_str_add(d1::itt_domain_enum domain, void *addr, unsigned long long addr_extra, string_resource_index key, const char *value ) { if ( __itt_domain *d = get_itt_domain( domain ) ) { __itt_id id = itt_null_id; itt_id_make( &id, addr, addr_extra ); __itt_string_handle *k = ITT_get_string_handle(key); size_t value_length = strlen( value ); #if _WIN32||_WIN64 __itt_metadata_str_addA(d, id, k, value, value_length); #else __itt_metadata_str_add(d, id, k, value, value_length); #endif } } void __TBB_EXPORTED_FUNC itt_metadata_ptr_add(d1::itt_domain_enum domain, void *addr, unsigned long long addr_extra, string_resource_index key, void *value ) { if ( __itt_domain *d = get_itt_domain( domain ) ) { __itt_id id = itt_null_id; itt_id_make( &id, addr, addr_extra ); __itt_string_handle *k = ITT_get_string_handle(key); #if __TBB_x86_32 __itt_metadata_add(d, id, k, __itt_metadata_u32, 1, value); #else __itt_metadata_add(d, id, k, __itt_metadata_u64, 1, value); #endif } } void __TBB_EXPORTED_FUNC itt_relation_add(d1::itt_domain_enum domain, void *addr0, unsigned long long addr0_extra, itt_relation relation, void *addr1, unsigned long long addr1_extra ) { if ( __itt_domain *d = get_itt_domain( domain ) ) { __itt_id id0 = itt_null_id; __itt_id id1 = itt_null_id; itt_id_make( &id0, addr0, addr0_extra ); itt_id_make( &id1, addr1, addr1_extra ); __itt_relation_add( d, id0, (__itt_relation)relation, id1 ); } } void __TBB_EXPORTED_FUNC itt_task_begin(d1::itt_domain_enum domain, void* task, unsigned long long task_extra, void* parent, unsigned long long parent_extra, string_resource_index name_index) { if (__itt_domain* d = get_itt_domain(domain)) { __itt_id task_id = itt_null_id; __itt_id parent_id = itt_null_id; if (task) { itt_id_make(&task_id, task, task_extra); } if (parent) { itt_id_make(&parent_id, parent, parent_extra); } __itt_string_handle* n = ITT_get_string_handle(name_index); __itt_task_begin(d, task_id, parent_id, n); } } void __TBB_EXPORTED_FUNC itt_task_end(d1::itt_domain_enum domain) { if (__itt_domain* d = get_itt_domain(domain)) { __itt_task_end(d); } } void __TBB_EXPORTED_FUNC itt_region_begin(d1::itt_domain_enum domain, void *region, unsigned long long region_extra, void *parent, unsigned long long parent_extra, string_resource_index /* name_index */ ) { if ( __itt_domain *d = get_itt_domain( domain ) ) { __itt_id region_id = itt_null_id; __itt_id parent_id = itt_null_id; itt_id_make( ®ion_id, region, region_extra ); if ( parent ) { itt_id_make( &parent_id, parent, parent_extra ); } __itt_region_begin( d, region_id, parent_id, nullptr ); } } void __TBB_EXPORTED_FUNC itt_region_end(d1::itt_domain_enum domain, void *region, unsigned long long region_extra ) { if ( __itt_domain *d = get_itt_domain( domain ) ) { __itt_id region_id = itt_null_id; itt_id_make( ®ion_id, region, region_extra ); __itt_region_end( d, region_id ); } } #else void create_itt_sync(void* /*ptr*/, const tchar* /*objtype*/, const tchar* /*objname*/) {} void call_itt_notify(int /*t*/, void* /*ptr*/) {} void itt_set_sync_name(void* /*obj*/, const tchar* /*name*/) {} void itt_make_task_group(d1::itt_domain_enum /*domain*/, void* /*group*/, unsigned long long /*group_extra*/, void* /*parent*/, unsigned long long /*parent_extra*/, string_resource_index /*name_index*/) {} void itt_metadata_str_add(d1::itt_domain_enum /*domain*/, void* /*addr*/, unsigned long long /*addr_extra*/, string_resource_index /*key*/, const char* /*value*/ ) { } void itt_metadata_ptr_add(d1::itt_domain_enum /*domain*/, void * /*addr*/, unsigned long long /*addr_extra*/, string_resource_index /*key*/, void * /*value*/ ) {} void itt_relation_add(d1::itt_domain_enum /*domain*/, void* /*addr0*/, unsigned long long /*addr0_extra*/, itt_relation /*relation*/, void* /*addr1*/, unsigned long long /*addr1_extra*/ ) { } void itt_task_begin(d1::itt_domain_enum /*domain*/, void* /*task*/, unsigned long long /*task_extra*/, void* /*parent*/, unsigned long long /*parent_extra*/, string_resource_index /*name_index*/ ) { } void itt_task_end(d1::itt_domain_enum /*domain*/ ) { } void itt_region_begin(d1::itt_domain_enum /*domain*/, void* /*region*/, unsigned long long /*region_extra*/, void* /*parent*/, unsigned long long /*parent_extra*/, string_resource_index /*name_index*/ ) { } void itt_region_end(d1::itt_domain_enum /*domain*/, void* /*region*/, unsigned long long /*region_extra*/ ) { } #endif /* __TBB_USE_ITT_NOTIFY */ const tchar *SyncType_Scheduler = _T("%Constant") ; const tchar *SyncObj_ContextsList = _T("TBB Scheduler") ; } // namespace r1 } // namespace detail } // namespace tbb ================================================ FILE: third-party/tbb/src/tbb/queuing_rw_mutex.cpp ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /** Before making any changes in the implementation, please emulate algorithmic changes with SPIN tool using /tools/spin_models/ReaderWriterMutex.pml. There could be some code looking as "can be restructured" but its structure does matter! */ #include "oneapi/tbb/queuing_rw_mutex.h" #include "oneapi/tbb/detail/_assert.h" #include "oneapi/tbb/detail/_utils.h" #include "itt_notify.h" namespace tbb { namespace detail { namespace r1 { #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) // Workaround for overzealous compiler warnings #pragma warning (push) #pragma warning (disable: 4311 4312) #endif //! A view of a T* with additional functionality for twiddling low-order bits. template class tricky_atomic_pointer { public: using word = uintptr_t; static T* fetch_add( std::atomic& location, word addend, std::memory_order memory_order ) { return reinterpret_cast(location.fetch_add(addend, memory_order)); } static T* exchange( std::atomic& location, T* value, std::memory_order memory_order ) { return reinterpret_cast(location.exchange(reinterpret_cast(value), memory_order)); } static T* compare_exchange_strong( std::atomic& obj, const T* expected, const T* desired, std::memory_order memory_order ) { word expd = reinterpret_cast(expected); obj.compare_exchange_strong(expd, reinterpret_cast(desired), memory_order); return reinterpret_cast(expd); } static void store( std::atomic& location, const T* value, std::memory_order memory_order ) { location.store(reinterpret_cast(value), memory_order); } static T* load( std::atomic& location, std::memory_order memory_order ) { return reinterpret_cast(location.load(memory_order)); } static void spin_wait_while_eq(const std::atomic& location, const T* value) { tbb::detail::d0::spin_wait_while_eq(location, reinterpret_cast(value) ); } T* & ref; tricky_atomic_pointer( T*& original ) : ref(original) {}; tricky_atomic_pointer(const tricky_atomic_pointer&) = delete; tricky_atomic_pointer& operator=(const tricky_atomic_pointer&) = delete; T* operator&( const word operand2 ) const { return reinterpret_cast( reinterpret_cast(ref) & operand2 ); } T* operator|( const word operand2 ) const { return reinterpret_cast( reinterpret_cast(ref) | operand2 ); } }; using tricky_pointer = tricky_atomic_pointer; #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) // Workaround for overzealous compiler warnings #pragma warning (pop) #endif //! Flag bits in a state_t that specify information about a locking request. enum state_t_flags : unsigned char { STATE_NONE = 0, STATE_WRITER = 1<<0, STATE_READER = 1<<1, STATE_READER_UNBLOCKNEXT = 1<<2, STATE_ACTIVEREADER = 1<<3, STATE_UPGRADE_REQUESTED = 1<<4, STATE_UPGRADE_WAITING = 1<<5, STATE_UPGRADE_LOSER = 1<<6, STATE_COMBINED_WAITINGREADER = STATE_READER | STATE_READER_UNBLOCKNEXT, STATE_COMBINED_READER = STATE_COMBINED_WAITINGREADER | STATE_ACTIVEREADER, STATE_COMBINED_UPGRADING = STATE_UPGRADE_WAITING | STATE_UPGRADE_LOSER }; static const unsigned char RELEASED = 0; static const unsigned char ACQUIRED = 1; struct queuing_rw_mutex_impl { //! Try to acquire the internal lock /** Returns true if lock was successfully acquired. */ static bool try_acquire_internal_lock(d1::queuing_rw_mutex::scoped_lock& s) { auto expected = RELEASED; return s.my_internal_lock.compare_exchange_strong(expected, ACQUIRED); } //! Acquire the internal lock static void acquire_internal_lock(d1::queuing_rw_mutex::scoped_lock& s) { // Usually, we would use the test-test-and-set idiom here, with exponential backoff. // But so far, experiments indicate there is no value in doing so here. while( !try_acquire_internal_lock(s) ) { machine_pause(1); } } //! Release the internal lock static void release_internal_lock(d1::queuing_rw_mutex::scoped_lock& s) { s.my_internal_lock.store(RELEASED, std::memory_order_release); } //! Wait for internal lock to be released static void wait_for_release_of_internal_lock(d1::queuing_rw_mutex::scoped_lock& s) { spin_wait_until_eq(s.my_internal_lock, RELEASED); } //! A helper function static void unblock_or_wait_on_internal_lock(d1::queuing_rw_mutex::scoped_lock& s, uintptr_t flag ) { if( flag ) { wait_for_release_of_internal_lock(s); } else { release_internal_lock(s); } } //! Mask for low order bit of a pointer. static const tricky_pointer::word FLAG = 0x1; static uintptr_t get_flag( d1::queuing_rw_mutex::scoped_lock* ptr ) { return reinterpret_cast(ptr) & FLAG; } //------------------------------------------------------------------------ // Methods of queuing_rw_mutex::scoped_lock //------------------------------------------------------------------------ //! A method to acquire queuing_rw_mutex lock static void acquire(d1::queuing_rw_mutex& m, d1::queuing_rw_mutex::scoped_lock& s, bool write) { __TBB_ASSERT( !s.my_mutex, "scoped_lock is already holding a mutex"); // Must set all fields before the exchange, because once the // exchange executes, *this becomes accessible to other threads. s.my_mutex = &m; s.my_prev.store(0U, std::memory_order_relaxed); s.my_next.store(0U, std::memory_order_relaxed); s.my_going.store(0U, std::memory_order_relaxed); s.my_state.store(d1::queuing_rw_mutex::scoped_lock::state_t(write ? STATE_WRITER : STATE_READER), std::memory_order_relaxed); s.my_internal_lock.store(RELEASED, std::memory_order_relaxed); // The CAS must have release semantics, because we are // "sending" the fields initialized above to other actors. // We need acquire semantics, because we are acquiring the predecessor (or mutex if no predecessor) queuing_rw_mutex::scoped_lock* predecessor = m.q_tail.exchange(&s, std::memory_order_acq_rel); if( write ) { // Acquiring for write if( predecessor ) { ITT_NOTIFY(sync_prepare, s.my_mutex); predecessor = tricky_pointer(predecessor) & ~FLAG; __TBB_ASSERT( !predecessor->my_next, "the predecessor has another successor!"); tricky_pointer::store(predecessor->my_next, &s, std::memory_order_release); // We are acquiring the mutex spin_wait_until_eq(s.my_going, 1U, std::memory_order_acquire); } } else { // Acquiring for read #if __TBB_USE_ITT_NOTIFY bool sync_prepare_done = false; #endif if( predecessor ) { unsigned char pred_state{}; __TBB_ASSERT( !s.my_prev.load(std::memory_order_relaxed), "the predecessor is already set" ); if( tricky_pointer(predecessor) & FLAG ) { /* this is only possible if predecessor is an upgrading reader and it signals us to wait */ pred_state = STATE_UPGRADE_WAITING; predecessor = tricky_pointer(predecessor) & ~FLAG; } else { // Load predecessor->my_state now, because once predecessor->my_next becomes // non-null, we must assume that *predecessor might be destroyed. pred_state = predecessor->my_state.load(std::memory_order_relaxed); if (pred_state == STATE_READER) { // Notify the previous reader to unblock us. predecessor->my_state.compare_exchange_strong(pred_state, STATE_READER_UNBLOCKNEXT, std::memory_order_relaxed); } if (pred_state == STATE_ACTIVEREADER) { // either we initially read it or CAS failed // Active reader means that the predecessor already acquired the mutex and cannot notify us. // Therefore, we need to acquire the mutex ourselves by re-reading predecessor state. (void)predecessor->my_state.load(std::memory_order_acquire); } } tricky_pointer::store(s.my_prev, predecessor, std::memory_order_relaxed); __TBB_ASSERT( !( tricky_pointer(predecessor) & FLAG ), "use of corrupted pointer!" ); __TBB_ASSERT( !predecessor->my_next.load(std::memory_order_relaxed), "the predecessor has another successor!"); tricky_pointer::store(predecessor->my_next, &s, std::memory_order_release); if( pred_state != STATE_ACTIVEREADER ) { #if __TBB_USE_ITT_NOTIFY sync_prepare_done = true; ITT_NOTIFY(sync_prepare, s.my_mutex); #endif // We are acquiring the mutex spin_wait_until_eq(s.my_going, 1U, std::memory_order_acquire); } } // The protected state must have been acquired here before it can be further released to any other reader(s): unsigned char old_state = STATE_READER; // When this reader is signaled by previous actor it acquires the mutex. // We need to build happens-before relation with all other coming readers that will read our ACTIVEREADER // without blocking on my_going. Therefore, we need to publish ACTIVEREADER with release semantics. // On fail it is relaxed, because we will build happens-before on my_going. s.my_state.compare_exchange_strong(old_state, STATE_ACTIVEREADER, std::memory_order_release, std::memory_order_relaxed); if( old_state!=STATE_READER ) { #if __TBB_USE_ITT_NOTIFY if( !sync_prepare_done ) ITT_NOTIFY(sync_prepare, s.my_mutex); #endif // Failed to become active reader -> need to unblock the next waiting reader first __TBB_ASSERT( s.my_state.load(std::memory_order_relaxed)==STATE_READER_UNBLOCKNEXT, "unexpected state" ); spin_wait_while_eq(s.my_next, 0U, std::memory_order_acquire); /* my_state should be changed before unblocking the next otherwise it might finish and another thread can get our old state and left blocked */ s.my_state.store(STATE_ACTIVEREADER, std::memory_order_relaxed); tricky_pointer::load(s.my_next, std::memory_order_relaxed)->my_going.store(1U, std::memory_order_release); } __TBB_ASSERT(s.my_state.load(std::memory_order_relaxed) == STATE_ACTIVEREADER, "unlocked reader is active reader"); } ITT_NOTIFY(sync_acquired, s.my_mutex); } //! A method to acquire queuing_rw_mutex if it is free static bool try_acquire(d1::queuing_rw_mutex& m, d1::queuing_rw_mutex::scoped_lock& s, bool write) { __TBB_ASSERT( !s.my_mutex, "scoped_lock is already holding a mutex"); if( m.q_tail.load(std::memory_order_relaxed) ) return false; // Someone already took the lock // Must set all fields before the exchange, because once the // exchange executes, *this becomes accessible to other threads. s.my_prev.store(0U, std::memory_order_relaxed); s.my_next.store(0U, std::memory_order_relaxed); s.my_going.store(0U, std::memory_order_relaxed); // TODO: remove dead assignment? s.my_state.store(d1::queuing_rw_mutex::scoped_lock::state_t(write ? STATE_WRITER : STATE_ACTIVEREADER), std::memory_order_relaxed); s.my_internal_lock.store(RELEASED, std::memory_order_relaxed); // The CAS must have release semantics, because we are // "sending" the fields initialized above to other actors. // We need acquire semantics, because we are acquiring the mutex d1::queuing_rw_mutex::scoped_lock* expected = nullptr; if (!m.q_tail.compare_exchange_strong(expected, &s, std::memory_order_acq_rel)) return false; // Someone already took the lock s.my_mutex = &m; ITT_NOTIFY(sync_acquired, s.my_mutex); return true; } //! A method to release queuing_rw_mutex lock static void release(d1::queuing_rw_mutex::scoped_lock& s) { __TBB_ASSERT(s.my_mutex!=nullptr, "no lock acquired"); ITT_NOTIFY(sync_releasing, s.my_mutex); if( s.my_state.load(std::memory_order_relaxed) == STATE_WRITER ) { // Acquired for write // The logic below is the same as "writerUnlock", but elides // "return" from the middle of the routine. // In the statement below, acquire semantics of reading my_next is required // so that following operations with fields of my_next are safe. d1::queuing_rw_mutex::scoped_lock* next = tricky_pointer::load(s.my_next, std::memory_order_acquire); if( !next ) { d1::queuing_rw_mutex::scoped_lock* expected = &s; // Release mutex on success otherwise wait for successor publication if( s.my_mutex->q_tail.compare_exchange_strong(expected, nullptr, std::memory_order_release, std::memory_order_relaxed) ) { // this was the only item in the queue, and the queue is now empty. goto done; } spin_wait_while_eq(s.my_next, 0U, std::memory_order_relaxed); next = tricky_pointer::load(s.my_next, std::memory_order_acquire); } next->my_going.store(2U, std::memory_order_relaxed); // protect next queue node from being destroyed too early // If the next is STATE_UPGRADE_WAITING, it is expected to acquire all other released readers via release // sequence in next->my_state. In that case, we need to preserve release sequence in next->my_state // contributed by other reader. So, there are two approaches not to break the release sequence: // 1. Use read-modify-write (exchange) operation to store with release the UPGRADE_LOSER state; // 2. Acquire the release sequence and store the sequence and UPGRADE_LOSER state. // The second approach seems better on x86 because it does not involve interlocked operations. // Therefore, we read next->my_state with acquire while it is not required for else branch to get the // release sequence. if( next->my_state.load(std::memory_order_acquire)==STATE_UPGRADE_WAITING ) { // the next waiting for upgrade means this writer was upgraded before. acquire_internal_lock(s); // Responsibility transition, the one who reads uncorrupted my_prev will do release. // Guarantee that above store of 2 into next->my_going happens-before resetting of next->my_prev d1::queuing_rw_mutex::scoped_lock* tmp = tricky_pointer::exchange(next->my_prev, nullptr, std::memory_order_release); // Pass the release sequence that we acquired with the above load of next->my_state. next->my_state.store(STATE_UPGRADE_LOSER, std::memory_order_release); // We are releasing the mutex next->my_going.store(1U, std::memory_order_release); unblock_or_wait_on_internal_lock(s, get_flag(tmp)); } else { // next->state cannot be STATE_UPGRADE_REQUESTED __TBB_ASSERT( next->my_state.load(std::memory_order_relaxed) & (STATE_COMBINED_WAITINGREADER | STATE_WRITER), "unexpected state" ); __TBB_ASSERT( !( next->my_prev.load(std::memory_order_relaxed) & FLAG ), "use of corrupted pointer!" ); // Guarantee that above store of 2 into next->my_going happens-before resetting of next->my_prev tricky_pointer::store(next->my_prev, nullptr, std::memory_order_release); // We are releasing the mutex next->my_going.store(1U, std::memory_order_release); } } else { // Acquired for read // The basic idea it to build happens-before relation with left and right readers via prev and next. In addition, // the first reader should acquire the left (prev) signal and propagate to right (next). To simplify, we always // build happens-before relation between left and right (left is happened before right). queuing_rw_mutex::scoped_lock *tmp = nullptr; retry: // Addition to the original paper: Mark my_prev as in use queuing_rw_mutex::scoped_lock *predecessor = tricky_pointer::fetch_add(s.my_prev, FLAG, std::memory_order_acquire); if( predecessor ) { if( !(try_acquire_internal_lock(*predecessor)) ) { // Failed to acquire the lock on predecessor. The predecessor either unlinks or upgrades. // In the second case, it could or could not know my "in use" flag - need to check // Responsibility transition, the one who reads uncorrupted my_prev will do release. tmp = tricky_pointer::compare_exchange_strong(s.my_prev, tricky_pointer(predecessor) | FLAG, predecessor, std::memory_order_acquire); if( !(tricky_pointer(tmp) & FLAG) ) { __TBB_ASSERT(tricky_pointer::load(s.my_prev, std::memory_order_relaxed) != (tricky_pointer(predecessor) | FLAG), nullptr); // Now owner of predecessor is waiting for _us_ to release its lock release_internal_lock(*predecessor); } // else the "in use" flag is back -> the predecessor didn't get it and will release itself; nothing to do tmp = nullptr; goto retry; } __TBB_ASSERT(predecessor && predecessor->my_internal_lock.load(std::memory_order_relaxed)==ACQUIRED, "predecessor's lock is not acquired"); tricky_pointer::store(s.my_prev, predecessor, std::memory_order_relaxed); acquire_internal_lock(s); tricky_pointer::store(predecessor->my_next, nullptr, std::memory_order_release); d1::queuing_rw_mutex::scoped_lock* expected = &s; if( !tricky_pointer::load(s.my_next, std::memory_order_acquire) && !s.my_mutex->q_tail.compare_exchange_strong(expected, predecessor, std::memory_order_release) ) { spin_wait_while_eq( s.my_next, 0U, std::memory_order_acquire ); } __TBB_ASSERT( !(s.my_next.load(std::memory_order_relaxed) & FLAG), "use of corrupted pointer" ); // my_next is acquired either with load or spin_wait. if(d1::queuing_rw_mutex::scoped_lock *const l_next = tricky_pointer::load(s.my_next, std::memory_order_relaxed) ) { // I->next != nil, TODO: rename to next after clearing up and adapting the n in the comment two lines below // Equivalent to I->next->prev = I->prev but protected against (prev[n]&FLAG)!=0 tmp = tricky_pointer::exchange(l_next->my_prev, predecessor, std::memory_order_release); // I->prev->next = I->next; __TBB_ASSERT(tricky_pointer::load(s.my_prev, std::memory_order_relaxed)==predecessor, nullptr); predecessor->my_next.store(s.my_next.load(std::memory_order_relaxed), std::memory_order_release); } // Safe to release in the order opposite to acquiring which makes the code simpler release_internal_lock(*predecessor); } else { // No predecessor when we looked acquire_internal_lock(s); // "exclusiveLock(&I->EL)" d1::queuing_rw_mutex::scoped_lock* next = tricky_pointer::load(s.my_next, std::memory_order_acquire); if( !next ) { d1::queuing_rw_mutex::scoped_lock* expected = &s; // Release mutex on success otherwise wait for successor publication if( !s.my_mutex->q_tail.compare_exchange_strong(expected, nullptr, std::memory_order_release, std::memory_order_relaxed) ) { spin_wait_while_eq( s.my_next, 0U, std::memory_order_relaxed ); next = tricky_pointer::load(s.my_next, std::memory_order_acquire); } else { goto unlock_self; } } next->my_going.store(2U, std::memory_order_relaxed); // Responsibility transition, the one who reads uncorrupted my_prev will do release. tmp = tricky_pointer::exchange(next->my_prev, nullptr, std::memory_order_release); next->my_going.store(1U, std::memory_order_release); } unlock_self: unblock_or_wait_on_internal_lock(s, get_flag(tmp)); } done: // Lifetime synchronization, no need to build happens-before relation spin_wait_while_eq( s.my_going, 2U, std::memory_order_relaxed ); s.initialize(); } static bool downgrade_to_reader(d1::queuing_rw_mutex::scoped_lock& s) { if ( s.my_state.load(std::memory_order_relaxed) == STATE_ACTIVEREADER ) return true; // Already a reader ITT_NOTIFY(sync_releasing, s.my_mutex); d1::queuing_rw_mutex::scoped_lock* next = tricky_pointer::load(s.my_next, std::memory_order_acquire); if( !next ) { s.my_state.store(STATE_READER, std::memory_order_seq_cst); // the following load of q_tail must not be reordered with setting STATE_READER above if( &s == s.my_mutex->q_tail.load(std::memory_order_seq_cst) ) { unsigned char old_state = STATE_READER; // When this reader is signaled by previous actor it acquires the mutex. // We need to build happens-before relation with all other coming readers that will read our ACTIVEREADER // without blocking on my_going. Therefore, we need to publish ACTIVEREADER with release semantics. // On fail it is relaxed, because we will build happens-before on my_going. s.my_state.compare_exchange_strong(old_state, STATE_ACTIVEREADER, std::memory_order_release, std::memory_order_relaxed); if( old_state==STATE_READER ) return true; // Downgrade completed } /* wait for the next to register */ spin_wait_while_eq(s.my_next, 0U, std::memory_order_relaxed); next = tricky_pointer::load(s.my_next, std::memory_order_acquire); } __TBB_ASSERT( next, "still no successor at this point!" ); if( next->my_state.load(std::memory_order_relaxed) & STATE_COMBINED_WAITINGREADER ) next->my_going.store(1U, std::memory_order_release); // If the next is STATE_UPGRADE_WAITING, it is expected to acquire all other released readers via release // sequence in next->my_state. In that case, we need to preserve release sequence in next->my_state // contributed by other reader. So, there are two approaches not to break the release sequence: // 1. Use read-modify-write (exchange) operation to store with release the UPGRADE_LOSER state; // 2. Acquire the release sequence and store the sequence and UPGRADE_LOSER state. // The second approach seems better on x86 because it does not involve interlocked operations. // Therefore, we read next->my_state with acquire while it is not required for else branch to get the // release sequence. else if( next->my_state.load(std::memory_order_acquire)==STATE_UPGRADE_WAITING ) // the next waiting for upgrade means this writer was upgraded before. // To safe release sequence on next->my_state read it with acquire next->my_state.store(STATE_UPGRADE_LOSER, std::memory_order_release); s.my_state.store(STATE_ACTIVEREADER, std::memory_order_release); return true; } static bool upgrade_to_writer(d1::queuing_rw_mutex::scoped_lock& s) { if (s.my_state.load(std::memory_order_relaxed) == STATE_WRITER) { // Already a writer return true; } __TBB_ASSERT(s.my_state.load(std::memory_order_relaxed) == STATE_ACTIVEREADER, "only active reader can be updated"); queuing_rw_mutex::scoped_lock* tmp{}; queuing_rw_mutex::scoped_lock* me = &s; ITT_NOTIFY(sync_releasing, s.my_mutex); // Publish ourselves into my_state that other UPGRADE_WAITING actors can acquire our state. s.my_state.store(STATE_UPGRADE_REQUESTED, std::memory_order_release); requested: __TBB_ASSERT( !(s.my_next.load(std::memory_order_relaxed) & FLAG), "use of corrupted pointer!" ); acquire_internal_lock(s); d1::queuing_rw_mutex::scoped_lock* expected = &s; if( !s.my_mutex->q_tail.compare_exchange_strong(expected, tricky_pointer(me)|FLAG, std::memory_order_acq_rel) ) { spin_wait_while_eq( s.my_next, 0U, std::memory_order_relaxed ); queuing_rw_mutex::scoped_lock * next; next = tricky_pointer::fetch_add(s.my_next, FLAG, std::memory_order_acquire); // While we were READER the next READER might reach STATE_UPGRADE_WAITING state. // Therefore, it did not build happens before relation with us and we need to acquire the // next->my_state to build the happens before relation ourselves unsigned short n_state = next->my_state.load(std::memory_order_acquire); /* the next reader can be blocked by our state. the best thing to do is to unblock it */ if( n_state & STATE_COMBINED_WAITINGREADER ) next->my_going.store(1U, std::memory_order_release); // Responsibility transition, the one who reads uncorrupted my_prev will do release. tmp = tricky_pointer::exchange(next->my_prev, &s, std::memory_order_release); unblock_or_wait_on_internal_lock(s, get_flag(tmp)); if( n_state & (STATE_COMBINED_READER | STATE_UPGRADE_REQUESTED) ) { // save next|FLAG for simplicity of following comparisons tmp = tricky_pointer(next)|FLAG; for( atomic_backoff b; tricky_pointer::load(s.my_next, std::memory_order_relaxed)==tmp; b.pause() ) { if( s.my_state.load(std::memory_order_acquire) & STATE_COMBINED_UPGRADING ) { if( tricky_pointer::load(s.my_next, std::memory_order_acquire)==tmp ) tricky_pointer::store(s.my_next, next, std::memory_order_relaxed); goto waiting; } } __TBB_ASSERT(tricky_pointer::load(s.my_next, std::memory_order_relaxed) != (tricky_pointer(next)|FLAG), nullptr); goto requested; } else { __TBB_ASSERT( n_state & (STATE_WRITER | STATE_UPGRADE_WAITING), "unexpected state"); __TBB_ASSERT( (tricky_pointer(next)|FLAG) == tricky_pointer::load(s.my_next, std::memory_order_relaxed), nullptr); tricky_pointer::store(s.my_next, next, std::memory_order_relaxed); } } else { /* We are in the tail; whoever comes next is blocked by q_tail&FLAG */ release_internal_lock(s); } // if( this != my_mutex->q_tail... ) { unsigned char old_state = STATE_UPGRADE_REQUESTED; // If we reach STATE_UPGRADE_WAITING state we do not build happens-before relation with READER on // left. We delegate this responsibility to READER on left when it try upgrading. Therefore, we are releasing // on success. // Otherwise, on fail, we already acquired the next->my_state. s.my_state.compare_exchange_strong(old_state, STATE_UPGRADE_WAITING, std::memory_order_release, std::memory_order_relaxed); } waiting: __TBB_ASSERT( !( s.my_next.load(std::memory_order_relaxed) & FLAG ), "use of corrupted pointer!" ); __TBB_ASSERT( s.my_state & STATE_COMBINED_UPGRADING, "wrong state at upgrade waiting_retry" ); __TBB_ASSERT( me==&s, nullptr ); ITT_NOTIFY(sync_prepare, s.my_mutex); /* if no one was blocked by the "corrupted" q_tail, turn it back */ expected = tricky_pointer(me)|FLAG; s.my_mutex->q_tail.compare_exchange_strong(expected, &s, std::memory_order_release); queuing_rw_mutex::scoped_lock * predecessor; // Mark my_prev as 'in use' to prevent predecessor from releasing predecessor = tricky_pointer::fetch_add(s.my_prev, FLAG, std::memory_order_acquire); if( predecessor ) { bool success = try_acquire_internal_lock(*predecessor); { // While the predecessor pointer (my_prev) is in use (FLAG is set), we can safely update the node`s state. // Corrupted pointer transitions responsibility to release the predecessor`s node on us. unsigned char old_state = STATE_UPGRADE_REQUESTED; // Try to build happens before with the upgrading READER on left. If fail, the predecessor state is not // important for us because it will acquire our state. predecessor->my_state.compare_exchange_strong(old_state, STATE_UPGRADE_WAITING, std::memory_order_release, std::memory_order_relaxed); } if( !success ) { // Responsibility transition, the one who reads uncorrupted my_prev will do release. tmp = tricky_pointer::compare_exchange_strong(s.my_prev, tricky_pointer(predecessor)|FLAG, predecessor, std::memory_order_acquire); if( tricky_pointer(tmp) & FLAG ) { tricky_pointer::spin_wait_while_eq(s.my_prev, predecessor); predecessor = tricky_pointer::load(s.my_prev, std::memory_order_relaxed); } else { // TODO: spin_wait condition seems never reachable tricky_pointer::spin_wait_while_eq(s.my_prev, tricky_pointer(predecessor)|FLAG); release_internal_lock(*predecessor); } } else { tricky_pointer::store(s.my_prev, predecessor, std::memory_order_relaxed); release_internal_lock(*predecessor); tricky_pointer::spin_wait_while_eq(s.my_prev, predecessor); predecessor = tricky_pointer::load(s.my_prev, std::memory_order_relaxed); } if( predecessor ) goto waiting; } else { tricky_pointer::store(s.my_prev, nullptr, std::memory_order_relaxed); } __TBB_ASSERT( !predecessor && !s.my_prev, nullptr ); // additional lifetime issue prevention checks // wait for the successor to finish working with my fields wait_for_release_of_internal_lock(s); // now wait for the predecessor to finish working with my fields spin_wait_while_eq( s.my_going, 2U ); bool result = ( s.my_state != STATE_UPGRADE_LOSER ); s.my_state.store(STATE_WRITER, std::memory_order_relaxed); s.my_going.store(1U, std::memory_order_relaxed); ITT_NOTIFY(sync_acquired, s.my_mutex); return result; } static bool is_writer(const d1::queuing_rw_mutex::scoped_lock& m) { return m.my_state.load(std::memory_order_relaxed) == STATE_WRITER; } static void construct(d1::queuing_rw_mutex& m) { suppress_unused_warning(m); ITT_SYNC_CREATE(&m, _T("tbb::queuing_rw_mutex"), _T("")); } }; void __TBB_EXPORTED_FUNC acquire(d1::queuing_rw_mutex& m, d1::queuing_rw_mutex::scoped_lock& s, bool write) { queuing_rw_mutex_impl::acquire(m, s, write); } bool __TBB_EXPORTED_FUNC try_acquire(d1::queuing_rw_mutex& m, d1::queuing_rw_mutex::scoped_lock& s, bool write) { return queuing_rw_mutex_impl::try_acquire(m, s, write); } void __TBB_EXPORTED_FUNC release(d1::queuing_rw_mutex::scoped_lock& s) { queuing_rw_mutex_impl::release(s); } bool __TBB_EXPORTED_FUNC upgrade_to_writer(d1::queuing_rw_mutex::scoped_lock& s) { return queuing_rw_mutex_impl::upgrade_to_writer(s); } bool __TBB_EXPORTED_FUNC is_writer(const d1::queuing_rw_mutex::scoped_lock& s) { return queuing_rw_mutex_impl::is_writer(s); } bool __TBB_EXPORTED_FUNC downgrade_to_reader(d1::queuing_rw_mutex::scoped_lock& s) { return queuing_rw_mutex_impl::downgrade_to_reader(s); } TBB_EXPORT void __TBB_EXPORTED_FUNC construct(d1::queuing_rw_mutex& m) { queuing_rw_mutex_impl::construct(m); } } // namespace r1 } // namespace detail } // namespace tbb ================================================ FILE: third-party/tbb/src/tbb/rml_base.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ // Header guard and namespace names follow rml conventions. #ifndef __RML_rml_base_H #define __RML_rml_base_H #include #if _WIN32||_WIN64 #include #endif /* _WIN32||_WIN64 */ #ifdef RML_PURE_VIRTUAL_HANDLER #define RML_PURE(T) {RML_PURE_VIRTUAL_HANDLER(); return (T)0;} #else #define RML_PURE(T) = 0; #endif namespace rml { class server; class versioned_object { public: //! A version number typedef unsigned version_type; virtual ~versioned_object() {} //! Get version of this object /** The version number is incremented when a incompatible change is introduced. The version number is invariant for the lifetime of the object. */ virtual version_type version() const RML_PURE(version_type) }; //! Represents a client's job for an execution context. /** A job object is constructed by the client. Not derived from versioned_object because version is same as for client. */ class job { friend class server; }; //! Information that client provides to server when asking for a server. /** The instance must endure at least until acknowledge_close_connection is called. */ class client: public versioned_object { public: //! Typedef for convenience of derived classes in other namespaces. typedef ::rml::job job; //! Index of a job in a job pool typedef unsigned size_type; //! Maximum number of threads that client can exploit profitably if nothing else is running on the machine. /** The returned value should remain invariant for the lifetime of the connection. [idempotent] */ virtual size_type max_job_count() const RML_PURE(size_type) //! Minimum stack size for each job. 0 means to use default stack size. [idempotent] virtual std::size_t min_stack_size() const RML_PURE(std::size_t) //! Server calls this routine when it needs client to create a job object. virtual job* create_one_job() RML_PURE(job*) //! Acknowledge that all jobs have been cleaned up. /** Called by server in response to request_close_connection after cleanup(job) has been called for each job. */ virtual void acknowledge_close_connection() RML_PURE(void) //! Inform client that server is done with *this. /** Client should destroy the job. Not necessarily called by execution context represented by *this. Never called while any other thread is working on the job. */ virtual void cleanup( job& ) RML_PURE(void) // In general, we should not add new virtual methods, because that would // break derived classes. Think about reserving some vtable slots. }; // Information that server provides to client. // Virtual functions are routines provided by the server for the client to call. class server: public versioned_object { public: //! Typedef for convenience of derived classes. typedef ::rml::job job; #if _WIN32||_WIN64 typedef void* execution_resource_t; #endif //! Request that connection to server be closed. /** Causes each job associated with the client to have its cleanup method called, possibly by a thread different than the thread that created the job. This method can return before all cleanup methods return. Actions that have to wait after all cleanup methods return should be part of client::acknowledge_close_connection. Pass true as exiting if request_close_connection() is called because exit() is called. In that case, it is the client's responsibility to make sure all threads are terminated. In all other cases, pass false. */ virtual void request_close_connection( bool exiting = false ) = 0; //! Called by client thread when it reaches a point where it cannot make progress until other threads do. virtual void yield() = 0; //! Called by client to indicate a change in the number of non-RML threads that are running. /** This is a performance hint to the RML to adjust how many threads it should let run concurrently. The delta is the change in the number of non-RML threads that are running. For example, a value of 1 means the client has started running another thread, and a value of -1 indicates that the client has blocked or terminated one of its threads. */ virtual void independent_thread_number_changed( int delta ) = 0; //! Default level of concurrency for which RML strives when there are no non-RML threads running. /** Normally, the value is the hardware concurrency minus one. The "minus one" accounts for the thread created by main(). */ virtual unsigned default_concurrency() const = 0; }; class factory { public: //! status results enum status_type { st_success=0, st_connection_exists, st_not_found, st_incompatible }; protected: //! Pointer to routine that waits for server to indicate when client can close itself. status_type (*my_wait_to_close_routine)( factory& ); public: //! Library handle for use by RML. #if _WIN32||_WIN64 HMODULE library_handle; #else void* library_handle; #endif /* _WIN32||_WIN64 */ //! Special marker to keep dll from being unloaded prematurely static const std::size_t c_dont_unload = 1; }; //! Typedef for callback functions to print server info typedef void (*server_info_callback_t)( void* arg, const char* server_info ); } // namespace rml #endif /* __RML_rml_base_H */ ================================================ FILE: third-party/tbb/src/tbb/rml_tbb.cpp ================================================ /* Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "oneapi/tbb/detail/_assert.h" #include "rml_tbb.h" #include "dynamic_link.h" namespace tbb { namespace detail { namespace r1 { namespace rml { #define MAKE_SERVER(x) DLD(__TBB_make_rml_server,x) #define GET_INFO(x) DLD(__TBB_call_with_my_server_info,x) #define SERVER tbb_server #define CLIENT tbb_client #define FACTORY tbb_factory #if __TBB_WEAK_SYMBOLS_PRESENT #pragma weak __TBB_make_rml_server #pragma weak __TBB_call_with_my_server_info extern "C" { ::rml::factory::status_type __TBB_make_rml_server( rml::tbb_factory& f, rml::tbb_server*& server, rml::tbb_client& client ); void __TBB_call_with_my_server_info( ::rml::server_info_callback_t cb, void* arg ); } #endif /* __TBB_WEAK_SYMBOLS_PRESENT */ #if TBB_USE_DEBUG #define DEBUG_SUFFIX "_debug" #else #define DEBUG_SUFFIX #endif /* TBB_USE_DEBUG */ // RML_SERVER_NAME is the name of the RML server library. #if _WIN32 || _WIN64 #define RML_SERVER_NAME "irml" DEBUG_SUFFIX ".dll" #elif __APPLE__ #define RML_SERVER_NAME "libirml" DEBUG_SUFFIX ".1.dylib" #elif __FreeBSD__ || __NetBSD__ || __OpenBSD__ || __sun || _AIX #define RML_SERVER_NAME "libirml" DEBUG_SUFFIX ".so" #elif __unix__ #define RML_SERVER_NAME "libirml" DEBUG_SUFFIX ".so.1" #else #error Unknown OS #endif const ::rml::versioned_object::version_type CLIENT_VERSION = 2; #if __TBB_WEAK_SYMBOLS_PRESENT #pragma weak __RML_open_factory #pragma weak __RML_close_factory extern "C" { ::rml::factory::status_type __RML_open_factory ( ::rml::factory&, ::rml::versioned_object::version_type&, ::rml::versioned_object::version_type ); void __RML_close_factory( ::rml::factory& f ); } #endif /* __TBB_WEAK_SYMBOLS_PRESENT */ ::rml::factory::status_type FACTORY::open() { // Failure of following assertion indicates that factory is already open, or not zero-inited. __TBB_ASSERT_EX( !library_handle, nullptr); status_type (*open_factory_routine)( factory&, version_type&, version_type ); dynamic_link_descriptor server_link_table[4] = { DLD(__RML_open_factory,open_factory_routine), MAKE_SERVER(my_make_server_routine), DLD(__RML_close_factory,my_wait_to_close_routine), GET_INFO(my_call_with_server_info_routine), }; status_type result; if ( dynamic_link( RML_SERVER_NAME, server_link_table, 4, &library_handle ) ) { version_type server_version; result = (*open_factory_routine)( *this, server_version, CLIENT_VERSION ); // server_version can be checked here for incompatibility if necessary. } else { library_handle = nullptr; result = st_not_found; } return result; } void FACTORY::close() { if ( library_handle ) (*my_wait_to_close_routine)(*this); if ( (size_t)library_handle>FACTORY::c_dont_unload ) { dynamic_unlink(library_handle); library_handle = nullptr; } } ::rml::factory::status_type FACTORY::make_server( SERVER*& s, CLIENT& c) { // Failure of following assertion means that factory was not successfully opened. __TBB_ASSERT_EX( my_make_server_routine, nullptr); return (*my_make_server_routine)(*this,s,c); } } // namespace rml } // namespace r1 } // namespace detail } // namespace tbb ================================================ FILE: third-party/tbb/src/tbb/rml_tbb.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ // Header guard and namespace names follow TBB conventions. #ifndef __TBB_rml_tbb_H #define __TBB_rml_tbb_H #include "oneapi/tbb/version.h" #include "rml_base.h" namespace tbb { namespace detail { namespace r1 { namespace rml { //------------------------------------------------------------------------ // Classes instantiated by the server //------------------------------------------------------------------------ //! Represents a set of oneTBB worker threads provided by the server. class tbb_server: public ::rml::server { public: //! Inform server of adjustments in the number of workers that the client can profitably use. virtual void adjust_job_count_estimate( int delta ) = 0; #if _WIN32 || _WIN64 //! Inform server of a oneTBB external thread. virtual void register_external_thread( execution_resource_t& v ) = 0; //! Inform server that the oneTBB external thread is done with its work. virtual void unregister_external_thread( execution_resource_t v ) = 0; #endif /* _WIN32||_WIN64 */ }; //------------------------------------------------------------------------ // Classes instantiated by the client //------------------------------------------------------------------------ class tbb_client: public ::rml::client { public: //! Defined by TBB to steal a task and execute it. /** Called by server when it wants an execution context to do some TBB work. The method should return when it is okay for the thread to yield indefinitely. */ virtual void process( job& ) RML_PURE(void) }; /** Client must ensure that instance is zero-inited, typically by being a file-scope object. */ class tbb_factory: public ::rml::factory { //! Pointer to routine that creates an RML server. status_type (*my_make_server_routine)( tbb_factory&, tbb_server*&, tbb_client& ); //! Pointer to routine that calls callback function with server version info. void (*my_call_with_server_info_routine)( ::rml::server_info_callback_t cb, void* arg ); public: typedef ::rml::versioned_object::version_type version_type; typedef tbb_client client_type; typedef tbb_server server_type; //! Open factory. /** Dynamically links against RML library. Returns st_success, st_incompatible, or st_not_found. */ status_type open(); //! Factory method to be called by client to create a server object. /** Factory must be open. Returns st_success, or st_incompatible . */ status_type make_server( server_type*&, client_type& ); //! Close factory void close(); }; } // namespace rml } // namespace r1 } // namespace detail } // namespace tbb #endif /*__TBB_rml_tbb_H */ ================================================ FILE: third-party/tbb/src/tbb/rml_thread_monitor.h ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ // All platform-specific threading support is encapsulated here. */ #ifndef __RML_thread_monitor_H #define __RML_thread_monitor_H #if __TBB_USE_WINAPI #include #include #include //_alloca #include "misc.h" // support for processor groups #if __TBB_WIN8UI_SUPPORT && (_WIN32_WINNT < 0x0A00) #include #endif #elif __TBB_USE_POSIX #include #include #include #include #else #error Unsupported platform #endif #include #include "oneapi/tbb/detail/_template_helpers.h" #include "itt_notify.h" #include "semaphore.h" // All platform-specific threading support is in this header. #if (_WIN32||_WIN64)&&!__TBB_ipf // Deal with 64K aliasing. The formula for "offset" is a Fibonacci hash function, // which has the desirable feature of spreading out the offsets fairly evenly // without knowing the total number of offsets, and furthermore unlikely to // accidentally cancel out other 64K aliasing schemes that Microsoft might implement later. // See Knuth Vol 3. "Theorem S" for details on Fibonacci hashing. // The second statement is really does need "volatile", otherwise the compiler might remove the _alloca. #define AVOID_64K_ALIASING(idx) \ std::size_t offset = (idx+1) * 40503U % (1U<<16); \ void* volatile sink_for_alloca = _alloca(offset); \ __TBB_ASSERT_EX(sink_for_alloca, "_alloca failed"); #else // Linux thread allocators avoid 64K aliasing. #define AVOID_64K_ALIASING(idx) tbb::detail::suppress_unused_warning(idx) #endif /* _WIN32||_WIN64 */ namespace tbb { namespace detail { namespace r1 { // Forward declaration: throws std::runtime_error with what() returning error_code description prefixed with aux_info void handle_perror(int error_code, const char* aux_info); namespace rml { namespace internal { #if __TBB_USE_ITT_NOTIFY static const ::tbb::detail::r1::tchar *SyncType_RML = _T("%Constant"); static const ::tbb::detail::r1::tchar *SyncObj_ThreadMonitor = _T("RML Thr Monitor"); #endif /* __TBB_USE_ITT_NOTIFY */ //! Monitor with limited two-phase commit form of wait. /** At most one thread should wait on an instance at a time. */ class thread_monitor { public: thread_monitor() { ITT_SYNC_CREATE(&my_sema, SyncType_RML, SyncObj_ThreadMonitor); } ~thread_monitor() {} //! Notify waiting thread /** Can be called by any thread. */ void notify(); //! Wait for notification void wait(); #if __TBB_USE_WINAPI typedef HANDLE handle_type; #define __RML_DECL_THREAD_ROUTINE unsigned WINAPI typedef unsigned (WINAPI *thread_routine_type)(void*); //! Launch a thread static handle_type launch( thread_routine_type thread_routine, void* arg, std::size_t stack_size, const size_t* worker_index = nullptr ); #elif __TBB_USE_POSIX typedef pthread_t handle_type; #define __RML_DECL_THREAD_ROUTINE void* typedef void*(*thread_routine_type)(void*); //! Launch a thread static handle_type launch( thread_routine_type thread_routine, void* arg, std::size_t stack_size ); #endif /* __TBB_USE_POSIX */ //! Join thread static void join(handle_type handle); //! Detach thread static void detach_thread(handle_type handle); private: // The protection from double notification of the binary semaphore std::atomic my_notified{ false }; binary_semaphore my_sema; #if __TBB_USE_POSIX static void check( int error_code, const char* routine ); #endif }; #if __TBB_USE_WINAPI #ifndef STACK_SIZE_PARAM_IS_A_RESERVATION #define STACK_SIZE_PARAM_IS_A_RESERVATION 0x00010000 #endif // _beginthreadex API is not available in Windows 8 Store* applications, so use std::thread instead #if __TBB_WIN8UI_SUPPORT && (_WIN32_WINNT < 0x0A00) inline thread_monitor::handle_type thread_monitor::launch( thread_routine_type thread_function, void* arg, std::size_t, const std::size_t*) { //TODO: check that exception thrown from std::thread is not swallowed silently std::thread* thread_tmp=new std::thread(thread_function, arg); return thread_tmp->native_handle(); } #else inline thread_monitor::handle_type thread_monitor::launch( thread_routine_type thread_routine, void* arg, std::size_t stack_size, const std::size_t* worker_index ) { unsigned thread_id; int number_of_processor_groups = ( worker_index ) ? NumberOfProcessorGroups() : 0; unsigned create_flags = ( number_of_processor_groups > 1 ) ? CREATE_SUSPENDED : 0; HANDLE h = (HANDLE)_beginthreadex( nullptr, unsigned(stack_size), thread_routine, arg, STACK_SIZE_PARAM_IS_A_RESERVATION | create_flags, &thread_id ); if( !h ) { handle_perror(0, "thread_monitor::launch: _beginthreadex failed\n"); } if ( number_of_processor_groups > 1 ) { MoveThreadIntoProcessorGroup( h, FindProcessorGroupIndex( static_cast(*worker_index) ) ); ResumeThread( h ); } return h; } #endif //__TBB_WIN8UI_SUPPORT && (_WIN32_WINNT < 0x0A00) void thread_monitor::join(handle_type handle) { #if TBB_USE_ASSERT DWORD res = #endif WaitForSingleObjectEx(handle, INFINITE, FALSE); __TBB_ASSERT( res==WAIT_OBJECT_0, nullptr); #if TBB_USE_ASSERT BOOL val = #endif CloseHandle(handle); __TBB_ASSERT( val, nullptr); } void thread_monitor::detach_thread(handle_type handle) { #if TBB_USE_ASSERT BOOL val = #endif CloseHandle(handle); __TBB_ASSERT( val, nullptr); } #endif /* __TBB_USE_WINAPI */ #if __TBB_USE_POSIX inline void thread_monitor::check( int error_code, const char* routine ) { if( error_code ) { handle_perror(error_code, routine); } } inline thread_monitor::handle_type thread_monitor::launch( void* (*thread_routine)(void*), void* arg, std::size_t stack_size ) { // FIXME - consider more graceful recovery than just exiting if a thread cannot be launched. // Note that there are some tricky situations to deal with, such that the thread is already // grabbed as part of an OpenMP team. pthread_attr_t s; check(pthread_attr_init( &s ), "pthread_attr_init has failed"); if( stack_size>0 ) check(pthread_attr_setstacksize( &s, stack_size ), "pthread_attr_setstack_size has failed" ); // pthread_create(2) can spuriously fail with EAGAIN. We retry // max_num_tries times with progressively longer wait times. pthread_t handle; const int max_num_tries = 20; int error = EAGAIN; for (int i = 0; i < max_num_tries && error == EAGAIN; i++) { if (i != 0) { // Wait i milliseconds struct timespec ts = {0, i * 1000 * 1000}; nanosleep(&ts, NULL); } error = pthread_create(&handle, &s, thread_routine, arg); } if (error) handle_perror(error, "pthread_create has failed"); check( pthread_attr_destroy( &s ), "pthread_attr_destroy has failed" ); return handle; } void thread_monitor::join(handle_type handle) { check(pthread_join(handle, nullptr), "pthread_join has failed"); } void thread_monitor::detach_thread(handle_type handle) { check(pthread_detach(handle), "pthread_detach has failed"); } #endif /* __TBB_USE_POSIX */ inline void thread_monitor::notify() { // Check that the semaphore is not notified twice if (!my_notified.exchange(true, std::memory_order_release)) { my_sema.V(); } } inline void thread_monitor::wait() { my_sema.P(); // memory_order_seq_cst is required here to be ordered with // further load checking shutdown state my_notified.store(false, std::memory_order_seq_cst); } } // namespace internal } // namespace rml } // namespace r1 } // namespace detail } // namespace tbb #endif /* __RML_thread_monitor_H */ ================================================ FILE: third-party/tbb/src/tbb/rtm_mutex.cpp ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "oneapi/tbb/detail/_assert.h" #include "oneapi/tbb/detail/_rtm_mutex.h" #include "itt_notify.h" #include "governor.h" #include "misc.h" #include namespace tbb { namespace detail { namespace r1 { struct rtm_mutex_impl { // maximum number of times to retry // TODO: experiment on retry values. static constexpr int retry_threshold = 10; using transaction_result_type = decltype(begin_transaction()); //! Release speculative mutex static void release(d1::rtm_mutex::scoped_lock& s) { switch(s.m_transaction_state) { case d1::rtm_mutex::rtm_state::rtm_transacting: __TBB_ASSERT(is_in_transaction(), "m_transaction_state && not speculating"); end_transaction(); s.m_mutex = nullptr; break; case d1::rtm_mutex::rtm_state::rtm_real: s.m_mutex->unlock(); s.m_mutex = nullptr; break; case d1::rtm_mutex::rtm_state::rtm_none: __TBB_ASSERT(false, "mutex is not locked, but in release"); break; default: __TBB_ASSERT(false, "invalid m_transaction_state"); } s.m_transaction_state = d1::rtm_mutex::rtm_state::rtm_none; } //! Acquire lock on the given mutex. static void acquire(d1::rtm_mutex& m, d1::rtm_mutex::scoped_lock& s, bool only_speculate) { __TBB_ASSERT(s.m_transaction_state == d1::rtm_mutex::rtm_state::rtm_none, "scoped_lock already in transaction"); if(governor::speculation_enabled()) { int num_retries = 0; transaction_result_type abort_code = 0; do { if(m.m_flag.load(std::memory_order_acquire)) { if(only_speculate) return; spin_wait_while_eq(m.m_flag, true); } // _xbegin returns -1 on success or the abort code, so capture it if((abort_code = begin_transaction()) == transaction_result_type(speculation_successful_begin)) { // started speculation if(m.m_flag.load(std::memory_order_relaxed)) { abort_transaction(); } s.m_transaction_state = d1::rtm_mutex::rtm_state::rtm_transacting; // Don not wrap the following assignment to a function, // because it can abort the transaction in debug. Need mutex for release(). s.m_mutex = &m; return; // successfully started speculation } ++num_retries; } while((abort_code & speculation_retry) != 0 && (num_retries < retry_threshold)); } if(only_speculate) return; s.m_mutex = &m; s.m_mutex->lock(); s.m_transaction_state = d1::rtm_mutex::rtm_state::rtm_real; } //! Try to acquire lock on the given mutex. static bool try_acquire(d1::rtm_mutex& m, d1::rtm_mutex::scoped_lock& s) { acquire(m, s, /*only_speculate=*/true); if (s.m_transaction_state == d1::rtm_mutex::rtm_state::rtm_transacting) { return true; } __TBB_ASSERT(s.m_transaction_state == d1::rtm_mutex::rtm_state::rtm_none, nullptr); // transacting acquire failed. try_lock the real mutex if (m.try_lock()) { s.m_mutex = &m; s.m_transaction_state = d1::rtm_mutex::rtm_state::rtm_real; return true; } return false; } }; void __TBB_EXPORTED_FUNC acquire(d1::rtm_mutex& m, d1::rtm_mutex::scoped_lock& s, bool only_speculate) { rtm_mutex_impl::acquire(m, s, only_speculate); } bool __TBB_EXPORTED_FUNC try_acquire(d1::rtm_mutex& m, d1::rtm_mutex::scoped_lock& s) { return rtm_mutex_impl::try_acquire(m, s); } void __TBB_EXPORTED_FUNC release(d1::rtm_mutex::scoped_lock& s) { rtm_mutex_impl::release(s); } } // namespace r1 } // namespace detail } // namespace tbb ================================================ FILE: third-party/tbb/src/tbb/rtm_rw_mutex.cpp ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "oneapi/tbb/detail/_assert.h" #include "oneapi/tbb/detail/_rtm_rw_mutex.h" #include "itt_notify.h" #include "governor.h" #include "misc.h" #include namespace tbb { namespace detail { namespace r1 { struct rtm_rw_mutex_impl { // maximum number of times to retry // TODO: experiment on retry values. static constexpr int retry_threshold_read = 10; static constexpr int retry_threshold_write = 10; using transaction_result_type = decltype(begin_transaction()); //! Release speculative mutex static void release(d1::rtm_rw_mutex::scoped_lock& s) { switch(s.m_transaction_state) { case d1::rtm_rw_mutex::rtm_type::rtm_transacting_writer: case d1::rtm_rw_mutex::rtm_type::rtm_transacting_reader: __TBB_ASSERT(is_in_transaction(), "m_transaction_state && not speculating"); end_transaction(); s.m_mutex = nullptr; break; case d1::rtm_rw_mutex::rtm_type::rtm_real_reader: __TBB_ASSERT(!s.m_mutex->write_flag.load(std::memory_order_relaxed), "write_flag set but read lock acquired"); s.m_mutex->unlock_shared(); s.m_mutex = nullptr; break; case d1::rtm_rw_mutex::rtm_type::rtm_real_writer: __TBB_ASSERT(s.m_mutex->write_flag.load(std::memory_order_relaxed), "write_flag unset but write lock acquired"); s.m_mutex->write_flag.store(false, std::memory_order_relaxed); s.m_mutex->unlock(); s.m_mutex = nullptr; break; case d1::rtm_rw_mutex::rtm_type::rtm_not_in_mutex: __TBB_ASSERT(false, "rtm_not_in_mutex, but in release"); break; default: __TBB_ASSERT(false, "invalid m_transaction_state"); } s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_not_in_mutex; } //! Acquire write lock on the given mutex. static void acquire_writer(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s, bool only_speculate) { __TBB_ASSERT(s.m_transaction_state == d1::rtm_rw_mutex::rtm_type::rtm_not_in_mutex, "scoped_lock already in transaction"); if(governor::speculation_enabled()) { int num_retries = 0; transaction_result_type abort_code = 0; do { if(m.m_state.load(std::memory_order_acquire)) { if(only_speculate) return; spin_wait_until_eq(m.m_state, d1::rtm_rw_mutex::state_type(0)); } // _xbegin returns -1 on success or the abort code, so capture it if((abort_code = begin_transaction()) == transaction_result_type(speculation_successful_begin)) { // started speculation if(m.m_state.load(std::memory_order_relaxed)) { // add spin_rw_mutex to read-set. // reader or writer grabbed the lock, so abort. abort_transaction(); } s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_transacting_writer; // Don not wrap the following assignment to a function, // because it can abort the transaction in debug. Need mutex for release(). s.m_mutex = &m; return; // successfully started speculation } ++num_retries; } while((abort_code & speculation_retry) != 0 && (num_retries < retry_threshold_write)); } if(only_speculate) return; s.m_mutex = &m; // should apply a real try_lock... s.m_mutex->lock(); // kill transactional writers __TBB_ASSERT(!m.write_flag.load(std::memory_order_relaxed), "After acquire for write, write_flag already true"); m.write_flag.store(true, std::memory_order_relaxed); // kill transactional readers s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_real_writer; } //! Acquire read lock on given mutex. // only_speculate : true if we are doing a try_acquire. If true and we fail to speculate, don't // really acquire the lock, return and do a try_acquire on the contained spin_rw_mutex. If // the lock is already held by a writer, just return. static void acquire_reader(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s, bool only_speculate) { __TBB_ASSERT(s.m_transaction_state == d1::rtm_rw_mutex::rtm_type::rtm_not_in_mutex, "scoped_lock already in transaction"); if(governor::speculation_enabled()) { int num_retries = 0; transaction_result_type abort_code = 0; do { // if in try_acquire, and lock is held as writer, don't attempt to speculate. if(m.write_flag.load(std::memory_order_acquire)) { if(only_speculate) return; spin_wait_while_eq(m.write_flag, true); } // _xbegin returns -1 on success or the abort code, so capture it if((abort_code = begin_transaction()) == transaction_result_type(speculation_successful_begin)) { // started speculation if(m.write_flag.load(std::memory_order_relaxed)) { // add write_flag to read-set. abort_transaction(); // writer grabbed the lock, so abort. } s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_transacting_reader; // Don not wrap the following assignment to a function, // because it can abort the transaction in debug. Need mutex for release(). s.m_mutex = &m; return; // successfully started speculation } // fallback path // retry only if there is any hope of getting into a transaction soon // Retry in the following cases (from Section 8.3.5 of // Intel(R) Architecture Instruction Set Extensions Programming Reference): // 1. abort caused by XABORT instruction (bit 0 of EAX register is set) // 2. the transaction may succeed on a retry (bit 1 of EAX register is set) // 3. if another logical processor conflicted with a memory address // that was part of the transaction that aborted (bit 2 of EAX register is set) // That is, retry if (abort_code & 0x7) is non-zero ++num_retries; } while((abort_code & speculation_retry) != 0 && (num_retries < retry_threshold_read)); } if(only_speculate) return; s.m_mutex = &m; s.m_mutex->lock_shared(); s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_real_reader; } //! Upgrade reader to become a writer. /** Returns whether the upgrade happened without releasing and re-acquiring the lock */ static bool upgrade(d1::rtm_rw_mutex::scoped_lock& s) { switch(s.m_transaction_state) { case d1::rtm_rw_mutex::rtm_type::rtm_real_reader: { s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_real_writer; bool no_release = s.m_mutex->upgrade(); __TBB_ASSERT(!s.m_mutex->write_flag.load(std::memory_order_relaxed), "After upgrade, write_flag already true"); s.m_mutex->write_flag.store(true, std::memory_order_relaxed); return no_release; } case d1::rtm_rw_mutex::rtm_type::rtm_transacting_reader: { d1::rtm_rw_mutex& m = *s.m_mutex; if(m.m_state.load(std::memory_order_acquire)) { // add spin_rw_mutex to read-set. // Real reader or writer holds the lock; so commit the read and re-acquire for write. release(s); acquire_writer(m, s, false); return false; } else { s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_transacting_writer; return true; } } default: __TBB_ASSERT(false, "Invalid state for upgrade"); return false; } } //! Downgrade writer to a reader. static bool downgrade(d1::rtm_rw_mutex::scoped_lock& s) { switch (s.m_transaction_state) { case d1::rtm_rw_mutex::rtm_type::rtm_real_writer: s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_real_reader; __TBB_ASSERT(s.m_mutex->write_flag.load(std::memory_order_relaxed), "Before downgrade write_flag not true"); s.m_mutex->write_flag.store(false, std::memory_order_relaxed); s.m_mutex->downgrade(); return true; case d1::rtm_rw_mutex::rtm_type::rtm_transacting_writer: s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_transacting_reader; return true; default: __TBB_ASSERT(false, "Invalid state for downgrade"); return false; } } //! Try to acquire write lock on the given mutex. // There may be reader(s) which acquired the spin_rw_mutex, as well as possibly // transactional reader(s). If this is the case, the acquire will fail, and assigning // write_flag will kill the transactors. So we only assign write_flag if we have successfully // acquired the lock. static bool try_acquire_writer(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s) { acquire_writer(m, s, /*only_speculate=*/true); if (s.m_transaction_state == d1::rtm_rw_mutex::rtm_type::rtm_transacting_writer) { return true; } __TBB_ASSERT(s.m_transaction_state == d1::rtm_rw_mutex::rtm_type::rtm_not_in_mutex, nullptr); // transacting write acquire failed. try_lock the real mutex if (m.try_lock()) { s.m_mutex = &m; // only shoot down readers if we're not transacting ourselves __TBB_ASSERT(!m.write_flag.load(std::memory_order_relaxed), "After try_acquire_writer, write_flag already true"); m.write_flag.store(true, std::memory_order_relaxed); s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_real_writer; return true; } return false; } //! Try to acquire read lock on the given mutex. static bool try_acquire_reader(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s) { // speculatively acquire the lock. If this fails, do try_lock_shared on the spin_rw_mutex. acquire_reader(m, s, /*only_speculate=*/true); if (s.m_transaction_state == d1::rtm_rw_mutex::rtm_type::rtm_transacting_reader) { return true; } __TBB_ASSERT(s.m_transaction_state == d1::rtm_rw_mutex::rtm_type::rtm_not_in_mutex, nullptr); // transacting read acquire failed. try_lock_shared the real mutex if (m.try_lock_shared()) { s.m_mutex = &m; s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_real_reader; return true; } return false; } }; void __TBB_EXPORTED_FUNC acquire_writer(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s, bool only_speculate) { rtm_rw_mutex_impl::acquire_writer(m, s, only_speculate); } //! Internal acquire read lock. // only_speculate == true if we're doing a try_lock, else false. void __TBB_EXPORTED_FUNC acquire_reader(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s, bool only_speculate) { rtm_rw_mutex_impl::acquire_reader(m, s, only_speculate); } //! Internal upgrade reader to become a writer. bool __TBB_EXPORTED_FUNC upgrade(d1::rtm_rw_mutex::scoped_lock& s) { return rtm_rw_mutex_impl::upgrade(s); } //! Internal downgrade writer to become a reader. bool __TBB_EXPORTED_FUNC downgrade(d1::rtm_rw_mutex::scoped_lock& s) { return rtm_rw_mutex_impl::downgrade(s); } //! Internal try_acquire write lock. bool __TBB_EXPORTED_FUNC try_acquire_writer(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s) { return rtm_rw_mutex_impl::try_acquire_writer(m, s); } //! Internal try_acquire read lock. bool __TBB_EXPORTED_FUNC try_acquire_reader(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s) { return rtm_rw_mutex_impl::try_acquire_reader(m, s); } //! Internal release lock. void __TBB_EXPORTED_FUNC release(d1::rtm_rw_mutex::scoped_lock& s) { rtm_rw_mutex_impl::release(s); } } // namespace r1 } // namespace detail } // namespace tbb ================================================ FILE: third-party/tbb/src/tbb/scheduler_common.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _TBB_scheduler_common_H #define _TBB_scheduler_common_H #include "oneapi/tbb/detail/_utils.h" #include "oneapi/tbb/detail/_template_helpers.h" #include "oneapi/tbb/detail/_task.h" #include "oneapi/tbb/detail/_machine.h" #include "oneapi/tbb/task_group.h" #include "oneapi/tbb/cache_aligned_allocator.h" #include "oneapi/tbb/tbb_allocator.h" #include "itt_notify.h" #include "co_context.h" #include "misc.h" #include "governor.h" #ifndef __TBB_SCHEDULER_MUTEX_TYPE #define __TBB_SCHEDULER_MUTEX_TYPE tbb::spin_mutex #endif // TODO: add conditional inclusion based on specified type #include "oneapi/tbb/spin_mutex.h" #include "oneapi/tbb/mutex.h" #if TBB_USE_ASSERT #include #endif #include #include #include // unique_ptr #include //! Mutex type for global locks in the scheduler using scheduler_mutex_type = __TBB_SCHEDULER_MUTEX_TYPE; #if _MSC_VER && !defined(__INTEL_COMPILER) // Workaround for overzealous compiler warnings // These particular warnings are so ubiquitous that no attempt is made to narrow // the scope of the warnings. #pragma warning (disable: 4100 4127 4312 4244 4267 4706) #endif namespace tbb { namespace detail { namespace r1 { class arena; class mail_inbox; class mail_outbox; class market; class observer_proxy; enum task_stream_accessor_type { front_accessor = 0, back_nonnull_accessor }; template class task_stream; using isolation_type = std::intptr_t; constexpr isolation_type no_isolation = 0; struct cache_aligned_deleter { template void operator() (T* ptr) const { ptr->~T(); cache_aligned_deallocate(ptr); } }; template using cache_aligned_unique_ptr = std::unique_ptr; template cache_aligned_unique_ptr make_cache_aligned_unique(Args&& ...args) { return cache_aligned_unique_ptr(new (cache_aligned_allocate(sizeof(T))) T(std::forward(args)...)); } //------------------------------------------------------------------------ // Extended execute data //------------------------------------------------------------------------ //! Execute data used on a task dispatcher side, reflects a current execution state struct execution_data_ext : d1::execution_data { task_dispatcher* task_disp{}; isolation_type isolation{}; d1::wait_context* wait_ctx{}; }; //------------------------------------------------------------------------ // Task accessor //------------------------------------------------------------------------ //! Interpretation of reserved task fields inside a task dispatcher struct task_accessor { static constexpr std::uint64_t proxy_task_trait = 1; static constexpr std::uint64_t resume_task_trait = 2; static d1::task_group_context*& context(d1::task& t) { task_group_context** tgc = reinterpret_cast(&t.m_reserved[0]); return *tgc; } static isolation_type& isolation(d1::task& t) { isolation_type* tag = reinterpret_cast(&t.m_reserved[2]); return *tag; } static void set_proxy_trait(d1::task& t) { // TODO: refactor proxy tasks not to work on uninitialized memory. //__TBB_ASSERT((t.m_version_and_traits & proxy_task_trait) == 0, nullptr); t.m_version_and_traits |= proxy_task_trait; } static bool is_proxy_task(d1::task& t) { return (t.m_version_and_traits & proxy_task_trait) != 0; } static void set_resume_trait(d1::task& t) { __TBB_ASSERT((t.m_version_and_traits & resume_task_trait) == 0, nullptr); t.m_version_and_traits |= resume_task_trait; } static bool is_resume_task(d1::task& t) { return (t.m_version_and_traits & resume_task_trait) != 0; } }; //------------------------------------------------------------------------ //! Extended variant of the standard offsetof macro /** The standard offsetof macro is not sufficient for TBB as it can be used for POD-types only. The constant 0x1000 (not nullptr) is necessary to appease GCC. **/ #define __TBB_offsetof(class_name, member_name) \ ((ptrdiff_t)&(reinterpret_cast(0x1000)->member_name) - 0x1000) //! Returns address of the object containing a member with the given name and address #define __TBB_get_object_ref(class_name, member_name, member_addr) \ (*reinterpret_cast((char*)member_addr - __TBB_offsetof(class_name, member_name))) //! Helper class for tracking floating point context and task group context switches /** Assuming presence of an itt collector, in addition to keeping track of floating point context, this class emits itt events to indicate begin and end of task group context execution **/ template class context_guard_helper { const d1::task_group_context* curr_ctx; d1::cpu_ctl_env guard_cpu_ctl_env; d1::cpu_ctl_env curr_cpu_ctl_env; public: context_guard_helper() : curr_ctx(nullptr) { guard_cpu_ctl_env.get_env(); curr_cpu_ctl_env = guard_cpu_ctl_env; } ~context_guard_helper() { if (curr_cpu_ctl_env != guard_cpu_ctl_env) guard_cpu_ctl_env.set_env(); if (report_tasks && curr_ctx) ITT_TASK_END; } // The function is called from bypass dispatch loop on the hot path. // Consider performance issues when refactoring. void set_ctx(const d1::task_group_context* ctx) { if (!ctx) return; const d1::cpu_ctl_env* ctl = reinterpret_cast(&ctx->my_cpu_ctl_env); // Compare the FPU settings directly because the context can be reused between parallel algorithms. if (*ctl != curr_cpu_ctl_env) { curr_cpu_ctl_env = *ctl; curr_cpu_ctl_env.set_env(); } if (report_tasks && ctx != curr_ctx) { // if task group context was active, report end of current execution frame. if (curr_ctx) ITT_TASK_END; // reporting begin of new task group context execution frame. // using address of task group context object to group tasks (parent). // id of task execution frame is nullptr and reserved for future use. ITT_TASK_BEGIN(ctx, ctx->my_name, nullptr); curr_ctx = ctx; } } #if _WIN64 void restore_default() { if (curr_cpu_ctl_env != guard_cpu_ctl_env) { guard_cpu_ctl_env.set_env(); curr_cpu_ctl_env = guard_cpu_ctl_env; } } #endif // _WIN64 }; #if (_WIN32 || _WIN64 || __unix__ || __APPLE__) && (__TBB_x86_32 || __TBB_x86_64) #if _MSC_VER #pragma intrinsic(__rdtsc) #endif inline std::uint64_t machine_time_stamp() { #if __INTEL_COMPILER return _rdtsc(); #elif _MSC_VER return __rdtsc(); #else std::uint32_t hi, lo; __asm__ __volatile__("rdtsc" : "=d"(hi), "=a"(lo)); return (std::uint64_t(hi) << 32) | lo; #endif } inline void prolonged_pause_impl() { // Assumption based on practice: 1000-2000 ticks seems to be a suitable invariant for the // majority of platforms. Currently, skip platforms that define __TBB_STEALING_PAUSE // because these platforms require very careful tuning. std::uint64_t prev = machine_time_stamp(); const std::uint64_t finish = prev + 1000; atomic_backoff backoff; do { backoff.bounded_pause(); std::uint64_t curr = machine_time_stamp(); if (curr <= prev) // Possibly, the current logical thread is moved to another hardware thread or overflow is occurred. break; prev = curr; } while (prev < finish); } #else inline void prolonged_pause_impl() { #ifdef __TBB_ipf static const long PauseTime = 1500; #else static const long PauseTime = 80; #endif // TODO IDEA: Update PauseTime adaptively? machine_pause(PauseTime); } #endif inline void prolonged_pause() { #if __TBB_WAITPKG_INTRINSICS_PRESENT if (governor::wait_package_enabled()) { std::uint64_t time_stamp = machine_time_stamp(); // _tpause function directs the processor to enter an implementation-dependent optimized state // until the Time Stamp Counter reaches or exceeds the value specified in second parameter. // Constant "1000" is ticks to wait for. // TODO : Modify this parameter based on empirical study of benchmarks. // First parameter 0 selects between a lower power (cleared) or faster wakeup (set) optimized state. _tpause(0, time_stamp + 1000); } else #endif prolonged_pause_impl(); } // TODO: investigate possibility to work with number of CPU cycles // because for different configurations this number of pauses + yields // will be calculated in different amount of CPU cycles // for example use rdtsc for it class stealing_loop_backoff { const int my_pause_threshold; const int my_yield_threshold; int my_pause_count; int my_yield_count; public: // my_yield_threshold = 100 is an experimental value. Ideally, once we start calling __TBB_Yield(), // the time spent spinning before calling out_of_work() should be approximately // the time it takes for a thread to be woken up. Doing so would guarantee that we do // no worse than 2x the optimal spin time. Or perhaps a time-slice quantum is the right amount. stealing_loop_backoff(int num_workers, int yields_multiplier) : my_pause_threshold{ 2 * (num_workers + 1) } , my_yield_threshold{100 * yields_multiplier} , my_pause_count{} , my_yield_count{} {} bool pause() { prolonged_pause(); if (my_pause_count++ >= my_pause_threshold) { my_pause_count = my_pause_threshold; d0::yield(); if (my_yield_count++ >= my_yield_threshold) { my_yield_count = my_yield_threshold; return true; } } return false; } void reset_wait() { my_pause_count = my_yield_count = 0; } }; //------------------------------------------------------------------------ // Exception support //------------------------------------------------------------------------ //! Task group state change propagation global epoch /** Together with generic_scheduler::my_context_state_propagation_epoch forms cross-thread signaling mechanism that allows to avoid locking at the hot path of normal execution flow. When a descendant task group context is registered or unregistered, the global and local epochs are compared. If they differ, a state change is being propagated, and thus registration/deregistration routines take slower branch that may block (at most one thread of the pool can be blocked at any moment). Otherwise the control path is lock-free and fast. **/ extern std::atomic the_context_state_propagation_epoch; //! Mutex guarding state change propagation across task groups forest. /** Also protects modification of related data structures. **/ typedef scheduler_mutex_type context_state_propagation_mutex_type; extern context_state_propagation_mutex_type the_context_state_propagation_mutex; class tbb_exception_ptr { std::exception_ptr my_ptr; public: static tbb_exception_ptr* allocate() noexcept; //! Destroys this objects /** Note that objects of this type can be created only by the allocate() method. **/ void destroy() noexcept; //! Throws the contained exception . void throw_self(); private: tbb_exception_ptr(const std::exception_ptr& src) : my_ptr(src) {} }; // class tbb_exception_ptr //------------------------------------------------------------------------ // Debugging support //------------------------------------------------------------------------ #if TBB_USE_ASSERT static const std::uintptr_t venom = tbb::detail::select_size_t_constant<0xDEADBEEFU, 0xDDEEAADDDEADBEEFULL>::value; inline void poison_value(std::uintptr_t& val) { val = venom; } inline void poison_value(std::atomic& val) { val.store(venom, std::memory_order_relaxed); } /** Expected to be used in assertions only, thus no empty form is defined. **/ inline bool is_alive(std::uintptr_t v) { return v != venom; } /** Logically, this method should be a member of class task. But we do not want to publish it, so it is here instead. */ inline void assert_task_valid(const d1::task* t) { assert_pointer_valid(t); } #else /* !TBB_USE_ASSERT */ /** In contrast to debug version poison_value() is a macro here because the variable used as its argument may be undefined in release builds. **/ #define poison_value(g) ((void)0) inline void assert_task_valid(const d1::task*) {} #endif /* !TBB_USE_ASSERT */ struct suspend_point_type { #if __TBB_RESUMABLE_TASKS //! The arena related to this task_dispatcher arena* m_arena{ nullptr }; //! The random for the resume task FastRandom m_random; //! The flag is raised when the original owner should return to this task dispatcher. std::atomic m_is_owner_recalled{ false }; //! Inicates if the resume task should be placed to the critical task stream. bool m_is_critical{ false }; //! Associated coroutine co_context m_co_context; //! Supend point before resume suspend_point_type* m_prev_suspend_point{nullptr}; // Possible state transitions: // A -> S -> N -> A // A -> N -> S -> N -> A enum class stack_state { active, // some thread is working with this stack suspended, // no thread is working with this stack notified // some thread tried to resume this stack }; //! The flag required to protect suspend finish and resume call std::atomic m_stack_state{stack_state::active}; void resume(suspend_point_type* sp) { __TBB_ASSERT(m_stack_state.load(std::memory_order_relaxed) != stack_state::suspended, "The stack is expected to be active"); sp->m_prev_suspend_point = this; // Do not access sp after resume m_co_context.resume(sp->m_co_context); __TBB_ASSERT(m_stack_state.load(std::memory_order_relaxed) != stack_state::active, nullptr); finilize_resume(); } void finilize_resume() { m_stack_state.store(stack_state::active, std::memory_order_relaxed); // Set the suspended state for the stack that we left. If the state is already notified, it means that // someone already tried to resume our previous stack but failed. So, we need to resume it. // m_prev_suspend_point might be nullptr when destroying co_context based on threads if (m_prev_suspend_point && m_prev_suspend_point->m_stack_state.exchange(stack_state::suspended) == stack_state::notified) { r1::resume(m_prev_suspend_point); } m_prev_suspend_point = nullptr; } bool try_notify_resume() { // Check that stack is already suspended. Return false if not yet. return m_stack_state.exchange(stack_state::notified) == stack_state::suspended; } void recall_owner() { __TBB_ASSERT(m_stack_state.load(std::memory_order_relaxed) == stack_state::suspended, nullptr); m_stack_state.store(stack_state::notified, std::memory_order_relaxed); m_is_owner_recalled.store(true, std::memory_order_release); } struct resume_task final : public d1::task { task_dispatcher& m_target; explicit resume_task(task_dispatcher& target) : m_target(target) { task_accessor::set_resume_trait(*this); } d1::task* execute(d1::execution_data& ed) override; d1::task* cancel(d1::execution_data&) override { __TBB_ASSERT(false, "The resume task cannot be canceled"); return nullptr; } } m_resume_task; suspend_point_type(arena* a, std::size_t stack_size, task_dispatcher& target); #endif /*__TBB_RESUMABLE_TASKS */ }; #if _MSC_VER && !defined(__INTEL_COMPILER) // structure was padded due to alignment specifier #pragma warning( push ) #pragma warning( disable: 4324 ) #endif class alignas (max_nfs_size) task_dispatcher { public: // TODO: reconsider low level design to better organize dependencies and files. friend class thread_data; friend class arena_slot; friend class nested_arena_context; friend class delegated_task; friend struct base_waiter; //! The list of possible post resume actions. enum class post_resume_action { invalid, register_waiter, cleanup, notify, none }; //! The data of the current thread attached to this task_dispatcher thread_data* m_thread_data{ nullptr }; //! The current execution data execution_data_ext m_execute_data_ext; //! Properties struct properties { bool outermost{ true }; bool fifo_tasks_allowed{ true }; bool critical_task_allowed{ true }; } m_properties; //! Position in the call stack when stealing is still allowed. std::uintptr_t m_stealing_threshold{}; //! Suspend point (null if this task dispatcher has been never suspended) suspend_point_type* m_suspend_point{ nullptr }; //! Used to improve scalability of d1::wait_context by using per thread reference_counter std::unordered_map, std::equal_to, tbb_allocator> > m_reference_vertex_map; //! Attempt to get a task from the mailbox. /** Gets a task only if it has not been executed by its sender or a thief that has stolen it from the sender's task pool. Otherwise returns nullptr. This method is intended to be used only by the thread extracting the proxy from its mailbox. (In contrast to local task pool, mailbox can be read only by its owner). **/ d1::task* get_mailbox_task(mail_inbox& my_inbox, execution_data_ext& ed, isolation_type isolation); d1::task* get_critical_task(d1::task*, execution_data_ext&, isolation_type, bool); template d1::task* receive_or_steal_task(thread_data& tls, execution_data_ext& ed, Waiter& waiter, isolation_type isolation, bool outermost, bool criticality_absence); template d1::task* local_wait_for_all(d1::task * t, Waiter& waiter); task_dispatcher(const task_dispatcher&) = delete; bool can_steal(); public: task_dispatcher(arena* a); ~task_dispatcher() { if (m_suspend_point) { m_suspend_point->~suspend_point_type(); cache_aligned_deallocate(m_suspend_point); } for (auto& elem : m_reference_vertex_map) { d1::reference_vertex*& node = elem.second; node->~reference_vertex(); cache_aligned_deallocate(node); poison_pointer(node); } poison_pointer(m_thread_data); poison_pointer(m_suspend_point); } template d1::task* local_wait_for_all(d1::task* t, Waiter& waiter); bool allow_fifo_task(bool new_state) { bool old_state = m_properties.fifo_tasks_allowed; m_properties.fifo_tasks_allowed = new_state; return old_state; } isolation_type set_isolation(isolation_type isolation) { isolation_type prev = m_execute_data_ext.isolation; m_execute_data_ext.isolation = isolation; return prev; } thread_data& get_thread_data() { __TBB_ASSERT(m_thread_data, nullptr); return *m_thread_data; } static void execute_and_wait(d1::task* t, d1::wait_context& wait_ctx, d1::task_group_context& w_ctx); void set_stealing_threshold(std::uintptr_t stealing_threshold) { bool assert_condition = (stealing_threshold == 0 && m_stealing_threshold != 0) || (stealing_threshold != 0 && m_stealing_threshold == 0); __TBB_ASSERT_EX( assert_condition, nullptr ); m_stealing_threshold = stealing_threshold; } d1::task* get_inbox_or_critical_task(execution_data_ext&, mail_inbox&, isolation_type, bool); d1::task* get_stream_or_critical_task(execution_data_ext&, arena&, task_stream&, unsigned& /*hint_for_stream*/, isolation_type, bool /*critical_allowed*/); d1::task* steal_or_get_critical(execution_data_ext&, arena&, unsigned /*arena_index*/, FastRandom&, isolation_type, bool /*critical_allowed*/); #if __TBB_RESUMABLE_TASKS /* [[noreturn]] */ void co_local_wait_for_all() noexcept; void suspend(suspend_callback_type suspend_callback, void* user_callback); void internal_suspend(); void do_post_resume_action(); bool resume(task_dispatcher& target); suspend_point_type* get_suspend_point(); void init_suspend_point(arena* a, std::size_t stack_size); friend void internal_resume(suspend_point_type*); void recall_point(); #endif /* __TBB_RESUMABLE_TASKS */ }; #if _MSC_VER && !defined(__INTEL_COMPILER) #pragma warning( pop ) #endif inline std::uintptr_t calculate_stealing_threshold(std::uintptr_t base, std::size_t stack_size) { __TBB_ASSERT(stack_size != 0, "Stack size cannot be zero"); __TBB_ASSERT(base > stack_size / 2, "Stack anchor calculation overflow"); return base - stack_size / 2; } struct task_group_context_impl { static void destroy(d1::task_group_context&); static void initialize(d1::task_group_context&); static void register_with(d1::task_group_context&, thread_data*); static void bind_to_impl(d1::task_group_context&, thread_data*); static void bind_to(d1::task_group_context&, thread_data*); static void propagate_task_group_state(d1::task_group_context&, std::atomic d1::task_group_context::*, d1::task_group_context&, uint32_t); static bool cancel_group_execution(d1::task_group_context&); static bool is_group_execution_cancelled(const d1::task_group_context&); static void reset(d1::task_group_context&); static void capture_fp_settings(d1::task_group_context&); static void copy_fp_settings(d1::task_group_context& ctx, const d1::task_group_context& src); }; //! Forward declaration for scheduler entities bool gcc_rethrow_exception_broken(); void fix_broken_rethrow(); //! Forward declaration: throws std::runtime_error with what() returning error_code description prefixed with aux_info void handle_perror(int error_code, const char* aux_info); } // namespace r1 } // namespace detail } // namespace tbb #endif /* _TBB_scheduler_common_H */ ================================================ FILE: third-party/tbb/src/tbb/semaphore.cpp ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "semaphore.h" #if __TBB_USE_SRWLOCK #include "dynamic_link.h" // Refers to src/tbb, not include/tbb #include "tbb_misc.h" #endif namespace tbb { namespace detail { namespace r1 { // TODO: For new win UI port, we can use SRWLock API without dynamic_link etc. #if __TBB_USE_SRWLOCK static std::atomic concmon_module_inited; void WINAPI init_binsem_using_event( SRWLOCK* h_ ) { srwl_or_handle* shptr = (srwl_or_handle*) h_; shptr->h = CreateEventEx( nullptr, nullptr, 0, EVENT_ALL_ACCESS|SEMAPHORE_ALL_ACCESS ); } void WINAPI acquire_binsem_using_event( SRWLOCK* h_ ) { srwl_or_handle* shptr = (srwl_or_handle*) h_; WaitForSingleObjectEx( shptr->h, INFINITE, FALSE ); } void WINAPI release_binsem_using_event( SRWLOCK* h_ ) { srwl_or_handle* shptr = (srwl_or_handle*) h_; SetEvent( shptr->h ); } static void (WINAPI *__TBB_init_binsem)( SRWLOCK* ) = (void (WINAPI *)(SRWLOCK*))&init_binsem_using_event; static void (WINAPI *__TBB_acquire_binsem)( SRWLOCK* ) = (void (WINAPI *)(SRWLOCK*))&acquire_binsem_using_event; static void (WINAPI *__TBB_release_binsem)( SRWLOCK* ) = (void (WINAPI *)(SRWLOCK*))&release_binsem_using_event; //! Table describing the how to link the handlers. static const dynamic_link_descriptor SRWLLinkTable[] = { DLD(InitializeSRWLock, __TBB_init_binsem), DLD(AcquireSRWLockExclusive, __TBB_acquire_binsem), DLD(ReleaseSRWLockExclusive, __TBB_release_binsem) }; inline void init_concmon_module() { __TBB_ASSERT( (uintptr_t)__TBB_init_binsem==(uintptr_t)&init_binsem_using_event, nullptr); if( dynamic_link( "Kernel32.dll", SRWLLinkTable, sizeof(SRWLLinkTable)/sizeof(dynamic_link_descriptor) ) ) { __TBB_ASSERT( (uintptr_t)__TBB_init_binsem!=(uintptr_t)&init_binsem_using_event, nullptr); __TBB_ASSERT( (uintptr_t)__TBB_acquire_binsem!=(uintptr_t)&acquire_binsem_using_event, nullptr); __TBB_ASSERT( (uintptr_t)__TBB_release_binsem!=(uintptr_t)&release_binsem_using_event, nullptr); } } binary_semaphore::binary_semaphore() { atomic_do_once( &init_concmon_module, concmon_module_inited ); __TBB_init_binsem( &my_sem.lock ); if( (uintptr_t)__TBB_init_binsem!=(uintptr_t)&init_binsem_using_event ) P(); } binary_semaphore::~binary_semaphore() { if( (uintptr_t)__TBB_init_binsem==(uintptr_t)&init_binsem_using_event ) CloseHandle( my_sem.h ); } void binary_semaphore::P() { __TBB_acquire_binsem( &my_sem.lock ); } void binary_semaphore::V() { __TBB_release_binsem( &my_sem.lock ); } #endif /* __TBB_USE_SRWLOCK */ } // namespace r1 } // namespace detail } // namespace tbb ================================================ FILE: third-party/tbb/src/tbb/semaphore.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_semaphore_H #define __TBB_semaphore_H #include "oneapi/tbb/detail/_utils.h" #if _WIN32||_WIN64 #include #elif __APPLE__ #include #else #include #ifdef TBB_USE_DEBUG #include #endif #endif /*_WIN32||_WIN64*/ #include #if __unix__ #if defined(__has_include) #define __TBB_has_include __has_include #else #define __TBB_has_include(x) 0 #endif /* Futex definitions */ #include #if defined(__linux__) || __TBB_has_include() #include #endif #if defined(SYS_futex) /* This section is included for Linux and some other systems that may support futexes.*/ #define __TBB_USE_FUTEX 1 /* If available, use typical headers where futex API is defined. While Linux and OpenBSD are known to provide such headers, other systems might have them as well. */ #if defined(__linux__) || __TBB_has_include() #include #elif defined(__OpenBSD__) || __TBB_has_include() #include #endif #include #include /* Some systems might not define the macros or use different names. In such case we expect the actual parameter values to match Linux: 0 for wait, 1 for wake. */ #if defined(FUTEX_WAIT_PRIVATE) #define __TBB_FUTEX_WAIT FUTEX_WAIT_PRIVATE #elif defined(FUTEX_WAIT) #define __TBB_FUTEX_WAIT FUTEX_WAIT #else #define __TBB_FUTEX_WAIT 0 #endif #if defined(FUTEX_WAKE_PRIVATE) #define __TBB_FUTEX_WAKE FUTEX_WAKE_PRIVATE #elif defined(FUTEX_WAKE) #define __TBB_FUTEX_WAKE FUTEX_WAKE #else #define __TBB_FUTEX_WAKE 1 #endif #endif // SYS_futex #endif // __unix__ namespace tbb { namespace detail { namespace r1 { //////////////////////////////////////////////////////////////////////////////////////////////////// // Futex implementation //////////////////////////////////////////////////////////////////////////////////////////////////// #if __TBB_USE_FUTEX static inline int futex_wait( void *futex, int comparand ) { #ifdef __OpenBSD__ int r = ::futex((volatile uint32_t *)futex, __TBB_FUTEX_WAIT, comparand, nullptr, nullptr); #else int r = ::syscall(SYS_futex, futex, __TBB_FUTEX_WAIT, comparand, nullptr, nullptr, 0); #endif #if TBB_USE_ASSERT int e = errno; __TBB_ASSERT(r == 0 || r == EWOULDBLOCK || (r == -1 && (e == EAGAIN || e == EINTR)), "futex_wait failed."); #endif /* TBB_USE_ASSERT */ return r; } static inline int futex_wakeup_one( void *futex ) { #ifdef __OpenBSD__ int r = ::futex((volatile uint32_t *)futex, __TBB_FUTEX_WAKE, 1 , nullptr, nullptr); #else int r = ::syscall(SYS_futex, futex, __TBB_FUTEX_WAKE, 1, nullptr, nullptr, 0); #endif __TBB_ASSERT(r == 0 || r == 1, "futex_wakeup_one: more than one thread woken up?"); return r; } // Additional possible methods that are not required right now // static inline int futex_wakeup_all( void *futex ) { // int r = ::syscall( SYS_futex,futex,__TBB_FUTEX_WAKE,INT_MAX,nullptr,nullptr,0 ); // __TBB_ASSERT( r>=0, "futex_wakeup_all: error in waking up threads" ); // return r; // } #endif // __TBB_USE_FUTEX //////////////////////////////////////////////////////////////////////////////////////////////////// #if _WIN32||_WIN64 typedef LONG sem_count_t; //! Edsger Dijkstra's counting semaphore class semaphore : no_copy { static const int max_semaphore_cnt = MAXLONG; public: //! ctor semaphore(size_t start_cnt_ = 0) {init_semaphore(start_cnt_);} //! dtor ~semaphore() {CloseHandle( sem );} //! wait/acquire void P() {WaitForSingleObjectEx( sem, INFINITE, FALSE );} //! post/release void V() {ReleaseSemaphore( sem, 1, nullptr);} private: HANDLE sem; void init_semaphore(size_t start_cnt_) { sem = CreateSemaphoreEx( nullptr, LONG(start_cnt_), max_semaphore_cnt, nullptr, 0, SEMAPHORE_ALL_ACCESS ); } }; #elif __APPLE__ //! Edsger Dijkstra's counting semaphore class semaphore : no_copy { public: //! ctor semaphore(int start_cnt_ = 0) { my_sem = dispatch_semaphore_create(start_cnt_); } //! dtor ~semaphore() { dispatch_release(my_sem); } //! wait/acquire void P() { std::intptr_t ret = dispatch_semaphore_wait(my_sem, DISPATCH_TIME_FOREVER); __TBB_ASSERT_EX(ret == 0, "dispatch_semaphore_wait() failed"); } //! post/release void V() { dispatch_semaphore_signal(my_sem); } private: dispatch_semaphore_t my_sem; }; #else /* Linux/Unix */ typedef uint32_t sem_count_t; //! Edsger Dijkstra's counting semaphore class semaphore : no_copy { public: //! ctor semaphore(int start_cnt_ = 0 ) { init_semaphore( start_cnt_ ); } //! dtor ~semaphore() { int ret = sem_destroy( &sem ); __TBB_ASSERT_EX( !ret, nullptr); } //! wait/acquire void P() { while( sem_wait( &sem )!=0 ) __TBB_ASSERT( errno==EINTR, nullptr); } //! post/release void V() { sem_post( &sem ); } private: sem_t sem; void init_semaphore(int start_cnt_) { int ret = sem_init( &sem, /*shared among threads*/ 0, start_cnt_ ); __TBB_ASSERT_EX( !ret, nullptr); } }; #endif /* _WIN32||_WIN64 */ //! for performance reasons, we want specialized binary_semaphore #if _WIN32||_WIN64 #if !__TBB_USE_SRWLOCK //! binary_semaphore for concurrent_monitor class binary_semaphore : no_copy { public: //! ctor binary_semaphore() { my_sem = CreateEventEx( nullptr, nullptr, 0, EVENT_ALL_ACCESS ); } //! dtor ~binary_semaphore() { CloseHandle( my_sem ); } //! wait/acquire void P() { WaitForSingleObjectEx( my_sem, INFINITE, FALSE ); } //! post/release void V() { SetEvent( my_sem ); } private: HANDLE my_sem; }; #else /* __TBB_USE_SRWLOCK */ union srwl_or_handle { SRWLOCK lock; HANDLE h; }; //! binary_semaphore for concurrent_monitor class binary_semaphore : no_copy { public: //! ctor binary_semaphore(); //! dtor ~binary_semaphore(); //! wait/acquire void P(); //! post/release void V(); private: srwl_or_handle my_sem; }; #endif /* !__TBB_USE_SRWLOCK */ #elif __APPLE__ //! binary_semaphore for concurrent monitor using binary_semaphore = semaphore; #else /* Linux/Unix */ #if __TBB_USE_FUTEX class binary_semaphore : no_copy { // The implementation is equivalent to the "Mutex, Take 3" one // in the paper "Futexes Are Tricky" by Ulrich Drepper public: //! ctor binary_semaphore() { my_sem = 1; } //! dtor ~binary_semaphore() {} //! wait/acquire void P() { int s = 0; if( !my_sem.compare_exchange_strong( s, 1 ) ) { if( s!=2 ) s = my_sem.exchange( 2 ); while( s!=0 ) { // This loop deals with spurious wakeup futex_wait( &my_sem, 2 ); s = my_sem.exchange( 2 ); } } } //! post/release void V() { __TBB_ASSERT( my_sem.load(std::memory_order_relaxed)>=1, "multiple V()'s in a row?" ); if( my_sem.exchange( 0 )==2 ) futex_wakeup_one( &my_sem ); } private: std::atomic my_sem; // 0 - open; 1 - closed, no waits; 2 - closed, possible waits }; #else typedef uint32_t sem_count_t; //! binary_semaphore for concurrent monitor class binary_semaphore : no_copy { public: //! ctor binary_semaphore() { int ret = sem_init( &my_sem, /*shared among threads*/ 0, 0 ); __TBB_ASSERT_EX( !ret, nullptr); } //! dtor ~binary_semaphore() { int ret = sem_destroy( &my_sem ); __TBB_ASSERT_EX( !ret, nullptr); } //! wait/acquire void P() { while( sem_wait( &my_sem )!=0 ) __TBB_ASSERT( errno==EINTR, nullptr); } //! post/release void V() { sem_post( &my_sem ); } private: sem_t my_sem; }; #endif /* __TBB_USE_FUTEX */ #endif /* _WIN32||_WIN64 */ } // namespace r1 } // namespace detail } // namespace tbb #endif /* __TBB_semaphore_H */ ================================================ FILE: third-party/tbb/src/tbb/small_object_pool.cpp ================================================ /* Copyright (c) 2020-2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "oneapi/tbb/cache_aligned_allocator.h" #include "oneapi/tbb/detail/_small_object_pool.h" #include "oneapi/tbb/detail/_task.h" #include "governor.h" #include "thread_data.h" #include "task_dispatcher.h" #include namespace tbb { namespace detail { namespace r1 { small_object_pool_impl::small_object* const small_object_pool_impl::dead_public_list = reinterpret_cast(1); void* __TBB_EXPORTED_FUNC allocate(d1::small_object_pool*& allocator, std::size_t number_of_bytes, const d1::execution_data& ed) { auto& tls = static_cast(ed).task_disp->get_thread_data(); auto pool = tls.my_small_object_pool; return pool->allocate_impl(allocator, number_of_bytes); } void* __TBB_EXPORTED_FUNC allocate(d1::small_object_pool*& allocator, std::size_t number_of_bytes) { // TODO: optimize if the allocator contains a valid pool. auto tls = governor::get_thread_data(); auto pool = tls->my_small_object_pool; return pool->allocate_impl(allocator, number_of_bytes); } void* small_object_pool_impl::allocate_impl(d1::small_object_pool*& allocator, std::size_t number_of_bytes) { __TBB_ASSERT(allocator == nullptr || allocator == this, "An attempt was made to allocate using another thread's small memory pool"); small_object* obj{nullptr}; if (number_of_bytes <= small_object_size) { if (m_private_list) { obj = m_private_list; m_private_list = m_private_list->next; } else if (m_public_list.load(std::memory_order_relaxed)) { // No fence required for read of my_public_list above, because std::atomic::exchange() has a fence. obj = m_public_list.exchange(nullptr); __TBB_ASSERT( obj, "another thread emptied the my_public_list" ); m_private_list = obj->next; } else { obj = new (cache_aligned_allocate(small_object_size)) small_object{nullptr}; ++m_private_counter; } } else { obj = new (cache_aligned_allocate(number_of_bytes)) small_object{nullptr}; } allocator = this; // Return uninitialized memory for further construction on user side. obj->~small_object(); return obj; } void __TBB_EXPORTED_FUNC deallocate(d1::small_object_pool& allocator, void* ptr, std::size_t number_of_bytes) { auto pool = static_cast(&allocator); auto tls = governor::get_thread_data(); pool->deallocate_impl(ptr, number_of_bytes, *tls); } void __TBB_EXPORTED_FUNC deallocate(d1::small_object_pool& allocator, void* ptr, std::size_t number_of_bytes, const d1::execution_data& ed) { auto& tls = static_cast(ed).task_disp->get_thread_data(); auto pool = static_cast(&allocator); pool->deallocate_impl(ptr, number_of_bytes, tls); } void small_object_pool_impl::deallocate_impl(void* ptr, std::size_t number_of_bytes, thread_data& td) { __TBB_ASSERT(ptr != nullptr, "pointer to deallocate should not be null"); __TBB_ASSERT(number_of_bytes >= sizeof(small_object), "number of bytes should be at least sizeof(small_object)"); if (number_of_bytes <= small_object_size) { auto obj = new (ptr) small_object{nullptr}; if (td.my_small_object_pool == this) { obj->next = m_private_list; m_private_list = obj; } else { auto old_public_list = m_public_list.load(std::memory_order_relaxed); for (;;) { if (old_public_list == dead_public_list) { obj->~small_object(); cache_aligned_deallocate(obj); if (++m_public_counter == 0) { this->~small_object_pool_impl(); cache_aligned_deallocate(this); } break; } obj->next = old_public_list; if (m_public_list.compare_exchange_strong(old_public_list, obj)) { break; } } } } else { cache_aligned_deallocate(ptr); } } std::int64_t small_object_pool_impl::cleanup_list(small_object* list) { std::int64_t removed_count{}; while (list) { small_object* current = list; list = list->next; current->~small_object(); cache_aligned_deallocate(current); ++removed_count; } return removed_count; } void small_object_pool_impl::destroy() { // clean up private list and subtract the removed count from private counter m_private_counter -= cleanup_list(m_private_list); // Grab public list and place dead mark small_object* public_list = m_public_list.exchange(dead_public_list); // clean up public list and subtract from private (intentionally) counter m_private_counter -= cleanup_list(public_list); __TBB_ASSERT(m_private_counter >= 0, "Private counter may not be less than 0"); // Equivalent to fetch_sub(m_private_counter) - m_private_counter. But we need to do it // atomically with operator-= not to access m_private_counter after the subtraction. auto new_value = m_public_counter -= m_private_counter; // check if this method is responsible to clean up the resources if (new_value == 0) { this->~small_object_pool_impl(); cache_aligned_deallocate(this); } } } // namespace r1 } // namespace detail } // namespace tbb ================================================ FILE: third-party/tbb/src/tbb/small_object_pool_impl.h ================================================ /* Copyright (c) 2020-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_small_object_pool_impl_H #define __TBB_small_object_pool_impl_H #include "oneapi/tbb/detail/_small_object_pool.h" #include "oneapi/tbb/detail/_utils.h" #include #include #include namespace tbb { namespace detail { namespace r1 { class thread_data; class small_object_pool_impl : public d1::small_object_pool { static constexpr std::size_t small_object_size = 256; struct small_object { small_object* next; }; static small_object* const dead_public_list; public: void* allocate_impl(small_object_pool*& allocator, std::size_t number_of_bytes); void deallocate_impl(void* ptr, std::size_t number_of_bytes, thread_data& td); void destroy(); private: static std::int64_t cleanup_list(small_object* list); ~small_object_pool_impl() = default; private: alignas(max_nfs_size) small_object* m_private_list; std::int64_t m_private_counter{}; alignas(max_nfs_size) std::atomic m_public_list; std::atomic m_public_counter{}; }; } // namespace r1 } // namespace detail } // namespace tbb #endif /* __TBB_small_object_pool_impl_H */ ================================================ FILE: third-party/tbb/src/tbb/task.cpp ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ // Do not include task.h directly. Use scheduler_common.h instead #include "scheduler_common.h" #include "governor.h" #include "arena.h" #include "thread_data.h" #include "task_dispatcher.h" #include "waiters.h" #include "itt_notify.h" #include "oneapi/tbb/detail/_task.h" #include "oneapi/tbb/partitioner.h" #include "oneapi/tbb/task.h" #include namespace tbb { namespace detail { namespace r1 { //------------------------------------------------------------------------ // resumable tasks //------------------------------------------------------------------------ #if __TBB_RESUMABLE_TASKS void suspend(suspend_callback_type suspend_callback, void* user_callback) { thread_data& td = *governor::get_thread_data(); td.my_task_dispatcher->suspend(suspend_callback, user_callback); // Do not access td after suspend. } void resume(suspend_point_type* sp) { assert_pointers_valid(sp, sp->m_arena); task_dispatcher& task_disp = sp->m_resume_task.m_target; if (sp->try_notify_resume()) { // TODO: remove this work-around // Prolong the arena's lifetime while all coroutines are alive // (otherwise the arena can be destroyed while some tasks are suspended). arena& a = *sp->m_arena; a.my_references += arena::ref_worker; if (task_disp.m_properties.critical_task_allowed) { // The target is not in the process of executing critical task, so the resume task is not critical. a.my_resume_task_stream.push(&sp->m_resume_task, random_lane_selector(sp->m_random)); } else { #if __TBB_PREVIEW_CRITICAL_TASKS // The target is in the process of executing critical task, so the resume task is critical. a.my_critical_task_stream.push(&sp->m_resume_task, random_lane_selector(sp->m_random)); #endif } // Do not access target after that point. a.advertise_new_work(); // Release our reference to my_arena. a.on_thread_leaving(arena::ref_worker); } } suspend_point_type* current_suspend_point() { thread_data& td = *governor::get_thread_data(); return td.my_task_dispatcher->get_suspend_point(); } task_dispatcher& create_coroutine(thread_data& td) { // We may have some task dispatchers cached task_dispatcher* task_disp = td.my_arena->my_co_cache.pop(); if (!task_disp) { void* ptr = cache_aligned_allocate(sizeof(task_dispatcher)); task_disp = new(ptr) task_dispatcher(td.my_arena); task_disp->init_suspend_point(td.my_arena, td.my_arena->my_threading_control->worker_stack_size()); } // Prolong the arena's lifetime until all coroutines is alive // (otherwise the arena can be destroyed while some tasks are suspended). // TODO: consider behavior if there are more than 4K external references. td.my_arena->my_references += arena::ref_external; return *task_disp; } void task_dispatcher::internal_suspend() { __TBB_ASSERT(m_thread_data != nullptr, nullptr); arena_slot* slot = m_thread_data->my_arena_slot; __TBB_ASSERT(slot != nullptr, nullptr); task_dispatcher& default_task_disp = slot->default_task_dispatcher(); // TODO: simplify the next line, e.g. is_task_dispatcher_recalled( task_dispatcher& ) bool is_recalled = default_task_disp.get_suspend_point()->m_is_owner_recalled.load(std::memory_order_acquire); task_dispatcher& target = is_recalled ? default_task_disp : create_coroutine(*m_thread_data); resume(target); if (m_properties.outermost) { recall_point(); } } void task_dispatcher::suspend(suspend_callback_type suspend_callback, void* user_callback) { __TBB_ASSERT(suspend_callback != nullptr, nullptr); __TBB_ASSERT(user_callback != nullptr, nullptr); suspend_callback(user_callback, get_suspend_point()); __TBB_ASSERT(m_thread_data != nullptr, nullptr); __TBB_ASSERT(m_thread_data->my_post_resume_action == post_resume_action::none, nullptr); __TBB_ASSERT(m_thread_data->my_post_resume_arg == nullptr, nullptr); internal_suspend(); } bool task_dispatcher::resume(task_dispatcher& target) { // Do not create non-trivial objects on the stack of this function. They might never be destroyed { thread_data* td = m_thread_data; __TBB_ASSERT(&target != this, "We cannot resume to ourself"); __TBB_ASSERT(td != nullptr, "This task dispatcher must be attach to a thread data"); __TBB_ASSERT(td->my_task_dispatcher == this, "Thread data must be attached to this task dispatcher"); // Change the task dispatcher td->detach_task_dispatcher(); td->attach_task_dispatcher(target); } __TBB_ASSERT(m_suspend_point != nullptr, "Suspend point must be created"); __TBB_ASSERT(target.m_suspend_point != nullptr, "Suspend point must be created"); // Swap to the target coroutine. m_suspend_point->resume(target.m_suspend_point); // Pay attention that m_thread_data can be changed after resume if (m_thread_data) { thread_data* td = m_thread_data; __TBB_ASSERT(td != nullptr, "This task dispatcher must be attach to a thread data"); __TBB_ASSERT(td->my_task_dispatcher == this, "Thread data must be attached to this task dispatcher"); do_post_resume_action(); // Remove the recall flag if the thread in its original task dispatcher arena_slot* slot = td->my_arena_slot; __TBB_ASSERT(slot != nullptr, nullptr); if (this == slot->my_default_task_dispatcher) { __TBB_ASSERT(m_suspend_point != nullptr, nullptr); m_suspend_point->m_is_owner_recalled.store(false, std::memory_order_relaxed); } return true; } return false; } void task_dispatcher::do_post_resume_action() { thread_data* td = m_thread_data; switch (td->my_post_resume_action) { case post_resume_action::register_waiter: { __TBB_ASSERT(td->my_post_resume_arg, "The post resume action must have an argument"); static_cast(td->my_post_resume_arg)->notify(); break; } case post_resume_action::cleanup: { __TBB_ASSERT(td->my_post_resume_arg, "The post resume action must have an argument"); task_dispatcher* to_cleanup = static_cast(td->my_post_resume_arg); // Release coroutine's reference to my_arena td->my_arena->on_thread_leaving(arena::ref_external); // Cache the coroutine for possible later re-usage td->my_arena->my_co_cache.push(to_cleanup); break; } case post_resume_action::notify: { __TBB_ASSERT(td->my_post_resume_arg, "The post resume action must have an argument"); suspend_point_type* sp = static_cast(td->my_post_resume_arg); sp->recall_owner(); // Do not access sp because it can be destroyed after recall auto is_our_suspend_point = [sp] (market_context ctx) { return std::uintptr_t(sp) == ctx.my_uniq_addr; }; td->my_arena->get_waiting_threads_monitor().notify(is_our_suspend_point); break; } default: __TBB_ASSERT(td->my_post_resume_action == post_resume_action::none, "Unknown post resume action"); __TBB_ASSERT(td->my_post_resume_arg == nullptr, "The post resume argument should not be set"); } td->clear_post_resume_action(); } #else void suspend(suspend_callback_type, void*) { __TBB_ASSERT_RELEASE(false, "Resumable tasks are unsupported on this platform"); } void resume(suspend_point_type*) { __TBB_ASSERT_RELEASE(false, "Resumable tasks are unsupported on this platform"); } suspend_point_type* current_suspend_point() { __TBB_ASSERT_RELEASE(false, "Resumable tasks are unsupported on this platform"); return nullptr; } #endif /* __TBB_RESUMABLE_TASKS */ void notify_waiters(std::uintptr_t wait_ctx_addr) { auto is_related_wait_ctx = [&] (market_context context) { return wait_ctx_addr == context.my_uniq_addr; }; governor::get_thread_data()->my_arena->get_waiting_threads_monitor().notify(is_related_wait_ctx); } d1::wait_tree_vertex_interface* get_thread_reference_vertex(d1::wait_tree_vertex_interface* top_wait_context) { __TBB_ASSERT(top_wait_context, nullptr); auto& dispatcher = *governor::get_thread_data()->my_task_dispatcher; d1::reference_vertex* ref_counter{nullptr}; auto& reference_map = dispatcher.m_reference_vertex_map; auto pos = reference_map.find(top_wait_context); if (pos != reference_map.end()) { ref_counter = pos->second; } else { constexpr std::size_t max_reference_vertex_map_size = 1000; if (reference_map.size() > max_reference_vertex_map_size) { // TODO: Research the possibility of using better approach for a clean-up for (auto it = reference_map.begin(); it != reference_map.end();) { if (it->second->get_num_child() == 0) { it->second->~reference_vertex(); cache_aligned_deallocate(it->second); it = reference_map.erase(it); } else { ++it; } } } reference_map[top_wait_context] = ref_counter = new (cache_aligned_allocate(sizeof(d1::reference_vertex))) d1::reference_vertex(top_wait_context, 0); } return ref_counter; } } // namespace r1 } // namespace detail } // namespace tbb ================================================ FILE: third-party/tbb/src/tbb/task_dispatcher.cpp ================================================ /* Copyright (c) 2020-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "task_dispatcher.h" #include "waiters.h" namespace tbb { namespace detail { namespace r1 { static inline void spawn_and_notify(d1::task& t, arena_slot* slot, arena* a) { slot->spawn(t); a->advertise_new_work(); // TODO: TBB_REVAMP_TODO slot->assert_task_pool_valid(); } void __TBB_EXPORTED_FUNC spawn(d1::task& t, d1::task_group_context& ctx) { thread_data* tls = governor::get_thread_data(); task_group_context_impl::bind_to(ctx, tls); arena* a = tls->my_arena; arena_slot* slot = tls->my_arena_slot; // Capture current context task_accessor::context(t) = &ctx; // Mark isolation task_accessor::isolation(t) = tls->my_task_dispatcher->m_execute_data_ext.isolation; spawn_and_notify(t, slot, a); } void __TBB_EXPORTED_FUNC spawn(d1::task& t, d1::task_group_context& ctx, d1::slot_id id) { thread_data* tls = governor::get_thread_data(); task_group_context_impl::bind_to(ctx, tls); arena* a = tls->my_arena; arena_slot* slot = tls->my_arena_slot; execution_data_ext& ed = tls->my_task_dispatcher->m_execute_data_ext; // Capture context task_accessor::context(t) = &ctx; // Mark isolation task_accessor::isolation(t) = ed.isolation; if ( id != d1::no_slot && id != tls->my_arena_index && id < a->my_num_slots) { // Allocate proxy task d1::small_object_allocator alloc{}; auto proxy = alloc.new_object(static_cast(ed)); // Mark as a proxy task_accessor::set_proxy_trait(*proxy); // Mark isolation for the proxy task task_accessor::isolation(*proxy) = ed.isolation; // Deallocation hint (tls) from the task allocator proxy->allocator = alloc; proxy->slot = id; proxy->outbox = &a->mailbox(id); // Mark proxy as present in both locations (sender's task pool and destination mailbox) proxy->task_and_tag = intptr_t(&t) | task_proxy::location_mask; // Mail the proxy - after this point t may be destroyed by another thread at any moment. proxy->outbox->push(proxy); // Spawn proxy to the local task pool spawn_and_notify(*proxy, slot, a); } else { spawn_and_notify(t, slot, a); } } void __TBB_EXPORTED_FUNC submit(d1::task& t, d1::task_group_context& ctx, arena* a, std::uintptr_t as_critical) { suppress_unused_warning(as_critical); assert_pointer_valid(a); thread_data& tls = *governor::get_thread_data(); // TODO revamp: for each use case investigate neccesity to make this call task_group_context_impl::bind_to(ctx, &tls); task_accessor::context(t) = &ctx; // TODO revamp: consider respecting task isolation if this call is being made by external thread task_accessor::isolation(t) = tls.my_task_dispatcher->m_execute_data_ext.isolation; // TODO: consider code refactoring when lane selection mechanism is unified. if ( tls.is_attached_to(a) ) { arena_slot* slot = tls.my_arena_slot; #if __TBB_PREVIEW_CRITICAL_TASKS if( as_critical ) { a->my_critical_task_stream.push( &t, subsequent_lane_selector(slot->critical_hint()) ); } else #endif { slot->spawn(t); } } else { random_lane_selector lane_selector{tls.my_random}; #if !__TBB_PREVIEW_CRITICAL_TASKS suppress_unused_warning(as_critical); #else if ( as_critical ) { a->my_critical_task_stream.push( &t, lane_selector ); } else #endif { // Avoid joining the arena the thread is not currently in. a->my_fifo_task_stream.push( &t, lane_selector ); } } // It is assumed that some thread will explicitly wait in the arena the task is submitted // into. Therefore, no need to utilize mandatory concurrency here. a->advertise_new_work(); } void __TBB_EXPORTED_FUNC execute_and_wait(d1::task& t, d1::task_group_context& t_ctx, d1::wait_context& wait_ctx, d1::task_group_context& w_ctx) { task_accessor::context(t) = &t_ctx; task_dispatcher::execute_and_wait(&t, wait_ctx, w_ctx); } void __TBB_EXPORTED_FUNC wait(d1::wait_context& wait_ctx, d1::task_group_context& w_ctx) { // Enter the task dispatch loop without a task task_dispatcher::execute_and_wait(nullptr, wait_ctx, w_ctx); } d1::slot_id __TBB_EXPORTED_FUNC execution_slot(const d1::execution_data* ed) { if (ed) { const execution_data_ext* ed_ext = static_cast(ed); assert_pointers_valid(ed_ext->task_disp, ed_ext->task_disp->m_thread_data); return ed_ext->task_disp->m_thread_data->my_arena_index; } else { thread_data* td = governor::get_thread_data_if_initialized(); return td ? td->my_arena_index : d1::slot_id(-1); } } d1::task_group_context* __TBB_EXPORTED_FUNC current_context() { thread_data* td = governor::get_thread_data(); assert_pointers_valid(td, td->my_task_dispatcher); task_dispatcher* task_disp = td->my_task_dispatcher; if (task_disp->m_properties.outermost) { // No one task is executed, so no execute_data. return nullptr; } else { return td->my_task_dispatcher->m_execute_data_ext.context; } } void task_dispatcher::execute_and_wait(d1::task* t, d1::wait_context& wait_ctx, d1::task_group_context& w_ctx) { // Get an associated task dispatcher thread_data* tls = governor::get_thread_data(); __TBB_ASSERT(tls->my_task_dispatcher != nullptr, nullptr); task_dispatcher& local_td = *tls->my_task_dispatcher; // TODO: factor out the binding to execute_and_wait_impl if (t) { task_group_context_impl::bind_to(*task_accessor::context(*t), tls); // Propagate the isolation to the task executed without spawn. task_accessor::isolation(*t) = tls->my_task_dispatcher->m_execute_data_ext.isolation; } // Waiting on special object tied to a waiting thread. external_waiter waiter{ *tls->my_arena, wait_ctx }; t = local_td.local_wait_for_all(t, waiter); __TBB_ASSERT_EX(t == nullptr, "External waiter must not leave dispatch loop with a task"); // The external thread couldn't exit the dispatch loop in an idle state if (local_td.m_thread_data->my_inbox.is_idle_state(true)) { local_td.m_thread_data->my_inbox.set_is_idle(false); } auto exception = w_ctx.my_exception.load(std::memory_order_acquire); if (exception) { __TBB_ASSERT(w_ctx.is_group_execution_cancelled(), "The task group context with an exception should be canceled."); exception->throw_self(); } } #if __TBB_RESUMABLE_TASKS #if _WIN32 /* [[noreturn]] */ void __stdcall co_local_wait_for_all(void* addr) noexcept #else /* [[noreturn]] */ void co_local_wait_for_all(unsigned hi, unsigned lo) noexcept #endif { #if !_WIN32 std::uintptr_t addr = lo; __TBB_ASSERT(sizeof(addr) == 8 || hi == 0, nullptr); addr += std::uintptr_t(std::uint64_t(hi) << 32); #endif task_dispatcher& task_disp = *reinterpret_cast(addr); assert_pointers_valid(task_disp.m_thread_data, task_disp.m_thread_data->my_arena); task_disp.set_stealing_threshold(task_disp.m_thread_data->my_arena->calculate_stealing_threshold()); __TBB_ASSERT(task_disp.can_steal(), nullptr); task_disp.co_local_wait_for_all(); // This code is unreachable } /* [[noreturn]] */ void task_dispatcher::co_local_wait_for_all() noexcept { // Do not create non-trivial objects on the stack of this function. They will never be destroyed. assert_pointer_valid(m_thread_data); m_suspend_point->finilize_resume(); // Basically calls the user callback passed to the tbb::task::suspend function do_post_resume_action(); // Endless loop here because coroutine could be reused d1::task* resume_task{}; do { arena* a = m_thread_data->my_arena; coroutine_waiter waiter(*a); resume_task = local_wait_for_all(nullptr, waiter); assert_task_valid(resume_task); __TBB_ASSERT(this == m_thread_data->my_task_dispatcher, nullptr); m_thread_data->set_post_resume_action(post_resume_action::cleanup, this); } while (resume(static_cast(resume_task)->m_target)); // This code might be unreachable } d1::suspend_point task_dispatcher::get_suspend_point() { if (m_suspend_point == nullptr) { assert_pointer_valid(m_thread_data); // 0 means that we attach this task dispatcher to the current stack init_suspend_point(m_thread_data->my_arena, 0); } assert_pointer_valid(m_suspend_point); return m_suspend_point; } void task_dispatcher::init_suspend_point(arena* a, std::size_t stack_size) { __TBB_ASSERT(m_suspend_point == nullptr, nullptr); m_suspend_point = new(cache_aligned_allocate(sizeof(suspend_point_type))) suspend_point_type(a, stack_size, *this); } #endif /* __TBB_RESUMABLE_TASKS */ } // namespace r1 } // namespace detail } // namespace tbb ================================================ FILE: third-party/tbb/src/tbb/task_dispatcher.h ================================================ /* Copyright (c) 2020-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _TBB_task_dispatcher_H #define _TBB_task_dispatcher_H #include "oneapi/tbb/detail/_utils.h" #include "oneapi/tbb/detail/_task.h" #include "oneapi/tbb/global_control.h" #include "scheduler_common.h" #include "waiters.h" #include "arena_slot.h" #include "arena.h" #include "thread_data.h" #include "mailbox.h" #include "itt_notify.h" #include "concurrent_monitor.h" #include "threading_control.h" #include #if !__TBB_CPU_CTL_ENV_PRESENT #include // #endif namespace tbb { namespace detail { namespace r1 { inline d1::task* get_self_recall_task(arena_slot& slot) { suppress_unused_warning(slot); d1::task* t = nullptr; #if __TBB_RESUMABLE_TASKS suspend_point_type* sp = slot.default_task_dispatcher().m_suspend_point; if (sp && sp->m_is_owner_recalled.load(std::memory_order_acquire)) { t = &sp->m_resume_task; __TBB_ASSERT(sp->m_resume_task.m_target.m_thread_data == nullptr, nullptr); } #endif /* __TBB_RESUMABLE_TASKS */ return t; } // Defined in exception.cpp /*[[noreturn]]*/void do_throw_noexcept(void (*throw_exception)()) noexcept; //------------------------------------------------------------------------ // Suspend point //------------------------------------------------------------------------ #if __TBB_RESUMABLE_TASKS inline d1::task* suspend_point_type::resume_task::execute(d1::execution_data& ed) { execution_data_ext& ed_ext = static_cast(ed); if (ed_ext.wait_ctx) { thread_control_monitor::resume_context monitor_node{{std::uintptr_t(ed_ext.wait_ctx), nullptr}, ed_ext, m_target}; // The wait_ctx is present only in external_waiter. In that case we leave the current stack // in the abandoned state to resume when waiting completes. thread_data* td = ed_ext.task_disp->m_thread_data; td->set_post_resume_action(task_dispatcher::post_resume_action::register_waiter, &monitor_node); thread_control_monitor& wait_list = td->my_arena->get_waiting_threads_monitor(); if (wait_list.wait([&] { return !ed_ext.wait_ctx->continue_execution(); }, monitor_node)) { return nullptr; } td->clear_post_resume_action(); r1::resume(ed_ext.task_disp->get_suspend_point()); } else { // If wait_ctx is null, it can be only a worker thread on outermost level because // coroutine_waiter interrupts bypass loop before the resume_task execution. ed_ext.task_disp->m_thread_data->set_post_resume_action(task_dispatcher::post_resume_action::notify, ed_ext.task_disp->get_suspend_point()); } // Do not access this task because it might be destroyed ed_ext.task_disp->resume(m_target); return nullptr; } inline suspend_point_type::suspend_point_type(arena* a, size_t stack_size, task_dispatcher& task_disp) : m_arena(a) , m_random(this) , m_co_context(stack_size, &task_disp) , m_resume_task(task_disp) { assert_pointer_valid(m_arena); assert_pointer_valid(m_arena->my_default_ctx); task_accessor::context(m_resume_task) = m_arena->my_default_ctx; task_accessor::isolation(m_resume_task) = no_isolation; // Initialize the itt_caller for the context of the resume task. // It will be bound to the stack of the first suspend call. task_group_context_impl::bind_to(*task_accessor::context(m_resume_task), task_disp.m_thread_data); } #endif /* __TBB_RESUMABLE_TASKS */ //------------------------------------------------------------------------ // Task Dispatcher //------------------------------------------------------------------------ inline task_dispatcher::task_dispatcher(arena* a) { m_execute_data_ext.context = a->my_default_ctx; m_execute_data_ext.task_disp = this; } inline bool task_dispatcher::can_steal() { __TBB_ASSERT(m_stealing_threshold != 0, nullptr); stack_anchor_type anchor{}; return reinterpret_cast(&anchor) > m_stealing_threshold; } inline d1::task* task_dispatcher::get_inbox_or_critical_task( execution_data_ext& ed, mail_inbox& inbox, isolation_type isolation, bool critical_allowed) { if (inbox.empty()) return nullptr; d1::task* result = get_critical_task(nullptr, ed, isolation, critical_allowed); if (result) return result; // Check if there are tasks mailed to this thread via task-to-thread affinity mechanism. result = get_mailbox_task(inbox, ed, isolation); // There is a race with a thread adding a new task (possibly with suitable isolation) // to our mailbox, so the below conditions might result in a false positive. // Then set_is_idle(false) allows that task to be stolen; it's OK. if (isolation != no_isolation && !result && !inbox.empty() && inbox.is_idle_state(true)) { // We have proxy tasks in our mailbox but the isolation blocks their execution. // So publish the proxy tasks in mailbox to be available for stealing from owner's task pool. inbox.set_is_idle( false ); } return result; } inline d1::task* task_dispatcher::get_stream_or_critical_task( execution_data_ext& ed, arena& a, task_stream& stream, unsigned& hint, isolation_type isolation, bool critical_allowed) { if (stream.empty()) return nullptr; d1::task* result = get_critical_task(nullptr, ed, isolation, critical_allowed); if (result) return result; return a.get_stream_task(stream, hint); } inline d1::task* task_dispatcher::steal_or_get_critical( execution_data_ext& ed, arena& a, unsigned arena_index, FastRandom& random, isolation_type isolation, bool critical_allowed) { if (d1::task* t = a.steal_task(arena_index, random, ed, isolation)) { ed.context = task_accessor::context(*t); ed.isolation = task_accessor::isolation(*t); return get_critical_task(t, ed, isolation, critical_allowed); } return nullptr; } template d1::task* task_dispatcher::receive_or_steal_task( thread_data& tls, execution_data_ext& ed, Waiter& waiter, isolation_type isolation, bool fifo_allowed, bool critical_allowed) { __TBB_ASSERT(governor::is_thread_data_set(&tls), nullptr); // Task to return d1::task* t = nullptr; // Get tls data (again) arena& a = *tls.my_arena; arena_slot& slot = *tls.my_arena_slot; unsigned arena_index = tls.my_arena_index; mail_inbox& inbox = tls.my_inbox; task_stream& resume_stream = a.my_resume_task_stream; unsigned& resume_hint = slot.hint_for_resume_stream; task_stream& fifo_stream = a.my_fifo_task_stream; unsigned& fifo_hint = slot.hint_for_fifo_stream; waiter.reset_wait(); // Thread is in idle state now inbox.set_is_idle(true); bool stealing_is_allowed = can_steal(); // Stealing loop mailbox/enqueue/other_slots for (;;) { __TBB_ASSERT(t == nullptr, nullptr); // Check if the resource manager requires our arena to relinquish some threads // For the external thread restore idle state to true after dispatch loop if (!waiter.continue_execution(slot, t)) { __TBB_ASSERT(t == nullptr, nullptr); break; } // Start searching if (t != nullptr) { // continue_execution returned a task } else if ((t = get_inbox_or_critical_task(ed, inbox, isolation, critical_allowed))) { // Successfully got the task from mailbox or critical task } else if ((t = get_stream_or_critical_task(ed, a, resume_stream, resume_hint, isolation, critical_allowed))) { // Successfully got the resume or critical task } else if (fifo_allowed && isolation == no_isolation && (t = get_stream_or_critical_task(ed, a, fifo_stream, fifo_hint, isolation, critical_allowed))) { // Checked if there are tasks in starvation-resistant stream. Only allowed at the outermost dispatch level without isolation. } else if (stealing_is_allowed && (t = steal_or_get_critical(ed, a, arena_index, tls.my_random, isolation, critical_allowed))) { // Stole a task from a random arena slot } else { t = get_critical_task(t, ed, isolation, critical_allowed); } if (t != nullptr) { ed.context = task_accessor::context(*t); ed.isolation = task_accessor::isolation(*t); a.my_observers.notify_entry_observers(tls.my_last_observer, tls.my_is_worker); break; // Stealing success, end of stealing attempt } // Nothing to do, pause a little. waiter.pause(slot); } // end of nonlocal task retrieval loop __TBB_ASSERT(is_alive(a.my_guard), nullptr); if (inbox.is_idle_state(true)) { inbox.set_is_idle(false); } return t; } template d1::task* task_dispatcher::local_wait_for_all(d1::task* t, Waiter& waiter ) { assert_pointer_valid(m_thread_data); __TBB_ASSERT(m_thread_data->my_task_dispatcher == this, nullptr); // Guard an outer/default execution state struct dispatch_loop_guard { task_dispatcher& task_disp; execution_data_ext old_execute_data_ext; properties old_properties; bool is_initially_registered; ~dispatch_loop_guard() { task_disp.m_execute_data_ext = old_execute_data_ext; task_disp.m_properties = old_properties; if (!is_initially_registered) { task_disp.m_thread_data->my_arena->my_tc_client.get_pm_client()->unregister_thread(); task_disp.m_thread_data->my_is_registered = false; } __TBB_ASSERT(task_disp.m_thread_data && governor::is_thread_data_set(task_disp.m_thread_data), nullptr); __TBB_ASSERT(task_disp.m_thread_data->my_task_dispatcher == &task_disp, nullptr); } } dl_guard{ *this, m_execute_data_ext, m_properties, m_thread_data->my_is_registered }; // The context guard to track fp setting and itt tasks. context_guard_helper context_guard; // Current isolation context const isolation_type isolation = dl_guard.old_execute_data_ext.isolation; // Critical work inflection point. Once turned false current execution context has taken // critical task on the previous stack frame and cannot take more until that critical path is // finished. bool critical_allowed = dl_guard.old_properties.critical_task_allowed; // Extended execution data that is used for dispatching. // Base version is passed to the task::execute method. execution_data_ext& ed = m_execute_data_ext; ed.context = t ? task_accessor::context(*t) : nullptr; ed.original_slot = m_thread_data->my_arena_index; ed.affinity_slot = d1::no_slot; ed.task_disp = this; ed.wait_ctx = waiter.wait_ctx(); m_properties.outermost = false; m_properties.fifo_tasks_allowed = false; if (!dl_guard.is_initially_registered) { m_thread_data->my_arena->my_tc_client.get_pm_client()->register_thread(); m_thread_data->my_is_registered = true; } t = get_critical_task(t, ed, isolation, critical_allowed); if (t && m_thread_data->my_inbox.is_idle_state(true)) { // The thread has a work to do. Therefore, marking its inbox as not idle so that // affinitized tasks can be stolen from it. m_thread_data->my_inbox.set_is_idle(false); } // Infinite exception loop for (;;) { try { // Main execution loop do { // We assume that bypass tasks are from the same task group. context_guard.set_ctx(ed.context); // Inner level evaluates tasks coming from nesting loops and those returned // by just executed tasks (bypassing spawn or enqueue calls). while (t != nullptr) { assert_task_valid(t); assert_pointer_valid(ed.context); __TBB_ASSERT(ed.context->my_state == d1::task_group_context::state::bound || ed.context->my_state == d1::task_group_context::state::isolated, nullptr); __TBB_ASSERT(m_thread_data->my_inbox.is_idle_state(false), nullptr); __TBB_ASSERT(task_accessor::is_resume_task(*t) || isolation == no_isolation || isolation == ed.isolation, nullptr); // Check premature leave if (Waiter::postpone_execution(*t)) { __TBB_ASSERT(task_accessor::is_resume_task(*t) && dl_guard.old_properties.outermost, "Currently, the bypass loop can be interrupted only for resume task on outermost level"); return t; } // Copy itt_caller to a stack because the context might be destroyed after t->execute. void* itt_caller = ed.context->my_itt_caller; suppress_unused_warning(itt_caller); ITT_CALLEE_ENTER(ITTPossible, t, itt_caller); if (ed.context->is_group_execution_cancelled()) { t = t->cancel(ed); } else { t = t->execute(ed); } ITT_CALLEE_LEAVE(ITTPossible, itt_caller); // The task affinity in execution data is set for affinitized tasks. // So drop it after the task execution. ed.affinity_slot = d1::no_slot; // Reset task owner id for bypassed task ed.original_slot = m_thread_data->my_arena_index; t = get_critical_task(t, ed, isolation, critical_allowed); } __TBB_ASSERT(m_thread_data && governor::is_thread_data_set(m_thread_data), nullptr); __TBB_ASSERT(m_thread_data->my_task_dispatcher == this, nullptr); // When refactoring, pay attention that m_thread_data can be changed after t->execute() __TBB_ASSERT(m_thread_data->my_arena_slot != nullptr, nullptr); arena_slot& slot = *m_thread_data->my_arena_slot; if (!waiter.continue_execution(slot, t)) { break; } // Retrieve the task from local task pool if (t || (slot.is_task_pool_published() && (t = slot.get_task(ed, isolation)))) { __TBB_ASSERT(ed.original_slot == m_thread_data->my_arena_index, nullptr); ed.context = task_accessor::context(*t); ed.isolation = task_accessor::isolation(*t); continue; } // Retrieve the task from global sources t = receive_or_steal_task( *m_thread_data, ed, waiter, isolation, dl_guard.old_properties.fifo_tasks_allowed, critical_allowed ); } while (t != nullptr); // main dispatch loop break; // Exit exception loop; } catch (...) { if (global_control::active_value(global_control::terminate_on_exception) == 1) { do_throw_noexcept([] { throw; }); } if (ed.context->cancel_group_execution()) { /* We are the first to signal cancellation, so store the exception that caused it. */ ed.context->my_exception.store(tbb_exception_ptr::allocate(), std::memory_order_release); } } } // Infinite exception loop __TBB_ASSERT(t == nullptr, nullptr); #if __TBB_RESUMABLE_TASKS if (dl_guard.old_properties.outermost) { recall_point(); } #endif /* __TBB_RESUMABLE_TASKS */ return nullptr; } #if __TBB_RESUMABLE_TASKS inline void task_dispatcher::recall_point() { if (this != &m_thread_data->my_arena_slot->default_task_dispatcher()) { __TBB_ASSERT(m_suspend_point != nullptr, nullptr); __TBB_ASSERT(m_suspend_point->m_is_owner_recalled.load(std::memory_order_relaxed) == false, nullptr); m_thread_data->set_post_resume_action(post_resume_action::notify, get_suspend_point()); internal_suspend(); if (m_thread_data->my_inbox.is_idle_state(true)) { m_thread_data->my_inbox.set_is_idle(false); } } } #endif /* __TBB_RESUMABLE_TASKS */ #if __TBB_PREVIEW_CRITICAL_TASKS inline d1::task* task_dispatcher::get_critical_task(d1::task* t, execution_data_ext& ed, isolation_type isolation, bool critical_allowed) { __TBB_ASSERT( critical_allowed || !m_properties.critical_task_allowed, nullptr ); if (!critical_allowed) { // The stack is already in the process of critical path execution. Cannot take another // critical work until finish with the current one. __TBB_ASSERT(!m_properties.critical_task_allowed, nullptr); return t; } assert_pointers_valid(m_thread_data, m_thread_data->my_arena, m_thread_data->my_arena_slot); thread_data& td = *m_thread_data; arena& a = *td.my_arena; arena_slot& slot = *td.my_arena_slot; d1::task* crit_t = a.get_critical_task(slot.hint_for_critical_stream, isolation); if (crit_t != nullptr) { assert_task_valid(crit_t); if (t != nullptr) { assert_pointer_valid(ed.context); r1::spawn(*t, *ed.context); } ed.context = task_accessor::context(*crit_t); ed.isolation = task_accessor::isolation(*crit_t); // We cannot execute more than one critical task on the same stack. // In other words, we prevent nested critical tasks. m_properties.critical_task_allowed = false; // TODO: add a test that the observer is called when critical task is taken. a.my_observers.notify_entry_observers(td.my_last_observer, td.my_is_worker); t = crit_t; } else { // Was unable to find critical work in the queue. Allow inspecting the queue in nested // invocations. Handles the case when critical task has been just completed. m_properties.critical_task_allowed = true; } return t; } #else inline d1::task* task_dispatcher::get_critical_task(d1::task* t, execution_data_ext&, isolation_type, bool /*critical_allowed*/) { return t; } #endif inline d1::task* task_dispatcher::get_mailbox_task(mail_inbox& my_inbox, execution_data_ext& ed, isolation_type isolation) { while (task_proxy* const tp = my_inbox.pop(isolation)) { if (d1::task* result = tp->extract_task()) { ed.original_slot = (unsigned short)(-2); ed.affinity_slot = ed.task_disp->m_thread_data->my_arena_index; return result; } // We have exclusive access to the proxy, and can destroy it. tp->allocator.delete_object(tp, ed); } return nullptr; } template d1::task* task_dispatcher::local_wait_for_all(d1::task* t, Waiter& waiter) { if (governor::is_itt_present()) { return local_wait_for_all(t, waiter); } else { return local_wait_for_all(t, waiter); } } } // namespace r1 } // namespace detail } // namespace tbb #endif // _TBB_task_dispatcher_H ================================================ FILE: third-party/tbb/src/tbb/task_group_context.cpp ================================================ /* Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "oneapi/tbb/detail/_config.h" #include "oneapi/tbb/tbb_allocator.h" #include "oneapi/tbb/task_group.h" #include "governor.h" #include "thread_data.h" #include "scheduler_common.h" #include "itt_notify.h" #include "task_dispatcher.h" #include namespace tbb { namespace detail { namespace r1 { //------------------------------------------------------------------------ // tbb_exception_ptr //------------------------------------------------------------------------ tbb_exception_ptr* tbb_exception_ptr::allocate() noexcept { tbb_exception_ptr* eptr = (tbb_exception_ptr*)allocate_memory(sizeof(tbb_exception_ptr)); return eptr ? new (eptr) tbb_exception_ptr(std::current_exception()) : nullptr; } void tbb_exception_ptr::destroy() noexcept { this->~tbb_exception_ptr(); deallocate_memory(this); } void tbb_exception_ptr::throw_self() { if (governor::rethrow_exception_broken()) fix_broken_rethrow(); std::rethrow_exception(my_ptr); } //------------------------------------------------------------------------ // task_group_context //------------------------------------------------------------------------ void task_group_context_impl::destroy(d1::task_group_context& ctx) { __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr); if (ctx.my_context_list != nullptr) { __TBB_ASSERT(ctx.my_state.load(std::memory_order_relaxed) == d1::task_group_context::state::bound, nullptr); // The owner can be destroyed at any moment. Access the associate data with caution. ctx.my_context_list->remove(ctx.my_node); } d1::cpu_ctl_env* ctl = reinterpret_cast(&ctx.my_cpu_ctl_env); #if _MSC_VER && _MSC_VER <= 1900 && !__INTEL_COMPILER suppress_unused_warning(ctl); #endif ctl->~cpu_ctl_env(); auto exception = ctx.my_exception.load(std::memory_order_relaxed); if (exception) { exception->destroy(); } ITT_STACK_DESTROY(ctx.my_itt_caller); poison_pointer(ctx.my_parent); poison_pointer(ctx.my_context_list); poison_pointer(ctx.my_node.my_next_node); poison_pointer(ctx.my_node.my_prev_node); poison_pointer(ctx.my_exception); poison_pointer(ctx.my_itt_caller); ctx.my_state.store(d1::task_group_context::state::dead, std::memory_order_release); } void task_group_context_impl::initialize(d1::task_group_context& ctx) { ITT_TASK_GROUP(&ctx, ctx.my_name, nullptr); ctx.my_node.my_next_node = &ctx.my_node; ctx.my_node.my_prev_node = &ctx.my_node; ctx.my_cpu_ctl_env = 0; ctx.my_cancellation_requested = 0; ctx.my_may_have_children.store(0, std::memory_order_relaxed); // Set the created state to bound at the first usage. ctx.my_state.store(d1::task_group_context::state::created, std::memory_order_relaxed); ctx.my_parent = nullptr; ctx.my_context_list = nullptr; ctx.my_exception.store(nullptr, std::memory_order_relaxed); ctx.my_itt_caller = nullptr; static_assert(sizeof(d1::cpu_ctl_env) <= sizeof(ctx.my_cpu_ctl_env), "FPU settings storage does not fit to uint64_t"); d1::cpu_ctl_env* ctl = new (&ctx.my_cpu_ctl_env) d1::cpu_ctl_env; if (ctx.my_traits.fp_settings) ctl->get_env(); } void task_group_context_impl::register_with(d1::task_group_context& ctx, thread_data* td) { __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr); __TBB_ASSERT(td, nullptr); ctx.my_context_list = td->my_context_list; ctx.my_context_list->push_front(ctx.my_node); } void task_group_context_impl::bind_to_impl(d1::task_group_context& ctx, thread_data* td) { __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr); __TBB_ASSERT(ctx.my_state.load(std::memory_order_relaxed) == d1::task_group_context::state::locked, "The context can be bound only under the lock."); __TBB_ASSERT(!ctx.my_parent, "Parent is set before initial binding"); ctx.my_parent = td->my_task_dispatcher->m_execute_data_ext.context; __TBB_ASSERT(ctx.my_parent, nullptr); // Inherit FPU settings only if the context has not captured FPU settings yet. if (!ctx.my_traits.fp_settings) copy_fp_settings(ctx, *ctx.my_parent); // Condition below prevents unnecessary thrashing parent context's cache line if (ctx.my_parent->my_may_have_children.load(std::memory_order_relaxed) != d1::task_group_context::may_have_children) { ctx.my_parent->my_may_have_children.store(d1::task_group_context::may_have_children, std::memory_order_relaxed); // full fence is below } if (ctx.my_parent->my_parent) { // Even if this context were made accessible for state change propagation // (by placing store_with_release(td->my_context_list_state.head.my_next, &ctx.my_node) // above), it still could be missed if state propagation from a grand-ancestor // was underway concurrently with binding. // Speculative propagation from the parent together with epoch counters // detecting possibility of such a race allow to avoid taking locks when // there is no contention. // Acquire fence is necessary to prevent reordering subsequent speculative // loads of parent state data out of the scope where epoch counters comparison // can reliably validate it. uintptr_t local_count_snapshot = ctx.my_parent->my_context_list->epoch.load(std::memory_order_acquire); // Speculative propagation of parent's state. The speculation will be // validated by the epoch counters check further on. ctx.my_cancellation_requested.store(ctx.my_parent->my_cancellation_requested.load(std::memory_order_relaxed), std::memory_order_relaxed); register_with(ctx, td); // Issues full fence // If no state propagation was detected by the following condition, the above // full fence guarantees that the parent had correct state during speculative // propagation before the fence. Otherwise the propagation from parent is // repeated under the lock. if (local_count_snapshot != the_context_state_propagation_epoch.load(std::memory_order_relaxed)) { // Another thread may be propagating state change right now. So resort to lock. context_state_propagation_mutex_type::scoped_lock lock(the_context_state_propagation_mutex); ctx.my_cancellation_requested.store(ctx.my_parent->my_cancellation_requested.load(std::memory_order_relaxed), std::memory_order_relaxed); } } else { register_with(ctx, td); // Issues full fence // As we do not have grand-ancestors, concurrent state propagation (if any) // may originate only from the parent context, and thus it is safe to directly // copy the state from it. ctx.my_cancellation_requested.store(ctx.my_parent->my_cancellation_requested.load(std::memory_order_relaxed), std::memory_order_relaxed); } } void task_group_context_impl::bind_to(d1::task_group_context& ctx, thread_data* td) { d1::task_group_context::state state = ctx.my_state.load(std::memory_order_acquire); if (state <= d1::task_group_context::state::locked) { if (state == d1::task_group_context::state::created && #if defined(__INTEL_COMPILER) && __INTEL_COMPILER <= 1910 ((std::atomic::type>&)ctx.my_state).compare_exchange_strong( (typename std::underlying_type::type&)state, (typename std::underlying_type::type)d1::task_group_context::state::locked) #else ctx.my_state.compare_exchange_strong(state, d1::task_group_context::state::locked) #endif ) { // If we are in the outermost task dispatch loop of an external thread, then // there is nothing to bind this context to, and we skip the binding part // treating the context as isolated. __TBB_ASSERT(td->my_task_dispatcher->m_execute_data_ext.context != nullptr, nullptr); d1::task_group_context::state release_state{}; if (td->my_task_dispatcher->m_execute_data_ext.context == td->my_arena->my_default_ctx || !ctx.my_traits.bound) { if (!ctx.my_traits.fp_settings) { copy_fp_settings(ctx, *td->my_arena->my_default_ctx); } release_state = d1::task_group_context::state::isolated; } else { bind_to_impl(ctx, td); release_state = d1::task_group_context::state::bound; } ITT_STACK_CREATE(ctx.my_itt_caller); ctx.my_state.store(release_state, std::memory_order_release); } spin_wait_while_eq(ctx.my_state, d1::task_group_context::state::locked); } __TBB_ASSERT(ctx.my_state.load(std::memory_order_relaxed) != d1::task_group_context::state::created, nullptr); __TBB_ASSERT(ctx.my_state.load(std::memory_order_relaxed) != d1::task_group_context::state::locked, nullptr); } void task_group_context_impl::propagate_task_group_state(d1::task_group_context& ctx, std::atomic d1::task_group_context::* mptr_state, d1::task_group_context& src, std::uint32_t new_state) { __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr); /* 1. if ((ctx.*mptr_state).load(std::memory_order_relaxed) == new_state): Nothing to do, whether descending from "src" or not, so no need to scan. Hopefully this happens often thanks to earlier invocations. This optimization is enabled by LIFO order in the context lists: - new contexts are bound to the beginning of lists; - descendants are newer than ancestors; - earlier invocations are therefore likely to "paint" long chains. 2. if (&ctx != &src): This clause is disjunct from the traversal below, which skips src entirely. Note that src.*mptr_state is not necessarily still equal to new_state (another thread may have changed it again). Such interference is probably not frequent enough to aim for optimisation by writing new_state again (to make the other thread back down). Letting the other thread prevail may also be fairer. */ if ((ctx.*mptr_state).load(std::memory_order_relaxed) != new_state && &ctx != &src) { for (d1::task_group_context* ancestor = ctx.my_parent; ancestor != nullptr; ancestor = ancestor->my_parent) { if (ancestor == &src) { for (d1::task_group_context* c = &ctx; c != ancestor; c = c->my_parent) (c->*mptr_state).store(new_state, std::memory_order_relaxed); break; } } } } bool task_group_context_impl::cancel_group_execution(d1::task_group_context& ctx) { __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr); __TBB_ASSERT(ctx.my_cancellation_requested.load(std::memory_order_relaxed) <= 1, "The cancellation state can be either 0 or 1"); if (ctx.my_cancellation_requested.load(std::memory_order_relaxed) || ctx.my_cancellation_requested.exchange(1)) { // This task group and any descendants have already been canceled. // (A newly added descendant would inherit its parent's ctx.my_cancellation_requested, // not missing out on any cancellation still being propagated, and a context cannot be uncanceled.) return false; } governor::get_thread_data()->my_arena->my_threading_control->propagate_task_group_state(&d1::task_group_context::my_cancellation_requested, ctx, uint32_t(1)); return true; } bool task_group_context_impl::is_group_execution_cancelled(const d1::task_group_context& ctx) { return ctx.my_cancellation_requested.load(std::memory_order_relaxed) != 0; } // IMPORTANT: It is assumed that this method is not used concurrently! void task_group_context_impl::reset(d1::task_group_context& ctx) { __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr); //! TODO: Add assertion that this context does not have children // No fences are necessary since this context can be accessed from another thread // only after stealing happened (which means necessary fences were used). auto exception = ctx.my_exception.load(std::memory_order_relaxed); if (exception) { exception->destroy(); ctx.my_exception.store(nullptr, std::memory_order_relaxed); } ctx.my_cancellation_requested = 0; } // IMPORTANT: It is assumed that this method is not used concurrently! void task_group_context_impl::capture_fp_settings(d1::task_group_context& ctx) { __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr); //! TODO: Add assertion that this context does not have children // No fences are necessary since this context can be accessed from another thread // only after stealing happened (which means necessary fences were used). d1::cpu_ctl_env* ctl = reinterpret_cast(&ctx.my_cpu_ctl_env); if (!ctx.my_traits.fp_settings) { ctl = new (&ctx.my_cpu_ctl_env) d1::cpu_ctl_env; ctx.my_traits.fp_settings = true; } ctl->get_env(); } void task_group_context_impl::copy_fp_settings(d1::task_group_context& ctx, const d1::task_group_context& src) { __TBB_ASSERT(!is_poisoned(ctx.my_context_list), nullptr); __TBB_ASSERT(!ctx.my_traits.fp_settings, "The context already has FPU settings."); __TBB_ASSERT(src.my_traits.fp_settings, "The source context does not have FPU settings."); const d1::cpu_ctl_env* src_ctl = reinterpret_cast(&src.my_cpu_ctl_env); new (&ctx.my_cpu_ctl_env) d1::cpu_ctl_env(*src_ctl); ctx.my_traits.fp_settings = true; } /* Comments: 1. The premise of the cancellation support implementation is that cancellations are not part of the hot path of the program execution. Therefore all changes in its implementation in order to reduce the overhead of the cancellation control flow should be done only in ways that do not increase overhead of the normal execution. In general, contexts are used by all threads and their descendants are created in different threads as well. In order to minimize impact of the cross-thread tree maintenance (first of all because of the synchronization), the tree of contexts is split into pieces, each of which is handled by a single thread. Such pieces are represented as lists of contexts, members of which are contexts that were bound to their parents in the given thread. The context tree maintenance and cancellation propagation algorithms are designed in such a manner that cross-thread access to a context list will take place only when cancellation signal is sent (by user or when an exception happens), and synchronization is necessary only then. Thus the normal execution flow (without exceptions and cancellation) remains free from any synchronization done on behalf of exception handling and cancellation support. 2. Consider parallel cancellations at the different levels of the context tree: Ctx1 <- Cancelled by Thread1 |- Thread2 started processing | | Ctx2 |- Thread1 started processing | T1 |- Thread2 finishes and syncs up local counters Ctx3 <- Cancelled by Thread2 | | |- Ctx5 is bound to Ctx2 Ctx4 | T2 |- Thread1 reaches Ctx2 Thread-propagator of each cancellation increments global counter. However the thread propagating the cancellation from the outermost context (Thread1) may be the last to finish. Which means that the local counters may be synchronized earlier (by Thread2, at Time1) than it propagated cancellation into Ctx2 (at time Time2). If a new context (Ctx5) is created and bound to Ctx2 between Time1 and Time2, checking its parent only (Ctx2) may result in cancellation request being lost. This issue is solved by doing the whole propagation under the lock. If we need more concurrency while processing parallel cancellations, we could try the following modification of the propagation algorithm: advance global counter and remember it for each thread: scan thread's list of contexts for each thread: sync up its local counter only if the global counter has not been changed However this version of the algorithm requires more analysis and verification. */ void __TBB_EXPORTED_FUNC initialize(d1::task_group_context& ctx) { task_group_context_impl::initialize(ctx); } void __TBB_EXPORTED_FUNC destroy(d1::task_group_context& ctx) { task_group_context_impl::destroy(ctx); } void __TBB_EXPORTED_FUNC reset(d1::task_group_context& ctx) { task_group_context_impl::reset(ctx); } bool __TBB_EXPORTED_FUNC cancel_group_execution(d1::task_group_context& ctx) { return task_group_context_impl::cancel_group_execution(ctx); } bool __TBB_EXPORTED_FUNC is_group_execution_cancelled(d1::task_group_context& ctx) { return task_group_context_impl::is_group_execution_cancelled(ctx); } void __TBB_EXPORTED_FUNC capture_fp_settings(d1::task_group_context& ctx) { task_group_context_impl::capture_fp_settings(ctx); } } // namespace r1 } // namespace detail } // namespace tbb ================================================ FILE: third-party/tbb/src/tbb/task_stream.h ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _TBB_task_stream_H #define _TBB_task_stream_H //! This file is a possible future replacement for the task_stream class implemented in //! task_stream.h. It refactors the code and extends task_stream capabilities by moving lane //! management during operations on caller side. Despite the fact that new implementation should not //! affect performance of the original task stream, analysis on this subject was not made at the //! time it was developed. In addition, it is not clearly seen at the moment that this container //! would be suitable for critical tasks due to linear time complexity on its operations. #include "oneapi/tbb/detail/_utils.h" #include "oneapi/tbb/cache_aligned_allocator.h" #include "oneapi/tbb/mutex.h" #include "scheduler_common.h" #include "misc.h" // for FastRandom #include #include #include namespace tbb { namespace detail { namespace r1 { //! Essentially, this is just a pair of a queue and a mutex to protect the queue. /** The reason std::pair is not used is that the code would look less clean if field names were replaced with 'first' and 'second'. **/ template< typename T, typename mutex_t > struct alignas(max_nfs_size) queue_and_mutex { typedef std::deque< T, cache_aligned_allocator > queue_base_t; queue_base_t my_queue{}; mutex_t my_mutex{}; }; using population_t = uintptr_t; const population_t one = 1; inline void set_one_bit( std::atomic& dest, int pos ) { __TBB_ASSERT( pos>=0, nullptr); __TBB_ASSERT( pos& dest, int pos ) { __TBB_ASSERT( pos>=0, nullptr); __TBB_ASSERT( pos=0, nullptr); __TBB_ASSERT( pos class task_stream_accessor : no_copy { protected: using lane_t = queue_and_mutex ; d1::task* get_item( lane_t::queue_base_t& queue ) { d1::task* result = queue.front(); queue.pop_front(); return result; } }; template<> class task_stream_accessor< back_nonnull_accessor > : no_copy { protected: using lane_t = queue_and_mutex ; d1::task* get_item( lane_t::queue_base_t& queue ) { d1::task* result = nullptr; __TBB_ASSERT(!queue.empty(), nullptr); // Isolated task can put zeros in queue see look_specific do { result = queue.back(); queue.pop_back(); } while ( !result && !queue.empty() ); return result; } }; //! The container for "fairness-oriented" aka "enqueued" tasks. template class task_stream : public task_stream_accessor< accessor > { using lane_t = typename task_stream_accessor::lane_t; std::atomic population{}; lane_t* lanes{nullptr}; unsigned N{}; public: task_stream() = default; void initialize( unsigned n_lanes ) { const unsigned max_lanes = sizeof(population_t) * CHAR_BIT; N = n_lanes >= max_lanes ? max_lanes : n_lanes > 2 ? 1 << (tbb::detail::log2(n_lanes - 1) + 1) : 2; __TBB_ASSERT( N == max_lanes || (N >= n_lanes && ((N - 1) & N) == 0), "number of lanes miscalculated" ); __TBB_ASSERT( N <= sizeof(population_t) * CHAR_BIT, nullptr); lanes = static_cast(cache_aligned_allocate(sizeof(lane_t) * N)); for (unsigned i = 0; i < N; ++i) { new (lanes + i) lane_t; } __TBB_ASSERT( !population.load(std::memory_order_relaxed), nullptr); } ~task_stream() { if (lanes) { for (unsigned i = 0; i < N; ++i) { lanes[i].~lane_t(); } cache_aligned_deallocate(lanes); } } //! Push a task into a lane. Lane selection is performed by passed functor. template void push(d1::task* source, const lane_selector_t& next_lane ) { bool succeed = false; unsigned lane = 0; do { lane = next_lane( /*out_of=*/N ); __TBB_ASSERT( lane < N, "Incorrect lane index." ); } while( ! (succeed = try_push( source, lane )) ); } //! Try finding and popping a task using passed functor for lane selection. Last used lane is //! updated inside lane selector. template d1::task* pop( const lane_selector_t& next_lane ) { d1::task* popped = nullptr; unsigned lane = 0; for (atomic_backoff b; !empty() && !popped; b.pause()) { lane = next_lane( /*out_of=*/N); __TBB_ASSERT(lane < N, "Incorrect lane index."); popped = try_pop(lane); } return popped; } //! Try finding and popping a related task. d1::task* pop_specific( unsigned& last_used_lane, isolation_type isolation ) { d1::task* result = nullptr; // Lane selection is round-robin in backward direction. unsigned idx = last_used_lane & (N-1); do { if( is_bit_set( population.load(std::memory_order_relaxed), idx ) ) { lane_t& lane = lanes[idx]; mutex::scoped_lock lock; if( lock.try_acquire(lane.my_mutex) && !lane.my_queue.empty() ) { result = look_specific( lane.my_queue, isolation ); if( lane.my_queue.empty() ) clear_one_bit( population, idx ); if( result ) break; } } idx=(idx-1)&(N-1); } while( !empty() && idx != last_used_lane ); last_used_lane = idx; return result; } //! Checks existence of a task. bool empty() { return !population.load(std::memory_order_relaxed); } private: //! Returns true on successful push, otherwise - false. bool try_push(d1::task* source, unsigned lane_idx ) { mutex::scoped_lock lock; if( lock.try_acquire( lanes[lane_idx].my_mutex ) ) { lanes[lane_idx].my_queue.push_back( source ); set_one_bit( population, lane_idx ); // TODO: avoid atomic op if the bit is already set return true; } return false; } //! Returns pointer to task on successful pop, otherwise - nullptr. d1::task* try_pop( unsigned lane_idx ) { if( !is_bit_set( population.load(std::memory_order_relaxed), lane_idx ) ) return nullptr; d1::task* result = nullptr; lane_t& lane = lanes[lane_idx]; mutex::scoped_lock lock; if( lock.try_acquire( lane.my_mutex ) && !lane.my_queue.empty() ) { result = this->get_item( lane.my_queue ); if( lane.my_queue.empty() ) clear_one_bit( population, lane_idx ); } return result; } // TODO: unify '*_specific' logic with 'pop' methods above d1::task* look_specific( typename lane_t::queue_base_t& queue, isolation_type isolation ) { __TBB_ASSERT( !queue.empty(), nullptr); // TODO: add a worst-case performance test and consider an alternative container with better // performance for isolation search. typename lane_t::queue_base_t::iterator curr = queue.end(); do { // TODO: consider logic from get_task to simplify the code. d1::task* result = *--curr; if( result && task_accessor::isolation(*result) == isolation ) { if( queue.end() - curr == 1 ) queue.pop_back(); // a little of housekeeping along the way else *curr = nullptr; // grabbing task with the same isolation // TODO: move one of the container's ends instead if the task has been found there return result; } } while( curr != queue.begin() ); return nullptr; } }; // task_stream } // namespace r1 } // namespace detail } // namespace tbb #endif /* _TBB_task_stream_H */ ================================================ FILE: third-party/tbb/src/tbb/tbb.rc ================================================ // Copyright (c) 2005-2025 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. ///////////////////////////////////////////////////////////////////////////// // // Includes // #include #include "../../include/oneapi/tbb/version.h" ///////////////////////////////////////////////////////////////////////////// // Neutral resources #ifdef _WIN32 LANGUAGE LANG_NEUTRAL, SUBLANG_NEUTRAL #pragma code_page(1252) #endif //_WIN32 ///////////////////////////////////////////////////////////////////////////// // // Version // #define TBB_VERNUMBERS TBB_VERSION_MAJOR,TBB_VERSION_MINOR,TBB_VERSION_PATCH #define TBB_VERSION TBB_VERSION_STRING VS_VERSION_INFO VERSIONINFO FILEVERSION TBB_VERNUMBERS PRODUCTVERSION TBB_VERNUMBERS FILEFLAGSMASK 0x17L #ifdef _DEBUG FILEFLAGS 0x1L #else FILEFLAGS 0x0L #endif FILEOS 0x40004L FILETYPE 0x2L FILESUBTYPE 0x0L BEGIN BLOCK "StringFileInfo" BEGIN BLOCK "000004b0" BEGIN VALUE "CompanyName", "Intel Corporation\0" VALUE "FileDescription", "oneAPI Threading Building Blocks (oneTBB) library\0" VALUE "FileVersion", TBB_VERSION "\0" VALUE "LegalCopyright", "Copyright 2005-2025 Intel Corporation. All Rights Reserved.\0" VALUE "LegalTrademarks", "\0" #ifndef TBB_USE_DEBUG VALUE "OriginalFilename", "tbb12.dll\0" #else VALUE "OriginalFilename", "tbb12_debug.dll\0" #endif VALUE "ProductName", "oneAPI Threading Building Blocks (oneTBB)\0" VALUE "ProductVersion", TBB_VERSION "\0" VALUE "PrivateBuild", "\0" VALUE "SpecialBuild", "\0" END END BLOCK "VarFileInfo" BEGIN VALUE "Translation", 0x0, 1200 END END ================================================ FILE: third-party/tbb/src/tbb/tcm.h ================================================ /* Copyright (c) 2023-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _TBB_tcm_H #define _TBB_tcm_H #include #include #ifdef __cplusplus extern "C" { #endif // Support for the TCM API return value typedef enum _tcm_result_t { TCM_RESULT_SUCCESS = 0x0, TCM_RESULT_ERROR_INVALID_ARGUMENT = 0x78000004, TCM_RESULT_ERROR_UNKNOWN = 0x7ffffffe } tcm_result_t; // Support for permit states enum tcm_permit_states_t { TCM_PERMIT_STATE_VOID, TCM_PERMIT_STATE_INACTIVE, TCM_PERMIT_STATE_PENDING, TCM_PERMIT_STATE_IDLE, TCM_PERMIT_STATE_ACTIVE }; typedef uint8_t tcm_permit_state_t; // Support for permit flags typedef struct _tcm_permit_flags_t { uint32_t stale : 1; uint32_t rigid_concurrency : 1; uint32_t exclusive : 1; uint32_t request_as_inactive : 1; uint32_t reserved : 28; } tcm_permit_flags_t; typedef struct _tcm_callback_flags_t { uint32_t new_concurrency : 1; uint32_t new_state : 1; uint32_t reserved : 30; } tcm_callback_flags_t; // Support for cpu masks struct hwloc_bitmap_s; typedef struct hwloc_bitmap_s* hwloc_bitmap_t; typedef hwloc_bitmap_t tcm_cpu_mask_t; // Support for ids typedef uint64_t tcm_client_id_t; // Support for permits typedef struct _tcm_permit_t { uint32_t* concurrencies; tcm_cpu_mask_t* cpu_masks; uint32_t size; tcm_permit_state_t state; tcm_permit_flags_t flags; } tcm_permit_t; // Support for permit handle typedef struct tcm_permit_rep_t* tcm_permit_handle_t; // Support for constraints typedef int32_t tcm_numa_node_t; typedef int32_t tcm_core_type_t; const int8_t tcm_automatic = -1; const int8_t tcm_any = -2; #define TCM_PERMIT_REQUEST_CONSTRAINTS_INITIALIZER {tcm_automatic, tcm_automatic, NULL, \ tcm_automatic, tcm_automatic, tcm_automatic} typedef struct _tcm_cpu_constraints_t { int32_t min_concurrency; int32_t max_concurrency; tcm_cpu_mask_t mask; tcm_numa_node_t numa_id; tcm_core_type_t core_type_id; int32_t threads_per_core; } tcm_cpu_constraints_t; // Support for priorities enum tcm_request_priorities_t { TCM_REQUEST_PRIORITY_LOW = (INT32_MAX / 4) * 1, TCM_REQUEST_PRIORITY_NORMAL = (INT32_MAX / 4) * 2, TCM_REQUEST_PRIORITY_HIGH = (INT32_MAX / 4) * 3 }; typedef int32_t tcm_request_priority_t; // Support for requests #define TCM_PERMIT_REQUEST_INITIALIZER {tcm_automatic, tcm_automatic, \ NULL, 0, TCM_REQUEST_PRIORITY_NORMAL, {}, {}} typedef struct _tcm_permit_request_t { int32_t min_sw_threads; int32_t max_sw_threads; tcm_cpu_constraints_t* cpu_constraints; uint32_t constraints_size; tcm_request_priority_t priority; tcm_permit_flags_t flags; char reserved[4]; } tcm_permit_request_t; // Support for client callback typedef tcm_result_t (*tcm_callback_t)(tcm_permit_handle_t p, void* callback_arg, tcm_callback_flags_t); #if _WIN32 #define __TCM_EXPORT __declspec(dllexport) #else #define __TCM_EXPORT #endif __TCM_EXPORT tcm_result_t tcmConnect(tcm_callback_t callback, tcm_client_id_t *client_id); __TCM_EXPORT tcm_result_t tcmDisconnect(tcm_client_id_t client_id); __TCM_EXPORT tcm_result_t tcmRequestPermit(tcm_client_id_t client_id, tcm_permit_request_t request, void* callback_arg, tcm_permit_handle_t* permit_handle, tcm_permit_t* permit); __TCM_EXPORT tcm_result_t tcmGetPermitData(tcm_permit_handle_t permit_handle, tcm_permit_t* permit); __TCM_EXPORT tcm_result_t tcmReleasePermit(tcm_permit_handle_t permit); __TCM_EXPORT tcm_result_t tcmIdlePermit(tcm_permit_handle_t permit_handle); __TCM_EXPORT tcm_result_t tcmDeactivatePermit(tcm_permit_handle_t permit_handle); __TCM_EXPORT tcm_result_t tcmActivatePermit(tcm_permit_handle_t permit_handle); __TCM_EXPORT tcm_result_t tcmRegisterThread(tcm_permit_handle_t permit_handle); __TCM_EXPORT tcm_result_t tcmUnregisterThread(); __TCM_EXPORT tcm_result_t tcmGetVersionInfo(char* buffer, uint32_t buffer_size); #ifdef __cplusplus } // extern "C" #endif #endif /* _TBB_tcm_H */ ================================================ FILE: third-party/tbb/src/tbb/tcm_adaptor.cpp ================================================ /* Copyright (c) 2023-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "oneapi/tbb/detail/_intrusive_list_node.h" #include "oneapi/tbb/detail/_template_helpers.h" #include "oneapi/tbb/task_arena.h" #include "pm_client.h" #include "dynamic_link.h" #include "misc.h" #include "tcm.h" #include "tcm_adaptor.h" #include namespace tbb { namespace detail { namespace r1 { namespace { #if __TBB_WEAK_SYMBOLS_PRESENT #pragma weak tcmConnect #pragma weak tcmDisconnect #pragma weak tcmRequestPermit #pragma weak tcmGetPermitData #pragma weak tcmReleasePermit #pragma weak tcmIdlePermit #pragma weak tcmDeactivatePermit #pragma weak tcmActivatePermit #pragma weak tcmRegisterThread #pragma weak tcmUnregisterThread #pragma weak tcmGetVersionInfo #endif /* __TBB_WEAK_SYMBOLS_PRESENT */ tcm_result_t(*tcm_connect)(tcm_callback_t callback, tcm_client_id_t* client_id){nullptr}; tcm_result_t(*tcm_disconnect)(tcm_client_id_t client_id){ nullptr }; tcm_result_t(*tcm_request_permit)(tcm_client_id_t client_id, tcm_permit_request_t request, void* callback_arg, tcm_permit_handle_t* permit_handle, tcm_permit_t* permit){nullptr}; tcm_result_t(*tcm_get_permit_data)(tcm_permit_handle_t permit_handle, tcm_permit_t* permit){nullptr}; tcm_result_t(*tcm_release_permit)(tcm_permit_handle_t permit){nullptr}; tcm_result_t(*tcm_idle_permit)(tcm_permit_handle_t permit_handle){nullptr}; tcm_result_t(*tcm_deactivate_permit)(tcm_permit_handle_t permit_handle){nullptr}; tcm_result_t(*tcm_activate_permit)(tcm_permit_handle_t permit_handle){nullptr}; tcm_result_t(*tcm_register_thread)(tcm_permit_handle_t permit_handle){nullptr}; tcm_result_t(*tcm_unregister_thread)(){nullptr}; tcm_result_t (*tcm_get_version_info)(char* buffer, uint32_t buffer_size){nullptr}; static const dynamic_link_descriptor tcm_link_table[] = { DLD(tcmConnect, tcm_connect), DLD(tcmDisconnect, tcm_disconnect), DLD(tcmRequestPermit, tcm_request_permit), DLD(tcmGetPermitData, tcm_get_permit_data), DLD(tcmReleasePermit, tcm_release_permit), DLD(tcmIdlePermit, tcm_idle_permit), DLD(tcmDeactivatePermit, tcm_deactivate_permit), DLD(tcmActivatePermit, tcm_activate_permit), DLD(tcmRegisterThread, tcm_register_thread), DLD(tcmUnregisterThread, tcm_unregister_thread), DLD(tcmGetVersionInfo, tcm_get_version_info) }; #if TBB_USE_DEBUG #define DEBUG_SUFFIX "_debug" #else #define DEBUG_SUFFIX #endif /* TBB_USE_DEBUG */ #if _WIN32 || _WIN64 #define LIBRARY_EXTENSION ".dll" #define LIBRARY_PREFIX #elif __unix__ #define LIBRARY_EXTENSION ".so.1" #define LIBRARY_PREFIX "lib" #else #define LIBRARY_EXTENSION #define LIBRARY_PREFIX #endif /* __unix__ */ #define TCMLIB_NAME LIBRARY_PREFIX "tcm" DEBUG_SUFFIX LIBRARY_EXTENSION static bool tcm_functions_loaded{ false }; } class tcm_client : public pm_client { using tcm_client_mutex_type = d1::mutex; public: tcm_client(tcm_adaptor& adaptor, arena& a) : pm_client(a), my_tcm_adaptor(adaptor) {} ~tcm_client() { if (my_permit_handle) { __TBB_ASSERT(tcm_release_permit, nullptr); auto res = tcm_release_permit(my_permit_handle); __TBB_ASSERT_EX(res == TCM_RESULT_SUCCESS, nullptr); } } int update_concurrency(uint32_t concurrency) { return my_arena.update_concurrency(concurrency); } unsigned priority_level() { return my_arena.priority_level(); } tcm_permit_request_t& permit_request() { return my_permit_request; } tcm_permit_handle_t& permit_handle() { return my_permit_handle; } void actualize_permit() { __TBB_ASSERT(tcm_get_permit_data, nullptr); int delta{}; { tcm_client_mutex_type::scoped_lock lock(my_permit_mutex); uint32_t new_concurrency{}; tcm_permit_t new_permit{ &new_concurrency, nullptr, 1, TCM_PERMIT_STATE_VOID, {} }; auto res = tcm_get_permit_data(my_permit_handle, &new_permit); __TBB_ASSERT_EX(res == TCM_RESULT_SUCCESS, nullptr); // The permit has changed during the reading, so the callback will be invoked soon one more time and // we can just skip this renegotiation iteration. if (!new_permit.flags.stale) { // If there is no other demand in TCM, the permit may still have granted concurrency but // be in the deactivated state thus we enforce 0 allotment to preserve arena invariants. delta = update_concurrency(new_permit.state != TCM_PERMIT_STATE_INACTIVE ? new_concurrency : 0); } } if (delta) { my_tcm_adaptor.notify_thread_request(delta); } } void request_permit(tcm_client_id_t client_id) { __TBB_ASSERT(tcm_request_permit, nullptr); my_permit_request.max_sw_threads = max_workers(); my_permit_request.min_sw_threads = my_permit_request.max_sw_threads == 0 ? 0 : min_workers(); if (my_permit_request.constraints_size > 0) { my_permit_request.cpu_constraints->min_concurrency = my_permit_request.min_sw_threads; my_permit_request.cpu_constraints->max_concurrency = my_permit_request.max_sw_threads; } __TBB_ASSERT(my_permit_request.max_sw_threads >= my_permit_request.min_sw_threads, nullptr); tcm_result_t res = tcm_request_permit(client_id, my_permit_request, this, &my_permit_handle, nullptr); __TBB_ASSERT_EX(res == TCM_RESULT_SUCCESS, nullptr); } void deactivate_permit() { __TBB_ASSERT(tcm_deactivate_permit, nullptr); tcm_result_t res = tcm_deactivate_permit(my_permit_handle); __TBB_ASSERT_EX(res == TCM_RESULT_SUCCESS, nullptr); } void init(tcm_client_id_t client_id, d1::constraints& constraints) { __TBB_ASSERT(tcm_request_permit, nullptr); __TBB_ASSERT(tcm_deactivate_permit, nullptr); if (constraints.core_type != d1::task_arena::automatic || constraints.numa_id != d1::task_arena::automatic || constraints.max_threads_per_core != d1::task_arena::automatic) { my_permit_constraints.max_concurrency = constraints.max_concurrency; my_permit_constraints.min_concurrency = 0; my_permit_constraints.core_type_id = constraints.core_type; my_permit_constraints.numa_id = constraints.numa_id; my_permit_constraints.threads_per_core = constraints.max_threads_per_core; my_permit_request.cpu_constraints = &my_permit_constraints; my_permit_request.constraints_size = 1; } my_permit_request.min_sw_threads = 0; my_permit_request.max_sw_threads = 0; my_permit_request.flags.request_as_inactive = 1; tcm_result_t res = tcm_request_permit(client_id, my_permit_request, this, &my_permit_handle, nullptr); __TBB_ASSERT_EX(res == TCM_RESULT_SUCCESS, nullptr); my_permit_request.flags.request_as_inactive = 0; } void register_thread() override { __TBB_ASSERT(tcm_register_thread, nullptr); auto return_code = tcm_register_thread(my_permit_handle); __TBB_ASSERT_EX(return_code == TCM_RESULT_SUCCESS, nullptr); } void unregister_thread() override { __TBB_ASSERT(tcm_unregister_thread, nullptr); auto return_code = tcm_unregister_thread(); __TBB_ASSERT_EX(return_code == TCM_RESULT_SUCCESS, nullptr); } private: tcm_cpu_constraints_t my_permit_constraints = TCM_PERMIT_REQUEST_CONSTRAINTS_INITIALIZER; tcm_permit_request_t my_permit_request = TCM_PERMIT_REQUEST_INITIALIZER; tcm_permit_handle_t my_permit_handle{}; tcm_client_mutex_type my_permit_mutex; tcm_adaptor& my_tcm_adaptor; }; //------------------------------------------------------------------------ // tcm_adaptor_impl //------------------------------------------------------------------------ struct tcm_adaptor_impl { using demand_mutex_type = d1::mutex; demand_mutex_type my_demand_mutex; tcm_client_id_t client_id{}; tcm_adaptor_impl(tcm_client_id_t id) : client_id(id) {} }; //------------------------------------------------------------------------ // tcm_adaptor //------------------------------------------------------------------------ tcm_result_t renegotiation_callback(tcm_permit_handle_t, void* client_ptr, tcm_callback_flags_t) { __TBB_ASSERT(client_ptr, nullptr); static_cast(client_ptr)->actualize_permit(); return TCM_RESULT_SUCCESS; } void tcm_adaptor::initialize() { tcm_functions_loaded = dynamic_link(TCMLIB_NAME, tcm_link_table, /* tcm_link_table size = */ 11); } bool tcm_adaptor::is_initialized() { return tcm_functions_loaded; } void tcm_adaptor::print_version() { if (is_initialized()) { __TBB_ASSERT(tcm_get_version_info, nullptr); char buffer[1024]; tcm_get_version_info(buffer, 1024); std::fprintf(stderr, "%.*s", 1024, buffer); } } tcm_adaptor::tcm_adaptor() { __TBB_ASSERT(tcm_connect, nullptr); tcm_client_id_t client_id{}; auto return_code = tcm_connect(renegotiation_callback, &client_id); if (return_code == TCM_RESULT_SUCCESS) { my_impl = make_cache_aligned_unique(client_id); } } tcm_adaptor::~tcm_adaptor() { if (my_impl) { __TBB_ASSERT(tcm_disconnect, nullptr); auto return_code = tcm_disconnect(my_impl->client_id); __TBB_ASSERT_EX(return_code == TCM_RESULT_SUCCESS, nullptr); my_impl = nullptr; } } bool tcm_adaptor::is_connected() { return my_impl != nullptr; } pm_client* tcm_adaptor::create_client(arena& a) { return new (cache_aligned_allocate(sizeof(tcm_client))) tcm_client(*this, a); } void tcm_adaptor::register_client(pm_client* c, d1::constraints& constraints) { static_cast(c)->init(my_impl->client_id, constraints); } void tcm_adaptor::unregister_and_destroy_client(pm_client& c) { auto& client = static_cast(c); { tcm_adaptor_impl::demand_mutex_type::scoped_lock lock(my_impl->my_demand_mutex); client.~tcm_client(); } cache_aligned_deallocate(&client); } void tcm_adaptor::set_active_num_workers(int) {} void tcm_adaptor::adjust_demand(pm_client& c, int mandatory_delta, int workers_delta) { __TBB_ASSERT(-1 <= mandatory_delta && mandatory_delta <= 1, nullptr); auto& client = static_cast(c); { tcm_adaptor_impl::demand_mutex_type::scoped_lock lock(my_impl->my_demand_mutex); // Update client's state workers_delta = client.update_request(mandatory_delta, workers_delta); if (workers_delta == 0) return; if (client.max_workers() == 0) { client.deactivate_permit(); } else { client.request_permit(my_impl->client_id); } } client.actualize_permit(); } } // namespace r1 } // namespace detail } // namespace tbb ================================================ FILE: third-party/tbb/src/tbb/tcm_adaptor.h ================================================ /* Copyright (c) 2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _TBB_tcm_adaptor_H #define _TBB_tcm_adaptor_H #include "scheduler_common.h" #include "permit_manager.h" #include "pm_client.h" namespace tbb { namespace detail { namespace r1 { struct tcm_adaptor_impl; //------------------------------------------------------------------------ // Class tcm_adaptor //------------------------------------------------------------------------ class tcm_adaptor : public permit_manager { public: tcm_adaptor(); ~tcm_adaptor(); pm_client* create_client(arena& a) override; void register_client(pm_client* client, d1::constraints& constraints) override; void unregister_and_destroy_client(pm_client& c) override; void set_active_num_workers(int soft_limit) override; void adjust_demand(pm_client& c, int mandatory_delta, int workers_delta) override; bool is_connected(); static void initialize(); static bool is_initialized(); static void print_version(); private: cache_aligned_unique_ptr my_impl; friend class tcm_client; }; // class tcm_adaptor } // namespace r1 } // namespace detail } // namespace tbb #endif /* _TBB_tcm_adaptor_H */ ================================================ FILE: third-party/tbb/src/tbb/thread_control_monitor.h ================================================ /* Copyright (c) 2021-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_thread_control_monitor_H #define __TBB_thread_control_monitor_H #include "concurrent_monitor.h" #include "scheduler_common.h" #include namespace tbb { namespace detail { namespace r1 { struct market_context { market_context() = default; market_context(std::uintptr_t first_addr, arena* a) : my_uniq_addr(first_addr), my_arena_addr(a) {} std::uintptr_t my_uniq_addr{0}; arena* my_arena_addr{nullptr}; }; #if __TBB_RESUMABLE_TASKS class resume_node : public wait_node { using base_type = wait_node; public: resume_node(market_context ctx, execution_data_ext& ed_ext, task_dispatcher& target) : base_type(ctx), my_curr_dispatcher(ed_ext.task_disp), my_target_dispatcher(&target) , my_suspend_point(my_curr_dispatcher->get_suspend_point()) {} ~resume_node() override { if (this->my_skipped_wakeup) { spin_wait_until_eq(this->my_notify_calls, 1); } poison_pointer(my_curr_dispatcher); poison_pointer(my_target_dispatcher); poison_pointer(my_suspend_point); } void init() override { base_type::init(); } void wait() override { my_curr_dispatcher->resume(*my_target_dispatcher); __TBB_ASSERT(!this->my_is_in_list.load(std::memory_order_relaxed), "Still in the queue?"); } void reset() override { base_type::reset(); spin_wait_until_eq(this->my_notify_calls, 1); my_notify_calls.store(0, std::memory_order_relaxed); } // notify is called (perhaps, concurrently) twice from: // - concurrent_monitor::notify // - post_resume_action::register_waiter // The second notify is called after thread switches the stack // (Because we can not call resume while the stack is occupied) // We need calling resume only when both notifications are performed. void notify() override { if (++my_notify_calls == 2) { r1::resume(my_suspend_point); } } private: friend class thread_data; friend struct suspend_point_type::resume_task; task_dispatcher* my_curr_dispatcher; task_dispatcher* my_target_dispatcher; suspend_point_type* my_suspend_point; std::atomic my_notify_calls{0}; }; #endif // __TBB_RESUMABLE_TASKS class thread_control_monitor : public concurrent_monitor_base { using base_type = concurrent_monitor_base; public: using base_type::base_type; ~thread_control_monitor() { destroy(); } /** per-thread descriptor for concurrent_monitor */ using thread_context = sleep_node; #if __TBB_RESUMABLE_TASKS using resume_context = resume_node; #endif }; } // namespace r1 } // namespace detail } // namespace tbb #endif // __TBB_thread_control_monitor_H ================================================ FILE: third-party/tbb/src/tbb/thread_data.h ================================================ /* Copyright (c) 2020-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_thread_data_H #define __TBB_thread_data_H #include "oneapi/tbb/detail/_task.h" #include "oneapi/tbb/task.h" #include "rml_base.h" // rml::job #include "scheduler_common.h" #include "arena.h" #include "concurrent_monitor.h" #include "mailbox.h" #include "misc.h" // FastRandom #include "small_object_pool_impl.h" #include "intrusive_list.h" #include namespace tbb { namespace detail { namespace r1 { class task; class arena_slot; class task_group_context; class task_dispatcher; class thread_dispatcher_client; class context_list : public intrusive_list { public: bool orphaned{false}; //! Last state propagation epoch known to this thread /** Together with the_context_state_propagation_epoch constitute synchronization protocol that keeps hot path of task group context construction destruction mostly lock-free. When local epoch equals the global one, the state of task group contexts registered with this thread is consistent with that of the task group trees they belong to. **/ std::atomic epoch{}; //! Mutex protecting access to the list of task group contexts. d1::mutex m_mutex{}; void destroy() { this->~context_list(); cache_aligned_deallocate(this); } void remove(d1::intrusive_list_node& val) { mutex::scoped_lock lock(m_mutex); intrusive_list::remove(val); if (orphaned && empty()) { lock.release(); destroy(); } } void push_front(d1::intrusive_list_node& val) { mutex::scoped_lock lock(m_mutex); intrusive_list::push_front(val); } void orphan() { mutex::scoped_lock lock(m_mutex); orphaned = true; if (empty()) { lock.release(); destroy(); } } }; //------------------------------------------------------------------------ // Thread Data //------------------------------------------------------------------------ class thread_data : public ::rml::job , public d1::intrusive_list_node , no_copy { public: thread_data(unsigned short index, bool is_worker) : my_arena_index{ index } , my_is_worker{ is_worker } , my_is_registered { false } , my_task_dispatcher{ nullptr } , my_arena{ nullptr } , my_last_client{ nullptr } , my_arena_slot{} , my_random{ this } , my_last_observer{ nullptr } , my_small_object_pool{new (cache_aligned_allocate(sizeof(small_object_pool_impl))) small_object_pool_impl{}} , my_context_list(new (cache_aligned_allocate(sizeof(context_list))) context_list{}) #if __TBB_RESUMABLE_TASKS , my_post_resume_action{ task_dispatcher::post_resume_action::none } , my_post_resume_arg{nullptr} #endif /* __TBB_RESUMABLE_TASKS */ { ITT_SYNC_CREATE(&my_context_list->m_mutex, SyncType_Scheduler, SyncObj_ContextsList); } ~thread_data() { my_context_list->orphan(); my_small_object_pool->destroy(); poison_pointer(my_task_dispatcher); poison_pointer(my_arena); poison_pointer(my_arena_slot); poison_pointer(my_last_observer); poison_pointer(my_small_object_pool); poison_pointer(my_context_list); #if __TBB_RESUMABLE_TASKS poison_pointer(my_post_resume_arg); #endif /* __TBB_RESUMABLE_TASKS */ } void attach_arena(arena& a, std::size_t index); bool is_attached_to(arena*); void attach_task_dispatcher(task_dispatcher&); void detach_task_dispatcher(); void enter_task_dispatcher(task_dispatcher& task_disp, std::uintptr_t stealing_threshold); void leave_task_dispatcher(); void propagate_task_group_state(std::atomic d1::task_group_context::* mptr_state, d1::task_group_context& src, uint32_t new_state); //! Index of the arena slot the scheduler occupies now, or occupied last time unsigned short my_arena_index; //! Indicates if the thread is created by RML const bool my_is_worker; bool my_is_registered; //! The current task dipsatcher task_dispatcher* my_task_dispatcher; //! The arena that I own (if external thread) or am servicing at the moment (if worker) arena* my_arena; thread_dispatcher_client* my_last_client; //! Pointer to the slot in the arena we own at the moment arena_slot* my_arena_slot; //! The mailbox (affinity mechanism) the current thread attached to mail_inbox my_inbox; //! The random generator FastRandom my_random; //! Last observer in the observers list processed on this slot observer_proxy* my_last_observer; //! Pool of small object for fast task allocation small_object_pool_impl* my_small_object_pool; context_list* my_context_list; #if __TBB_RESUMABLE_TASKS //! Suspends the current coroutine (task_dispatcher). void suspend(void* suspend_callback, void* user_callback); //! Resumes the target task_dispatcher. void resume(task_dispatcher& target); //! Set post resume action to perform after resume. void set_post_resume_action(task_dispatcher::post_resume_action pra, void* arg) { __TBB_ASSERT(my_post_resume_action == task_dispatcher::post_resume_action::none, "The Post resume action must not be set"); __TBB_ASSERT(!my_post_resume_arg, "The post resume action must not have an argument"); my_post_resume_action = pra; my_post_resume_arg = arg; } void clear_post_resume_action() { my_post_resume_action = task_dispatcher::post_resume_action::none; my_post_resume_arg = nullptr; } //! The post resume action requested after the swap contexts. task_dispatcher::post_resume_action my_post_resume_action; //! The post resume action argument. void* my_post_resume_arg; #endif /* __TBB_RESUMABLE_TASKS */ //! The default context // TODO: consider using common default context because it is used only to simplify // cancellation check. d1::task_group_context my_default_context; }; inline void thread_data::attach_arena(arena& a, std::size_t index) { my_arena = &a; my_arena_index = static_cast(index); my_arena_slot = a.my_slots + index; // Read the current slot mail_outbox and attach it to the mail_inbox (remove inbox later maybe) my_inbox.attach(my_arena->mailbox(index)); } inline bool thread_data::is_attached_to(arena* a) { return my_arena == a; } inline void thread_data::attach_task_dispatcher(task_dispatcher& task_disp) { __TBB_ASSERT(my_task_dispatcher == nullptr, nullptr); __TBB_ASSERT(task_disp.m_thread_data == nullptr, nullptr); task_disp.m_thread_data = this; my_task_dispatcher = &task_disp; } inline void thread_data::detach_task_dispatcher() { __TBB_ASSERT(my_task_dispatcher != nullptr, nullptr); __TBB_ASSERT(my_task_dispatcher->m_thread_data == this, nullptr); my_task_dispatcher->m_thread_data = nullptr; my_task_dispatcher = nullptr; } inline void thread_data::enter_task_dispatcher(task_dispatcher& task_disp, std::uintptr_t stealing_threshold) { task_disp.set_stealing_threshold(stealing_threshold); attach_task_dispatcher(task_disp); } inline void thread_data::leave_task_dispatcher() { my_task_dispatcher->set_stealing_threshold(0); detach_task_dispatcher(); } inline void thread_data::propagate_task_group_state(std::atomic d1::task_group_context::* mptr_state, d1::task_group_context& src, std::uint32_t new_state) { mutex::scoped_lock lock(my_context_list->m_mutex); // Acquire fence is necessary to ensure that the subsequent node->my_next load // returned the correct value in case it was just inserted in another thread. // The fence also ensures visibility of the correct ctx.my_parent value. for (context_list::iterator it = my_context_list->begin(); it != my_context_list->end(); ++it) { d1::task_group_context& ctx = __TBB_get_object_ref(d1::task_group_context, my_node, &(*it)); if ((ctx.*mptr_state).load(std::memory_order_relaxed) != new_state) task_group_context_impl::propagate_task_group_state(ctx, mptr_state, src, new_state); } // Sync up local propagation epoch with the global one. Release fence prevents // reordering of possible store to *mptr_state after the sync point. my_context_list->epoch.store(the_context_state_propagation_epoch.load(std::memory_order_relaxed), std::memory_order_release); } } // namespace r1 } // namespace detail } // namespace tbb #endif // __TBB_thread_data_H ================================================ FILE: third-party/tbb/src/tbb/thread_dispatcher.cpp ================================================ /* Copyright (c) 2022-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "thread_dispatcher.h" #include "threading_control.h" namespace tbb { namespace detail { namespace r1 { thread_dispatcher::thread_dispatcher(threading_control& tc, unsigned hard_limit, std::size_t stack_size) : my_threading_control(tc) , my_num_workers_hard_limit(hard_limit) , my_stack_size(stack_size) { my_server = governor::create_rml_server( *this ); __TBB_ASSERT( my_server, "Failed to create RML server" ); } thread_dispatcher::~thread_dispatcher() { poison_pointer(my_server); } thread_dispatcher_client* thread_dispatcher::select_next_client(thread_dispatcher_client* hint) { unsigned next_client_priority_level = num_priority_levels; if (hint) { next_client_priority_level = hint->priority_level(); } for (unsigned idx = 0; idx < next_client_priority_level; ++idx) { if (!my_client_list[idx].empty()) { return &*my_client_list[idx].begin(); } } return hint; } thread_dispatcher_client* thread_dispatcher::create_client(arena& a) { return new (cache_aligned_allocate(sizeof(thread_dispatcher_client))) thread_dispatcher_client(a, my_clients_aba_epoch); } void thread_dispatcher::register_client(thread_dispatcher_client* client) { client_list_mutex_type::scoped_lock lock(my_list_mutex); insert_client(*client); } bool thread_dispatcher::try_unregister_client(thread_dispatcher_client* client, std::uint64_t aba_epoch, unsigned priority) { __TBB_ASSERT(client, nullptr); // we hold reference to the server, so market cannot be destroyed at any moment here __TBB_ASSERT(!is_poisoned(my_server), nullptr); my_list_mutex.lock(); for (auto& it : my_client_list[priority]) { if (client == &it) { if (it.get_aba_epoch() == aba_epoch) { // Client is alive // Acquire my_references to sync with threads that just left the arena // Pay attention that references should be read before workers_requested because // if references is no zero some other thread might call adjust_demand and lead to // a race over workers_requested if (!client->references() && !client->has_request()) { // Client is abandoned. Destroy it. remove_client(*client); ++my_clients_aba_epoch; my_list_mutex.unlock(); destroy_client(client); return true; } } break; } } my_list_mutex.unlock(); return false; } void thread_dispatcher::destroy_client(thread_dispatcher_client* client) { client->~thread_dispatcher_client(); cache_aligned_deallocate(client); } // Should be called under lock void thread_dispatcher::insert_client(thread_dispatcher_client& client) { __TBB_ASSERT(client.priority_level() < num_priority_levels, nullptr); my_client_list[client.priority_level()].push_front(client); __TBB_ASSERT(!my_next_client || my_next_client->priority_level() < num_priority_levels, nullptr); my_next_client = select_next_client(my_next_client); } // Should be called under lock void thread_dispatcher::remove_client(thread_dispatcher_client& client) { __TBB_ASSERT(client.priority_level() < num_priority_levels, nullptr); my_client_list[client.priority_level()].remove(client); if (my_next_client == &client) { my_next_client = nullptr; } my_next_client = select_next_client(my_next_client); } bool thread_dispatcher::is_client_alive(thread_dispatcher_client* client) { if (!client) { return false; } // Still cannot access internals of the client since the object itself might be destroyed. for (auto& priority_list : my_client_list) { for (auto& c : priority_list) { if (client == &c) { return true; } } } return false; } thread_dispatcher_client* thread_dispatcher::client_in_need(client_list_type* clients, thread_dispatcher_client* hint) { // TODO: make sure client with higher priority returned only if there are available slots in it. hint = select_next_client(hint); if (!hint) { return nullptr; } client_list_type::iterator it = hint; unsigned curr_priority_level = hint->priority_level(); __TBB_ASSERT(it != clients[curr_priority_level].end(), nullptr); do { thread_dispatcher_client& t = *it; if (++it == clients[curr_priority_level].end()) { do { ++curr_priority_level %= num_priority_levels; } while (clients[curr_priority_level].empty()); it = clients[curr_priority_level].begin(); } if (t.try_join()) { return &t; } } while (it != hint); return nullptr; } thread_dispatcher_client* thread_dispatcher::client_in_need(thread_dispatcher_client* prev) { client_list_mutex_type::scoped_lock lock(my_list_mutex, /*is_writer=*/false); if (is_client_alive(prev)) { return client_in_need(my_client_list, prev); } return client_in_need(my_client_list, my_next_client); } bool thread_dispatcher::is_any_client_in_need() { client_list_mutex_type::scoped_lock lock(my_list_mutex, /*is_writer=*/false); for (auto& priority_list : my_client_list) { for (auto& client : priority_list) { if (client.is_joinable()) { return true; } } } return false; } void thread_dispatcher::adjust_job_count_estimate(int delta) { my_server->adjust_job_count_estimate(delta); } void thread_dispatcher::release(bool blocking_terminate) { my_join_workers = blocking_terminate; my_server->request_close_connection(); } void thread_dispatcher::process(job& j) { thread_data& td = static_cast(j); // td.my_last_client can be dead. Don't access it until client_in_need is called thread_dispatcher_client* client = td.my_last_client; for (int i = 0; i < 2; ++i) { while ((client = client_in_need(client)) ) { td.my_last_client = client; client->process(td); } // Workers leave thread_dispatcher because there is no client in need. It can happen earlier than // adjust_job_count_estimate() decreases my_slack and RML can put this thread to sleep. // It might result in a busy-loop checking for my_slack<0 and calling this method instantly. // the yield refines this spinning. if ( !i ) { yield(); } } } //! Used when RML asks for join mode during workers termination. bool thread_dispatcher::must_join_workers() const { return my_join_workers; } //! Returns the requested stack size of worker threads. std::size_t thread_dispatcher::worker_stack_size() const { return my_stack_size; } void thread_dispatcher::acknowledge_close_connection() { my_threading_control.destroy(); } ::rml::job* thread_dispatcher::create_one_job() { unsigned short index = ++my_first_unused_worker_idx; __TBB_ASSERT(index > 0, nullptr); ITT_THREAD_SET_NAME(_T("TBB Worker Thread")); // index serves as a hint decreasing conflicts between workers when they migrate between arenas thread_data* td = new (cache_aligned_allocate(sizeof(thread_data))) thread_data{ index, true }; __TBB_ASSERT(index <= my_num_workers_hard_limit, nullptr); my_threading_control.register_thread(*td); return td; } void thread_dispatcher::cleanup(job& j) { my_threading_control.unregister_thread(static_cast(j)); governor::auto_terminate(&j); } } // namespace r1 } // namespace detail } // namespace tbb ================================================ FILE: third-party/tbb/src/tbb/thread_dispatcher.h ================================================ /* Copyright (c) 2022-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _TBB_thread_dispatcher_H #define _TBB_thread_dispatcher_H #include "oneapi/tbb/detail/_config.h" #include "oneapi/tbb/detail/_utils.h" #include "oneapi/tbb/rw_mutex.h" #include "oneapi/tbb/task_arena.h" #include "arena.h" #include "governor.h" #include "thread_data.h" #include "rml_tbb.h" #include "thread_dispatcher_client.h" namespace tbb { namespace detail { namespace r1 { class threading_control_impl; class thread_dispatcher : no_copy, rml::tbb_client { using client_list_type = intrusive_list; using client_list_mutex_type = d1::rw_mutex; public: thread_dispatcher(threading_control& tc, unsigned hard_limit, std::size_t stack_size); ~thread_dispatcher(); thread_dispatcher_client* create_client(arena& a); void register_client(thread_dispatcher_client* client); bool try_unregister_client(thread_dispatcher_client* client, std::uint64_t aba_epoch, unsigned priority); bool is_any_client_in_need(); void adjust_job_count_estimate(int delta); void release(bool blocking_terminate); void process(job& j) override; //! Used when RML asks for join mode during workers termination. bool must_join_workers() const; //! Returns the requested stack size of worker threads. std::size_t worker_stack_size() const; private: version_type version () const override { return 0; } unsigned max_job_count () const override { return my_num_workers_hard_limit; } std::size_t min_stack_size () const override { return worker_stack_size(); } void cleanup(job& j) override; void acknowledge_close_connection() override; ::rml::job* create_one_job() override; thread_dispatcher_client* select_next_client(thread_dispatcher_client* hint); void destroy_client(thread_dispatcher_client* client); void insert_client(thread_dispatcher_client& client); void remove_client(thread_dispatcher_client& client); bool is_client_alive(thread_dispatcher_client* client); thread_dispatcher_client* client_in_need(client_list_type* clients, thread_dispatcher_client* hint); thread_dispatcher_client* client_in_need(thread_dispatcher_client* prev); friend class threading_control_impl; static constexpr unsigned num_priority_levels = d1::num_priority_levels; client_list_mutex_type my_list_mutex; client_list_type my_client_list[num_priority_levels]; thread_dispatcher_client* my_next_client{nullptr}; //! Shutdown mode bool my_join_workers{false}; threading_control& my_threading_control; //! ABA prevention marker to assign to newly created clients std::atomic my_clients_aba_epoch{0}; //! Maximal number of workers allowed for use by the underlying resource manager /** It can't be changed after thread_dispatcher creation. **/ unsigned my_num_workers_hard_limit{0}; //! Stack size of worker threads std::size_t my_stack_size{0}; //! First unused index of worker /** Used to assign indices to the new workers coming from RML **/ std::atomic my_first_unused_worker_idx{0}; //! Pointer to the RML server object that services this TBB instance. rml::tbb_server* my_server{nullptr}; }; } // namespace r1 } // namespace detail } // namespace tbb #endif // _TBB_thread_dispatcher_H ================================================ FILE: third-party/tbb/src/tbb/thread_dispatcher_client.h ================================================ /* Copyright (c) 2022-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _TBB_thread_dispatcher_client_H #define _TBB_thread_dispatcher_client_H #include "oneapi/tbb/detail/_intrusive_list_node.h" #include "arena.h" namespace tbb { namespace detail { namespace r1 { class thread_dispatcher_client : public d1::intrusive_list_node /* Need for list in thread pool */ { public: thread_dispatcher_client(arena& a, std::uint64_t aba_epoch) : my_arena(a), my_aba_epoch(aba_epoch) {} // Interface of communication with thread pool bool try_join() { return my_arena.try_join(); } bool is_joinable() { return my_arena.is_joinable(); } void process(thread_data& td) { my_arena.process(td); } unsigned priority_level() { return my_arena.priority_level(); } std::uint64_t get_aba_epoch() { return my_aba_epoch; } unsigned references() { return my_arena.references(); } bool has_request() { return my_arena.has_request(); } private: arena& my_arena; std::uint64_t my_aba_epoch; }; } // namespace r1 } // namespace detail } // namespace tbb #endif // _TBB_thread_dispatcher_client_H ================================================ FILE: third-party/tbb/src/tbb/thread_request_serializer.cpp ================================================ /* Copyright (c) 2022-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "misc.h" #include "thread_request_serializer.h" namespace tbb { namespace detail { namespace r1 { thread_request_serializer::thread_request_serializer(thread_dispatcher& td, int soft_limit) : my_thread_dispatcher(td) , my_soft_limit(soft_limit) {} void thread_request_serializer::update(int delta) { constexpr std::uint64_t delta_mask = (pending_delta_base << 1) - 1; constexpr std::uint64_t counter_value = delta_mask + 1; int prev_pending_delta = my_pending_delta.fetch_add(counter_value + delta); // There is a pseudo request aggregator, so only thread that see pending_delta_base in my_pending_delta // Will enter to critical section and call adjust_job_count_estimate if (prev_pending_delta == pending_delta_base) { delta = int(my_pending_delta.exchange(pending_delta_base) & delta_mask) - int(pending_delta_base); mutex_type::scoped_lock lock(my_mutex); my_total_request.store(my_total_request.load(std::memory_order_relaxed) + delta, std::memory_order_relaxed); delta = limit_delta(delta, my_soft_limit, my_total_request.load(std::memory_order_relaxed)); my_thread_dispatcher.adjust_job_count_estimate(delta); } } void thread_request_serializer::set_active_num_workers(int soft_limit) { mutex_type::scoped_lock lock(my_mutex); int delta = soft_limit - my_soft_limit; delta = limit_delta(delta, my_total_request.load(std::memory_order_relaxed), soft_limit); my_thread_dispatcher.adjust_job_count_estimate(delta); my_soft_limit = soft_limit; } int thread_request_serializer::limit_delta(int delta, int limit, int new_value) { // This method can be described with such pseudocode: // bool above_limit = prev_value >= limit && new_value >= limit; // bool below_limit = prev_value <= limit && new_value <= limit; // enum request_type { ABOVE_LIMIT, CROSS_LIMIT, BELOW_LIMIT }; // request = above_limit ? ABOVE_LIMIT : below_limit ? BELOW_LIMIT : CROSS_LIMIT; // switch (request) { // case ABOVE_LIMIT: // delta = 0; // case CROSS_LIMIT: // delta = delta > 0 ? limit - prev_value : new_value - limit; // case BELOW_LIMIT: // // No changes to delta // } int prev_value = new_value - delta; // actual new_value and prev_value cannot exceed the limit new_value = min(limit, new_value); prev_value = min(limit, prev_value); return new_value - prev_value; } thread_request_serializer_proxy::thread_request_serializer_proxy(thread_dispatcher& td, int soft_limit) : my_serializer(td, soft_limit) {} void thread_request_serializer_proxy::register_mandatory_request(int mandatory_delta) { if (mandatory_delta != 0) { mutex_type::scoped_lock lock(my_mutex, /* is_write = */ false); int prev_value = my_num_mandatory_requests.fetch_add(mandatory_delta); const bool should_try_enable = mandatory_delta > 0 && prev_value == 0; const bool should_try_disable = mandatory_delta < 0 && prev_value == 1; if (should_try_enable) { enable_mandatory_concurrency(lock); } else if (should_try_disable) { disable_mandatory_concurrency(lock); } } } void thread_request_serializer_proxy::set_active_num_workers(int soft_limit) { mutex_type::scoped_lock lock(my_mutex, /* is_write = */ true); if (soft_limit != 0) { my_is_mandatory_concurrency_enabled = false; } else if (my_num_mandatory_requests > 0) { my_is_mandatory_concurrency_enabled = true; soft_limit = 1; } my_serializer.set_active_num_workers(soft_limit); } int thread_request_serializer_proxy::num_workers_requested() { return my_serializer.num_workers_requested(); } void thread_request_serializer_proxy::update(int delta) { my_serializer.update(delta); } void thread_request_serializer_proxy::enable_mandatory_concurrency(mutex_type::scoped_lock& lock) { lock.upgrade_to_writer(); bool still_should_enable = my_num_mandatory_requests.load(std::memory_order_relaxed) > 0 && !my_is_mandatory_concurrency_enabled && my_serializer.is_no_workers_avaliable(); if (still_should_enable) { my_is_mandatory_concurrency_enabled = true; my_serializer.set_active_num_workers(1); } } void thread_request_serializer_proxy::disable_mandatory_concurrency(mutex_type::scoped_lock& lock) { lock.upgrade_to_writer(); bool still_should_disable = my_num_mandatory_requests.load(std::memory_order_relaxed) <= 0 && my_is_mandatory_concurrency_enabled && !my_serializer.is_no_workers_avaliable(); if (still_should_disable) { my_is_mandatory_concurrency_enabled = false; my_serializer.set_active_num_workers(0); } } } // r1 } // detail } // tbb ================================================ FILE: third-party/tbb/src/tbb/thread_request_serializer.h ================================================ /* Copyright (c) 2022-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _TBB_thread_serializer_handlers_H #define _TBB_thread_serializer_handlers_H #include "oneapi/tbb/mutex.h" #include "oneapi/tbb/rw_mutex.h" #include "thread_dispatcher.h" namespace tbb { namespace detail { namespace r1 { class thread_request_observer { protected: virtual ~thread_request_observer() {} public: virtual void update(int delta) = 0; }; class thread_request_serializer : public thread_request_observer { using mutex_type = d1::mutex; public: thread_request_serializer(thread_dispatcher& td, int soft_limit); void set_active_num_workers(int soft_limit); int num_workers_requested() { return my_total_request.load(std::memory_order_relaxed); } bool is_no_workers_avaliable() { return my_soft_limit == 0; } private: friend class thread_request_serializer_proxy; void update(int delta) override; static int limit_delta(int delta, int limit, int new_value); thread_dispatcher& my_thread_dispatcher; int my_soft_limit{ 0 }; std::atomic my_total_request{ 0 }; // my_pending_delta is set to pending_delta_base to have ability to hold negative values // consider increase base since thead number will be bigger than 1 << 15 static constexpr std::uint64_t pending_delta_base = 1 << 15; std::atomic my_pending_delta{ pending_delta_base }; mutex_type my_mutex; }; // Handles mandatory concurrency i.e. enables worker threads for enqueue tasks class thread_request_serializer_proxy : public thread_request_observer { using mutex_type = d1::rw_mutex; public: thread_request_serializer_proxy(thread_dispatcher& td, int soft_limit); void register_mandatory_request(int mandatory_delta); void set_active_num_workers(int soft_limit); int num_workers_requested(); private: void update(int delta) override; void enable_mandatory_concurrency(mutex_type::scoped_lock& lock); void disable_mandatory_concurrency(mutex_type::scoped_lock& lock); std::atomic my_num_mandatory_requests{0}; bool my_is_mandatory_concurrency_enabled{false}; thread_request_serializer my_serializer; mutex_type my_mutex; }; } // namespace r1 } // namespace detail } // namespace tbb #endif // _TBB_thread_serializer_handlers_H ================================================ FILE: third-party/tbb/src/tbb/threading_control.cpp ================================================ /* Copyright (c) 2022-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "threading_control.h" #include "permit_manager.h" #include "market.h" #include "tcm_adaptor.h" #include "thread_dispatcher.h" #include "governor.h" #include "thread_dispatcher_client.h" namespace tbb { namespace detail { namespace r1 { // ---------------------------------------- threading_control_impl -------------------------------------------------------------- std::size_t global_control_active_value_unsafe(d1::global_control::parameter); std::pair threading_control_impl::calculate_workers_limits() { // Expecting that 4P is suitable for most applications. // Limit to 2P for large thread number. // TODO: ask RML for max concurrency and possibly correct hard_limit unsigned factor = governor::default_num_threads() <= 128 ? 4 : 2; // The requested number of threads is intentionally not considered in // computation of the hard limit, in order to separate responsibilities // and avoid complicated interactions between global_control and task_scheduler_init. // The threading control guarantees that at least 256 threads might be created. unsigned workers_app_limit = global_control_active_value_unsafe(global_control::max_allowed_parallelism); unsigned workers_hard_limit = max(max(factor * governor::default_num_threads(), 256u), workers_app_limit); unsigned workers_soft_limit = calc_workers_soft_limit(workers_hard_limit); return std::make_pair(workers_soft_limit, workers_hard_limit); } unsigned threading_control_impl::calc_workers_soft_limit(unsigned workers_hard_limit) { unsigned workers_soft_limit{}; unsigned soft_limit = global_control_active_value_unsafe(global_control::max_allowed_parallelism); // if user set no limits (yet), use default value workers_soft_limit = soft_limit != 0 ? soft_limit - 1 : governor::default_num_threads() - 1; if (workers_soft_limit >= workers_hard_limit) { workers_soft_limit = workers_hard_limit - 1; } return workers_soft_limit; } cache_aligned_unique_ptr threading_control_impl::make_permit_manager(unsigned workers_soft_limit) { if (tcm_adaptor::is_initialized()) { auto tcm = make_cache_aligned_unique(); if (tcm->is_connected()) { return tcm; } } return make_cache_aligned_unique(workers_soft_limit); } cache_aligned_unique_ptr threading_control_impl::make_thread_dispatcher(threading_control& tc, unsigned workers_soft_limit, unsigned workers_hard_limit) { stack_size_type stack_size = global_control_active_value_unsafe(global_control::thread_stack_size); cache_aligned_unique_ptr td = make_cache_aligned_unique(tc, workers_hard_limit, stack_size); // This check relies on the fact that for shared RML default_concurrency == max_concurrency if (!governor::UsePrivateRML && td->my_server->default_concurrency() < workers_soft_limit) { runtime_warning("RML might limit the number of workers to %u while %u is requested.\n", td->my_server->default_concurrency(), workers_soft_limit); } return td; } threading_control_impl::threading_control_impl(threading_control* tc) { unsigned workers_soft_limit{}, workers_hard_limit{}; std::tie(workers_soft_limit, workers_hard_limit) = calculate_workers_limits(); my_permit_manager = make_permit_manager(workers_soft_limit); my_thread_dispatcher = make_thread_dispatcher(*tc, workers_soft_limit, workers_hard_limit); my_thread_request_serializer = make_cache_aligned_unique(*my_thread_dispatcher, workers_soft_limit); my_permit_manager->set_thread_request_observer(*my_thread_request_serializer); my_cancellation_disseminator = make_cache_aligned_unique(); my_waiting_threads_monitor = make_cache_aligned_unique(); } void threading_control_impl::release(bool blocking_terminate) { my_thread_dispatcher->release(blocking_terminate); } void threading_control_impl::set_active_num_workers(unsigned soft_limit) { __TBB_ASSERT(soft_limit <= my_thread_dispatcher->my_num_workers_hard_limit, nullptr); my_thread_request_serializer->set_active_num_workers(soft_limit); my_permit_manager->set_active_num_workers(soft_limit); } threading_control_client threading_control_impl::create_client(arena& a) { pm_client* pm_client = my_permit_manager->create_client(a); thread_dispatcher_client* td_client = my_thread_dispatcher->create_client(a); return threading_control_client{pm_client, td_client}; } threading_control_impl::client_snapshot threading_control_impl::prepare_client_destruction(threading_control_client client) { auto td_client = client.get_thread_dispatcher_client(); return {td_client->get_aba_epoch(), td_client->priority_level(), td_client, client.get_pm_client()}; } bool threading_control_impl::try_destroy_client(threading_control_impl::client_snapshot snapshot) { if (my_thread_dispatcher->try_unregister_client(snapshot.my_td_client, snapshot.aba_epoch, snapshot.priority_level)) { my_permit_manager->unregister_and_destroy_client(*snapshot.my_pm_client); return true; } return false; } void threading_control_impl::publish_client(threading_control_client tc_client, d1::constraints& constraints) { my_permit_manager->register_client(tc_client.get_pm_client(), constraints); my_thread_dispatcher->register_client(tc_client.get_thread_dispatcher_client()); } void threading_control_impl::register_thread(thread_data& td) { my_cancellation_disseminator->register_thread(td); } void threading_control_impl::unregister_thread(thread_data& td) { my_cancellation_disseminator->unregister_thread(td); } void threading_control_impl::propagate_task_group_state(std::atomic d1::task_group_context::*mptr_state, d1::task_group_context& src, uint32_t new_state) { my_cancellation_disseminator->propagate_task_group_state(mptr_state, src, new_state); } std::size_t threading_control_impl::worker_stack_size() { return my_thread_dispatcher->worker_stack_size(); } unsigned threading_control_impl::max_num_workers() { return my_thread_dispatcher->my_num_workers_hard_limit; } void threading_control_impl::adjust_demand(threading_control_client tc_client, int mandatory_delta, int workers_delta) { auto& c = *tc_client.get_pm_client(); my_thread_request_serializer->register_mandatory_request(mandatory_delta); my_permit_manager->adjust_demand(c, mandatory_delta, workers_delta); } bool threading_control_impl::is_any_other_client_active() { return my_thread_request_serializer->num_workers_requested() > 0 ? my_thread_dispatcher->is_any_client_in_need() : false; } thread_control_monitor& threading_control_impl::get_waiting_threads_monitor() { return *my_waiting_threads_monitor; } // ---------------------------------------- threading_control ------------------------------------------------------------------- // Defined in global_control.cpp void global_control_lock(); void global_control_unlock(); void threading_control::add_ref(bool is_public) { ++my_ref_count; if (is_public) { my_public_ref_count++; } } bool threading_control::remove_ref(bool is_public) { if (is_public) { __TBB_ASSERT(g_threading_control == this, "Global threading control instance was destroyed prematurely?"); __TBB_ASSERT(my_public_ref_count.load(std::memory_order_relaxed), nullptr); --my_public_ref_count; } bool is_last_ref = --my_ref_count == 0; if (is_last_ref) { __TBB_ASSERT(!my_public_ref_count.load(std::memory_order_relaxed), nullptr); g_threading_control = nullptr; } return is_last_ref; } threading_control* threading_control::get_threading_control(bool is_public) { threading_control* control = g_threading_control; if (control) { control->add_ref(is_public); } return control; } threading_control* threading_control::create_threading_control() { // Global control should be locked before threading_control_impl global_control_lock(); threading_control* thr_control{ nullptr }; try_call([&] { global_mutex_type::scoped_lock lock(g_threading_control_mutex); thr_control = get_threading_control(/*public = */ true); if (thr_control == nullptr) { thr_control = new (cache_aligned_allocate(sizeof(threading_control))) threading_control(/*public_ref = */ 1, /*private_ref = */ 1); thr_control->my_pimpl = make_cache_aligned_unique(thr_control); __TBB_InitOnce::add_ref(); if (global_control_active_value_unsafe(global_control::scheduler_handle)) { ++thr_control->my_public_ref_count; ++thr_control->my_ref_count; } g_threading_control = thr_control; } }).on_exception([&] { global_control_unlock(); cache_aligned_deleter deleter{}; deleter(thr_control); }); global_control_unlock(); return thr_control; } void threading_control::destroy () { cache_aligned_deleter deleter; deleter(this); __TBB_InitOnce::remove_ref(); } void threading_control::wait_last_reference(global_mutex_type::scoped_lock& lock) { while (my_public_ref_count.load(std::memory_order_relaxed) == 1 && my_ref_count.load(std::memory_order_relaxed) > 1) { lock.release(); // To guarantee that request_close_connection() is called by the last external thread, we need to wait till all // references are released. Re-read my_public_ref_count to limit waiting if new external threads are created. // Theoretically, new private references to the threading control can be added during waiting making it potentially // endless. // TODO: revise why the weak scheduler needs threading control's pointer and try to remove this wait. // Note that the threading control should know about its schedulers for cancellation/exception/priority propagation, // see e.g. task_group_context::cancel_group_execution() while (my_public_ref_count.load(std::memory_order_acquire) == 1 && my_ref_count.load(std::memory_order_acquire) > 1) { yield(); } lock.acquire(g_threading_control_mutex); } } bool threading_control::release(bool is_public, bool blocking_terminate) { bool do_release = false; { global_mutex_type::scoped_lock lock(g_threading_control_mutex); if (blocking_terminate) { __TBB_ASSERT(is_public, "Only an object with a public reference can request the blocking terminate"); wait_last_reference(lock); } do_release = remove_ref(is_public); } if (do_release) { __TBB_ASSERT(!my_public_ref_count.load(std::memory_order_relaxed), "No public references must remain if we remove the threading control."); // inform RML that blocking termination is required my_pimpl->release(blocking_terminate); return blocking_terminate; } return false; } threading_control::threading_control(unsigned public_ref, unsigned ref) : my_public_ref_count(public_ref), my_ref_count(ref) {} threading_control* threading_control::register_public_reference() { threading_control* control{nullptr}; global_mutex_type::scoped_lock lock(g_threading_control_mutex); control = get_threading_control(/*public = */ true); if (!control) { // We are going to create threading_control_impl, we should acquire mutexes in right order lock.release(); control = create_threading_control(); } return control; } bool threading_control::unregister_public_reference(bool blocking_terminate) { __TBB_ASSERT(g_threading_control, "Threading control should exist until last public reference"); __TBB_ASSERT(g_threading_control->my_public_ref_count.load(std::memory_order_relaxed), nullptr); return g_threading_control->release(/*public = */ true, /*blocking_terminate = */ blocking_terminate); } threading_control_client threading_control::create_client(arena& a) { { global_mutex_type::scoped_lock lock(g_threading_control_mutex); add_ref(/*public = */ false); } return my_pimpl->create_client(a); } void threading_control::publish_client(threading_control_client client, d1::constraints& constraints) { return my_pimpl->publish_client(client, constraints); } threading_control::client_snapshot threading_control::prepare_client_destruction(threading_control_client client) { return my_pimpl->prepare_client_destruction(client); } bool threading_control::try_destroy_client(threading_control::client_snapshot deleter) { bool res = my_pimpl->try_destroy_client(deleter); if (res) { release(/*public = */ false, /*blocking_terminate = */ false); } return res; } void threading_control::set_active_num_workers(unsigned soft_limit) { threading_control* thr_control{nullptr}; { global_mutex_type::scoped_lock lock(g_threading_control_mutex); thr_control = get_threading_control(/*public = */ false); } if (thr_control != nullptr) { thr_control->my_pimpl->set_active_num_workers(soft_limit); thr_control->release(/*is_public=*/false, /*blocking_terminate=*/false); } } bool threading_control::is_present() { global_mutex_type::scoped_lock lock(g_threading_control_mutex); return g_threading_control != nullptr; } bool threading_control::register_lifetime_control() { global_mutex_type::scoped_lock lock(g_threading_control_mutex); return get_threading_control(/*public = */ true) != nullptr; } bool threading_control::unregister_lifetime_control(bool blocking_terminate) { threading_control* thr_control{nullptr}; { global_mutex_type::scoped_lock lock(g_threading_control_mutex); thr_control = g_threading_control; } bool released{true}; if (thr_control) { released = thr_control->release(/*public = */ true, /*blocking_terminate = */ blocking_terminate); } return released; } void threading_control::register_thread(thread_data& td) { my_pimpl->register_thread(td); } void threading_control::unregister_thread(thread_data& td) { my_pimpl->unregister_thread(td); } void threading_control::propagate_task_group_state(std::atomic d1::task_group_context::*mptr_state, d1::task_group_context& src, uint32_t new_state) { my_pimpl->propagate_task_group_state(mptr_state, src, new_state); } std::size_t threading_control::worker_stack_size() { return my_pimpl->worker_stack_size(); } unsigned threading_control::max_num_workers() { global_mutex_type::scoped_lock lock(g_threading_control_mutex); return g_threading_control ? g_threading_control->my_pimpl->max_num_workers() : 0; } void threading_control::adjust_demand(threading_control_client client, int mandatory_delta, int workers_delta) { my_pimpl->adjust_demand(client, mandatory_delta, workers_delta); } bool threading_control::is_any_other_client_active() { return my_pimpl->is_any_other_client_active(); } thread_control_monitor& threading_control::get_waiting_threads_monitor() { return my_pimpl->get_waiting_threads_monitor(); } } // r1 } // detail } // tbb ================================================ FILE: third-party/tbb/src/tbb/threading_control.h ================================================ /* Copyright (c) 2022-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _TBB_threading_control_H #define _TBB_threading_control_H #include "oneapi/tbb/mutex.h" #include "oneapi/tbb/global_control.h" #include "threading_control_client.h" #include "intrusive_list.h" #include "main.h" #include "permit_manager.h" #include "pm_client.h" #include "thread_dispatcher.h" #include "cancellation_disseminator.h" #include "thread_request_serializer.h" #include "scheduler_common.h" namespace tbb { namespace detail { namespace r1 { class arena; class thread_data; class threading_control; class threading_control_impl { public: threading_control_impl(threading_control*); public: void release(bool blocking_terminate); threading_control_client create_client(arena& a); void publish_client(threading_control_client client, d1::constraints& constraints); struct client_snapshot { std::uint64_t aba_epoch; unsigned priority_level; thread_dispatcher_client* my_td_client; pm_client* my_pm_client; }; client_snapshot prepare_client_destruction(threading_control_client client); bool try_destroy_client(client_snapshot deleter); void register_thread(thread_data& td); void unregister_thread(thread_data& td); void propagate_task_group_state(std::atomic d1::task_group_context::*mptr_state, d1::task_group_context& src, uint32_t new_state); void set_active_num_workers(unsigned soft_limit); std::size_t worker_stack_size(); unsigned max_num_workers(); void adjust_demand(threading_control_client, int mandatory_delta, int workers_delta); bool is_any_other_client_active(); thread_control_monitor& get_waiting_threads_monitor(); private: static unsigned calc_workers_soft_limit(unsigned workers_hard_limit); static std::pair calculate_workers_limits(); static cache_aligned_unique_ptr make_permit_manager(unsigned workers_soft_limit); static cache_aligned_unique_ptr make_thread_dispatcher(threading_control& control, unsigned workers_soft_limit, unsigned workers_hard_limit); // TODO: Consider allocation one chunk of memory and construct objects on it cache_aligned_unique_ptr my_permit_manager{nullptr}; cache_aligned_unique_ptr my_thread_dispatcher{nullptr}; cache_aligned_unique_ptr my_thread_request_serializer{nullptr}; cache_aligned_unique_ptr my_cancellation_disseminator{nullptr}; cache_aligned_unique_ptr my_waiting_threads_monitor{nullptr}; }; class threading_control { using global_mutex_type = d1::mutex; public: using client_snapshot = threading_control_impl::client_snapshot; static threading_control* register_public_reference(); static bool unregister_public_reference(bool blocking_terminate); static bool is_present(); static void set_active_num_workers(unsigned soft_limit); static bool register_lifetime_control(); static bool unregister_lifetime_control(bool blocking_terminate); threading_control_client create_client(arena& a); void publish_client(threading_control_client client, d1::constraints& constraints); client_snapshot prepare_client_destruction(threading_control_client client); bool try_destroy_client(client_snapshot deleter); void register_thread(thread_data& td); void unregister_thread(thread_data& td); void propagate_task_group_state(std::atomic d1::task_group_context::*mptr_state, d1::task_group_context& src, uint32_t new_state); std::size_t worker_stack_size(); static unsigned max_num_workers(); void adjust_demand(threading_control_client client, int mandatory_delta, int workers_delta); bool is_any_other_client_active(); thread_control_monitor& get_waiting_threads_monitor(); private: threading_control(unsigned public_ref, unsigned ref); void add_ref(bool is_public); bool remove_ref(bool is_public); static threading_control* get_threading_control(bool is_public); static threading_control* create_threading_control(); bool release(bool is_public, bool blocking_terminate); void wait_last_reference(global_mutex_type::scoped_lock& lock); void destroy(); friend class thread_dispatcher; static threading_control* g_threading_control; //! Mutex guarding creation/destruction of g_threading_control, insertions/deletions in my_arenas, and cancellation propagation static global_mutex_type g_threading_control_mutex; cache_aligned_unique_ptr my_pimpl{nullptr}; //! Count of external threads attached std::atomic my_public_ref_count{0}; //! Reference count controlling threading_control object lifetime std::atomic my_ref_count{0}; }; } // r1 } // detail } // tbb #endif // _TBB_threading_control_H ================================================ FILE: third-party/tbb/src/tbb/threading_control_client.h ================================================ /* Copyright (c) 2022-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _TBB_threading_control_client_H #define _TBB_threading_control_client_H #include "oneapi/tbb/detail/_assert.h" namespace tbb { namespace detail { namespace r1 { class pm_client; class thread_dispatcher_client; class threading_control_client { public: threading_control_client() = default; threading_control_client(const threading_control_client&) = default; threading_control_client& operator=(const threading_control_client&) = default; threading_control_client(pm_client* p, thread_dispatcher_client* t) : my_pm_client(p), my_thread_dispatcher_client(t) { __TBB_ASSERT(my_pm_client, nullptr); __TBB_ASSERT(my_thread_dispatcher_client, nullptr); } pm_client* get_pm_client() { return my_pm_client; } thread_dispatcher_client* get_thread_dispatcher_client() { return my_thread_dispatcher_client; } private: pm_client* my_pm_client{nullptr}; thread_dispatcher_client* my_thread_dispatcher_client{nullptr}; }; } } } #endif // _TBB_threading_control_client_H ================================================ FILE: third-party/tbb/src/tbb/tls.h ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _TBB_tls_H #define _TBB_tls_H #include "oneapi/tbb/detail/_config.h" #if __TBB_USE_POSIX #include #else /* assume __TBB_USE_WINAPI */ #include #endif namespace tbb { namespace detail { namespace r1 { typedef void (*tls_dtor_t)(void*); //! Basic cross-platform wrapper class for TLS operations. template class basic_tls { #if __TBB_USE_POSIX typedef pthread_key_t tls_key_t; public: int create( tls_dtor_t dtor = nullptr ) { return pthread_key_create(&my_key, dtor); } int destroy() { return pthread_key_delete(my_key); } void set( T value ) { pthread_setspecific(my_key, (void*)value); } T get() { return (T)pthread_getspecific(my_key); } #else /* __TBB_USE_WINAPI */ typedef DWORD tls_key_t; public: #if !__TBB_WIN8UI_SUPPORT int create() { tls_key_t tmp = TlsAlloc(); if( tmp==TLS_OUT_OF_INDEXES ) return TLS_OUT_OF_INDEXES; my_key = tmp; return 0; } int destroy() { TlsFree(my_key); my_key=0; return 0; } void set( T value ) { TlsSetValue(my_key, (LPVOID)value); } T get() { return (T)TlsGetValue(my_key); } #else /*!__TBB_WIN8UI_SUPPORT*/ int create() { tls_key_t tmp = FlsAlloc(nullptr); if( tmp== (DWORD)0xFFFFFFFF ) return (DWORD)0xFFFFFFFF; my_key = tmp; return 0; } int destroy() { FlsFree(my_key); my_key=0; return 0; } void set( T value ) { FlsSetValue(my_key, (LPVOID)value); } T get() { return (T)FlsGetValue(my_key); } #endif /* !__TBB_WIN8UI_SUPPORT */ #endif /* __TBB_USE_WINAPI */ private: tls_key_t my_key; }; } // namespace r1 } // namespace detail } // namespace tbb #endif /* _TBB_tls_H */ ================================================ FILE: third-party/tbb/src/tbb/tools_api/disable_warnings.h ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "ittnotify_config.h" #if ITT_PLATFORM==ITT_PLATFORM_WIN #if defined _MSC_VER #pragma warning (disable: 593) /* parameter "XXXX" was set but never used */ #pragma warning (disable: 344) /* typedef name has already been declared (with same type) */ #pragma warning (disable: 174) /* expression has no effect */ #pragma warning (disable: 4127) /* conditional expression is constant */ #pragma warning (disable: 4306) /* conversion from '?' to '?' of greater size */ #endif #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if defined __INTEL_COMPILER #pragma warning (disable: 869) /* parameter "XXXXX" was never referenced */ #pragma warning (disable: 1418) /* external function definition with no prior declaration */ #pragma warning (disable: 1419) /* external declaration in primary source file */ #endif /* __INTEL_COMPILER */ ================================================ FILE: third-party/tbb/src/tbb/tools_api/ittnotify.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _ITTNOTIFY_H_ #define _ITTNOTIFY_H_ /** @file @brief Public User API functions and types @mainpage The Instrumentation and Tracing Technology API (ITT API) is used to annotate a user's program with additional information that can be used by correctness and performance tools. The user inserts calls in their program. Those calls generate information that is collected at runtime, and used by Intel(R) Threading Tools. @section API Concepts The following general concepts are used throughout the API. @subsection Unicode Support Many API functions take character string arguments. On Windows, there are two versions of each such function. The function name is suffixed by W if Unicode support is enabled, and by A otherwise. Any API function that takes a character string argument adheres to this convention. @subsection Conditional Compilation Many users prefer having an option to modify ITT API code when linking it inside their runtimes. ITT API header file provides a mechanism to replace ITT API function names inside your code with empty strings. To do this, define the macros INTEL_NO_ITTNOTIFY_API during compilation and remove the static library from the linker script. @subsection Domains [see domains] Domains provide a way to separate notification for different modules or libraries in a program. Domains are specified by dotted character strings, e.g. TBB.Internal.Control. A mechanism (to be specified) is provided to enable and disable domains. By default, all domains are enabled. @subsection Named Entities and Instances Named entities (frames, regions, tasks, and markers) communicate information about the program to the analysis tools. A named entity often refers to a section of program code, or to some set of logical concepts that the programmer wants to group together. Named entities relate to the programmer's static view of the program. When the program actually executes, many instances of a given named entity may be created. The API annotations denote instances of named entities. The actual named entities are displayed using the analysis tools. In other words, the named entities come into existence when instances are created. Instances of named entities may have instance identifiers (IDs). Some API calls use instance identifiers to create relationships between different instances of named entities. Other API calls associate data with instances of named entities. Some named entities must always have instance IDs. In particular, regions and frames always have IDs. Task and markers need IDs only if the ID is needed in another API call (such as adding a relation or metadata). The lifetime of instance IDs is distinct from the lifetime of instances. This allows various relationships to be specified separate from the actual execution of instances. This flexibility comes at the expense of extra API calls. The same ID may not be reused for different instances, unless a previous [ref] __itt_id_destroy call for that ID has been issued. */ /** @cond exclude_from_documentation */ #ifndef ITT_OS_WIN # define ITT_OS_WIN 1 #endif /* ITT_OS_WIN */ #ifndef ITT_OS_LINUX # define ITT_OS_LINUX 2 #endif /* ITT_OS_LINUX */ #ifndef ITT_OS_MAC # define ITT_OS_MAC 3 #endif /* ITT_OS_MAC */ #ifndef ITT_OS_FREEBSD # define ITT_OS_FREEBSD 4 #endif /* ITT_OS_FREEBSD */ #ifndef ITT_OS_OPENBSD # define ITT_OS_OPENBSD 5 #endif /* ITT_OS_OPENBSD */ #ifndef ITT_OS # if defined WIN32 || defined _WIN32 # define ITT_OS ITT_OS_WIN # elif defined( __APPLE__ ) && defined( __MACH__ ) # define ITT_OS ITT_OS_MAC # elif defined( __FreeBSD__ ) # define ITT_OS ITT_OS_FREEBSD # elif defined( __OpenBSD__ ) # define ITT_OS ITT_OS_OPENBSD # else # define ITT_OS ITT_OS_LINUX # endif #endif /* ITT_OS */ #ifndef ITT_PLATFORM_WIN # define ITT_PLATFORM_WIN 1 #endif /* ITT_PLATFORM_WIN */ #ifndef ITT_PLATFORM_POSIX # define ITT_PLATFORM_POSIX 2 #endif /* ITT_PLATFORM_POSIX */ #ifndef ITT_PLATFORM_MAC # define ITT_PLATFORM_MAC 3 #endif /* ITT_PLATFORM_MAC */ #ifndef ITT_PLATFORM_FREEBSD # define ITT_PLATFORM_FREEBSD 4 #endif /* ITT_PLATFORM_FREEBSD */ #ifndef ITT_PLATFORM_OPENBSD # define ITT_PLATFORM_OPENBSD 5 #endif /* ITT_PLATFORM_OPENBSD */ #ifndef ITT_PLATFORM # if ITT_OS==ITT_OS_WIN # define ITT_PLATFORM ITT_PLATFORM_WIN # elif ITT_OS==ITT_OS_MAC # define ITT_PLATFORM ITT_PLATFORM_MAC # elif ITT_OS==ITT_OS_FREEBSD # define ITT_PLATFORM ITT_PLATFORM_FREEBSD # elif ITT_OS==ITT_OS_OPENBSD # define ITT_PLATFORM ITT_PLATFORM_OPENBSD # else # define ITT_PLATFORM ITT_PLATFORM_POSIX # endif #endif /* ITT_PLATFORM */ #if defined(_UNICODE) && !defined(UNICODE) #define UNICODE #endif #include #if ITT_PLATFORM==ITT_PLATFORM_WIN #include #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #include #if defined(UNICODE) || defined(_UNICODE) #include #endif /* UNICODE || _UNICODE */ #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #ifndef ITTAPI_CDECL # if ITT_PLATFORM==ITT_PLATFORM_WIN # define ITTAPI_CDECL __cdecl # else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ # if defined _M_IX86 || defined __i386__ # define ITTAPI_CDECL __attribute__ ((cdecl)) # else /* _M_IX86 || __i386__ */ # define ITTAPI_CDECL /* actual only on x86 platform */ # endif /* _M_IX86 || __i386__ */ # endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* ITTAPI_CDECL */ #ifndef STDCALL # if ITT_PLATFORM==ITT_PLATFORM_WIN # define STDCALL __stdcall # else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ # if defined _M_IX86 || defined __i386__ # define STDCALL __attribute__ ((stdcall)) # else /* _M_IX86 || __i386__ */ # define STDCALL /* supported only on x86 platform */ # endif /* _M_IX86 || __i386__ */ # endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* STDCALL */ #define ITTAPI ITTAPI_CDECL #define LIBITTAPI ITTAPI_CDECL /* TODO: Temporary for compatibility! */ #define ITTAPI_CALL ITTAPI_CDECL #define LIBITTAPI_CALL ITTAPI_CDECL #if ITT_PLATFORM==ITT_PLATFORM_WIN /* use __forceinline (VC++ specific) */ #if defined(__MINGW32__) && !defined(__cplusplus) #define ITT_INLINE static __inline__ __attribute__((__always_inline__,__gnu_inline__)) #else #define ITT_INLINE static __forceinline #endif /* __MINGW32__ */ #define ITT_INLINE_ATTRIBUTE /* nothing */ #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ /* * Generally, functions are not inlined unless optimization is specified. * For functions declared inline, this attribute inlines the function even * if no optimization level was specified. */ #ifdef __STRICT_ANSI__ #define ITT_INLINE static #define ITT_INLINE_ATTRIBUTE __attribute__((unused)) #else /* __STRICT_ANSI__ */ #define ITT_INLINE static inline #define ITT_INLINE_ATTRIBUTE __attribute__((always_inline, unused)) #endif /* __STRICT_ANSI__ */ #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ /** @endcond */ #ifdef INTEL_ITTNOTIFY_ENABLE_LEGACY # if ITT_PLATFORM==ITT_PLATFORM_WIN # pragma message("WARNING!!! Deprecated API is used. Please undefine INTEL_ITTNOTIFY_ENABLE_LEGACY macro") # else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ # warning "Deprecated API is used. Please undefine INTEL_ITTNOTIFY_ENABLE_LEGACY macro" # endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ # include "legacy/ittnotify.h" #endif /* INTEL_ITTNOTIFY_ENABLE_LEGACY */ /** @cond exclude_from_documentation */ /* Helper macro for joining tokens */ #define ITT_JOIN_AUX(p,n) p##n #define ITT_JOIN(p,n) ITT_JOIN_AUX(p,n) #ifdef ITT_MAJOR #undef ITT_MAJOR #endif #ifdef ITT_MINOR #undef ITT_MINOR #endif #define ITT_MAJOR 3 #define ITT_MINOR 0 /* Standard versioning of a token with major and minor version numbers */ #define ITT_VERSIONIZE(x) \ ITT_JOIN(x, \ ITT_JOIN(_, \ ITT_JOIN(ITT_MAJOR, \ ITT_JOIN(_, ITT_MINOR)))) #ifndef INTEL_ITTNOTIFY_PREFIX # define INTEL_ITTNOTIFY_PREFIX __itt_ #endif /* INTEL_ITTNOTIFY_PREFIX */ #ifndef INTEL_ITTNOTIFY_POSTFIX # define INTEL_ITTNOTIFY_POSTFIX _ptr_ #endif /* INTEL_ITTNOTIFY_POSTFIX */ #define ITTNOTIFY_NAME_AUX(n) ITT_JOIN(INTEL_ITTNOTIFY_PREFIX,n) #define ITTNOTIFY_NAME(n) ITT_VERSIONIZE(ITTNOTIFY_NAME_AUX(ITT_JOIN(n,INTEL_ITTNOTIFY_POSTFIX))) #define ITTNOTIFY_VOID(n) (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n) #define ITTNOTIFY_DATA(n) (!ITTNOTIFY_NAME(n)) ? 0 : ITTNOTIFY_NAME(n) #define ITTNOTIFY_VOID_D0(n,d) (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d) #define ITTNOTIFY_VOID_D1(n,d,x) (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x) #define ITTNOTIFY_VOID_D2(n,d,x,y) (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y) #define ITTNOTIFY_VOID_D3(n,d,x,y,z) (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z) #define ITTNOTIFY_VOID_D4(n,d,x,y,z,a) (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a) #define ITTNOTIFY_VOID_D5(n,d,x,y,z,a,b) (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b) #define ITTNOTIFY_VOID_D6(n,d,x,y,z,a,b,c) (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c) #define ITTNOTIFY_DATA_D0(n,d) (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ? 0 : ITTNOTIFY_NAME(n)(d) #define ITTNOTIFY_DATA_D1(n,d,x) (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ? 0 : ITTNOTIFY_NAME(n)(d,x) #define ITTNOTIFY_DATA_D2(n,d,x,y) (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ? 0 : ITTNOTIFY_NAME(n)(d,x,y) #define ITTNOTIFY_DATA_D3(n,d,x,y,z) (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ? 0 : ITTNOTIFY_NAME(n)(d,x,y,z) #define ITTNOTIFY_DATA_D4(n,d,x,y,z,a) (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ? 0 : ITTNOTIFY_NAME(n)(d,x,y,z,a) #define ITTNOTIFY_DATA_D5(n,d,x,y,z,a,b) (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ? 0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b) #define ITTNOTIFY_DATA_D6(n,d,x,y,z,a,b,c) (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ? 0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c) #ifdef ITT_STUB #undef ITT_STUB #endif #ifdef ITT_STUBV #undef ITT_STUBV #endif #define ITT_STUBV(api,type,name,args) \ typedef type (api* ITT_JOIN(ITTNOTIFY_NAME(name),_t)) args; \ extern ITT_JOIN(ITTNOTIFY_NAME(name),_t) ITTNOTIFY_NAME(name); #define ITT_STUB ITT_STUBV /** @endcond */ #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ /** @cond exclude_from_gpa_documentation */ /** * @defgroup public Public API * @{ * @} */ /** * @defgroup control Collection Control * @ingroup public * General behavior: application continues to run, but no profiling information is being collected * * Pausing occurs not only for the current thread but for all process as well as spawned processes * - Intel(R) Parallel Inspector and Intel(R) Inspector XE: * - Does not analyze or report errors that involve memory access. * - Other errors are reported as usual. Pausing data collection in * Intel(R) Parallel Inspector and Intel(R) Inspector XE * only pauses tracing and analyzing memory access. * It does not pause tracing or analyzing threading APIs. * . * Intel(R) VTune(TM) Profiler: * - Does continue to record when new threads are started. * . * - Other effects: * - Possible reduction of runtime overhead. * . * @{ */ /** @brief Pause collection */ void ITTAPI __itt_pause(void); /** @brief Resume collection */ void ITTAPI __itt_resume(void); /** @brief Detach collection */ void ITTAPI __itt_detach(void); /** * @enum __itt_collection_scope * @brief Enumerator for collection scopes */ typedef enum { __itt_collection_scope_host = 1 << 0, __itt_collection_scope_offload = 1 << 1, __itt_collection_scope_all = 0x7FFFFFFF } __itt_collection_scope; /** @brief Pause scoped collection */ void ITTAPI __itt_pause_scoped(__itt_collection_scope); /** @brief Resume scoped collection */ void ITTAPI __itt_resume_scoped(__itt_collection_scope); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, pause, (void)) ITT_STUBV(ITTAPI, void, pause_scoped, (__itt_collection_scope)) ITT_STUBV(ITTAPI, void, resume, (void)) ITT_STUBV(ITTAPI, void, resume_scoped, (__itt_collection_scope)) ITT_STUBV(ITTAPI, void, detach, (void)) #define __itt_pause ITTNOTIFY_VOID(pause) #define __itt_pause_ptr ITTNOTIFY_NAME(pause) #define __itt_pause_scoped ITTNOTIFY_VOID(pause_scoped) #define __itt_pause_scoped_ptr ITTNOTIFY_NAME(pause_scoped) #define __itt_resume ITTNOTIFY_VOID(resume) #define __itt_resume_ptr ITTNOTIFY_NAME(resume) #define __itt_resume_scoped ITTNOTIFY_VOID(resume_scoped) #define __itt_resume_scoped_ptr ITTNOTIFY_NAME(resume_scoped) #define __itt_detach ITTNOTIFY_VOID(detach) #define __itt_detach_ptr ITTNOTIFY_NAME(detach) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_pause() #define __itt_pause_ptr 0 #define __itt_pause_scoped(scope) #define __itt_pause_scoped_ptr 0 #define __itt_resume() #define __itt_resume_ptr 0 #define __itt_resume_scoped(scope) #define __itt_resume_scoped_ptr 0 #define __itt_detach() #define __itt_detach_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_pause_ptr 0 #define __itt_pause_scoped_ptr 0 #define __itt_resume_ptr 0 #define __itt_resume_scoped_ptr 0 #define __itt_detach_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @} control group */ /** @endcond */ /** * @defgroup Intel Processor Trace control * API from this group provides control over collection and analysis of Intel Processor Trace (Intel PT) data * Information about Intel Processor Trace technology can be found here (Volume 3 chapter 35): * https://github.com/tpn/pdfs/blob/master/Intel%2064%20and%20IA-32%20Architectures%20Software%20Developer's%20Manual%20-%20Combined%20Volumes%201-4%20-%20May%202018%20(325462-sdm-vol-1-2abcd-3abcd).pdf * Use this API to mark particular code regions for loading detailed performance statistics. * This mode makes your analysis faster and more accurate. * @{ */ typedef unsigned char __itt_pt_region; /** * @brief function saves a region name marked with Intel PT API and returns a region id. * Only 7 names can be registered. Attempts to register more names will be ignored and a region id with auto names will be returned. * For automatic naming of regions pass NULL as function parameter */ #if ITT_PLATFORM==ITT_PLATFORM_WIN __itt_pt_region ITTAPI __itt_pt_region_createA(const char *name); __itt_pt_region ITTAPI __itt_pt_region_createW(const wchar_t *name); #if defined(UNICODE) || defined(_UNICODE) # define __itt_pt_region_create __itt_pt_region_createW #else /* UNICODE */ # define __itt_pt_region_create __itt_pt_region_createA #endif /* UNICODE */ #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ __itt_pt_region ITTAPI __itt_pt_region_create(const char *name); #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(ITTAPI, __itt_pt_region, pt_region_createA, (const char *name)) ITT_STUB(ITTAPI, __itt_pt_region, pt_region_createW, (const wchar_t *name)) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUB(ITTAPI, __itt_pt_region, pt_region_create, (const char *name)) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_pt_region_createA ITTNOTIFY_DATA(pt_region_createA) #define __itt_pt_region_createA_ptr ITTNOTIFY_NAME(pt_region_createA) #define __itt_pt_region_createW ITTNOTIFY_DATA(pt_region_createW) #define __itt_pt_region_createW_ptr ITTNOTIFY_NAME(pt_region_createW) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_pt_region_create ITTNOTIFY_DATA(pt_region_create) #define __itt_pt_region_create_ptr ITTNOTIFY_NAME(pt_region_create) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #else /* INTEL_NO_ITTNOTIFY_API */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_pt_region_createA(name) (__itt_pt_region)0 #define __itt_pt_region_createA_ptr 0 #define __itt_pt_region_createW(name) (__itt_pt_region)0 #define __itt_pt_region_createW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_pt_region_create(name) (__itt_pt_region)0 #define __itt_pt_region_create_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_pt_region_createA_ptr 0 #define __itt_pt_region_createW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_pt_region_create_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief function contains a special code pattern identified on the post-processing stage and * marks the beginning of a code region targeted for Intel PT analysis * @param[in] region - region id, 0 <= region < 8 */ void __itt_mark_pt_region_begin(__itt_pt_region region); /** * @brief function contains a special code pattern identified on the post-processing stage and * marks the end of a code region targeted for Intel PT analysis * @param[in] region - region id, 0 <= region < 8 */ void __itt_mark_pt_region_end(__itt_pt_region region); /** @} Intel PT control group*/ /** * @defgroup threads Threads * @ingroup public * Give names to threads * @{ */ /** * @brief Sets thread name of calling thread * @param[in] name - name of thread */ #if ITT_PLATFORM==ITT_PLATFORM_WIN void ITTAPI __itt_thread_set_nameA(const char *name); void ITTAPI __itt_thread_set_nameW(const wchar_t *name); #if defined(UNICODE) || defined(_UNICODE) # define __itt_thread_set_name __itt_thread_set_nameW # define __itt_thread_set_name_ptr __itt_thread_set_nameW_ptr #else /* UNICODE */ # define __itt_thread_set_name __itt_thread_set_nameA # define __itt_thread_set_name_ptr __itt_thread_set_nameA_ptr #endif /* UNICODE */ #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ void ITTAPI __itt_thread_set_name(const char *name); #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUBV(ITTAPI, void, thread_set_nameA, (const char *name)) ITT_STUBV(ITTAPI, void, thread_set_nameW, (const wchar_t *name)) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUBV(ITTAPI, void, thread_set_name, (const char *name)) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_thread_set_nameA ITTNOTIFY_VOID(thread_set_nameA) #define __itt_thread_set_nameA_ptr ITTNOTIFY_NAME(thread_set_nameA) #define __itt_thread_set_nameW ITTNOTIFY_VOID(thread_set_nameW) #define __itt_thread_set_nameW_ptr ITTNOTIFY_NAME(thread_set_nameW) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_thread_set_name ITTNOTIFY_VOID(thread_set_name) #define __itt_thread_set_name_ptr ITTNOTIFY_NAME(thread_set_name) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #else /* INTEL_NO_ITTNOTIFY_API */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_thread_set_nameA(name) #define __itt_thread_set_nameA_ptr 0 #define __itt_thread_set_nameW(name) #define __itt_thread_set_nameW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_thread_set_name(name) #define __itt_thread_set_name_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_thread_set_nameA_ptr 0 #define __itt_thread_set_nameW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_thread_set_name_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @cond exclude_from_gpa_documentation */ /** * @brief Mark current thread as ignored from this point on, for the duration of its existence. */ void ITTAPI __itt_thread_ignore(void); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, thread_ignore, (void)) #define __itt_thread_ignore ITTNOTIFY_VOID(thread_ignore) #define __itt_thread_ignore_ptr ITTNOTIFY_NAME(thread_ignore) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_thread_ignore() #define __itt_thread_ignore_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_thread_ignore_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @} threads group */ /** * @defgroup suppress Error suppression * @ingroup public * General behavior: application continues to run, but errors are suppressed * * @{ */ /*****************************************************************//** * @name group of functions used for error suppression in correctness tools *********************************************************************/ /** @{ */ /** * @hideinitializer * @brief possible value for suppression mask */ #define __itt_suppress_all_errors 0x7fffffff /** * @hideinitializer * @brief possible value for suppression mask (suppresses errors from threading analysis) */ #define __itt_suppress_threading_errors 0x000000ff /** * @hideinitializer * @brief possible value for suppression mask (suppresses errors from memory analysis) */ #define __itt_suppress_memory_errors 0x0000ff00 /** * @brief Start suppressing errors identified in mask on this thread */ void ITTAPI __itt_suppress_push(unsigned int mask); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, suppress_push, (unsigned int mask)) #define __itt_suppress_push ITTNOTIFY_VOID(suppress_push) #define __itt_suppress_push_ptr ITTNOTIFY_NAME(suppress_push) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_suppress_push(mask) #define __itt_suppress_push_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_suppress_push_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Undo the effects of the matching call to __itt_suppress_push */ void ITTAPI __itt_suppress_pop(void); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, suppress_pop, (void)) #define __itt_suppress_pop ITTNOTIFY_VOID(suppress_pop) #define __itt_suppress_pop_ptr ITTNOTIFY_NAME(suppress_pop) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_suppress_pop() #define __itt_suppress_pop_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_suppress_pop_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @enum __itt_suppress_mode * @brief Enumerator for the suppressing modes */ typedef enum __itt_suppress_mode { __itt_unsuppress_range, __itt_suppress_range } __itt_suppress_mode_t; /** * @enum __itt_collection_state * @brief Enumerator for collection state. */ typedef enum { __itt_collection_uninitialized = 0, /* uninitialized */ __itt_collection_init_fail = 1, /* failed to init */ __itt_collection_collector_absent = 2, /* non work state collector is absent */ __itt_collection_collector_exists = 3, /* work state collector exists */ __itt_collection_init_successful = 4 /* success to init */ } __itt_collection_state; /** * @brief Mark a range of memory for error suppression or unsuppression for error types included in mask */ void ITTAPI __itt_suppress_mark_range(__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, suppress_mark_range, (__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size)) #define __itt_suppress_mark_range ITTNOTIFY_VOID(suppress_mark_range) #define __itt_suppress_mark_range_ptr ITTNOTIFY_NAME(suppress_mark_range) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_suppress_mark_range(mask) #define __itt_suppress_mark_range_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_suppress_mark_range_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Undo the effect of a matching call to __itt_suppress_mark_range. If not matching * call is found, nothing is changed. */ void ITTAPI __itt_suppress_clear_range(__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, suppress_clear_range, (__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size)) #define __itt_suppress_clear_range ITTNOTIFY_VOID(suppress_clear_range) #define __itt_suppress_clear_range_ptr ITTNOTIFY_NAME(suppress_clear_range) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_suppress_clear_range(mask) #define __itt_suppress_clear_range_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_suppress_clear_range_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @} */ /** @} suppress group */ /** * @defgroup sync Synchronization * @ingroup public * Indicate user-written synchronization code * @{ */ /** * @hideinitializer * @brief possible value of attribute argument for sync object type */ #define __itt_attr_barrier 1 /** * @hideinitializer * @brief possible value of attribute argument for sync object type */ #define __itt_attr_mutex 2 /** @brief Name a synchronization object @param[in] addr Handle for the synchronization object. You should use a real address to uniquely identify the synchronization object. @param[in] objtype null-terminated object type string. If NULL is passed, the name will be "User Synchronization". @param[in] objname null-terminated object name string. If NULL, no name will be assigned to the object. @param[in] attribute one of [#__itt_attr_barrier, #__itt_attr_mutex] */ #if ITT_PLATFORM==ITT_PLATFORM_WIN void ITTAPI __itt_sync_createA(void *addr, const char *objtype, const char *objname, int attribute); void ITTAPI __itt_sync_createW(void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute); #if defined(UNICODE) || defined(_UNICODE) # define __itt_sync_create __itt_sync_createW # define __itt_sync_create_ptr __itt_sync_createW_ptr #else /* UNICODE */ # define __itt_sync_create __itt_sync_createA # define __itt_sync_create_ptr __itt_sync_createA_ptr #endif /* UNICODE */ #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ void ITTAPI __itt_sync_create (void *addr, const char *objtype, const char *objname, int attribute); #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUBV(ITTAPI, void, sync_createA, (void *addr, const char *objtype, const char *objname, int attribute)) ITT_STUBV(ITTAPI, void, sync_createW, (void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute)) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUBV(ITTAPI, void, sync_create, (void *addr, const char* objtype, const char* objname, int attribute)) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_sync_createA ITTNOTIFY_VOID(sync_createA) #define __itt_sync_createA_ptr ITTNOTIFY_NAME(sync_createA) #define __itt_sync_createW ITTNOTIFY_VOID(sync_createW) #define __itt_sync_createW_ptr ITTNOTIFY_NAME(sync_createW) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_sync_create ITTNOTIFY_VOID(sync_create) #define __itt_sync_create_ptr ITTNOTIFY_NAME(sync_create) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #else /* INTEL_NO_ITTNOTIFY_API */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_sync_createA(addr, objtype, objname, attribute) #define __itt_sync_createA_ptr 0 #define __itt_sync_createW(addr, objtype, objname, attribute) #define __itt_sync_createW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_sync_create(addr, objtype, objname, attribute) #define __itt_sync_create_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_sync_createA_ptr 0 #define __itt_sync_createW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_sync_create_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @brief Rename a synchronization object You can use the rename call to assign or reassign a name to a given synchronization object. @param[in] addr handle for the synchronization object. @param[in] name null-terminated object name string. */ #if ITT_PLATFORM==ITT_PLATFORM_WIN void ITTAPI __itt_sync_renameA(void *addr, const char *name); void ITTAPI __itt_sync_renameW(void *addr, const wchar_t *name); #if defined(UNICODE) || defined(_UNICODE) # define __itt_sync_rename __itt_sync_renameW # define __itt_sync_rename_ptr __itt_sync_renameW_ptr #else /* UNICODE */ # define __itt_sync_rename __itt_sync_renameA # define __itt_sync_rename_ptr __itt_sync_renameA_ptr #endif /* UNICODE */ #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ void ITTAPI __itt_sync_rename(void *addr, const char *name); #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUBV(ITTAPI, void, sync_renameA, (void *addr, const char *name)) ITT_STUBV(ITTAPI, void, sync_renameW, (void *addr, const wchar_t *name)) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUBV(ITTAPI, void, sync_rename, (void *addr, const char *name)) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_sync_renameA ITTNOTIFY_VOID(sync_renameA) #define __itt_sync_renameA_ptr ITTNOTIFY_NAME(sync_renameA) #define __itt_sync_renameW ITTNOTIFY_VOID(sync_renameW) #define __itt_sync_renameW_ptr ITTNOTIFY_NAME(sync_renameW) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_sync_rename ITTNOTIFY_VOID(sync_rename) #define __itt_sync_rename_ptr ITTNOTIFY_NAME(sync_rename) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #else /* INTEL_NO_ITTNOTIFY_API */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_sync_renameA(addr, name) #define __itt_sync_renameA_ptr 0 #define __itt_sync_renameW(addr, name) #define __itt_sync_renameW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_sync_rename(addr, name) #define __itt_sync_rename_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_sync_renameA_ptr 0 #define __itt_sync_renameW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_sync_rename_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @brief Destroy a synchronization object. @param addr Handle for the synchronization object. */ void ITTAPI __itt_sync_destroy(void *addr); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, sync_destroy, (void *addr)) #define __itt_sync_destroy ITTNOTIFY_VOID(sync_destroy) #define __itt_sync_destroy_ptr ITTNOTIFY_NAME(sync_destroy) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_sync_destroy(addr) #define __itt_sync_destroy_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_sync_destroy_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /*****************************************************************//** * @name group of functions is used for performance measurement tools *********************************************************************/ /** @{ */ /** * @brief Enter spin loop on user-defined sync object */ void ITTAPI __itt_sync_prepare(void* addr); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, sync_prepare, (void *addr)) #define __itt_sync_prepare ITTNOTIFY_VOID(sync_prepare) #define __itt_sync_prepare_ptr ITTNOTIFY_NAME(sync_prepare) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_sync_prepare(addr) #define __itt_sync_prepare_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_sync_prepare_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Quit spin loop without acquiring spin object */ void ITTAPI __itt_sync_cancel(void *addr); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, sync_cancel, (void *addr)) #define __itt_sync_cancel ITTNOTIFY_VOID(sync_cancel) #define __itt_sync_cancel_ptr ITTNOTIFY_NAME(sync_cancel) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_sync_cancel(addr) #define __itt_sync_cancel_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_sync_cancel_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Successful spin loop completion (sync object acquired) */ void ITTAPI __itt_sync_acquired(void *addr); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, sync_acquired, (void *addr)) #define __itt_sync_acquired ITTNOTIFY_VOID(sync_acquired) #define __itt_sync_acquired_ptr ITTNOTIFY_NAME(sync_acquired) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_sync_acquired(addr) #define __itt_sync_acquired_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_sync_acquired_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Start sync object releasing code. Is called before the lock release call. */ void ITTAPI __itt_sync_releasing(void* addr); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, sync_releasing, (void *addr)) #define __itt_sync_releasing ITTNOTIFY_VOID(sync_releasing) #define __itt_sync_releasing_ptr ITTNOTIFY_NAME(sync_releasing) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_sync_releasing(addr) #define __itt_sync_releasing_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_sync_releasing_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @} */ /** @} sync group */ /**************************************************************//** * @name group of functions is used for correctness checking tools ******************************************************************/ /** @{ */ /** * @ingroup legacy * @deprecated Legacy API * @brief Fast synchronization which does no require spinning. * - This special function is to be used by TBB and OpenMP libraries only when they know * there is no spin but they need to suppress TC warnings about shared variable modifications. * - It only has corresponding pointers in static library and does not have corresponding function * in dynamic library. * @see void __itt_sync_prepare(void* addr); */ void ITTAPI __itt_fsync_prepare(void* addr); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, fsync_prepare, (void *addr)) #define __itt_fsync_prepare ITTNOTIFY_VOID(fsync_prepare) #define __itt_fsync_prepare_ptr ITTNOTIFY_NAME(fsync_prepare) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_fsync_prepare(addr) #define __itt_fsync_prepare_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_fsync_prepare_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @ingroup legacy * @deprecated Legacy API * @brief Fast synchronization which does no require spinning. * - This special function is to be used by TBB and OpenMP libraries only when they know * there is no spin but they need to suppress TC warnings about shared variable modifications. * - It only has corresponding pointers in static library and does not have corresponding function * in dynamic library. * @see void __itt_sync_cancel(void *addr); */ void ITTAPI __itt_fsync_cancel(void *addr); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, fsync_cancel, (void *addr)) #define __itt_fsync_cancel ITTNOTIFY_VOID(fsync_cancel) #define __itt_fsync_cancel_ptr ITTNOTIFY_NAME(fsync_cancel) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_fsync_cancel(addr) #define __itt_fsync_cancel_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_fsync_cancel_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @ingroup legacy * @deprecated Legacy API * @brief Fast synchronization which does no require spinning. * - This special function is to be used by TBB and OpenMP libraries only when they know * there is no spin but they need to suppress TC warnings about shared variable modifications. * - It only has corresponding pointers in static library and does not have corresponding function * in dynamic library. * @see void __itt_sync_acquired(void *addr); */ void ITTAPI __itt_fsync_acquired(void *addr); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, fsync_acquired, (void *addr)) #define __itt_fsync_acquired ITTNOTIFY_VOID(fsync_acquired) #define __itt_fsync_acquired_ptr ITTNOTIFY_NAME(fsync_acquired) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_fsync_acquired(addr) #define __itt_fsync_acquired_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_fsync_acquired_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @ingroup legacy * @deprecated Legacy API * @brief Fast synchronization which does no require spinning. * - This special function is to be used by TBB and OpenMP libraries only when they know * there is no spin but they need to suppress TC warnings about shared variable modifications. * - It only has corresponding pointers in static library and does not have corresponding function * in dynamic library. * @see void __itt_sync_releasing(void* addr); */ void ITTAPI __itt_fsync_releasing(void* addr); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, fsync_releasing, (void *addr)) #define __itt_fsync_releasing ITTNOTIFY_VOID(fsync_releasing) #define __itt_fsync_releasing_ptr ITTNOTIFY_NAME(fsync_releasing) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_fsync_releasing(addr) #define __itt_fsync_releasing_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_fsync_releasing_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @} */ /** * @defgroup model Modeling by Intel(R) Parallel Advisor * @ingroup public * This is the subset of itt used for modeling by Intel(R) Parallel Advisor. * This API is called ONLY using annotate.h, by "Annotation" macros * the user places in their sources during the parallelism modeling steps. * * site_begin/end and task_begin/end take the address of handle variables, * which are writeable by the API. Handles must be 0 initialized prior * to the first call to begin, or may cause a run-time failure. * The handles are initialized in a multi-thread safe way by the API if * the handle is 0. The commonly expected idiom is one static handle to * identify a site or task. If a site or task of the same name has already * been started during this collection, the same handle MAY be returned, * but is not required to be - it is unspecified if data merging is done * based on name. These routines also take an instance variable. Like * the lexical instance, these must be 0 initialized. Unlike the lexical * instance, this is used to track a single dynamic instance. * * API used by the Intel(R) Parallel Advisor to describe potential concurrency * and related activities. User-added source annotations expand to calls * to these procedures to enable modeling of a hypothetical concurrent * execution serially. * @{ */ #if !defined(_ADVISOR_ANNOTATE_H_) || defined(ANNOTATE_EXPAND_NULL) typedef void* __itt_model_site; /*!< @brief handle for lexical site */ typedef void* __itt_model_site_instance; /*!< @brief handle for dynamic instance */ typedef void* __itt_model_task; /*!< @brief handle for lexical site */ typedef void* __itt_model_task_instance; /*!< @brief handle for dynamic instance */ /** * @enum __itt_model_disable * @brief Enumerator for the disable methods */ typedef enum { __itt_model_disable_observation, __itt_model_disable_collection } __itt_model_disable; #endif /* !_ADVISOR_ANNOTATE_H_ || ANNOTATE_EXPAND_NULL */ /** * @brief ANNOTATE_SITE_BEGIN/ANNOTATE_SITE_END support. * * site_begin/end model a potential concurrency site. * site instances may be recursively nested with themselves. * site_end exits the most recently started but unended site for the current * thread. The handle passed to end may be used to validate structure. * Instances of a site encountered on different threads concurrently * are considered completely distinct. If the site name for two different * lexical sites match, it is unspecified whether they are treated as the * same or different for data presentation. */ void ITTAPI __itt_model_site_begin(__itt_model_site *site, __itt_model_site_instance *instance, const char *name); #if ITT_PLATFORM==ITT_PLATFORM_WIN void ITTAPI __itt_model_site_beginW(const wchar_t *name); #endif void ITTAPI __itt_model_site_beginA(const char *name); void ITTAPI __itt_model_site_beginAL(const char *name, size_t siteNameLen); void ITTAPI __itt_model_site_end (__itt_model_site *site, __itt_model_site_instance *instance); void ITTAPI __itt_model_site_end_2(void); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, model_site_begin, (__itt_model_site *site, __itt_model_site_instance *instance, const char *name)) #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUBV(ITTAPI, void, model_site_beginW, (const wchar_t *name)) #endif ITT_STUBV(ITTAPI, void, model_site_beginA, (const char *name)) ITT_STUBV(ITTAPI, void, model_site_beginAL, (const char *name, size_t siteNameLen)) ITT_STUBV(ITTAPI, void, model_site_end, (__itt_model_site *site, __itt_model_site_instance *instance)) ITT_STUBV(ITTAPI, void, model_site_end_2, (void)) #define __itt_model_site_begin ITTNOTIFY_VOID(model_site_begin) #define __itt_model_site_begin_ptr ITTNOTIFY_NAME(model_site_begin) #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_model_site_beginW ITTNOTIFY_VOID(model_site_beginW) #define __itt_model_site_beginW_ptr ITTNOTIFY_NAME(model_site_beginW) #endif #define __itt_model_site_beginA ITTNOTIFY_VOID(model_site_beginA) #define __itt_model_site_beginA_ptr ITTNOTIFY_NAME(model_site_beginA) #define __itt_model_site_beginAL ITTNOTIFY_VOID(model_site_beginAL) #define __itt_model_site_beginAL_ptr ITTNOTIFY_NAME(model_site_beginAL) #define __itt_model_site_end ITTNOTIFY_VOID(model_site_end) #define __itt_model_site_end_ptr ITTNOTIFY_NAME(model_site_end) #define __itt_model_site_end_2 ITTNOTIFY_VOID(model_site_end_2) #define __itt_model_site_end_2_ptr ITTNOTIFY_NAME(model_site_end_2) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_model_site_begin(site, instance, name) #define __itt_model_site_begin_ptr 0 #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_model_site_beginW(name) #define __itt_model_site_beginW_ptr 0 #endif #define __itt_model_site_beginA(name) #define __itt_model_site_beginA_ptr 0 #define __itt_model_site_beginAL(name, siteNameLen) #define __itt_model_site_beginAL_ptr 0 #define __itt_model_site_end(site, instance) #define __itt_model_site_end_ptr 0 #define __itt_model_site_end_2() #define __itt_model_site_end_2_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_model_site_begin_ptr 0 #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_model_site_beginW_ptr 0 #endif #define __itt_model_site_beginA_ptr 0 #define __itt_model_site_beginAL_ptr 0 #define __itt_model_site_end_ptr 0 #define __itt_model_site_end_2_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief ANNOTATE_TASK_BEGIN/ANNOTATE_TASK_END support * * task_begin/end model a potential task, which is contained within the most * closely enclosing dynamic site. task_end exits the most recently started * but unended task. The handle passed to end may be used to validate * structure. It is unspecified if bad dynamic nesting is detected. If it * is, it should be encoded in the resulting data collection. The collector * should not fail due to construct nesting issues, nor attempt to directly * indicate the problem. */ void ITTAPI __itt_model_task_begin(__itt_model_task *task, __itt_model_task_instance *instance, const char *name); #if ITT_PLATFORM==ITT_PLATFORM_WIN void ITTAPI __itt_model_task_beginW(const wchar_t *name); void ITTAPI __itt_model_iteration_taskW(const wchar_t *name); #endif void ITTAPI __itt_model_task_beginA(const char *name); void ITTAPI __itt_model_task_beginAL(const char *name, size_t taskNameLen); void ITTAPI __itt_model_iteration_taskA(const char *name); void ITTAPI __itt_model_iteration_taskAL(const char *name, size_t taskNameLen); void ITTAPI __itt_model_task_end (__itt_model_task *task, __itt_model_task_instance *instance); void ITTAPI __itt_model_task_end_2(void); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, model_task_begin, (__itt_model_task *task, __itt_model_task_instance *instance, const char *name)) #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUBV(ITTAPI, void, model_task_beginW, (const wchar_t *name)) ITT_STUBV(ITTAPI, void, model_iteration_taskW, (const wchar_t *name)) #endif ITT_STUBV(ITTAPI, void, model_task_beginA, (const char *name)) ITT_STUBV(ITTAPI, void, model_task_beginAL, (const char *name, size_t taskNameLen)) ITT_STUBV(ITTAPI, void, model_iteration_taskA, (const char *name)) ITT_STUBV(ITTAPI, void, model_iteration_taskAL, (const char *name, size_t taskNameLen)) ITT_STUBV(ITTAPI, void, model_task_end, (__itt_model_task *task, __itt_model_task_instance *instance)) ITT_STUBV(ITTAPI, void, model_task_end_2, (void)) #define __itt_model_task_begin ITTNOTIFY_VOID(model_task_begin) #define __itt_model_task_begin_ptr ITTNOTIFY_NAME(model_task_begin) #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_model_task_beginW ITTNOTIFY_VOID(model_task_beginW) #define __itt_model_task_beginW_ptr ITTNOTIFY_NAME(model_task_beginW) #define __itt_model_iteration_taskW ITTNOTIFY_VOID(model_iteration_taskW) #define __itt_model_iteration_taskW_ptr ITTNOTIFY_NAME(model_iteration_taskW) #endif #define __itt_model_task_beginA ITTNOTIFY_VOID(model_task_beginA) #define __itt_model_task_beginA_ptr ITTNOTIFY_NAME(model_task_beginA) #define __itt_model_task_beginAL ITTNOTIFY_VOID(model_task_beginAL) #define __itt_model_task_beginAL_ptr ITTNOTIFY_NAME(model_task_beginAL) #define __itt_model_iteration_taskA ITTNOTIFY_VOID(model_iteration_taskA) #define __itt_model_iteration_taskA_ptr ITTNOTIFY_NAME(model_iteration_taskA) #define __itt_model_iteration_taskAL ITTNOTIFY_VOID(model_iteration_taskAL) #define __itt_model_iteration_taskAL_ptr ITTNOTIFY_NAME(model_iteration_taskAL) #define __itt_model_task_end ITTNOTIFY_VOID(model_task_end) #define __itt_model_task_end_ptr ITTNOTIFY_NAME(model_task_end) #define __itt_model_task_end_2 ITTNOTIFY_VOID(model_task_end_2) #define __itt_model_task_end_2_ptr ITTNOTIFY_NAME(model_task_end_2) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_model_task_begin(task, instance, name) #define __itt_model_task_begin_ptr 0 #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_model_task_beginW(name) #define __itt_model_task_beginW_ptr 0 #endif #define __itt_model_task_beginA(name) #define __itt_model_task_beginA_ptr 0 #define __itt_model_task_beginAL(name, siteNameLen) #define __itt_model_task_beginAL_ptr 0 #define __itt_model_iteration_taskA(name) #define __itt_model_iteration_taskA_ptr 0 #define __itt_model_iteration_taskAL(name, siteNameLen) #define __itt_model_iteration_taskAL_ptr 0 #define __itt_model_task_end(task, instance) #define __itt_model_task_end_ptr 0 #define __itt_model_task_end_2() #define __itt_model_task_end_2_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_model_task_begin_ptr 0 #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_model_task_beginW_ptr 0 #endif #define __itt_model_task_beginA_ptr 0 #define __itt_model_task_beginAL_ptr 0 #define __itt_model_iteration_taskA_ptr 0 #define __itt_model_iteration_taskAL_ptr 0 #define __itt_model_task_end_ptr 0 #define __itt_model_task_end_2_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief ANNOTATE_LOCK_ACQUIRE/ANNOTATE_LOCK_RELEASE support * * lock_acquire/release model a potential lock for both lockset and * performance modeling. Each unique address is modeled as a separate * lock, with invalid addresses being valid lock IDs. Specifically: * no storage is accessed by the API at the specified address - it is only * used for lock identification. Lock acquires may be self-nested and are * unlocked by a corresponding number of releases. * (These closely correspond to __itt_sync_acquired/__itt_sync_releasing, * but may not have identical semantics.) */ void ITTAPI __itt_model_lock_acquire(void *lock); void ITTAPI __itt_model_lock_acquire_2(void *lock); void ITTAPI __itt_model_lock_release(void *lock); void ITTAPI __itt_model_lock_release_2(void *lock); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, model_lock_acquire, (void *lock)) ITT_STUBV(ITTAPI, void, model_lock_acquire_2, (void *lock)) ITT_STUBV(ITTAPI, void, model_lock_release, (void *lock)) ITT_STUBV(ITTAPI, void, model_lock_release_2, (void *lock)) #define __itt_model_lock_acquire ITTNOTIFY_VOID(model_lock_acquire) #define __itt_model_lock_acquire_ptr ITTNOTIFY_NAME(model_lock_acquire) #define __itt_model_lock_acquire_2 ITTNOTIFY_VOID(model_lock_acquire_2) #define __itt_model_lock_acquire_2_ptr ITTNOTIFY_NAME(model_lock_acquire_2) #define __itt_model_lock_release ITTNOTIFY_VOID(model_lock_release) #define __itt_model_lock_release_ptr ITTNOTIFY_NAME(model_lock_release) #define __itt_model_lock_release_2 ITTNOTIFY_VOID(model_lock_release_2) #define __itt_model_lock_release_2_ptr ITTNOTIFY_NAME(model_lock_release_2) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_model_lock_acquire(lock) #define __itt_model_lock_acquire_ptr 0 #define __itt_model_lock_acquire_2(lock) #define __itt_model_lock_acquire_2_ptr 0 #define __itt_model_lock_release(lock) #define __itt_model_lock_release_ptr 0 #define __itt_model_lock_release_2(lock) #define __itt_model_lock_release_2_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_model_lock_acquire_ptr 0 #define __itt_model_lock_acquire_2_ptr 0 #define __itt_model_lock_release_ptr 0 #define __itt_model_lock_release_2_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief ANNOTATE_RECORD_ALLOCATION/ANNOTATE_RECORD_DEALLOCATION support * * record_allocation/deallocation describe user-defined memory allocator * behavior, which may be required for correctness modeling to understand * when storage is not expected to be actually reused across threads. */ void ITTAPI __itt_model_record_allocation (void *addr, size_t size); void ITTAPI __itt_model_record_deallocation(void *addr); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, model_record_allocation, (void *addr, size_t size)) ITT_STUBV(ITTAPI, void, model_record_deallocation, (void *addr)) #define __itt_model_record_allocation ITTNOTIFY_VOID(model_record_allocation) #define __itt_model_record_allocation_ptr ITTNOTIFY_NAME(model_record_allocation) #define __itt_model_record_deallocation ITTNOTIFY_VOID(model_record_deallocation) #define __itt_model_record_deallocation_ptr ITTNOTIFY_NAME(model_record_deallocation) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_model_record_allocation(addr, size) #define __itt_model_record_allocation_ptr 0 #define __itt_model_record_deallocation(addr) #define __itt_model_record_deallocation_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_model_record_allocation_ptr 0 #define __itt_model_record_deallocation_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief ANNOTATE_INDUCTION_USES support * * Note particular storage is inductive through the end of the current site */ void ITTAPI __itt_model_induction_uses(void* addr, size_t size); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, model_induction_uses, (void *addr, size_t size)) #define __itt_model_induction_uses ITTNOTIFY_VOID(model_induction_uses) #define __itt_model_induction_uses_ptr ITTNOTIFY_NAME(model_induction_uses) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_model_induction_uses(addr, size) #define __itt_model_induction_uses_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_model_induction_uses_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief ANNOTATE_REDUCTION_USES support * * Note particular storage is used for reduction through the end * of the current site */ void ITTAPI __itt_model_reduction_uses(void* addr, size_t size); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, model_reduction_uses, (void *addr, size_t size)) #define __itt_model_reduction_uses ITTNOTIFY_VOID(model_reduction_uses) #define __itt_model_reduction_uses_ptr ITTNOTIFY_NAME(model_reduction_uses) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_model_reduction_uses(addr, size) #define __itt_model_reduction_uses_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_model_reduction_uses_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief ANNOTATE_OBSERVE_USES support * * Have correctness modeling record observations about uses of storage * through the end of the current site */ void ITTAPI __itt_model_observe_uses(void* addr, size_t size); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, model_observe_uses, (void *addr, size_t size)) #define __itt_model_observe_uses ITTNOTIFY_VOID(model_observe_uses) #define __itt_model_observe_uses_ptr ITTNOTIFY_NAME(model_observe_uses) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_model_observe_uses(addr, size) #define __itt_model_observe_uses_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_model_observe_uses_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief ANNOTATE_CLEAR_USES support * * Clear the special handling of a piece of storage related to induction, * reduction or observe_uses */ void ITTAPI __itt_model_clear_uses(void* addr); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, model_clear_uses, (void *addr)) #define __itt_model_clear_uses ITTNOTIFY_VOID(model_clear_uses) #define __itt_model_clear_uses_ptr ITTNOTIFY_NAME(model_clear_uses) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_model_clear_uses(addr) #define __itt_model_clear_uses_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_model_clear_uses_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief ANNOTATE_DISABLE_*_PUSH/ANNOTATE_DISABLE_*_POP support * * disable_push/disable_pop push and pop disabling based on a parameter. * Disabling observations stops processing of memory references during * correctness modeling, and all annotations that occur in the disabled * region. This allows description of code that is expected to be handled * specially during conversion to parallelism or that is not recognized * by tools (e.g. some kinds of synchronization operations.) * This mechanism causes all annotations in the disabled region, other * than disable_push and disable_pop, to be ignored. (For example, this * might validly be used to disable an entire parallel site and the contained * tasks and locking in it for data collection purposes.) * The disable for collection is a more expensive operation, but reduces * collector overhead significantly. This applies to BOTH correctness data * collection and performance data collection. For example, a site * containing a task might only enable data collection for the first 10 * iterations. Both performance and correctness data should reflect this, * and the program should run as close to full speed as possible when * collection is disabled. */ void ITTAPI __itt_model_disable_push(__itt_model_disable x); void ITTAPI __itt_model_disable_pop(void); void ITTAPI __itt_model_aggregate_task(size_t x); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, model_disable_push, (__itt_model_disable x)) ITT_STUBV(ITTAPI, void, model_disable_pop, (void)) ITT_STUBV(ITTAPI, void, model_aggregate_task, (size_t x)) #define __itt_model_disable_push ITTNOTIFY_VOID(model_disable_push) #define __itt_model_disable_push_ptr ITTNOTIFY_NAME(model_disable_push) #define __itt_model_disable_pop ITTNOTIFY_VOID(model_disable_pop) #define __itt_model_disable_pop_ptr ITTNOTIFY_NAME(model_disable_pop) #define __itt_model_aggregate_task ITTNOTIFY_VOID(model_aggregate_task) #define __itt_model_aggregate_task_ptr ITTNOTIFY_NAME(model_aggregate_task) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_model_disable_push(x) #define __itt_model_disable_push_ptr 0 #define __itt_model_disable_pop() #define __itt_model_disable_pop_ptr 0 #define __itt_model_aggregate_task(x) #define __itt_model_aggregate_task_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_model_disable_push_ptr 0 #define __itt_model_disable_pop_ptr 0 #define __itt_model_aggregate_task_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @} model group */ /** * @defgroup heap Heap * @ingroup public * Heap group * @{ */ typedef void* __itt_heap_function; /** * @brief Create an identification for heap function * @return non-zero identifier or NULL */ #if ITT_PLATFORM==ITT_PLATFORM_WIN __itt_heap_function ITTAPI __itt_heap_function_createA(const char* name, const char* domain); __itt_heap_function ITTAPI __itt_heap_function_createW(const wchar_t* name, const wchar_t* domain); #if defined(UNICODE) || defined(_UNICODE) # define __itt_heap_function_create __itt_heap_function_createW # define __itt_heap_function_create_ptr __itt_heap_function_createW_ptr #else # define __itt_heap_function_create __itt_heap_function_createA # define __itt_heap_function_create_ptr __itt_heap_function_createA_ptr #endif /* UNICODE */ #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ __itt_heap_function ITTAPI __itt_heap_function_create(const char* name, const char* domain); #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(ITTAPI, __itt_heap_function, heap_function_createA, (const char* name, const char* domain)) ITT_STUB(ITTAPI, __itt_heap_function, heap_function_createW, (const wchar_t* name, const wchar_t* domain)) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUB(ITTAPI, __itt_heap_function, heap_function_create, (const char* name, const char* domain)) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_heap_function_createA ITTNOTIFY_DATA(heap_function_createA) #define __itt_heap_function_createA_ptr ITTNOTIFY_NAME(heap_function_createA) #define __itt_heap_function_createW ITTNOTIFY_DATA(heap_function_createW) #define __itt_heap_function_createW_ptr ITTNOTIFY_NAME(heap_function_createW) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_heap_function_create ITTNOTIFY_DATA(heap_function_create) #define __itt_heap_function_create_ptr ITTNOTIFY_NAME(heap_function_create) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #else /* INTEL_NO_ITTNOTIFY_API */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_heap_function_createA(name, domain) (__itt_heap_function)0 #define __itt_heap_function_createA_ptr 0 #define __itt_heap_function_createW(name, domain) (__itt_heap_function)0 #define __itt_heap_function_createW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_heap_function_create(name, domain) (__itt_heap_function)0 #define __itt_heap_function_create_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_heap_function_createA_ptr 0 #define __itt_heap_function_createW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_heap_function_create_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Record an allocation begin occurrence. */ void ITTAPI __itt_heap_allocate_begin(__itt_heap_function h, size_t size, int initialized); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, heap_allocate_begin, (__itt_heap_function h, size_t size, int initialized)) #define __itt_heap_allocate_begin ITTNOTIFY_VOID(heap_allocate_begin) #define __itt_heap_allocate_begin_ptr ITTNOTIFY_NAME(heap_allocate_begin) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_heap_allocate_begin(h, size, initialized) #define __itt_heap_allocate_begin_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_heap_allocate_begin_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Record an allocation end occurrence. */ void ITTAPI __itt_heap_allocate_end(__itt_heap_function h, void** addr, size_t size, int initialized); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, heap_allocate_end, (__itt_heap_function h, void** addr, size_t size, int initialized)) #define __itt_heap_allocate_end ITTNOTIFY_VOID(heap_allocate_end) #define __itt_heap_allocate_end_ptr ITTNOTIFY_NAME(heap_allocate_end) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_heap_allocate_end(h, addr, size, initialized) #define __itt_heap_allocate_end_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_heap_allocate_end_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Record a free begin occurrence. */ void ITTAPI __itt_heap_free_begin(__itt_heap_function h, void* addr); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, heap_free_begin, (__itt_heap_function h, void* addr)) #define __itt_heap_free_begin ITTNOTIFY_VOID(heap_free_begin) #define __itt_heap_free_begin_ptr ITTNOTIFY_NAME(heap_free_begin) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_heap_free_begin(h, addr) #define __itt_heap_free_begin_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_heap_free_begin_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Record a free end occurrence. */ void ITTAPI __itt_heap_free_end(__itt_heap_function h, void* addr); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, heap_free_end, (__itt_heap_function h, void* addr)) #define __itt_heap_free_end ITTNOTIFY_VOID(heap_free_end) #define __itt_heap_free_end_ptr ITTNOTIFY_NAME(heap_free_end) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_heap_free_end(h, addr) #define __itt_heap_free_end_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_heap_free_end_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Record a reallocation begin occurrence. */ void ITTAPI __itt_heap_reallocate_begin(__itt_heap_function h, void* addr, size_t new_size, int initialized); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, heap_reallocate_begin, (__itt_heap_function h, void* addr, size_t new_size, int initialized)) #define __itt_heap_reallocate_begin ITTNOTIFY_VOID(heap_reallocate_begin) #define __itt_heap_reallocate_begin_ptr ITTNOTIFY_NAME(heap_reallocate_begin) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_heap_reallocate_begin(h, addr, new_size, initialized) #define __itt_heap_reallocate_begin_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_heap_reallocate_begin_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Record a reallocation end occurrence. */ void ITTAPI __itt_heap_reallocate_end(__itt_heap_function h, void* addr, void** new_addr, size_t new_size, int initialized); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, heap_reallocate_end, (__itt_heap_function h, void* addr, void** new_addr, size_t new_size, int initialized)) #define __itt_heap_reallocate_end ITTNOTIFY_VOID(heap_reallocate_end) #define __itt_heap_reallocate_end_ptr ITTNOTIFY_NAME(heap_reallocate_end) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_heap_reallocate_end(h, addr, new_addr, new_size, initialized) #define __itt_heap_reallocate_end_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_heap_reallocate_end_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @brief internal access begin */ void ITTAPI __itt_heap_internal_access_begin(void); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, heap_internal_access_begin, (void)) #define __itt_heap_internal_access_begin ITTNOTIFY_VOID(heap_internal_access_begin) #define __itt_heap_internal_access_begin_ptr ITTNOTIFY_NAME(heap_internal_access_begin) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_heap_internal_access_begin() #define __itt_heap_internal_access_begin_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_heap_internal_access_begin_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @brief internal access end */ void ITTAPI __itt_heap_internal_access_end(void); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, heap_internal_access_end, (void)) #define __itt_heap_internal_access_end ITTNOTIFY_VOID(heap_internal_access_end) #define __itt_heap_internal_access_end_ptr ITTNOTIFY_NAME(heap_internal_access_end) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_heap_internal_access_end() #define __itt_heap_internal_access_end_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_heap_internal_access_end_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @brief record memory growth begin */ void ITTAPI __itt_heap_record_memory_growth_begin(void); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, heap_record_memory_growth_begin, (void)) #define __itt_heap_record_memory_growth_begin ITTNOTIFY_VOID(heap_record_memory_growth_begin) #define __itt_heap_record_memory_growth_begin_ptr ITTNOTIFY_NAME(heap_record_memory_growth_begin) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_heap_record_memory_growth_begin() #define __itt_heap_record_memory_growth_begin_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_heap_record_memory_growth_begin_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @brief record memory growth end */ void ITTAPI __itt_heap_record_memory_growth_end(void); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, heap_record_memory_growth_end, (void)) #define __itt_heap_record_memory_growth_end ITTNOTIFY_VOID(heap_record_memory_growth_end) #define __itt_heap_record_memory_growth_end_ptr ITTNOTIFY_NAME(heap_record_memory_growth_end) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_heap_record_memory_growth_end() #define __itt_heap_record_memory_growth_end_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_heap_record_memory_growth_end_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Specify the type of heap detection/reporting to modify. */ /** * @hideinitializer * @brief Report on memory leaks. */ #define __itt_heap_leaks 0x00000001 /** * @hideinitializer * @brief Report on memory growth. */ #define __itt_heap_growth 0x00000002 /** @brief heap reset detection */ void ITTAPI __itt_heap_reset_detection(unsigned int reset_mask); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, heap_reset_detection, (unsigned int reset_mask)) #define __itt_heap_reset_detection ITTNOTIFY_VOID(heap_reset_detection) #define __itt_heap_reset_detection_ptr ITTNOTIFY_NAME(heap_reset_detection) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_heap_reset_detection() #define __itt_heap_reset_detection_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_heap_reset_detection_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @brief report */ void ITTAPI __itt_heap_record(unsigned int record_mask); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, heap_record, (unsigned int record_mask)) #define __itt_heap_record ITTNOTIFY_VOID(heap_record) #define __itt_heap_record_ptr ITTNOTIFY_NAME(heap_record) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_heap_record() #define __itt_heap_record_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_heap_record_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @} heap group */ /** @endcond */ /* ========================================================================== */ /** * @defgroup domains Domains * @ingroup public * Domains group * @{ */ /** @cond exclude_from_documentation */ #pragma pack(push, 8) typedef struct ___itt_domain { volatile int flags; /*!< Zero if disabled, non-zero if enabled. The meaning of different non-zero values is reserved to the runtime */ const char* nameA; /*!< Copy of original name in ASCII. */ #if defined(UNICODE) || defined(_UNICODE) const wchar_t* nameW; /*!< Copy of original name in UNICODE. */ #else /* UNICODE || _UNICODE */ void* nameW; #endif /* UNICODE || _UNICODE */ int extra1; /*!< Reserved to the runtime */ void* extra2; /*!< Reserved to the runtime */ struct ___itt_domain* next; } __itt_domain; #pragma pack(pop) /** @endcond */ /** * @ingroup domains * @brief Create a domain. * Create domain using some domain name: the URI naming style is recommended. * Because the set of domains is expected to be static over the application's * execution time, there is no mechanism to destroy a domain. * Any domain can be accessed by any thread in the process, regardless of * which thread created the domain. This call is thread-safe. * @param[in] name name of domain */ #if ITT_PLATFORM==ITT_PLATFORM_WIN __itt_domain* ITTAPI __itt_domain_createA(const char *name); __itt_domain* ITTAPI __itt_domain_createW(const wchar_t *name); #if defined(UNICODE) || defined(_UNICODE) # define __itt_domain_create __itt_domain_createW # define __itt_domain_create_ptr __itt_domain_createW_ptr #else /* UNICODE */ # define __itt_domain_create __itt_domain_createA # define __itt_domain_create_ptr __itt_domain_createA_ptr #endif /* UNICODE */ #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ __itt_domain* ITTAPI __itt_domain_create(const char *name); #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(ITTAPI, __itt_domain*, domain_createA, (const char *name)) ITT_STUB(ITTAPI, __itt_domain*, domain_createW, (const wchar_t *name)) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUB(ITTAPI, __itt_domain*, domain_create, (const char *name)) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_domain_createA ITTNOTIFY_DATA(domain_createA) #define __itt_domain_createA_ptr ITTNOTIFY_NAME(domain_createA) #define __itt_domain_createW ITTNOTIFY_DATA(domain_createW) #define __itt_domain_createW_ptr ITTNOTIFY_NAME(domain_createW) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_domain_create ITTNOTIFY_DATA(domain_create) #define __itt_domain_create_ptr ITTNOTIFY_NAME(domain_create) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #else /* INTEL_NO_ITTNOTIFY_API */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_domain_createA(name) (__itt_domain*)0 #define __itt_domain_createA_ptr 0 #define __itt_domain_createW(name) (__itt_domain*)0 #define __itt_domain_createW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_domain_create(name) (__itt_domain*)0 #define __itt_domain_create_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_domain_createA_ptr 0 #define __itt_domain_createW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_domain_create_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @} domains group */ /** * @defgroup ids IDs * @ingroup public * IDs group * @{ */ /** @cond exclude_from_documentation */ #pragma pack(push, 8) typedef struct ___itt_id { unsigned long long d1, d2, d3; } __itt_id; #pragma pack(pop) /** @endcond */ static const __itt_id __itt_null = { 0, 0, 0 }; /** * @ingroup ids * @brief A convenience function is provided to create an ID without domain control. * @brief This is a convenience function to initialize an __itt_id structure. This function * does not affect the collector runtime in any way. After you make the ID with this * function, you still must create it with the __itt_id_create function before using the ID * to identify a named entity. * @param[in] addr The address of object; high QWORD of the ID value. * @param[in] extra The extra data to unique identify object; low QWORD of the ID value. */ ITT_INLINE __itt_id ITTAPI __itt_id_make(void* addr, unsigned long long extra) ITT_INLINE_ATTRIBUTE; ITT_INLINE __itt_id ITTAPI __itt_id_make(void* addr, unsigned long long extra) { __itt_id id = __itt_null; id.d1 = (unsigned long long)((uintptr_t)addr); id.d2 = (unsigned long long)extra; id.d3 = (unsigned long long)0; /* Reserved. Must be zero */ return id; } /** * @ingroup ids * @brief Create an instance of identifier. * This establishes the beginning of the lifetime of an instance of * the given ID in the trace. Once this lifetime starts, the ID * can be used to tag named entity instances in calls such as * __itt_task_begin, and to specify relationships among * identified named entity instances, using the \ref relations APIs. * Instance IDs are not domain specific! * @param[in] domain The domain controlling the execution of this call. * @param[in] id The ID to create. */ void ITTAPI __itt_id_create(const __itt_domain *domain, __itt_id id); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, id_create, (const __itt_domain *domain, __itt_id id)) #define __itt_id_create(d,x) ITTNOTIFY_VOID_D1(id_create,d,x) #define __itt_id_create_ptr ITTNOTIFY_NAME(id_create) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_id_create(domain,id) #define __itt_id_create_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_id_create_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @ingroup ids * @brief Destroy an instance of identifier. * This ends the lifetime of the current instance of the given ID value in the trace. * Any relationships that are established after this lifetime ends are invalid. * This call must be performed before the given ID value can be reused for a different * named entity instance. * @param[in] domain The domain controlling the execution of this call. * @param[in] id The ID to destroy. */ void ITTAPI __itt_id_destroy(const __itt_domain *domain, __itt_id id); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, id_destroy, (const __itt_domain *domain, __itt_id id)) #define __itt_id_destroy(d,x) ITTNOTIFY_VOID_D1(id_destroy,d,x) #define __itt_id_destroy_ptr ITTNOTIFY_NAME(id_destroy) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_id_destroy(domain,id) #define __itt_id_destroy_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_id_destroy_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @} ids group */ /** * @defgroup handless String Handles * @ingroup public * String Handles group * @{ */ /** @cond exclude_from_documentation */ #pragma pack(push, 8) typedef struct ___itt_string_handle { const char* strA; /*!< Copy of original string in ASCII. */ #if defined(UNICODE) || defined(_UNICODE) const wchar_t* strW; /*!< Copy of original string in UNICODE. */ #else /* UNICODE || _UNICODE */ void* strW; #endif /* UNICODE || _UNICODE */ int extra1; /*!< Reserved. Must be zero */ void* extra2; /*!< Reserved. Must be zero */ struct ___itt_string_handle* next; } __itt_string_handle; #pragma pack(pop) /** @endcond */ /** * @ingroup handles * @brief Create a string handle. * Create and return handle value that can be associated with a string. * Consecutive calls to __itt_string_handle_create with the same name * return the same value. Because the set of string handles is expected to remain * static during the application's execution time, there is no mechanism to destroy a string handle. * Any string handle can be accessed by any thread in the process, regardless of which thread created * the string handle. This call is thread-safe. * @param[in] name The input string */ #if ITT_PLATFORM==ITT_PLATFORM_WIN __itt_string_handle* ITTAPI __itt_string_handle_createA(const char *name); __itt_string_handle* ITTAPI __itt_string_handle_createW(const wchar_t *name); #if defined(UNICODE) || defined(_UNICODE) # define __itt_string_handle_create __itt_string_handle_createW # define __itt_string_handle_create_ptr __itt_string_handle_createW_ptr #else /* UNICODE */ # define __itt_string_handle_create __itt_string_handle_createA # define __itt_string_handle_create_ptr __itt_string_handle_createA_ptr #endif /* UNICODE */ #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ __itt_string_handle* ITTAPI __itt_string_handle_create(const char *name); #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_createA, (const char *name)) ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_createW, (const wchar_t *name)) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_create, (const char *name)) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_string_handle_createA ITTNOTIFY_DATA(string_handle_createA) #define __itt_string_handle_createA_ptr ITTNOTIFY_NAME(string_handle_createA) #define __itt_string_handle_createW ITTNOTIFY_DATA(string_handle_createW) #define __itt_string_handle_createW_ptr ITTNOTIFY_NAME(string_handle_createW) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_string_handle_create ITTNOTIFY_DATA(string_handle_create) #define __itt_string_handle_create_ptr ITTNOTIFY_NAME(string_handle_create) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #else /* INTEL_NO_ITTNOTIFY_API */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_string_handle_createA(name) (__itt_string_handle*)0 #define __itt_string_handle_createA_ptr 0 #define __itt_string_handle_createW(name) (__itt_string_handle*)0 #define __itt_string_handle_createW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_string_handle_create(name) (__itt_string_handle*)0 #define __itt_string_handle_create_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_string_handle_createA_ptr 0 #define __itt_string_handle_createW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_string_handle_create_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @} handles group */ /** @cond exclude_from_documentation */ typedef unsigned long long __itt_timestamp; /** @endcond */ #define __itt_timestamp_none ((__itt_timestamp)-1LL) /** @cond exclude_from_gpa_documentation */ /** * @ingroup timestamps * @brief Return timestamp corresponding to the current moment. * This returns the timestamp in the format that is the most relevant for the current * host or platform (RDTSC, QPC, and others). You can use the "<" operator to * compare __itt_timestamp values. */ __itt_timestamp ITTAPI __itt_get_timestamp(void); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUB(ITTAPI, __itt_timestamp, get_timestamp, (void)) #define __itt_get_timestamp ITTNOTIFY_DATA(get_timestamp) #define __itt_get_timestamp_ptr ITTNOTIFY_NAME(get_timestamp) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_get_timestamp() #define __itt_get_timestamp_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_get_timestamp_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @} timestamps */ /** @endcond */ /** @cond exclude_from_gpa_documentation */ /** * @defgroup regions Regions * @ingroup public * Regions group * @{ */ /** * @ingroup regions * @brief Begin of region instance. * Successive calls to __itt_region_begin with the same ID are ignored * until a call to __itt_region_end with the same ID * @param[in] domain The domain for this region instance * @param[in] id The instance ID for this region instance. Must not be __itt_null * @param[in] parentid The instance ID for the parent of this region instance, or __itt_null * @param[in] name The name of this region */ void ITTAPI __itt_region_begin(const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name); /** * @ingroup regions * @brief End of region instance. * The first call to __itt_region_end with a given ID ends the * region. Successive calls with the same ID are ignored, as are * calls that do not have a matching __itt_region_begin call. * @param[in] domain The domain for this region instance * @param[in] id The instance ID for this region instance */ void ITTAPI __itt_region_end(const __itt_domain *domain, __itt_id id); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, region_begin, (const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name)) ITT_STUBV(ITTAPI, void, region_end, (const __itt_domain *domain, __itt_id id)) #define __itt_region_begin(d,x,y,z) ITTNOTIFY_VOID_D3(region_begin,d,x,y,z) #define __itt_region_begin_ptr ITTNOTIFY_NAME(region_begin) #define __itt_region_end(d,x) ITTNOTIFY_VOID_D1(region_end,d,x) #define __itt_region_end_ptr ITTNOTIFY_NAME(region_end) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_region_begin(d,x,y,z) #define __itt_region_begin_ptr 0 #define __itt_region_end(d,x) #define __itt_region_end_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_region_begin_ptr 0 #define __itt_region_end_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @} regions group */ /** * @defgroup frames Frames * @ingroup public * Frames are similar to regions, but are intended to be easier to use and to implement. * In particular: * - Frames always represent periods of elapsed time * - By default, frames have no nesting relationships * @{ */ /** * @ingroup frames * @brief Begin a frame instance. * Successive calls to __itt_frame_begin with the * same ID are ignored until a call to __itt_frame_end with the same ID. * @param[in] domain The domain for this frame instance * @param[in] id The instance ID for this frame instance or NULL */ void ITTAPI __itt_frame_begin_v3(const __itt_domain *domain, __itt_id *id); /** * @ingroup frames * @brief End a frame instance. * The first call to __itt_frame_end with a given ID * ends the frame. Successive calls with the same ID are ignored, as are * calls that do not have a matching __itt_frame_begin call. * @param[in] domain The domain for this frame instance * @param[in] id The instance ID for this frame instance or NULL for current */ void ITTAPI __itt_frame_end_v3(const __itt_domain *domain, __itt_id *id); /** * @ingroup frames * @brief Submits a frame instance. * Successive calls to __itt_frame_begin or __itt_frame_submit with the * same ID are ignored until a call to __itt_frame_end or __itt_frame_submit * with the same ID. * Passing special __itt_timestamp_none value as "end" argument means * take the current timestamp as the end timestamp. * @param[in] domain The domain for this frame instance * @param[in] id The instance ID for this frame instance or NULL * @param[in] begin Timestamp of the beginning of the frame * @param[in] end Timestamp of the end of the frame */ void ITTAPI __itt_frame_submit_v3(const __itt_domain *domain, __itt_id *id, __itt_timestamp begin, __itt_timestamp end); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, frame_begin_v3, (const __itt_domain *domain, __itt_id *id)) ITT_STUBV(ITTAPI, void, frame_end_v3, (const __itt_domain *domain, __itt_id *id)) ITT_STUBV(ITTAPI, void, frame_submit_v3, (const __itt_domain *domain, __itt_id *id, __itt_timestamp begin, __itt_timestamp end)) #define __itt_frame_begin_v3(d,x) ITTNOTIFY_VOID_D1(frame_begin_v3,d,x) #define __itt_frame_begin_v3_ptr ITTNOTIFY_NAME(frame_begin_v3) #define __itt_frame_end_v3(d,x) ITTNOTIFY_VOID_D1(frame_end_v3,d,x) #define __itt_frame_end_v3_ptr ITTNOTIFY_NAME(frame_end_v3) #define __itt_frame_submit_v3(d,x,b,e) ITTNOTIFY_VOID_D3(frame_submit_v3,d,x,b,e) #define __itt_frame_submit_v3_ptr ITTNOTIFY_NAME(frame_submit_v3) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_frame_begin_v3(domain,id) #define __itt_frame_begin_v3_ptr 0 #define __itt_frame_end_v3(domain,id) #define __itt_frame_end_v3_ptr 0 #define __itt_frame_submit_v3(domain,id,begin,end) #define __itt_frame_submit_v3_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_frame_begin_v3_ptr 0 #define __itt_frame_end_v3_ptr 0 #define __itt_frame_submit_v3_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @} frames group */ /** @endcond */ /** * @defgroup taskgroup Task Group * @ingroup public * Task Group * @{ */ /** * @ingroup task_groups * @brief Denotes a task_group instance. * Successive calls to __itt_task_group with the same ID are ignored. * @param[in] domain The domain for this task_group instance * @param[in] id The instance ID for this task_group instance. Must not be __itt_null. * @param[in] parentid The instance ID for the parent of this task_group instance, or __itt_null. * @param[in] name The name of this task_group */ void ITTAPI __itt_task_group(const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, task_group, (const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name)) #define __itt_task_group(d,x,y,z) ITTNOTIFY_VOID_D3(task_group,d,x,y,z) #define __itt_task_group_ptr ITTNOTIFY_NAME(task_group) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_task_group(d,x,y,z) #define __itt_task_group_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_task_group_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @} taskgroup group */ /** * @defgroup tasks Tasks * @ingroup public * A task instance represents a piece of work performed by a particular * thread for a period of time. A call to __itt_task_begin creates a * task instance. This becomes the current instance for that task on that * thread. A following call to __itt_task_end on the same thread ends the * instance. There may be multiple simultaneous instances of tasks with the * same name on different threads. If an ID is specified, the task instance * receives that ID. Nested tasks are allowed. * * Note: The task is defined by the bracketing of __itt_task_begin and * __itt_task_end on the same thread. If some scheduling mechanism causes * task switching (the thread executes a different user task) or task * switching (the user task switches to a different thread) then this breaks * the notion of current instance. Additional API calls are required to * deal with that possibility. * @{ */ /** * @ingroup tasks * @brief Begin a task instance. * @param[in] domain The domain for this task * @param[in] taskid The instance ID for this task instance, or __itt_null * @param[in] parentid The parent instance to which this task instance belongs, or __itt_null * @param[in] name The name of this task */ void ITTAPI __itt_task_begin(const __itt_domain *domain, __itt_id taskid, __itt_id parentid, __itt_string_handle *name); /** * @ingroup tasks * @brief Begin a task instance. * @param[in] domain The domain for this task * @param[in] taskid The identifier for this task instance (may be 0) * @param[in] parentid The parent of this task (may be 0) * @param[in] fn The pointer to the function you are tracing */ void ITTAPI __itt_task_begin_fn(const __itt_domain *domain, __itt_id taskid, __itt_id parentid, void* fn); /** * @ingroup tasks * @brief End the current task instance. * @param[in] domain The domain for this task */ void ITTAPI __itt_task_end(const __itt_domain *domain); /** * @ingroup tasks * @brief Begin an overlapped task instance. * @param[in] domain The domain for this task. * @param[in] taskid The identifier for this task instance, *cannot* be __itt_null. * @param[in] parentid The parent of this task, or __itt_null. * @param[in] name The name of this task. */ void ITTAPI __itt_task_begin_overlapped(const __itt_domain* domain, __itt_id taskid, __itt_id parentid, __itt_string_handle* name); /** * @ingroup tasks * @brief End an overlapped task instance. * @param[in] domain The domain for this task * @param[in] taskid Explicit ID of finished task */ void ITTAPI __itt_task_end_overlapped(const __itt_domain *domain, __itt_id taskid); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, task_begin, (const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name)) ITT_STUBV(ITTAPI, void, task_begin_fn, (const __itt_domain *domain, __itt_id id, __itt_id parentid, void* fn)) ITT_STUBV(ITTAPI, void, task_end, (const __itt_domain *domain)) ITT_STUBV(ITTAPI, void, task_begin_overlapped, (const __itt_domain *domain, __itt_id taskid, __itt_id parentid, __itt_string_handle *name)) ITT_STUBV(ITTAPI, void, task_end_overlapped, (const __itt_domain *domain, __itt_id taskid)) #define __itt_task_begin(d,x,y,z) ITTNOTIFY_VOID_D3(task_begin,d,x,y,z) #define __itt_task_begin_ptr ITTNOTIFY_NAME(task_begin) #define __itt_task_begin_fn(d,x,y,z) ITTNOTIFY_VOID_D3(task_begin_fn,d,x,y,z) #define __itt_task_begin_fn_ptr ITTNOTIFY_NAME(task_begin_fn) #define __itt_task_end(d) ITTNOTIFY_VOID_D0(task_end,d) #define __itt_task_end_ptr ITTNOTIFY_NAME(task_end) #define __itt_task_begin_overlapped(d,x,y,z) ITTNOTIFY_VOID_D3(task_begin_overlapped,d,x,y,z) #define __itt_task_begin_overlapped_ptr ITTNOTIFY_NAME(task_begin_overlapped) #define __itt_task_end_overlapped(d,x) ITTNOTIFY_VOID_D1(task_end_overlapped,d,x) #define __itt_task_end_overlapped_ptr ITTNOTIFY_NAME(task_end_overlapped) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_task_begin(domain,id,parentid,name) #define __itt_task_begin_ptr 0 #define __itt_task_begin_fn(domain,id,parentid,fn) #define __itt_task_begin_fn_ptr 0 #define __itt_task_end(domain) #define __itt_task_end_ptr 0 #define __itt_task_begin_overlapped(domain,taskid,parentid,name) #define __itt_task_begin_overlapped_ptr 0 #define __itt_task_end_overlapped(domain,taskid) #define __itt_task_end_overlapped_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_task_begin_ptr 0 #define __itt_task_begin_fn_ptr 0 #define __itt_task_end_ptr 0 #define __itt_task_begin_overlapped_ptr 0 #define __itt_task_end_overlapped_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @} tasks group */ /** * @defgroup markers Markers * Markers represent a single discrete event in time. Markers have a scope, * described by an enumerated type __itt_scope. Markers are created by * the API call __itt_marker. A marker instance can be given an ID for use in * adding metadata. * @{ */ /** * @brief Describes the scope of an event object in the trace. */ typedef enum { __itt_scope_unknown = 0, __itt_scope_global, __itt_scope_track_group, __itt_scope_track, __itt_scope_task, __itt_scope_marker } __itt_scope; /** @cond exclude_from_documentation */ #define __itt_marker_scope_unknown __itt_scope_unknown #define __itt_marker_scope_global __itt_scope_global #define __itt_marker_scope_process __itt_scope_track_group #define __itt_marker_scope_thread __itt_scope_track #define __itt_marker_scope_task __itt_scope_task /** @endcond */ /** * @ingroup markers * @brief Create a marker instance * @param[in] domain The domain for this marker * @param[in] id The instance ID for this marker or __itt_null * @param[in] name The name for this marker * @param[in] scope The scope for this marker */ void ITTAPI __itt_marker(const __itt_domain *domain, __itt_id id, __itt_string_handle *name, __itt_scope scope); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, marker, (const __itt_domain *domain, __itt_id id, __itt_string_handle *name, __itt_scope scope)) #define __itt_marker(d,x,y,z) ITTNOTIFY_VOID_D3(marker,d,x,y,z) #define __itt_marker_ptr ITTNOTIFY_NAME(marker) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_marker(domain,id,name,scope) #define __itt_marker_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_marker_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @} markers group */ /** * @defgroup metadata Metadata * The metadata API is used to attach extra information to named * entities. Metadata can be attached to an identified named entity by ID, * or to the current entity (which is always a task). * * Conceptually metadata has a type (what kind of metadata), a key (the * name of the metadata), and a value (the actual data). The encoding of * the value depends on the type of the metadata. * * The type of metadata is specified by an enumerated type __itt_metdata_type. * @{ */ /** * @ingroup parameters * @brief describes the type of metadata */ typedef enum { __itt_metadata_unknown = 0, __itt_metadata_u64, /**< Unsigned 64-bit integer */ __itt_metadata_s64, /**< Signed 64-bit integer */ __itt_metadata_u32, /**< Unsigned 32-bit integer */ __itt_metadata_s32, /**< Signed 32-bit integer */ __itt_metadata_u16, /**< Unsigned 16-bit integer */ __itt_metadata_s16, /**< Signed 16-bit integer */ __itt_metadata_float, /**< Signed 32-bit floating-point */ __itt_metadata_double /**< SIgned 64-bit floating-point */ } __itt_metadata_type; /** * @ingroup parameters * @brief Add metadata to an instance of a named entity. * @param[in] domain The domain controlling the call * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task * @param[in] key The name of the metadata * @param[in] type The type of the metadata * @param[in] count The number of elements of the given type. If count == 0, no metadata will be added. * @param[in] data The metadata itself */ void ITTAPI __itt_metadata_add(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, metadata_add, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data)) #define __itt_metadata_add(d,x,y,z,a,b) ITTNOTIFY_VOID_D5(metadata_add,d,x,y,z,a,b) #define __itt_metadata_add_ptr ITTNOTIFY_NAME(metadata_add) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_metadata_add(d,x,y,z,a,b) #define __itt_metadata_add_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_metadata_add_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @ingroup parameters * @brief Add string metadata to an instance of a named entity. * @param[in] domain The domain controlling the call * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task * @param[in] key The name of the metadata * @param[in] data The metadata itself * @param[in] length The number of characters in the string, or -1 if the length is unknown but the string is null-terminated */ #if ITT_PLATFORM==ITT_PLATFORM_WIN void ITTAPI __itt_metadata_str_addA(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length); void ITTAPI __itt_metadata_str_addW(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const wchar_t *data, size_t length); #if defined(UNICODE) || defined(_UNICODE) # define __itt_metadata_str_add __itt_metadata_str_addW # define __itt_metadata_str_add_ptr __itt_metadata_str_addW_ptr #else /* UNICODE */ # define __itt_metadata_str_add __itt_metadata_str_addA # define __itt_metadata_str_add_ptr __itt_metadata_str_addA_ptr #endif /* UNICODE */ #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ void ITTAPI __itt_metadata_str_add(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length); #endif /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUBV(ITTAPI, void, metadata_str_addA, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length)) ITT_STUBV(ITTAPI, void, metadata_str_addW, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const wchar_t *data, size_t length)) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUBV(ITTAPI, void, metadata_str_add, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length)) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_metadata_str_addA(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_addA,d,x,y,z,a) #define __itt_metadata_str_addA_ptr ITTNOTIFY_NAME(metadata_str_addA) #define __itt_metadata_str_addW(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_addW,d,x,y,z,a) #define __itt_metadata_str_addW_ptr ITTNOTIFY_NAME(metadata_str_addW) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_metadata_str_add(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_add,d,x,y,z,a) #define __itt_metadata_str_add_ptr ITTNOTIFY_NAME(metadata_str_add) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #else /* INTEL_NO_ITTNOTIFY_API */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_metadata_str_addA(d,x,y,z,a) #define __itt_metadata_str_addA_ptr 0 #define __itt_metadata_str_addW(d,x,y,z,a) #define __itt_metadata_str_addW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_metadata_str_add(d,x,y,z,a) #define __itt_metadata_str_add_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_metadata_str_addA_ptr 0 #define __itt_metadata_str_addW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_metadata_str_add_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @ingroup parameters * @brief Add metadata to an instance of a named entity. * @param[in] domain The domain controlling the call * @param[in] scope The scope of the instance to which the metadata is to be added * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task * @param[in] key The name of the metadata * @param[in] type The type of the metadata * @param[in] count The number of elements of the given type. If count == 0, no metadata will be added. * @param[in] data The metadata itself */ void ITTAPI __itt_metadata_add_with_scope(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, metadata_add_with_scope, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data)) #define __itt_metadata_add_with_scope(d,x,y,z,a,b) ITTNOTIFY_VOID_D5(metadata_add_with_scope,d,x,y,z,a,b) #define __itt_metadata_add_with_scope_ptr ITTNOTIFY_NAME(metadata_add_with_scope) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_metadata_add_with_scope(d,x,y,z,a,b) #define __itt_metadata_add_with_scope_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_metadata_add_with_scope_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @ingroup parameters * @brief Add string metadata to an instance of a named entity. * @param[in] domain The domain controlling the call * @param[in] scope The scope of the instance to which the metadata is to be added * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task * @param[in] key The name of the metadata * @param[in] data The metadata itself * @param[in] length The number of characters in the string, or -1 if the length is unknown but the string is null-terminated */ #if ITT_PLATFORM==ITT_PLATFORM_WIN void ITTAPI __itt_metadata_str_add_with_scopeA(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length); void ITTAPI __itt_metadata_str_add_with_scopeW(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const wchar_t *data, size_t length); #if defined(UNICODE) || defined(_UNICODE) # define __itt_metadata_str_add_with_scope __itt_metadata_str_add_with_scopeW # define __itt_metadata_str_add_with_scope_ptr __itt_metadata_str_add_with_scopeW_ptr #else /* UNICODE */ # define __itt_metadata_str_add_with_scope __itt_metadata_str_add_with_scopeA # define __itt_metadata_str_add_with_scope_ptr __itt_metadata_str_add_with_scopeA_ptr #endif /* UNICODE */ #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ void ITTAPI __itt_metadata_str_add_with_scope(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length); #endif /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUBV(ITTAPI, void, metadata_str_add_with_scopeA, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length)) ITT_STUBV(ITTAPI, void, metadata_str_add_with_scopeW, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const wchar_t *data, size_t length)) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUBV(ITTAPI, void, metadata_str_add_with_scope, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length)) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_metadata_str_add_with_scopeA(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_add_with_scopeA,d,x,y,z,a) #define __itt_metadata_str_add_with_scopeA_ptr ITTNOTIFY_NAME(metadata_str_add_with_scopeA) #define __itt_metadata_str_add_with_scopeW(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_add_with_scopeW,d,x,y,z,a) #define __itt_metadata_str_add_with_scopeW_ptr ITTNOTIFY_NAME(metadata_str_add_with_scopeW) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_metadata_str_add_with_scope(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_add_with_scope,d,x,y,z,a) #define __itt_metadata_str_add_with_scope_ptr ITTNOTIFY_NAME(metadata_str_add_with_scope) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #else /* INTEL_NO_ITTNOTIFY_API */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_metadata_str_add_with_scopeA(d,x,y,z,a) #define __itt_metadata_str_add_with_scopeA_ptr 0 #define __itt_metadata_str_add_with_scopeW(d,x,y,z,a) #define __itt_metadata_str_add_with_scopeW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_metadata_str_add_with_scope(d,x,y,z,a) #define __itt_metadata_str_add_with_scope_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_metadata_str_add_with_scopeA_ptr 0 #define __itt_metadata_str_add_with_scopeW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_metadata_str_add_with_scope_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @} metadata group */ /** * @defgroup relations Relations * Instances of named entities can be explicitly associated with other * instances using instance IDs and the relationship API calls. * * @{ */ /** * @ingroup relations * @brief The kind of relation between two instances is specified by the enumerated type __itt_relation. * Relations between instances can be added with an API call. The relation * API uses instance IDs. Relations can be added before or after the actual * instances are created and persist independently of the instances. This * is the motivation for having different lifetimes for instance IDs and * the actual instances. */ typedef enum { __itt_relation_is_unknown = 0, __itt_relation_is_dependent_on, /**< "A is dependent on B" means that A cannot start until B completes */ __itt_relation_is_sibling_of, /**< "A is sibling of B" means that A and B were created as a group */ __itt_relation_is_parent_of, /**< "A is parent of B" means that A created B */ __itt_relation_is_continuation_of, /**< "A is continuation of B" means that A assumes the dependencies of B */ __itt_relation_is_child_of, /**< "A is child of B" means that A was created by B (inverse of is_parent_of) */ __itt_relation_is_continued_by, /**< "A is continued by B" means that B assumes the dependencies of A (inverse of is_continuation_of) */ __itt_relation_is_predecessor_to /**< "A is predecessor to B" means that B cannot start until A completes (inverse of is_dependent_on) */ } __itt_relation; /** * @ingroup relations * @brief Add a relation to the current task instance. * The current task instance is the head of the relation. * @param[in] domain The domain controlling this call * @param[in] relation The kind of relation * @param[in] tail The ID for the tail of the relation */ void ITTAPI __itt_relation_add_to_current(const __itt_domain *domain, __itt_relation relation, __itt_id tail); /** * @ingroup relations * @brief Add a relation between two instance identifiers. * @param[in] domain The domain controlling this call * @param[in] head The ID for the head of the relation * @param[in] relation The kind of relation * @param[in] tail The ID for the tail of the relation */ void ITTAPI __itt_relation_add(const __itt_domain *domain, __itt_id head, __itt_relation relation, __itt_id tail); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, relation_add_to_current, (const __itt_domain *domain, __itt_relation relation, __itt_id tail)) ITT_STUBV(ITTAPI, void, relation_add, (const __itt_domain *domain, __itt_id head, __itt_relation relation, __itt_id tail)) #define __itt_relation_add_to_current(d,x,y) ITTNOTIFY_VOID_D2(relation_add_to_current,d,x,y) #define __itt_relation_add_to_current_ptr ITTNOTIFY_NAME(relation_add_to_current) #define __itt_relation_add(d,x,y,z) ITTNOTIFY_VOID_D3(relation_add,d,x,y,z) #define __itt_relation_add_ptr ITTNOTIFY_NAME(relation_add) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_relation_add_to_current(d,x,y) #define __itt_relation_add_to_current_ptr 0 #define __itt_relation_add(d,x,y,z) #define __itt_relation_add_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_relation_add_to_current_ptr 0 #define __itt_relation_add_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @} relations group */ /** @cond exclude_from_documentation */ #pragma pack(push, 8) typedef struct ___itt_clock_info { unsigned long long clock_freq; /*!< Clock domain frequency */ unsigned long long clock_base; /*!< Clock domain base timestamp */ } __itt_clock_info; #pragma pack(pop) /** @endcond */ /** @cond exclude_from_documentation */ typedef void (ITTAPI *__itt_get_clock_info_fn)(__itt_clock_info* clock_info, void* data); /** @endcond */ /** @cond exclude_from_documentation */ #pragma pack(push, 8) typedef struct ___itt_clock_domain { __itt_clock_info info; /*!< Most recent clock domain info */ __itt_get_clock_info_fn fn; /*!< Callback function pointer */ void* fn_data; /*!< Input argument for the callback function */ int extra1; /*!< Reserved. Must be zero */ void* extra2; /*!< Reserved. Must be zero */ struct ___itt_clock_domain* next; } __itt_clock_domain; #pragma pack(pop) /** @endcond */ /** * @ingroup clockdomains * @brief Create a clock domain. * Certain applications require the capability to trace their application using * a clock domain different than the CPU, for instance the instrumentation of events * that occur on a GPU. * Because the set of domains is expected to be static over the application's execution time, * there is no mechanism to destroy a domain. * Any domain can be accessed by any thread in the process, regardless of which thread created * the domain. This call is thread-safe. * @param[in] fn A pointer to a callback function which retrieves alternative CPU timestamps * @param[in] fn_data Argument for a callback function; may be NULL */ __itt_clock_domain* ITTAPI __itt_clock_domain_create(__itt_get_clock_info_fn fn, void* fn_data); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUB(ITTAPI, __itt_clock_domain*, clock_domain_create, (__itt_get_clock_info_fn fn, void* fn_data)) #define __itt_clock_domain_create ITTNOTIFY_DATA(clock_domain_create) #define __itt_clock_domain_create_ptr ITTNOTIFY_NAME(clock_domain_create) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_clock_domain_create(fn,fn_data) (__itt_clock_domain*)0 #define __itt_clock_domain_create_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_clock_domain_create_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @ingroup clockdomains * @brief Recalculate clock domains frequencies and clock base timestamps. */ void ITTAPI __itt_clock_domain_reset(void); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, clock_domain_reset, (void)) #define __itt_clock_domain_reset ITTNOTIFY_VOID(clock_domain_reset) #define __itt_clock_domain_reset_ptr ITTNOTIFY_NAME(clock_domain_reset) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_clock_domain_reset() #define __itt_clock_domain_reset_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_clock_domain_reset_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @ingroup clockdomain * @brief Create an instance of identifier. This establishes the beginning of the lifetime of * an instance of the given ID in the trace. Once this lifetime starts, the ID can be used to * tag named entity instances in calls such as __itt_task_begin, and to specify relationships among * identified named entity instances, using the \ref relations APIs. * @param[in] domain The domain controlling the execution of this call. * @param[in] clock_domain The clock domain controlling the execution of this call. * @param[in] timestamp The user defined timestamp. * @param[in] id The ID to create. */ void ITTAPI __itt_id_create_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id); /** * @ingroup clockdomain * @brief Destroy an instance of identifier. This ends the lifetime of the current instance of the * given ID value in the trace. Any relationships that are established after this lifetime ends are * invalid. This call must be performed before the given ID value can be reused for a different * named entity instance. * @param[in] domain The domain controlling the execution of this call. * @param[in] clock_domain The clock domain controlling the execution of this call. * @param[in] timestamp The user defined timestamp. * @param[in] id The ID to destroy. */ void ITTAPI __itt_id_destroy_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, id_create_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id)) ITT_STUBV(ITTAPI, void, id_destroy_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id)) #define __itt_id_create_ex(d,x,y,z) ITTNOTIFY_VOID_D3(id_create_ex,d,x,y,z) #define __itt_id_create_ex_ptr ITTNOTIFY_NAME(id_create_ex) #define __itt_id_destroy_ex(d,x,y,z) ITTNOTIFY_VOID_D3(id_destroy_ex,d,x,y,z) #define __itt_id_destroy_ex_ptr ITTNOTIFY_NAME(id_destroy_ex) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_id_create_ex(domain,clock_domain,timestamp,id) #define __itt_id_create_ex_ptr 0 #define __itt_id_destroy_ex(domain,clock_domain,timestamp,id) #define __itt_id_destroy_ex_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_id_create_ex_ptr 0 #define __itt_id_destroy_ex_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @ingroup clockdomain * @brief Begin a task instance. * @param[in] domain The domain for this task * @param[in] clock_domain The clock domain controlling the execution of this call. * @param[in] timestamp The user defined timestamp. * @param[in] taskid The instance ID for this task instance, or __itt_null * @param[in] parentid The parent instance to which this task instance belongs, or __itt_null * @param[in] name The name of this task */ void ITTAPI __itt_task_begin_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, __itt_string_handle* name); /** * @ingroup clockdomain * @brief Begin a task instance. * @param[in] domain The domain for this task * @param[in] clock_domain The clock domain controlling the execution of this call. * @param[in] timestamp The user defined timestamp. * @param[in] taskid The identifier for this task instance, or __itt_null * @param[in] parentid The parent of this task, or __itt_null * @param[in] fn The pointer to the function you are tracing */ void ITTAPI __itt_task_begin_fn_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, void* fn); /** * @ingroup clockdomain * @brief End the current task instance. * @param[in] domain The domain for this task * @param[in] clock_domain The clock domain controlling the execution of this call. * @param[in] timestamp The user defined timestamp. */ void ITTAPI __itt_task_end_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, task_begin_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_id parentid, __itt_string_handle *name)) ITT_STUBV(ITTAPI, void, task_begin_fn_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_id parentid, void* fn)) ITT_STUBV(ITTAPI, void, task_end_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp)) #define __itt_task_begin_ex(d,x,y,z,a,b) ITTNOTIFY_VOID_D5(task_begin_ex,d,x,y,z,a,b) #define __itt_task_begin_ex_ptr ITTNOTIFY_NAME(task_begin_ex) #define __itt_task_begin_fn_ex(d,x,y,z,a,b) ITTNOTIFY_VOID_D5(task_begin_fn_ex,d,x,y,z,a,b) #define __itt_task_begin_fn_ex_ptr ITTNOTIFY_NAME(task_begin_fn_ex) #define __itt_task_end_ex(d,x,y) ITTNOTIFY_VOID_D2(task_end_ex,d,x,y) #define __itt_task_end_ex_ptr ITTNOTIFY_NAME(task_end_ex) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_task_begin_ex(domain,clock_domain,timestamp,id,parentid,name) #define __itt_task_begin_ex_ptr 0 #define __itt_task_begin_fn_ex(domain,clock_domain,timestamp,id,parentid,fn) #define __itt_task_begin_fn_ex_ptr 0 #define __itt_task_end_ex(domain,clock_domain,timestamp) #define __itt_task_end_ex_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_task_begin_ex_ptr 0 #define __itt_task_begin_fn_ex_ptr 0 #define __itt_task_end_ex_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @defgroup counters Counters * @ingroup public * Counters are user-defined objects with a monotonically increasing * value. Counter values are 64-bit unsigned integers. * Counters have names that can be displayed in * the tools. * @{ */ /** * @brief opaque structure for counter identification */ /** @cond exclude_from_documentation */ typedef struct ___itt_counter* __itt_counter; /** * @brief Create an unsigned 64 bits integer counter with given name/domain * * After __itt_counter_create() is called, __itt_counter_inc(id), __itt_counter_inc_delta(id, delta), * __itt_counter_set_value(id, value_ptr) or __itt_counter_set_value_ex(id, clock_domain, timestamp, value_ptr) * can be used to change the value of the counter, where value_ptr is a pointer to an unsigned 64 bits integer * * The call is equal to __itt_counter_create_typed(name, domain, __itt_metadata_u64) */ #if ITT_PLATFORM==ITT_PLATFORM_WIN __itt_counter ITTAPI __itt_counter_createA(const char *name, const char *domain); __itt_counter ITTAPI __itt_counter_createW(const wchar_t *name, const wchar_t *domain); #if defined(UNICODE) || defined(_UNICODE) # define __itt_counter_create __itt_counter_createW # define __itt_counter_create_ptr __itt_counter_createW_ptr #else /* UNICODE */ # define __itt_counter_create __itt_counter_createA # define __itt_counter_create_ptr __itt_counter_createA_ptr #endif /* UNICODE */ #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ __itt_counter ITTAPI __itt_counter_create(const char *name, const char *domain); #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(ITTAPI, __itt_counter, counter_createA, (const char *name, const char *domain)) ITT_STUB(ITTAPI, __itt_counter, counter_createW, (const wchar_t *name, const wchar_t *domain)) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUB(ITTAPI, __itt_counter, counter_create, (const char *name, const char *domain)) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_counter_createA ITTNOTIFY_DATA(counter_createA) #define __itt_counter_createA_ptr ITTNOTIFY_NAME(counter_createA) #define __itt_counter_createW ITTNOTIFY_DATA(counter_createW) #define __itt_counter_createW_ptr ITTNOTIFY_NAME(counter_createW) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_counter_create ITTNOTIFY_DATA(counter_create) #define __itt_counter_create_ptr ITTNOTIFY_NAME(counter_create) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #else /* INTEL_NO_ITTNOTIFY_API */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_counter_createA(name, domain) #define __itt_counter_createA_ptr 0 #define __itt_counter_createW(name, domain) #define __itt_counter_createW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_counter_create(name, domain) #define __itt_counter_create_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_counter_createA_ptr 0 #define __itt_counter_createW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_counter_create_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Increment the unsigned 64 bits integer counter value * * Calling this function to non-unsigned 64 bits integer counters has no effect */ void ITTAPI __itt_counter_inc(__itt_counter id); #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, counter_inc, (__itt_counter id)) #define __itt_counter_inc ITTNOTIFY_VOID(counter_inc) #define __itt_counter_inc_ptr ITTNOTIFY_NAME(counter_inc) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_counter_inc(id) #define __itt_counter_inc_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_counter_inc_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Increment the unsigned 64 bits integer counter value with x * * Calling this function to non-unsigned 64 bits integer counters has no effect */ void ITTAPI __itt_counter_inc_delta(__itt_counter id, unsigned long long value); #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, counter_inc_delta, (__itt_counter id, unsigned long long value)) #define __itt_counter_inc_delta ITTNOTIFY_VOID(counter_inc_delta) #define __itt_counter_inc_delta_ptr ITTNOTIFY_NAME(counter_inc_delta) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_counter_inc_delta(id, value) #define __itt_counter_inc_delta_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_counter_inc_delta_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Decrement the unsigned 64 bits integer counter value * * Calling this function to non-unsigned 64 bits integer counters has no effect */ void ITTAPI __itt_counter_dec(__itt_counter id); #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, counter_dec, (__itt_counter id)) #define __itt_counter_dec ITTNOTIFY_VOID(counter_dec) #define __itt_counter_dec_ptr ITTNOTIFY_NAME(counter_dec) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_counter_dec(id) #define __itt_counter_dec_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_counter_dec_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Decrement the unsigned 64 bits integer counter value with x * * Calling this function to non-unsigned 64 bits integer counters has no effect */ void ITTAPI __itt_counter_dec_delta(__itt_counter id, unsigned long long value); #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, counter_dec_delta, (__itt_counter id, unsigned long long value)) #define __itt_counter_dec_delta ITTNOTIFY_VOID(counter_dec_delta) #define __itt_counter_dec_delta_ptr ITTNOTIFY_NAME(counter_dec_delta) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_counter_dec_delta(id, value) #define __itt_counter_dec_delta_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_counter_dec_delta_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @ingroup counters * @brief Increment a counter by one. * The first call with a given name creates a counter by that name and sets its * value to zero. Successive calls increment the counter value. * @param[in] domain The domain controlling the call. Counter names are not domain specific. * The domain argument is used only to enable or disable the API calls. * @param[in] name The name of the counter */ void ITTAPI __itt_counter_inc_v3(const __itt_domain *domain, __itt_string_handle *name); /** * @ingroup counters * @brief Increment a counter by the value specified in delta. * @param[in] domain The domain controlling the call. Counter names are not domain specific. * The domain argument is used only to enable or disable the API calls. * @param[in] name The name of the counter * @param[in] delta The amount by which to increment the counter */ void ITTAPI __itt_counter_inc_delta_v3(const __itt_domain *domain, __itt_string_handle *name, unsigned long long delta); #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, counter_inc_v3, (const __itt_domain *domain, __itt_string_handle *name)) ITT_STUBV(ITTAPI, void, counter_inc_delta_v3, (const __itt_domain *domain, __itt_string_handle *name, unsigned long long delta)) #define __itt_counter_inc_v3(d,x) ITTNOTIFY_VOID_D1(counter_inc_v3,d,x) #define __itt_counter_inc_v3_ptr ITTNOTIFY_NAME(counter_inc_v3) #define __itt_counter_inc_delta_v3(d,x,y) ITTNOTIFY_VOID_D2(counter_inc_delta_v3,d,x,y) #define __itt_counter_inc_delta_v3_ptr ITTNOTIFY_NAME(counter_inc_delta_v3) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_counter_inc_v3(domain,name) #define __itt_counter_inc_v3_ptr 0 #define __itt_counter_inc_delta_v3(domain,name,delta) #define __itt_counter_inc_delta_v3_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_counter_inc_v3_ptr 0 #define __itt_counter_inc_delta_v3_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @ingroup counters * @brief Decrement a counter by one. * The first call with a given name creates a counter by that name and sets its * value to zero. Successive calls decrement the counter value. * @param[in] domain The domain controlling the call. Counter names are not domain specific. * The domain argument is used only to enable or disable the API calls. * @param[in] name The name of the counter */ void ITTAPI __itt_counter_dec_v3(const __itt_domain *domain, __itt_string_handle *name); /** * @ingroup counters * @brief Decrement a counter by the value specified in delta. * @param[in] domain The domain controlling the call. Counter names are not domain specific. * The domain argument is used only to enable or disable the API calls. * @param[in] name The name of the counter * @param[in] delta The amount by which to decrement the counter */ void ITTAPI __itt_counter_dec_delta_v3(const __itt_domain *domain, __itt_string_handle *name, unsigned long long delta); #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, counter_dec_v3, (const __itt_domain *domain, __itt_string_handle *name)) ITT_STUBV(ITTAPI, void, counter_dec_delta_v3, (const __itt_domain *domain, __itt_string_handle *name, unsigned long long delta)) #define __itt_counter_dec_v3(d,x) ITTNOTIFY_VOID_D1(counter_dec_v3,d,x) #define __itt_counter_dec_v3_ptr ITTNOTIFY_NAME(counter_dec_v3) #define __itt_counter_dec_delta_v3(d,x,y) ITTNOTIFY_VOID_D2(counter_dec_delta_v3,d,x,y) #define __itt_counter_dec_delta_v3_ptr ITTNOTIFY_NAME(counter_dec_delta_v3) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_counter_dec_v3(domain,name) #define __itt_counter_dec_v3_ptr 0 #define __itt_counter_dec_delta_v3(domain,name,delta) #define __itt_counter_dec_delta_v3_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_counter_dec_v3_ptr 0 #define __itt_counter_dec_delta_v3_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @} counters group */ /** * @brief Set the counter value */ void ITTAPI __itt_counter_set_value(__itt_counter id, void *value_ptr); #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, counter_set_value, (__itt_counter id, void *value_ptr)) #define __itt_counter_set_value ITTNOTIFY_VOID(counter_set_value) #define __itt_counter_set_value_ptr ITTNOTIFY_NAME(counter_set_value) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_counter_set_value(id, value_ptr) #define __itt_counter_set_value_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_counter_set_value_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Set the counter value */ void ITTAPI __itt_counter_set_value_ex(__itt_counter id, __itt_clock_domain *clock_domain, unsigned long long timestamp, void *value_ptr); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, counter_set_value_ex, (__itt_counter id, __itt_clock_domain *clock_domain, unsigned long long timestamp, void *value_ptr)) #define __itt_counter_set_value_ex ITTNOTIFY_VOID(counter_set_value_ex) #define __itt_counter_set_value_ex_ptr ITTNOTIFY_NAME(counter_set_value_ex) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_counter_set_value_ex(id, clock_domain, timestamp, value_ptr) #define __itt_counter_set_value_ex_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_counter_set_value_ex_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Create a typed counter with given name/domain * * After __itt_counter_create_typed() is called, __itt_counter_inc(id), __itt_counter_inc_delta(id, delta), * __itt_counter_set_value(id, value_ptr) or __itt_counter_set_value_ex(id, clock_domain, timestamp, value_ptr) * can be used to change the value of the counter */ #if ITT_PLATFORM==ITT_PLATFORM_WIN __itt_counter ITTAPI __itt_counter_create_typedA(const char *name, const char *domain, __itt_metadata_type type); __itt_counter ITTAPI __itt_counter_create_typedW(const wchar_t *name, const wchar_t *domain, __itt_metadata_type type); #if defined(UNICODE) || defined(_UNICODE) # define __itt_counter_create_typed __itt_counter_create_typedW # define __itt_counter_create_typed_ptr __itt_counter_create_typedW_ptr #else /* UNICODE */ # define __itt_counter_create_typed __itt_counter_create_typedA # define __itt_counter_create_typed_ptr __itt_counter_create_typedA_ptr #endif /* UNICODE */ #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ __itt_counter ITTAPI __itt_counter_create_typed(const char *name, const char *domain, __itt_metadata_type type); #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(ITTAPI, __itt_counter, counter_create_typedA, (const char *name, const char *domain, __itt_metadata_type type)) ITT_STUB(ITTAPI, __itt_counter, counter_create_typedW, (const wchar_t *name, const wchar_t *domain, __itt_metadata_type type)) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUB(ITTAPI, __itt_counter, counter_create_typed, (const char *name, const char *domain, __itt_metadata_type type)) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_counter_create_typedA ITTNOTIFY_DATA(counter_create_typedA) #define __itt_counter_create_typedA_ptr ITTNOTIFY_NAME(counter_create_typedA) #define __itt_counter_create_typedW ITTNOTIFY_DATA(counter_create_typedW) #define __itt_counter_create_typedW_ptr ITTNOTIFY_NAME(counter_create_typedW) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_counter_create_typed ITTNOTIFY_DATA(counter_create_typed) #define __itt_counter_create_typed_ptr ITTNOTIFY_NAME(counter_create_typed) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #else /* INTEL_NO_ITTNOTIFY_API */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_counter_create_typedA(name, domain, type) #define __itt_counter_create_typedA_ptr 0 #define __itt_counter_create_typedW(name, domain, type) #define __itt_counter_create_typedW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_counter_create_typed(name, domain, type) #define __itt_counter_create_typed_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_counter_create_typedA_ptr 0 #define __itt_counter_create_typedW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_counter_create_typed_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Destroy the counter identified by the pointer previously returned by __itt_counter_create() or * __itt_counter_create_typed() */ void ITTAPI __itt_counter_destroy(__itt_counter id); #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, counter_destroy, (__itt_counter id)) #define __itt_counter_destroy ITTNOTIFY_VOID(counter_destroy) #define __itt_counter_destroy_ptr ITTNOTIFY_NAME(counter_destroy) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_counter_destroy(id) #define __itt_counter_destroy_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_counter_destroy_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @} counters group */ /** * @ingroup markers * @brief Create a marker instance. * @param[in] domain The domain for this marker * @param[in] clock_domain The clock domain controlling the execution of this call. * @param[in] timestamp The user defined timestamp. * @param[in] id The instance ID for this marker, or __itt_null * @param[in] name The name for this marker * @param[in] scope The scope for this marker */ void ITTAPI __itt_marker_ex(const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_string_handle *name, __itt_scope scope); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, marker_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_string_handle *name, __itt_scope scope)) #define __itt_marker_ex(d,x,y,z,a,b) ITTNOTIFY_VOID_D5(marker_ex,d,x,y,z,a,b) #define __itt_marker_ex_ptr ITTNOTIFY_NAME(marker_ex) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_marker_ex(domain,clock_domain,timestamp,id,name,scope) #define __itt_marker_ex_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_marker_ex_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @ingroup clockdomain * @brief Add a relation to the current task instance. * The current task instance is the head of the relation. * @param[in] domain The domain controlling this call * @param[in] clock_domain The clock domain controlling the execution of this call. * @param[in] timestamp The user defined timestamp. * @param[in] relation The kind of relation * @param[in] tail The ID for the tail of the relation */ void ITTAPI __itt_relation_add_to_current_ex(const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_relation relation, __itt_id tail); /** * @ingroup clockdomain * @brief Add a relation between two instance identifiers. * @param[in] domain The domain controlling this call * @param[in] clock_domain The clock domain controlling the execution of this call. * @param[in] timestamp The user defined timestamp. * @param[in] head The ID for the head of the relation * @param[in] relation The kind of relation * @param[in] tail The ID for the tail of the relation */ void ITTAPI __itt_relation_add_ex(const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id head, __itt_relation relation, __itt_id tail); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, relation_add_to_current_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_relation relation, __itt_id tail)) ITT_STUBV(ITTAPI, void, relation_add_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id head, __itt_relation relation, __itt_id tail)) #define __itt_relation_add_to_current_ex(d,x,y,z,a) ITTNOTIFY_VOID_D4(relation_add_to_current_ex,d,x,y,z,a) #define __itt_relation_add_to_current_ex_ptr ITTNOTIFY_NAME(relation_add_to_current_ex) #define __itt_relation_add_ex(d,x,y,z,a,b) ITTNOTIFY_VOID_D5(relation_add_ex,d,x,y,z,a,b) #define __itt_relation_add_ex_ptr ITTNOTIFY_NAME(relation_add_ex) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_relation_add_to_current_ex(domain,clock_domain,timestame,relation,tail) #define __itt_relation_add_to_current_ex_ptr 0 #define __itt_relation_add_ex(domain,clock_domain,timestamp,head,relation,tail) #define __itt_relation_add_ex_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_relation_add_to_current_ex_ptr 0 #define __itt_relation_add_ex_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @cond exclude_from_documentation */ typedef enum ___itt_track_group_type { __itt_track_group_type_normal = 0 } __itt_track_group_type; /** @endcond */ /** @cond exclude_from_documentation */ #pragma pack(push, 8) typedef struct ___itt_track_group { __itt_string_handle* name; /*!< Name of the track group */ struct ___itt_track* track; /*!< List of child tracks */ __itt_track_group_type tgtype; /*!< Type of the track group */ int extra1; /*!< Reserved. Must be zero */ void* extra2; /*!< Reserved. Must be zero */ struct ___itt_track_group* next; } __itt_track_group; #pragma pack(pop) /** @endcond */ /** * @brief Placeholder for custom track types. Currently, "normal" custom track * is the only available track type. */ typedef enum ___itt_track_type { __itt_track_type_normal = 0 #ifdef INTEL_ITTNOTIFY_API_PRIVATE , __itt_track_type_queue #endif /* INTEL_ITTNOTIFY_API_PRIVATE */ } __itt_track_type; /** @cond exclude_from_documentation */ #pragma pack(push, 8) typedef struct ___itt_track { __itt_string_handle* name; /*!< Name of the track group */ __itt_track_group* group; /*!< Parent group to a track */ __itt_track_type ttype; /*!< Type of the track */ int extra1; /*!< Reserved. Must be zero */ void* extra2; /*!< Reserved. Must be zero */ struct ___itt_track* next; } __itt_track; #pragma pack(pop) /** @endcond */ /** * @brief Create logical track group. */ __itt_track_group* ITTAPI __itt_track_group_create(__itt_string_handle* name, __itt_track_group_type track_group_type); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUB(ITTAPI, __itt_track_group*, track_group_create, (__itt_string_handle* name, __itt_track_group_type track_group_type)) #define __itt_track_group_create ITTNOTIFY_DATA(track_group_create) #define __itt_track_group_create_ptr ITTNOTIFY_NAME(track_group_create) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_track_group_create(name) (__itt_track_group*)0 #define __itt_track_group_create_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_track_group_create_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Create logical track. */ __itt_track* ITTAPI __itt_track_create(__itt_track_group* track_group, __itt_string_handle* name, __itt_track_type track_type); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUB(ITTAPI, __itt_track*, track_create, (__itt_track_group* track_group,__itt_string_handle* name, __itt_track_type track_type)) #define __itt_track_create ITTNOTIFY_DATA(track_create) #define __itt_track_create_ptr ITTNOTIFY_NAME(track_create) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_track_create(track_group,name,track_type) (__itt_track*)0 #define __itt_track_create_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_track_create_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Set the logical track. */ void ITTAPI __itt_set_track(__itt_track* track); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, set_track, (__itt_track *track)) #define __itt_set_track ITTNOTIFY_VOID(set_track) #define __itt_set_track_ptr ITTNOTIFY_NAME(set_track) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_set_track(track) #define __itt_set_track_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_set_track_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /* ========================================================================== */ /** @cond exclude_from_gpa_documentation */ /** * @defgroup events Events * @ingroup public * Events group * @{ */ /** @brief user event type */ typedef int __itt_event; /** * @brief Create an event notification * @note name or namelen being null/name and namelen not matching, user event feature not enabled * @return non-zero event identifier upon success and __itt_err otherwise */ #if ITT_PLATFORM==ITT_PLATFORM_WIN __itt_event LIBITTAPI __itt_event_createA(const char *name, int namelen); __itt_event LIBITTAPI __itt_event_createW(const wchar_t *name, int namelen); #if defined(UNICODE) || defined(_UNICODE) # define __itt_event_create __itt_event_createW # define __itt_event_create_ptr __itt_event_createW_ptr #else # define __itt_event_create __itt_event_createA # define __itt_event_create_ptr __itt_event_createA_ptr #endif /* UNICODE */ #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ __itt_event LIBITTAPI __itt_event_create(const char *name, int namelen); #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(LIBITTAPI, __itt_event, event_createA, (const char *name, int namelen)) ITT_STUB(LIBITTAPI, __itt_event, event_createW, (const wchar_t *name, int namelen)) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUB(LIBITTAPI, __itt_event, event_create, (const char *name, int namelen)) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_event_createA ITTNOTIFY_DATA(event_createA) #define __itt_event_createA_ptr ITTNOTIFY_NAME(event_createA) #define __itt_event_createW ITTNOTIFY_DATA(event_createW) #define __itt_event_createW_ptr ITTNOTIFY_NAME(event_createW) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_event_create ITTNOTIFY_DATA(event_create) #define __itt_event_create_ptr ITTNOTIFY_NAME(event_create) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #else /* INTEL_NO_ITTNOTIFY_API */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_event_createA(name, namelen) (__itt_event)0 #define __itt_event_createA_ptr 0 #define __itt_event_createW(name, namelen) (__itt_event)0 #define __itt_event_createW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_event_create(name, namelen) (__itt_event)0 #define __itt_event_create_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_event_createA_ptr 0 #define __itt_event_createW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_event_create_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Record an event occurrence. * @return __itt_err upon failure (invalid event id/user event feature not enabled) */ int LIBITTAPI __itt_event_start(__itt_event event); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUB(LIBITTAPI, int, event_start, (__itt_event event)) #define __itt_event_start ITTNOTIFY_DATA(event_start) #define __itt_event_start_ptr ITTNOTIFY_NAME(event_start) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_event_start(event) (int)0 #define __itt_event_start_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_event_start_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Record an event end occurrence. * @note It is optional if events do not have durations. * @return __itt_err upon failure (invalid event id/user event feature not enabled) */ int LIBITTAPI __itt_event_end(__itt_event event); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUB(LIBITTAPI, int, event_end, (__itt_event event)) #define __itt_event_end ITTNOTIFY_DATA(event_end) #define __itt_event_end_ptr ITTNOTIFY_NAME(event_end) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_event_end(event) (int)0 #define __itt_event_end_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_event_end_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @} events group */ /** * @defgroup arrays Arrays Visualizer * @ingroup public * Visualize arrays * @{ */ /** * @enum __itt_av_data_type * @brief Defines types of arrays data (for C/C++ intrinsic types) */ typedef enum { __itt_e_first = 0, __itt_e_char = 0, /* 1-byte integer */ __itt_e_uchar, /* 1-byte unsigned integer */ __itt_e_int16, /* 2-byte integer */ __itt_e_uint16, /* 2-byte unsigned integer */ __itt_e_int32, /* 4-byte integer */ __itt_e_uint32, /* 4-byte unsigned integer */ __itt_e_int64, /* 8-byte integer */ __itt_e_uint64, /* 8-byte unsigned integer */ __itt_e_float, /* 4-byte floating */ __itt_e_double, /* 8-byte floating */ __itt_e_last = __itt_e_double } __itt_av_data_type; /** * @brief Save an array data to a file. * Output format is defined by the file extension. The csv and bmp formats are supported (bmp - for 2-dimensional array only). * @param[in] data - pointer to the array data * @param[in] rank - the rank of the array * @param[in] dimensions - pointer to an array of integers, which specifies the array dimensions. * The size of dimensions must be equal to the rank * @param[in] type - the type of the array, specified as one of the __itt_av_data_type values (for intrinsic types) * @param[in] filePath - the file path; the output format is defined by the file extension * @param[in] columnOrder - defines how the array is stored in the linear memory. * It should be 1 for column-major order (e.g. in FORTRAN) or 0 - for row-major order (e.g. in C). */ #if ITT_PLATFORM==ITT_PLATFORM_WIN int ITTAPI __itt_av_saveA(void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder); int ITTAPI __itt_av_saveW(void *data, int rank, const int *dimensions, int type, const wchar_t *filePath, int columnOrder); #if defined(UNICODE) || defined(_UNICODE) # define __itt_av_save __itt_av_saveW # define __itt_av_save_ptr __itt_av_saveW_ptr #else /* UNICODE */ # define __itt_av_save __itt_av_saveA # define __itt_av_save_ptr __itt_av_saveA_ptr #endif /* UNICODE */ #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ int ITTAPI __itt_av_save(void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder); #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(ITTAPI, int, av_saveA, (void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder)) ITT_STUB(ITTAPI, int, av_saveW, (void *data, int rank, const int *dimensions, int type, const wchar_t *filePath, int columnOrder)) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUB(ITTAPI, int, av_save, (void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder)) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_av_saveA ITTNOTIFY_DATA(av_saveA) #define __itt_av_saveA_ptr ITTNOTIFY_NAME(av_saveA) #define __itt_av_saveW ITTNOTIFY_DATA(av_saveW) #define __itt_av_saveW_ptr ITTNOTIFY_NAME(av_saveW) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_av_save ITTNOTIFY_DATA(av_save) #define __itt_av_save_ptr ITTNOTIFY_NAME(av_save) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #else /* INTEL_NO_ITTNOTIFY_API */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_av_saveA(name) #define __itt_av_saveA_ptr 0 #define __itt_av_saveW(name) #define __itt_av_saveW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_av_save(name) #define __itt_av_save_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_av_saveA_ptr 0 #define __itt_av_saveW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_av_save_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ void ITTAPI __itt_enable_attach(void); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, enable_attach, (void)) #define __itt_enable_attach ITTNOTIFY_VOID(enable_attach) #define __itt_enable_attach_ptr ITTNOTIFY_NAME(enable_attach) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_enable_attach() #define __itt_enable_attach_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_enable_attach_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @cond exclude_from_gpa_documentation */ /** @} arrays group */ /** @endcond */ /** * @brief Module load notification * This API is used to report necessary information in case of bypassing default system loader. * Notification should be done immediately after this module is loaded to process memory. * @param[in] start_addr - module start address * @param[in] end_addr - module end address * @param[in] path - file system full path to the module */ #if ITT_PLATFORM==ITT_PLATFORM_WIN void ITTAPI __itt_module_loadA(void *start_addr, void *end_addr, const char *path); void ITTAPI __itt_module_loadW(void *start_addr, void *end_addr, const wchar_t *path); #if defined(UNICODE) || defined(_UNICODE) # define __itt_module_load __itt_module_loadW # define __itt_module_load_ptr __itt_module_loadW_ptr #else /* UNICODE */ # define __itt_module_load __itt_module_loadA # define __itt_module_load_ptr __itt_module_loadA_ptr #endif /* UNICODE */ #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ void ITTAPI __itt_module_load(void *start_addr, void *end_addr, const char *path); #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(ITTAPI, void, module_loadA, (void *start_addr, void *end_addr, const char *path)) ITT_STUB(ITTAPI, void, module_loadW, (void *start_addr, void *end_addr, const wchar_t *path)) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUB(ITTAPI, void, module_load, (void *start_addr, void *end_addr, const char *path)) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_module_loadA ITTNOTIFY_VOID(module_loadA) #define __itt_module_loadA_ptr ITTNOTIFY_NAME(module_loadA) #define __itt_module_loadW ITTNOTIFY_VOID(module_loadW) #define __itt_module_loadW_ptr ITTNOTIFY_NAME(module_loadW) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_module_load ITTNOTIFY_VOID(module_load) #define __itt_module_load_ptr ITTNOTIFY_NAME(module_load) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #else /* INTEL_NO_ITTNOTIFY_API */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_module_loadA(start_addr, end_addr, path) #define __itt_module_loadA_ptr 0 #define __itt_module_loadW(start_addr, end_addr, path) #define __itt_module_loadW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_module_load(start_addr, end_addr, path) #define __itt_module_load_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_module_loadA_ptr 0 #define __itt_module_loadW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_module_load_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Report module unload * This API is used to report necessary information in case of bypassing default system loader. * Notification should be done just before the module is unloaded from process memory. * @param[in] addr - base address of loaded module */ void ITTAPI __itt_module_unload(void *addr); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, module_unload, (void *addr)) #define __itt_module_unload ITTNOTIFY_VOID(module_unload) #define __itt_module_unload_ptr ITTNOTIFY_NAME(module_unload) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_module_unload(addr) #define __itt_module_unload_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_module_unload_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @cond exclude_from_documentation */ typedef enum { __itt_module_type_unknown = 0, __itt_module_type_elf, __itt_module_type_coff } __itt_module_type; /** @endcond */ /** @cond exclude_from_documentation */ typedef enum { itt_section_type_unknown, itt_section_type_bss, /* notifies that the section contains uninitialized data. These are the relevant section types and the modules that contain them: * ELF module: SHT_NOBITS section type * COFF module: IMAGE_SCN_CNT_UNINITIALIZED_DATA section type */ itt_section_type_data, /* notifies that section contains initialized data. These are the relevant section types and the modules that contain them: * ELF module: SHT_PROGBITS section type * COFF module: IMAGE_SCN_CNT_INITIALIZED_DATA section type */ itt_section_type_text /* notifies that the section contains executable code. These are the relevant section types and the modules that contain them: * ELF module: SHT_PROGBITS section type * COFF module: IMAGE_SCN_CNT_CODE section type */ } __itt_section_type; /** @endcond */ /** * @hideinitializer * @brief bit-mask, detects a section attribute that indicates whether a section can be executed as code: * These are the relevant section attributes and the modules that contain them: * ELF module: PF_X section attribute * COFF module: IMAGE_SCN_MEM_EXECUTE attribute */ #define __itt_section_exec 0x20000000 /** * @hideinitializer * @brief bit-mask, detects a section attribute that indicates whether a section can be read. * These are the relevant section attributes and the modules that contain them: * ELF module: PF_R attribute * COFF module: IMAGE_SCN_MEM_READ attribute */ #define __itt_section_read 0x40000000 /** * @hideinitializer * @brief bit-mask, detects a section attribute that indicates whether a section can be written to. * These are the relevant section attributes and the modules that contain them: * ELF module: PF_W attribute * COFF module: IMAGE_SCN_MEM_WRITE attribute */ #define __itt_section_write 0x80000000 /** @cond exclude_from_documentation */ #pragma pack(push, 8) typedef struct ___itt_section_info { const char* name; /*!< Section name in UTF8 */ __itt_section_type type; /*!< Section content and semantics description */ size_t flags; /*!< Section bit flags that describe attributes using bit mask * Zero if disabled, non-zero if enabled */ void* start_addr; /*!< Section load(relocated) start address */ size_t size; /*!< Section file offset */ size_t file_offset; /*!< Section size */ } __itt_section_info; #pragma pack(pop) /** @endcond */ /** @cond exclude_from_documentation */ #pragma pack(push, 8) typedef struct ___itt_module_object { unsigned int version; /*!< API version*/ __itt_id module_id; /*!< Unique identifier. This is unchanged for sections that belong to the same module */ __itt_module_type module_type; /*!< Binary module format */ const char* module_name; /*!< Unique module name or path to module in UTF8 * Contains module name when module_bufer and module_size exist * Contains module path when module_bufer and module_size absent * module_name remains the same for the certain module_id */ void* module_buffer; /*!< Module buffer content */ size_t module_size; /*!< Module buffer size */ /*!< If module_buffer and module_size exist, the binary module is dumped onto the system. * If module_buffer and module_size do not exist, * the binary module exists on the system already. * The module_name parameter contains the path to the module. */ __itt_section_info* section_array; /*!< Reference to section information */ size_t section_number; } __itt_module_object; #pragma pack(pop) /** @endcond */ /** * @brief Load module content and its loaded(relocated) sections. * This API is useful to save a module, or specify its location on the system and report information about loaded sections. * The target module is saved on the system if module buffer content and size are available. * If module buffer content and size are unavailable, the module name contains the path to the existing binary module. * @param[in] module_obj - provides module and section information, along with unique module identifiers (name,module ID) * which bind the binary module to particular sections. */ void ITTAPI __itt_module_load_with_sections(__itt_module_object* module_obj); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, module_load_with_sections, (__itt_module_object* module_obj)) #define __itt_module_load_with_sections ITTNOTIFY_VOID(module_load_with_sections) #define __itt_module_load_with_sections_ptr ITTNOTIFY_NAME(module_load_with_sections) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_module_load_with_sections(module_obj) #define __itt_module_load_with_sections_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_module_load_with_sections_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Unload a module and its loaded(relocated) sections. * This API notifies that the module and its sections were unloaded. * @param[in] module_obj - provides module and sections information, along with unique module identifiers (name,module ID) * which bind the binary module to particular sections. */ void ITTAPI __itt_module_unload_with_sections(__itt_module_object* module_obj); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, module_unload_with_sections, (__itt_module_object* module_obj)) #define __itt_module_unload_with_sections ITTNOTIFY_VOID(module_unload_with_sections) #define __itt_module_unload_with_sections_ptr ITTNOTIFY_NAME(module_unload_with_sections) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_module_unload_with_sections(module_obj) #define __itt_module_unload_with_sections_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_module_unload_with_sections_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @cond exclude_from_documentation */ #pragma pack(push, 8) typedef struct ___itt_histogram { const __itt_domain* domain; /*!< Domain of the histogram*/ const char* nameA; /*!< Name of the histogram */ #if defined(UNICODE) || defined(_UNICODE) const wchar_t* nameW; #else /* UNICODE || _UNICODE */ void* nameW; #endif /* UNICODE || _UNICODE */ __itt_metadata_type x_type; /*!< Type of the histogram X axis */ __itt_metadata_type y_type; /*!< Type of the histogram Y axis */ int extra1; /*!< Reserved to the runtime */ void* extra2; /*!< Reserved to the runtime */ struct ___itt_histogram* next; } __itt_histogram; #pragma pack(pop) /** @endcond */ /** * @brief Create a typed histogram instance with given name/domain. * @param[in] domain The domain controlling the call. * @param[in] name The name of the histogram. * @param[in] x_type The type of the X axis in histogram (may be 0 to calculate batch statistics). * @param[in] y_type The type of the Y axis in histogram. */ #if ITT_PLATFORM==ITT_PLATFORM_WIN __itt_histogram* ITTAPI __itt_histogram_createA(const __itt_domain* domain, const char* name, __itt_metadata_type x_type, __itt_metadata_type y_type); __itt_histogram* ITTAPI __itt_histogram_createW(const __itt_domain* domain, const wchar_t* name, __itt_metadata_type x_type, __itt_metadata_type y_type); #if defined(UNICODE) || defined(_UNICODE) # define __itt_histogram_create __itt_histogram_createW # define __itt_histogram_create_ptr __itt_histogram_createW_ptr #else /* UNICODE */ # define __itt_histogram_create __itt_histogram_createA # define __itt_histogram_create_ptr __itt_histogram_createA_ptr #endif /* UNICODE */ #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ __itt_histogram* ITTAPI __itt_histogram_create(const __itt_domain* domain, const char* name, __itt_metadata_type x_type, __itt_metadata_type y_type); #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(ITTAPI, __itt_histogram*, histogram_createA, (const __itt_domain* domain, const char* name, __itt_metadata_type x_type, __itt_metadata_type y_type)) ITT_STUB(ITTAPI, __itt_histogram*, histogram_createW, (const __itt_domain* domain, const wchar_t* name, __itt_metadata_type x_type, __itt_metadata_type y_type)) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUB(ITTAPI, __itt_histogram*, histogram_create, (const __itt_domain* domain, const char* name, __itt_metadata_type x_type, __itt_metadata_type y_type)) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_histogram_createA ITTNOTIFY_DATA(histogram_createA) #define __itt_histogram_createA_ptr ITTNOTIFY_NAME(histogram_createA) #define __itt_histogram_createW ITTNOTIFY_DATA(histogram_createW) #define __itt_histogram_createW_ptr ITTNOTIFY_NAME(histogram_createW) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_histogram_create ITTNOTIFY_DATA(histogram_create) #define __itt_histogram_create_ptr ITTNOTIFY_NAME(histogram_create) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #else /* INTEL_NO_ITTNOTIFY_API */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_histogram_createA(domain, name, x_type, y_type) (__itt_histogram*)0 #define __itt_histogram_createA_ptr 0 #define __itt_histogram_createW(domain, name, x_type, y_type) (__itt_histogram*)0 #define __itt_histogram_createW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_histogram_create(domain, name, x_type, y_type) (__itt_histogram*)0 #define __itt_histogram_create_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_histogram_createA_ptr 0 #define __itt_histogram_createW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_histogram_create_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Submit statistics for a histogram instance. * @param[in] histogram Pointer to the histogram instance to which the histogram statistic is to be dumped. * @param[in] length The number of elements in dumped axis data array. * @param[in] x_data The X axis dumped data itself (may be NULL to calculate batch statistics). * @param[in] y_data The Y axis dumped data itself. */ void ITTAPI __itt_histogram_submit(__itt_histogram* histogram, size_t length, void* x_data, void* y_data); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, histogram_submit, (__itt_histogram* histogram, size_t length, void* x_data, void* y_data)) #define __itt_histogram_submit ITTNOTIFY_VOID(histogram_submit) #define __itt_histogram_submit_ptr ITTNOTIFY_NAME(histogram_submit) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_histogram_submit(histogram, length, x_data, y_data) #define __itt_histogram_submit_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_histogram_submit_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** * @brief function allows to obtain the current collection state at the moment * @return collection state as a enum __itt_collection_state */ __itt_collection_state __itt_get_collection_state(void); /** * @brief function releases resources allocated by ITT API static part * this API should be called from the library destructor * @return void */ void __itt_release_resources(void); /** @endcond */ /** * @brief Create a typed counter with given domain pointer, string name and counter type */ #if ITT_PLATFORM==ITT_PLATFORM_WIN __itt_counter ITTAPI __itt_counter_createA_v3(const __itt_domain* domain, const char* name, __itt_metadata_type type); __itt_counter ITTAPI __itt_counter_createW_v3(const __itt_domain* domain, const wchar_t* name, __itt_metadata_type type); #if defined(UNICODE) || defined(_UNICODE) # define __itt_counter_create_v3 __itt_counter_createW_v3 # define __itt_counter_create_v3_ptr __itt_counter_createW_v3_ptr #else /* UNICODE */ # define __itt_counter_create_v3 __itt_counter_createA_v3 # define __itt_counter_create_v3_ptr __itt_counter_createA_v3_ptr #endif /* UNICODE */ #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ __itt_counter ITTAPI __itt_counter_create_v3(const __itt_domain* domain, const char* name, __itt_metadata_type type); #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(ITTAPI, __itt_counter, counter_createA_v3, (const __itt_domain* domain, const char* name, __itt_metadata_type type)) ITT_STUB(ITTAPI, __itt_counter, counter_createW_v3, (const __itt_domain* domain, const wchar_t* name, __itt_metadata_type type)) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUB(ITTAPI, __itt_counter, counter_create_v3, (const __itt_domain* domain, const char* name, __itt_metadata_type type)) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_counter_createA_v3 ITTNOTIFY_DATA(counter_createA_v3) #define __itt_counter_createA_v3_ptr ITTNOTIFY_NAME(counter_createA_v3) #define __itt_counter_createW_v3 ITTNOTIFY_DATA(counter_createW_v3) #define __itt_counter_createW_v3_ptr ITTNOTIFY_NAME(counter_createW_v3) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_counter_create_v3 ITTNOTIFY_DATA(counter_create_v3) #define __itt_counter_create_v3_ptr ITTNOTIFY_NAME(counter_create_v3) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #else /* INTEL_NO_ITTNOTIFY_API */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_counter_createA_v3(domain, name, type) (__itt_counter)0 #define __itt_counter_createA_v3_ptr 0 #define __itt_counter_createW_v3(domain, name, type) (__itt_counter)0 #define __itt_counter_create_typedW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_counter_create_v3(domain, name, type) (__itt_counter)0 #define __itt_counter_create_v3_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_counter_createA_v3_ptr 0 #define __itt_counter_createW_v3_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_counter_create_v3_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Set the counter value api */ void ITTAPI __itt_counter_set_value_v3(__itt_counter counter, void *value_ptr); #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, counter_set_value_v3, (__itt_counter counter, void *value_ptr)) #define __itt_counter_set_value_v3 ITTNOTIFY_VOID(counter_set_value_v3) #define __itt_counter_set_value_v3_ptr ITTNOTIFY_NAME(counter_set_value_v3) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_counter_set_value_v3(counter, value_ptr) #define __itt_counter_set_value_v3_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_counter_set_value_v3_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief describes the type of context metadata */ typedef enum { __itt_context_unknown = 0, /*!< Undefined type */ __itt_context_nameA, /*!< ASCII string char* type */ __itt_context_nameW, /*!< Unicode string wchar_t* type */ __itt_context_deviceA, /*!< ASCII string char* type */ __itt_context_deviceW, /*!< Unicode string wchar_t* type */ __itt_context_unitsA, /*!< ASCII string char* type */ __itt_context_unitsW, /*!< Unicode string wchar_t* type */ __itt_context_pci_addrA, /*!< ASCII string char* type */ __itt_context_pci_addrW, /*!< Unicode string wchar_t* type */ __itt_context_tid, /*!< Unsigned 64-bit integer type */ __itt_context_max_val, /*!< Unsigned 64-bit integer type */ __itt_context_bandwidth_flag, /*!< Unsigned 64-bit integer type */ __itt_context_latency_flag, /*!< Unsigned 64-bit integer type */ __itt_context_occupancy_flag, /*!< Unsigned 64-bit integer type */ __itt_context_on_thread_flag, /*!< Unsigned 64-bit integer type */ __itt_context_is_abs_val_flag, /*!< Unsigned 64-bit integer type */ __itt_context_cpu_instructions_flag, /*!< Unsigned 64-bit integer type */ __itt_context_cpu_cycles_flag /*!< Unsigned 64-bit integer type */ } __itt_context_type; #if defined(UNICODE) || defined(_UNICODE) # define __itt_context_name __itt_context_nameW # define __itt_context_device __itt_context_deviceW # define __itt_context_units __itt_context_unitsW # define __itt_context_pci_addr __itt_context_pci_addrW #else /* UNICODE || _UNICODE */ # define __itt_context_name __itt_context_nameA # define __itt_context_device __itt_context_deviceA # define __itt_context_units __itt_context_unitsA # define __itt_context_pci_addr __itt_context_pci_addrA #endif /* UNICODE || _UNICODE */ /** @cond exclude_from_documentation */ #pragma pack(push, 8) typedef struct ___itt_context_metadata { __itt_context_type type; /*!< Type of the context metadata value */ void* value; /*!< Pointer to context metadata value itself */ } __itt_context_metadata; #pragma pack(pop) /** @endcond */ /** @cond exclude_from_documentation */ #pragma pack(push, 8) typedef struct ___itt_counter_metadata { __itt_counter counter; /*!< Associated context metadata counter */ __itt_context_type type; /*!< Type of the context metadata value */ const char* str_valueA; /*!< String context metadata value */ #if defined(UNICODE) || defined(_UNICODE) const wchar_t* str_valueW; #else /* UNICODE || _UNICODE */ void* str_valueW; #endif /* UNICODE || _UNICODE */ unsigned long long value; /*!< Numeric context metadata value */ int extra1; /*!< Reserved to the runtime */ void* extra2; /*!< Reserved to the runtime */ struct ___itt_counter_metadata* next; } __itt_counter_metadata; #pragma pack(pop) /** @endcond */ /** * @brief Bind context metadata to counter instance * @param[in] counter Pointer to the counter instance to which the context metadata is to be associated. * @param[in] length The number of elements in context metadata array. * @param[in] metadata The context metadata itself. */ void ITTAPI __itt_bind_context_metadata_to_counter(__itt_counter counter, size_t length, __itt_context_metadata* metadata); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, bind_context_metadata_to_counter, (__itt_counter counter, size_t length, __itt_context_metadata* metadata)) #define __itt_bind_context_metadata_to_counter ITTNOTIFY_VOID(bind_context_metadata_to_counter) #define __itt_bind_context_metadata_to_counter_ptr ITTNOTIFY_NAME(bind_context_metadata_to_counter) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_bind_context_metadata_to_counter(counter, length, metadata) #define __itt_bind_context_metadata_to_counter_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_bind_context_metadata_to_counter_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ #ifdef __cplusplus } #endif /* __cplusplus */ #endif /* _ITTNOTIFY_H_ */ #ifdef INTEL_ITTNOTIFY_API_PRIVATE #ifndef _ITTNOTIFY_PRIVATE_ #define _ITTNOTIFY_PRIVATE_ #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ /** * @ingroup clockdomain * @brief Begin an overlapped task instance. * @param[in] domain The domain for this task * @param[in] clock_domain The clock domain controlling the execution of this call. * @param[in] timestamp The user defined timestamp. * @param[in] taskid The identifier for this task instance, *cannot* be __itt_null. * @param[in] parentid The parent of this task, or __itt_null. * @param[in] name The name of this task. */ void ITTAPI __itt_task_begin_overlapped_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, __itt_string_handle* name); /** * @ingroup clockdomain * @brief End an overlapped task instance. * @param[in] domain The domain for this task * @param[in] clock_domain The clock domain controlling the execution of this call. * @param[in] timestamp The user defined timestamp. * @param[in] taskid Explicit ID of finished task */ void ITTAPI __itt_task_end_overlapped_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, task_begin_overlapped_ex, (const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, __itt_string_handle* name)) ITT_STUBV(ITTAPI, void, task_end_overlapped_ex, (const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid)) #define __itt_task_begin_overlapped_ex(d,x,y,z,a,b) ITTNOTIFY_VOID_D5(task_begin_overlapped_ex,d,x,y,z,a,b) #define __itt_task_begin_overlapped_ex_ptr ITTNOTIFY_NAME(task_begin_overlapped_ex) #define __itt_task_end_overlapped_ex(d,x,y,z) ITTNOTIFY_VOID_D3(task_end_overlapped_ex,d,x,y,z) #define __itt_task_end_overlapped_ex_ptr ITTNOTIFY_NAME(task_end_overlapped_ex) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_task_begin_overlapped_ex(domain,clock_domain,timestamp,taskid,parentid,name) #define __itt_task_begin_overlapped_ex_ptr 0 #define __itt_task_end_overlapped_ex(domain,clock_domain,timestamp,taskid) #define __itt_task_end_overlapped_ex_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_task_begin_overlapped_ex_ptr 0 #define __itt_task_end_overlapped_ptr 0 #define __itt_task_end_overlapped_ex_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @defgroup makrs_internal Marks * @ingroup internal * Marks group * @warning Internal API: * - It is not shipped to outside of Intel * - It is delivered to internal Intel teams using e-mail or SVN access only * @{ */ /** @brief user mark type */ typedef int __itt_mark_type; /** * @brief Creates a user mark type with the specified name using char or Unicode string. * @param[in] name - name of mark to create * @return Returns a handle to the mark type */ #if ITT_PLATFORM==ITT_PLATFORM_WIN __itt_mark_type ITTAPI __itt_mark_createA(const char *name); __itt_mark_type ITTAPI __itt_mark_createW(const wchar_t *name); #if defined(UNICODE) || defined(_UNICODE) # define __itt_mark_create __itt_mark_createW # define __itt_mark_create_ptr __itt_mark_createW_ptr #else /* UNICODE */ # define __itt_mark_create __itt_mark_createA # define __itt_mark_create_ptr __itt_mark_createA_ptr #endif /* UNICODE */ #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ __itt_mark_type ITTAPI __itt_mark_create(const char *name); #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(ITTAPI, __itt_mark_type, mark_createA, (const char *name)) ITT_STUB(ITTAPI, __itt_mark_type, mark_createW, (const wchar_t *name)) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUB(ITTAPI, __itt_mark_type, mark_create, (const char *name)) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_mark_createA ITTNOTIFY_DATA(mark_createA) #define __itt_mark_createA_ptr ITTNOTIFY_NAME(mark_createA) #define __itt_mark_createW ITTNOTIFY_DATA(mark_createW) #define __itt_mark_createW_ptr ITTNOTIFY_NAME(mark_createW) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_mark_create ITTNOTIFY_DATA(mark_create) #define __itt_mark_create_ptr ITTNOTIFY_NAME(mark_create) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #else /* INTEL_NO_ITTNOTIFY_API */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_mark_createA(name) (__itt_mark_type)0 #define __itt_mark_createA_ptr 0 #define __itt_mark_createW(name) (__itt_mark_type)0 #define __itt_mark_createW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_mark_create(name) (__itt_mark_type)0 #define __itt_mark_create_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_mark_createA_ptr 0 #define __itt_mark_createW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_mark_create_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Creates a "discrete" user mark type of the specified type and an optional parameter using char or Unicode string. * * - The mark of "discrete" type is placed to collection results in case of success. It appears in overtime view(s) as a special tick sign. * - The call is "synchronous" - function returns after mark is actually added to results. * - This function is useful, for example, to mark different phases of application * (beginning of the next mark automatically meand end of current region). * - Can be used together with "continuous" marks (see below) at the same collection session * @param[in] mt - mark, created by __itt_mark_create(const char* name) function * @param[in] parameter - string parameter of mark * @return Returns zero value in case of success, non-zero value otherwise. */ #if ITT_PLATFORM==ITT_PLATFORM_WIN int ITTAPI __itt_markA(__itt_mark_type mt, const char *parameter); int ITTAPI __itt_markW(__itt_mark_type mt, const wchar_t *parameter); #if defined(UNICODE) || defined(_UNICODE) # define __itt_mark __itt_markW # define __itt_mark_ptr __itt_markW_ptr #else /* UNICODE */ # define __itt_mark __itt_markA # define __itt_mark_ptr __itt_markA_ptr #endif /* UNICODE */ #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ int ITTAPI __itt_mark(__itt_mark_type mt, const char *parameter); #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(ITTAPI, int, markA, (__itt_mark_type mt, const char *parameter)) ITT_STUB(ITTAPI, int, markW, (__itt_mark_type mt, const wchar_t *parameter)) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUB(ITTAPI, int, mark, (__itt_mark_type mt, const char *parameter)) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_markA ITTNOTIFY_DATA(markA) #define __itt_markA_ptr ITTNOTIFY_NAME(markA) #define __itt_markW ITTNOTIFY_DATA(markW) #define __itt_markW_ptr ITTNOTIFY_NAME(markW) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_mark ITTNOTIFY_DATA(mark) #define __itt_mark_ptr ITTNOTIFY_NAME(mark) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #else /* INTEL_NO_ITTNOTIFY_API */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_markA(mt, parameter) (int)0 #define __itt_markA_ptr 0 #define __itt_markW(mt, parameter) (int)0 #define __itt_markW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_mark(mt, parameter) (int)0 #define __itt_mark_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_markA_ptr 0 #define __itt_markW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_mark_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Use this if necessary to create a "discrete" user event type (mark) for process * rather then for one thread * @see int __itt_mark(__itt_mark_type mt, const char* parameter); */ #if ITT_PLATFORM==ITT_PLATFORM_WIN int ITTAPI __itt_mark_globalA(__itt_mark_type mt, const char *parameter); int ITTAPI __itt_mark_globalW(__itt_mark_type mt, const wchar_t *parameter); #if defined(UNICODE) || defined(_UNICODE) # define __itt_mark_global __itt_mark_globalW # define __itt_mark_global_ptr __itt_mark_globalW_ptr #else /* UNICODE */ # define __itt_mark_global __itt_mark_globalA # define __itt_mark_global_ptr __itt_mark_globalA_ptr #endif /* UNICODE */ #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ int ITTAPI __itt_mark_global(__itt_mark_type mt, const char *parameter); #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(ITTAPI, int, mark_globalA, (__itt_mark_type mt, const char *parameter)) ITT_STUB(ITTAPI, int, mark_globalW, (__itt_mark_type mt, const wchar_t *parameter)) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUB(ITTAPI, int, mark_global, (__itt_mark_type mt, const char *parameter)) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_mark_globalA ITTNOTIFY_DATA(mark_globalA) #define __itt_mark_globalA_ptr ITTNOTIFY_NAME(mark_globalA) #define __itt_mark_globalW ITTNOTIFY_DATA(mark_globalW) #define __itt_mark_globalW_ptr ITTNOTIFY_NAME(mark_globalW) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_mark_global ITTNOTIFY_DATA(mark_global) #define __itt_mark_global_ptr ITTNOTIFY_NAME(mark_global) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #else /* INTEL_NO_ITTNOTIFY_API */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_mark_globalA(mt, parameter) (int)0 #define __itt_mark_globalA_ptr 0 #define __itt_mark_globalW(mt, parameter) (int)0 #define __itt_mark_globalW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_mark_global(mt, parameter) (int)0 #define __itt_mark_global_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_mark_globalA_ptr 0 #define __itt_mark_globalW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_mark_global_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Creates an "end" point for "continuous" mark with specified name. * * - Returns zero value in case of success, non-zero value otherwise. * Also returns non-zero value when preceding "begin" point for the * mark with the same name failed to be created or not created. * - The mark of "continuous" type is placed to collection results in * case of success. It appears in overtime view(s) as a special tick * sign (different from "discrete" mark) together with line from * corresponding "begin" mark to "end" mark. * @note Continuous marks can overlap and be nested inside each other. * Discrete mark can be nested inside marked region * @param[in] mt - mark, created by __itt_mark_create(const char* name) function * @return Returns zero value in case of success, non-zero value otherwise. */ int ITTAPI __itt_mark_off(__itt_mark_type mt); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUB(ITTAPI, int, mark_off, (__itt_mark_type mt)) #define __itt_mark_off ITTNOTIFY_DATA(mark_off) #define __itt_mark_off_ptr ITTNOTIFY_NAME(mark_off) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_mark_off(mt) (int)0 #define __itt_mark_off_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_mark_off_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Use this if necessary to create an "end" point for mark of process * @see int __itt_mark_off(__itt_mark_type mt); */ int ITTAPI __itt_mark_global_off(__itt_mark_type mt); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUB(ITTAPI, int, mark_global_off, (__itt_mark_type mt)) #define __itt_mark_global_off ITTNOTIFY_DATA(mark_global_off) #define __itt_mark_global_off_ptr ITTNOTIFY_NAME(mark_global_off) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_mark_global_off(mt) (int)0 #define __itt_mark_global_off_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_mark_global_off_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @} marks group */ /** * @defgroup counters_internal Counters * @ingroup internal * Counters group * @{ */ /** * @defgroup stitch Stack Stitching * @ingroup internal * Stack Stitching group * @{ */ /** * @brief opaque structure for counter identification */ typedef struct ___itt_caller *__itt_caller; /** * @brief Create the stitch point e.g. a point in call stack where other stacks should be stitched to. * The function returns a unique identifier which is used to match the cut points with corresponding stitch points. */ __itt_caller ITTAPI __itt_stack_caller_create(void); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUB(ITTAPI, __itt_caller, stack_caller_create, (void)) #define __itt_stack_caller_create ITTNOTIFY_DATA(stack_caller_create) #define __itt_stack_caller_create_ptr ITTNOTIFY_NAME(stack_caller_create) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_stack_caller_create() (__itt_caller)0 #define __itt_stack_caller_create_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_stack_caller_create_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Destroy the information about stitch point identified by the pointer previously returned by __itt_stack_caller_create() */ void ITTAPI __itt_stack_caller_destroy(__itt_caller id); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, stack_caller_destroy, (__itt_caller id)) #define __itt_stack_caller_destroy ITTNOTIFY_VOID(stack_caller_destroy) #define __itt_stack_caller_destroy_ptr ITTNOTIFY_NAME(stack_caller_destroy) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_stack_caller_destroy(id) #define __itt_stack_caller_destroy_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_stack_caller_destroy_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Sets the cut point. Stack from each event which occurs after this call will be cut * at the same stack level the function was called and stitched to the corresponding stitch point. */ void ITTAPI __itt_stack_callee_enter(__itt_caller id); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, stack_callee_enter, (__itt_caller id)) #define __itt_stack_callee_enter ITTNOTIFY_VOID(stack_callee_enter) #define __itt_stack_callee_enter_ptr ITTNOTIFY_NAME(stack_callee_enter) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_stack_callee_enter(id) #define __itt_stack_callee_enter_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_stack_callee_enter_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief This function eliminates the cut point which was set by latest __itt_stack_callee_enter(). */ void ITTAPI __itt_stack_callee_leave(__itt_caller id); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, stack_callee_leave, (__itt_caller id)) #define __itt_stack_callee_leave ITTNOTIFY_VOID(stack_callee_leave) #define __itt_stack_callee_leave_ptr ITTNOTIFY_NAME(stack_callee_leave) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_stack_callee_leave(id) #define __itt_stack_callee_leave_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_stack_callee_leave_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @} stitch group */ /* ***************************************************************************************************************************** */ #include /** @cond exclude_from_documentation */ typedef enum __itt_error_code { __itt_error_success = 0, /*!< no error */ __itt_error_no_module = 1, /*!< module can't be loaded */ /* %1$s -- library name; win: %2$d -- system error code; unix: %2$s -- system error message. */ __itt_error_no_symbol = 2, /*!< symbol not found */ /* %1$s -- library name, %2$s -- symbol name. */ __itt_error_unknown_group = 3, /*!< unknown group specified */ /* %1$s -- env var name, %2$s -- group name. */ __itt_error_cant_read_env = 4, /*!< GetEnvironmentVariable() failed */ /* %1$s -- env var name, %2$d -- system error. */ __itt_error_env_too_long = 5, /*!< variable value too long */ /* %1$s -- env var name, %2$d -- actual length of the var, %3$d -- max allowed length. */ __itt_error_system = 6 /*!< pthread_mutexattr_init or pthread_mutex_init failed */ /* %1$s -- function name, %2$d -- errno. */ } __itt_error_code; typedef void (__itt_error_handler_t)(__itt_error_code code, va_list); __itt_error_handler_t* __itt_set_error_handler(__itt_error_handler_t*); const char* ITTAPI __itt_api_version(void); /** @endcond */ /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API #define __itt_error_handler ITT_JOIN(INTEL_ITTNOTIFY_PREFIX, error_handler) void __itt_error_handler(__itt_error_code code, va_list args); extern const int ITTNOTIFY_NAME(err); #define __itt_err ITTNOTIFY_NAME(err) ITT_STUB(ITTAPI, const char*, api_version, (void)) #define __itt_api_version ITTNOTIFY_DATA(api_version) #define __itt_api_version_ptr ITTNOTIFY_NAME(api_version) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_api_version() (const char*)0 #define __itt_api_version_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_api_version_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ #ifdef __cplusplus } #endif /* __cplusplus */ #endif /* _ITTNOTIFY_PRIVATE_ */ #endif /* INTEL_ITTNOTIFY_API_PRIVATE */ ================================================ FILE: third-party/tbb/src/tbb/tools_api/ittnotify_config.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _ITTNOTIFY_CONFIG_H_ #define _ITTNOTIFY_CONFIG_H_ /** @cond exclude_from_documentation */ #ifndef ITT_OS_WIN # define ITT_OS_WIN 1 #endif /* ITT_OS_WIN */ #ifndef ITT_OS_LINUX # define ITT_OS_LINUX 2 #endif /* ITT_OS_LINUX */ #ifndef ITT_OS_MAC # define ITT_OS_MAC 3 #endif /* ITT_OS_MAC */ #ifndef ITT_OS_FREEBSD # define ITT_OS_FREEBSD 4 #endif /* ITT_OS_FREEBSD */ #ifndef ITT_OS_OPENBSD # define ITT_OS_OPENBSD 5 #endif /* ITT_OS_OPENBSD */ #ifndef ITT_OS # if defined WIN32 || defined _WIN32 # define ITT_OS ITT_OS_WIN # elif defined( __APPLE__ ) && defined( __MACH__ ) # define ITT_OS ITT_OS_MAC # elif defined( __FreeBSD__ ) # define ITT_OS ITT_OS_FREEBSD # elif defined( __OpenBSD__ ) # define ITT_OS ITT_OS_OPENBSD # else # define ITT_OS ITT_OS_LINUX # endif #endif /* ITT_OS */ #ifndef ITT_PLATFORM_WIN # define ITT_PLATFORM_WIN 1 #endif /* ITT_PLATFORM_WIN */ #ifndef ITT_PLATFORM_POSIX # define ITT_PLATFORM_POSIX 2 #endif /* ITT_PLATFORM_POSIX */ #ifndef ITT_PLATFORM_MAC # define ITT_PLATFORM_MAC 3 #endif /* ITT_PLATFORM_MAC */ #ifndef ITT_PLATFORM_FREEBSD # define ITT_PLATFORM_FREEBSD 4 #endif /* ITT_PLATFORM_FREEBSD */ #ifndef ITT_PLATFORM_OPENBSD # define ITT_PLATFORM_OPENBSD 5 #endif /* ITT_PLATFORM_OPENBSD */ #ifndef ITT_PLATFORM # if ITT_OS==ITT_OS_WIN # define ITT_PLATFORM ITT_PLATFORM_WIN # elif ITT_OS==ITT_OS_MAC # define ITT_PLATFORM ITT_PLATFORM_MAC # elif ITT_OS==ITT_OS_FREEBSD # define ITT_PLATFORM ITT_PLATFORM_FREEBSD # elif ITT_OS==ITT_OS_OPENBSD # define ITT_PLATFORM ITT_PLATFORM_OPENBSD # else # define ITT_PLATFORM ITT_PLATFORM_POSIX # endif #endif /* ITT_PLATFORM */ #if defined(_UNICODE) && !defined(UNICODE) #define UNICODE #endif #include #if ITT_PLATFORM==ITT_PLATFORM_WIN #include #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #include #if defined(UNICODE) || defined(_UNICODE) #include #endif /* UNICODE || _UNICODE */ #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #ifndef ITTAPI_CDECL # if ITT_PLATFORM==ITT_PLATFORM_WIN # define ITTAPI_CDECL __cdecl # else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ # if defined _M_IX86 || defined __i386__ # define ITTAPI_CDECL __attribute__ ((cdecl)) # else /* _M_IX86 || __i386__ */ # define ITTAPI_CDECL /* actual only on x86 platform */ # endif /* _M_IX86 || __i386__ */ # endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* ITTAPI_CDECL */ #ifndef STDCALL # if ITT_PLATFORM==ITT_PLATFORM_WIN # define STDCALL __stdcall # else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ # if defined _M_IX86 || defined __i386__ # define STDCALL __attribute__ ((stdcall)) # else /* _M_IX86 || __i386__ */ # define STDCALL /* supported only on x86 platform */ # endif /* _M_IX86 || __i386__ */ # endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* STDCALL */ #define ITTAPI ITTAPI_CDECL #define LIBITTAPI ITTAPI_CDECL /* TODO: Temporary for compatibility! */ #define ITTAPI_CALL ITTAPI_CDECL #define LIBITTAPI_CALL ITTAPI_CDECL #if ITT_PLATFORM==ITT_PLATFORM_WIN /* use __forceinline (VC++ specific) */ #if defined(__MINGW32__) && !defined(__cplusplus) #define ITT_INLINE static __inline__ __attribute__((__always_inline__,__gnu_inline__)) #else #define ITT_INLINE static __forceinline #endif /* __MINGW32__ */ #define ITT_INLINE_ATTRIBUTE /* nothing */ #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ /* * Generally, functions are not inlined unless optimization is specified. * For functions declared inline, this attribute inlines the function even * if no optimization level was specified. */ #ifdef __STRICT_ANSI__ #define ITT_INLINE static #define ITT_INLINE_ATTRIBUTE __attribute__((unused)) #else /* __STRICT_ANSI__ */ #define ITT_INLINE static inline #define ITT_INLINE_ATTRIBUTE __attribute__((always_inline, unused)) #endif /* __STRICT_ANSI__ */ #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ /** @endcond */ #ifndef ITT_ARCH_IA32 # define ITT_ARCH_IA32 1 #endif /* ITT_ARCH_IA32 */ #ifndef ITT_ARCH_IA32E # define ITT_ARCH_IA32E 2 #endif /* ITT_ARCH_IA32E */ #ifndef ITT_ARCH_IA64 # define ITT_ARCH_IA64 3 #endif /* ITT_ARCH_IA64 */ #ifndef ITT_ARCH_ARM # define ITT_ARCH_ARM 4 #endif /* ITT_ARCH_ARM */ #ifndef ITT_ARCH_PPC64 # define ITT_ARCH_PPC64 5 #endif /* ITT_ARCH_PPC64 */ #ifndef ITT_ARCH_ARM64 # define ITT_ARCH_ARM64 6 #endif /* ITT_ARCH_ARM64 */ #ifndef ITT_ARCH_LOONGARCH64 # define ITT_ARCH_LOONGARCH64 7 #endif /* ITT_ARCH_LOONGARCH64 */ #ifndef ITT_ARCH_S390X # define ITT_ARCH_S390X 8 #endif /* ITT_ARCH_S390X */ #ifndef ITT_ARCH_HPPA # define ITT_ARCH_HPPA 9 #endif /* ITT_ARCH_HPPA */ #ifndef ITT_ARCH_RISCV64 # define ITT_ARCH_RISCV64 10 #endif /* ITT_ARCH_RISCV64 */ #ifndef ITT_ARCH # if defined _M_IX86 || defined __i386__ # define ITT_ARCH ITT_ARCH_IA32 # elif defined _M_X64 || defined _M_AMD64 || defined __x86_64__ # define ITT_ARCH ITT_ARCH_IA32E # elif defined _M_IA64 || defined __ia64__ # define ITT_ARCH ITT_ARCH_IA64 # elif defined _M_ARM || defined __arm__ # define ITT_ARCH ITT_ARCH_ARM # elif defined __aarch64__ # define ITT_ARCH ITT_ARCH_ARM64 # elif defined __powerpc64__ # define ITT_ARCH ITT_ARCH_PPC64 # elif defined __loongarch__ # define ITT_ARCH ITT_ARCH_LOONGARCH64 # elif defined __s390__ || defined __s390x__ # define ITT_ARCH ITT_ARCH_S390X # elif defined __hppa__ # define ITT_ARCH ITT_ARCH_HPPA # elif defined __riscv && __riscv_xlen == 64 # define ITT_ARCH ITT_ARCH_RISCV64 # endif #endif #ifdef __cplusplus # define ITT_EXTERN_C extern "C" # define ITT_EXTERN_C_BEGIN extern "C" { # define ITT_EXTERN_C_END } #else # define ITT_EXTERN_C /* nothing */ # define ITT_EXTERN_C_BEGIN /* nothing */ # define ITT_EXTERN_C_END /* nothing */ #endif /* __cplusplus */ #define ITT_TO_STR_AUX(x) #x #define ITT_TO_STR(x) ITT_TO_STR_AUX(x) #define __ITT_BUILD_ASSERT(expr, suffix) do { \ static char __itt_build_check_##suffix[(expr) ? 1 : -1]; \ __itt_build_check_##suffix[0] = 0; \ } while(0) #define _ITT_BUILD_ASSERT(expr, suffix) __ITT_BUILD_ASSERT((expr), suffix) #define ITT_BUILD_ASSERT(expr) _ITT_BUILD_ASSERT((expr), __LINE__) #define ITT_MAGIC { 0xED, 0xAB, 0xAB, 0xEC, 0x0D, 0xEE, 0xDA, 0x30 } /* Replace with snapshot date YYYYMMDD for promotion build. */ #define API_VERSION_BUILD 20230630 #ifndef API_VERSION_NUM #define API_VERSION_NUM 3.24.4 #endif /* API_VERSION_NUM */ #define API_VERSION "ITT-API-Version " ITT_TO_STR(API_VERSION_NUM) \ " (" ITT_TO_STR(API_VERSION_BUILD) ")" /* OS communication functions */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #include typedef HMODULE lib_t; typedef DWORD TIDT; typedef CRITICAL_SECTION mutex_t; #ifdef __cplusplus #define MUTEX_INITIALIZER {} #else #define MUTEX_INITIALIZER { 0 } #endif #define strong_alias(name, aliasname) /* empty for Windows */ #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #include #if defined(UNICODE) || defined(_UNICODE) #include #endif /* UNICODE */ #ifndef _GNU_SOURCE #define _GNU_SOURCE 1 /* need for PTHREAD_MUTEX_RECURSIVE */ #endif /* _GNU_SOURCE */ #ifndef __USE_UNIX98 #define __USE_UNIX98 1 /* need for PTHREAD_MUTEX_RECURSIVE, on SLES11.1 with gcc 4.3.4 wherein pthread.h missing dependency on __USE_XOPEN2K8 */ #endif /*__USE_UNIX98*/ #include typedef void* lib_t; typedef pthread_t TIDT; typedef pthread_mutex_t mutex_t; #define MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER #define _strong_alias(name, aliasname) \ extern __typeof (name) aliasname __attribute__ ((alias (#name))); #define strong_alias(name, aliasname) _strong_alias(name, aliasname) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_get_proc(lib, name) GetProcAddress(lib, name) #define __itt_mutex_init(mutex) InitializeCriticalSection(mutex) #define __itt_mutex_lock(mutex) EnterCriticalSection(mutex) #define __itt_mutex_unlock(mutex) LeaveCriticalSection(mutex) #define __itt_mutex_destroy(mutex) DeleteCriticalSection(mutex) #define __itt_load_lib(name) LoadLibraryA(name) #define __itt_unload_lib(handle) FreeLibrary(handle) #define __itt_system_error() (int)GetLastError() #define __itt_fstrcmp(s1, s2) lstrcmpA(s1, s2) #define __itt_fstrnlen(s, l) strnlen_s(s, l) #define __itt_fstrcpyn(s1, b, s2, l) strncpy_s(s1, b, s2, l) #define __itt_thread_id() GetCurrentThreadId() #define __itt_thread_yield() SwitchToThread() #ifndef ITT_SIMPLE_INIT ITT_INLINE long __itt_interlocked_increment(volatile long* ptr) ITT_INLINE_ATTRIBUTE; ITT_INLINE long __itt_interlocked_increment(volatile long* ptr) { return InterlockedIncrement(ptr); } ITT_INLINE long __itt_interlocked_compare_exchange(volatile long* ptr, long exchange, long comperand) ITT_INLINE_ATTRIBUTE; ITT_INLINE long __itt_interlocked_compare_exchange(volatile long* ptr, long exchange, long comperand) { return InterlockedCompareExchange(ptr, exchange, comperand); } #endif /* ITT_SIMPLE_INIT */ #define DL_SYMBOLS (1) #define PTHREAD_SYMBOLS (1) #else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */ #define __itt_get_proc(lib, name) dlsym(lib, name) #define __itt_mutex_init(mutex) {\ pthread_mutexattr_t mutex_attr; \ int error_code = pthread_mutexattr_init(&mutex_attr); \ if (error_code) \ __itt_report_error(__itt_error_system, "pthread_mutexattr_init", \ error_code); \ error_code = pthread_mutexattr_settype(&mutex_attr, \ PTHREAD_MUTEX_RECURSIVE); \ if (error_code) \ __itt_report_error(__itt_error_system, "pthread_mutexattr_settype", \ error_code); \ error_code = pthread_mutex_init(mutex, &mutex_attr); \ if (error_code) \ __itt_report_error(__itt_error_system, "pthread_mutex_init", \ error_code); \ error_code = pthread_mutexattr_destroy(&mutex_attr); \ if (error_code) \ __itt_report_error(__itt_error_system, "pthread_mutexattr_destroy", \ error_code); \ } #define __itt_mutex_lock(mutex) pthread_mutex_lock(mutex) #define __itt_mutex_unlock(mutex) pthread_mutex_unlock(mutex) #define __itt_mutex_destroy(mutex) pthread_mutex_destroy(mutex) #define __itt_load_lib(name) dlopen(name, RTLD_LAZY) #define __itt_unload_lib(handle) dlclose(handle) #define __itt_system_error() errno #define __itt_fstrcmp(s1, s2) strcmp(s1, s2) /* makes customer code define safe APIs for SDL_STRNLEN_S and SDL_STRNCPY_S */ #ifdef SDL_STRNLEN_S #define __itt_fstrnlen(s, l) SDL_STRNLEN_S(s, l) #else #define __itt_fstrnlen(s, l) strlen(s) #endif /* SDL_STRNLEN_S */ #ifdef SDL_STRNCPY_S #define __itt_fstrcpyn(s1, b, s2, l) SDL_STRNCPY_S(s1, b, s2, l) #else #define __itt_fstrcpyn(s1, b, s2, l) { \ if (b > 0) { \ /* 'volatile' is used to suppress the warning that a destination */ \ /* bound depends on the length of the source. */ \ volatile size_t num_to_copy = (size_t)(b - 1) < (size_t)(l) ? \ (size_t)(b - 1) : (size_t)(l); \ strncpy(s1, s2, num_to_copy); \ s1[num_to_copy] = 0; \ } \ } #endif /* SDL_STRNCPY_S */ #define __itt_thread_id() pthread_self() #define __itt_thread_yield() sched_yield() #if ITT_ARCH==ITT_ARCH_IA64 #ifdef __INTEL_COMPILER #define __TBB_machine_fetchadd4(addr, val) __fetchadd4_acq((void *)addr, val) #else /* __INTEL_COMPILER */ #define __TBB_machine_fetchadd4(addr, val) __sync_fetch_and_add(addr, val) #endif /* __INTEL_COMPILER */ #elif ITT_ARCH==ITT_ARCH_IA32 || ITT_ARCH==ITT_ARCH_IA32E /* ITT_ARCH!=ITT_ARCH_IA64 */ ITT_INLINE long __TBB_machine_fetchadd4(volatile void* ptr, long addend) ITT_INLINE_ATTRIBUTE; ITT_INLINE long __TBB_machine_fetchadd4(volatile void* ptr, long addend) { long result; __asm__ __volatile__("lock\nxadd %0,%1" : "=r"(result),"=m"(*(volatile int*)ptr) : "0"(addend), "m"(*(volatile int*)ptr) : "memory"); return result; } #else #define __TBB_machine_fetchadd4(addr, val) __sync_fetch_and_add(addr, val) #endif /* ITT_ARCH==ITT_ARCH_IA64 */ #ifndef ITT_SIMPLE_INIT ITT_INLINE long __itt_interlocked_increment(volatile long* ptr) ITT_INLINE_ATTRIBUTE; ITT_INLINE long __itt_interlocked_increment(volatile long* ptr) { return __TBB_machine_fetchadd4(ptr, 1) + 1L; } ITT_INLINE long __itt_interlocked_compare_exchange(volatile long* ptr, long exchange, long comperand) ITT_INLINE_ATTRIBUTE; ITT_INLINE long __itt_interlocked_compare_exchange(volatile long* ptr, long exchange, long comperand) { return __sync_val_compare_and_swap(ptr, exchange, comperand); } #endif /* ITT_SIMPLE_INIT */ void* dlopen(const char*, int) __attribute__((weak)); void* dlsym(void*, const char*) __attribute__((weak)); int dlclose(void*) __attribute__((weak)); #define DL_SYMBOLS (dlopen && dlsym && dlclose) int pthread_mutex_init(pthread_mutex_t*, const pthread_mutexattr_t*) __attribute__((weak)); int pthread_mutex_lock(pthread_mutex_t*) __attribute__((weak)); int pthread_mutex_unlock(pthread_mutex_t*) __attribute__((weak)); int pthread_mutex_destroy(pthread_mutex_t*) __attribute__((weak)); int pthread_mutexattr_init(pthread_mutexattr_t*) __attribute__((weak)); int pthread_mutexattr_settype(pthread_mutexattr_t*, int) __attribute__((weak)); int pthread_mutexattr_destroy(pthread_mutexattr_t*) __attribute__((weak)); pthread_t pthread_self(void) __attribute__((weak)); #define PTHREAD_SYMBOLS (pthread_mutex_init && pthread_mutex_lock && pthread_mutex_unlock && pthread_mutex_destroy && pthread_mutexattr_init && pthread_mutexattr_settype && pthread_mutexattr_destroy && pthread_self) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ /* strdup() is not included into C99 which results in a compiler warning about * implicitly declared symbol. To avoid the issue strdup is implemented * manually. */ #define ITT_STRDUP_MAX_STRING_SIZE 4096 #define __itt_fstrdup(s, new_s) do { \ if (s != NULL) { \ size_t s_len = __itt_fstrnlen(s, ITT_STRDUP_MAX_STRING_SIZE); \ new_s = (char *)malloc(s_len + 1); \ if (new_s != NULL) { \ __itt_fstrcpyn(new_s, s_len + 1, s, s_len); \ } \ } \ } while(0) typedef enum { __itt_thread_normal = 0, __itt_thread_ignored = 1 } __itt_thread_state; #pragma pack(push, 8) typedef struct ___itt_thread_info { const char* nameA; /*!< Copy of original name in ASCII. */ #if defined(UNICODE) || defined(_UNICODE) const wchar_t* nameW; /*!< Copy of original name in UNICODE. */ #else /* UNICODE || _UNICODE */ void* nameW; #endif /* UNICODE || _UNICODE */ TIDT tid; __itt_thread_state state; /*!< Thread state (paused or normal) */ int extra1; /*!< Reserved to the runtime */ void* extra2; /*!< Reserved to the runtime */ struct ___itt_thread_info* next; } __itt_thread_info; #include "ittnotify_types.h" /* For __itt_group_id definition */ typedef struct ___itt_api_info_20101001 { const char* name; void** func_ptr; void* init_func; __itt_group_id group; } __itt_api_info_20101001; typedef struct ___itt_api_info { const char* name; void** func_ptr; void* init_func; void* null_func; __itt_group_id group; } __itt_api_info; typedef struct __itt_counter_info { const char* nameA; /*!< Copy of original name in ASCII. */ #if defined(UNICODE) || defined(_UNICODE) const wchar_t* nameW; /*!< Copy of original name in UNICODE. */ #else /* UNICODE || _UNICODE */ void* nameW; #endif /* UNICODE || _UNICODE */ const char* domainA; /*!< Copy of original name in ASCII. */ #if defined(UNICODE) || defined(_UNICODE) const wchar_t* domainW; /*!< Copy of original name in UNICODE. */ #else /* UNICODE || _UNICODE */ void* domainW; #endif /* UNICODE || _UNICODE */ int type; long index; int extra1; /*!< Reserved to the runtime */ void* extra2; /*!< Reserved to the runtime */ struct __itt_counter_info* next; } __itt_counter_info_t; struct ___itt_domain; struct ___itt_string_handle; struct ___itt_histogram; struct ___itt_counter_metadata; #include "ittnotify.h" typedef struct ___itt_global { unsigned char magic[8]; unsigned long version_major; unsigned long version_minor; unsigned long version_build; volatile long api_initialized; volatile long mutex_initialized; volatile long atomic_counter; mutex_t mutex; lib_t lib; void* error_handler; const char** dll_path_ptr; __itt_api_info* api_list_ptr; struct ___itt_global* next; /* Joinable structures below */ __itt_thread_info* thread_list; struct ___itt_domain* domain_list; struct ___itt_string_handle* string_list; __itt_collection_state state; __itt_counter_info_t* counter_list; unsigned int ipt_collect_events; struct ___itt_histogram* histogram_list; struct ___itt_counter_metadata* counter_metadata_list; } __itt_global; #pragma pack(pop) #define NEW_THREAD_INFO_W(gptr,h,h_tail,t,s,n) { \ h = (__itt_thread_info*)malloc(sizeof(__itt_thread_info)); \ if (h != NULL) { \ h->tid = t; \ h->nameA = NULL; \ h->nameW = n ? _wcsdup(n) : NULL; \ h->state = s; \ h->extra1 = 0; /* reserved */ \ h->extra2 = NULL; /* reserved */ \ h->next = NULL; \ if (h_tail == NULL) \ (gptr)->thread_list = h; \ else \ h_tail->next = h; \ } \ } #define NEW_THREAD_INFO_A(gptr,h,h_tail,t,s,n) { \ h = (__itt_thread_info*)malloc(sizeof(__itt_thread_info)); \ if (h != NULL) { \ h->tid = t; \ char *n_copy = NULL; \ __itt_fstrdup(n, n_copy); \ h->nameA = n_copy; \ h->nameW = NULL; \ h->state = s; \ h->extra1 = 0; /* reserved */ \ h->extra2 = NULL; /* reserved */ \ h->next = NULL; \ if (h_tail == NULL) \ (gptr)->thread_list = h; \ else \ h_tail->next = h; \ } \ } #define NEW_DOMAIN_W(gptr,h,h_tail,name) { \ h = (__itt_domain*)malloc(sizeof(__itt_domain)); \ if (h != NULL) { \ h->flags = 1; /* domain is enabled by default */ \ h->nameA = NULL; \ h->nameW = name ? _wcsdup(name) : NULL; \ h->extra1 = 0; /* reserved */ \ h->extra2 = NULL; /* reserved */ \ h->next = NULL; \ if (h_tail == NULL) \ (gptr)->domain_list = h; \ else \ h_tail->next = h; \ } \ } #define NEW_DOMAIN_A(gptr,h,h_tail,name) { \ h = (__itt_domain*)malloc(sizeof(__itt_domain)); \ if (h != NULL) { \ h->flags = 1; /* domain is enabled by default */ \ char *name_copy = NULL; \ __itt_fstrdup(name, name_copy); \ h->nameA = name_copy; \ h->nameW = NULL; \ h->extra1 = 0; /* reserved */ \ h->extra2 = NULL; /* reserved */ \ h->next = NULL; \ if (h_tail == NULL) \ (gptr)->domain_list = h; \ else \ h_tail->next = h; \ } \ } #define NEW_STRING_HANDLE_W(gptr,h,h_tail,name) { \ h = (__itt_string_handle*)malloc(sizeof(__itt_string_handle)); \ if (h != NULL) { \ h->strA = NULL; \ h->strW = name ? _wcsdup(name) : NULL; \ h->extra1 = 0; /* reserved */ \ h->extra2 = NULL; /* reserved */ \ h->next = NULL; \ if (h_tail == NULL) \ (gptr)->string_list = h; \ else \ h_tail->next = h; \ } \ } #define NEW_STRING_HANDLE_A(gptr,h,h_tail,name) { \ h = (__itt_string_handle*)malloc(sizeof(__itt_string_handle)); \ if (h != NULL) { \ char *name_copy = NULL; \ __itt_fstrdup(name, name_copy); \ h->strA = name_copy; \ h->strW = NULL; \ h->extra1 = 0; /* reserved */ \ h->extra2 = NULL; /* reserved */ \ h->next = NULL; \ if (h_tail == NULL) \ (gptr)->string_list = h; \ else \ h_tail->next = h; \ } \ } #define NEW_COUNTER_W(gptr,h,h_tail,name,domain,type) { \ h = (__itt_counter_info_t*)malloc(sizeof(__itt_counter_info_t)); \ if (h != NULL) { \ h->nameA = NULL; \ h->nameW = name ? _wcsdup(name) : NULL; \ h->domainA = NULL; \ h->domainW = domain ? _wcsdup(domain) : NULL; \ h->type = type; \ h->index = 0; \ h->next = NULL; \ if (h_tail == NULL) \ (gptr)->counter_list = h; \ else \ h_tail->next = h; \ } \ } #define NEW_COUNTER_A(gptr,h,h_tail,name,domain,type) { \ h = (__itt_counter_info_t*)malloc(sizeof(__itt_counter_info_t)); \ if (h != NULL) { \ char *name_copy = NULL; \ __itt_fstrdup(name, name_copy); \ h->nameA = name_copy; \ h->nameW = NULL; \ char *domain_copy = NULL; \ __itt_fstrdup(domain, domain_copy); \ h->domainA = domain_copy; \ h->domainW = NULL; \ h->type = type; \ h->index = 0; \ h->next = NULL; \ if (h_tail == NULL) \ (gptr)->counter_list = h; \ else \ h_tail->next = h; \ } \ } #define NEW_HISTOGRAM_W(gptr,h,h_tail,domain,name,x_type,y_type) { \ h = (__itt_histogram*)malloc(sizeof(__itt_histogram)); \ if (h != NULL) { \ h->domain = domain; \ h->nameA = NULL; \ h->nameW = name ? _wcsdup(name) : NULL; \ h->x_type = x_type; \ h->y_type = y_type; \ h->extra1 = 0; \ h->extra2 = NULL; \ h->next = NULL; \ if (h_tail == NULL) \ (gptr)->histogram_list = h; \ else \ h_tail->next = h; \ } \ } #define NEW_HISTOGRAM_A(gptr,h,h_tail,domain,name,x_type,y_type) { \ h = (__itt_histogram*)malloc(sizeof(__itt_histogram)); \ if (h != NULL) { \ h->domain = domain; \ char *name_copy = NULL; \ __itt_fstrdup(name, name_copy); \ h->nameA = name_copy; \ h->nameW = NULL; \ h->x_type = x_type; \ h->y_type = y_type; \ h->extra1 = 0; \ h->extra2 = NULL; \ h->next = NULL; \ if (h_tail == NULL) \ (gptr)->histogram_list = h; \ else \ h_tail->next = h; \ } \ } #define NEW_COUNTER_METADATA_NUM(gptr,h,h_tail,counter,type,value) { \ h = (__itt_counter_metadata*)malloc(sizeof(__itt_counter_metadata)); \ if (h != NULL) { \ h->counter = counter; \ h->type = type; \ h->str_valueA = NULL; \ h->str_valueW = NULL; \ h->value = value; \ h->extra1 = 0; \ h->extra2 = NULL; \ h->next = NULL; \ if (h_tail == NULL) \ (gptr)->counter_metadata_list = h; \ else \ h_tail->next = h; \ } \ } #define NEW_COUNTER_METADATA_STR_A(gptr,h,h_tail,counter,type,str_valueA) { \ h = (__itt_counter_metadata*)malloc(sizeof(__itt_counter_metadata)); \ if (h != NULL) { \ h->counter = counter; \ h->type = type; \ char *str_value_copy = NULL; \ __itt_fstrdup(str_valueA, str_value_copy); \ h->str_valueA = str_value_copy; \ h->str_valueW = NULL; \ h->value = 0; \ h->extra1 = 0; \ h->extra2 = NULL; \ h->next = NULL; \ if (h_tail == NULL) \ (gptr)->counter_metadata_list = h; \ else \ h_tail->next = h; \ } \ } #define NEW_COUNTER_METADATA_STR_W(gptr,h,h_tail,counter,type,str_valueW) { \ h = (__itt_counter_metadata*)malloc(sizeof(__itt_counter_metadata)); \ if (h != NULL) { \ h->counter = counter; \ h->type = type; \ h->str_valueA = NULL; \ h->str_valueW = str_valueW ? _wcsdup(str_valueW) : NULL; \ h->value = 0; \ h->extra1 = 0; \ h->extra2 = NULL; \ h->next = NULL; \ if (h_tail == NULL) \ (gptr)->counter_metadata_list = h; \ else \ h_tail->next = h; \ } \ } #endif /* _ITTNOTIFY_CONFIG_H_ */ ================================================ FILE: third-party/tbb/src/tbb/tools_api/ittnotify_static.c ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #define INTEL_NO_MACRO_BODY #define INTEL_ITTNOTIFY_API_PRIVATE #include "ittnotify_config.h" #if ITT_PLATFORM==ITT_PLATFORM_WIN #if !defined(PATH_MAX) #define PATH_MAX 512 #endif #else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */ #include #include #include #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #include #include #include #include #include "ittnotify.h" #include "legacy/ittnotify.h" #include "disable_warnings.h" static const char api_version[] = API_VERSION "\0\n@(#) $Revision$\n"; #define _N_(n) ITT_JOIN(INTEL_ITTNOTIFY_PREFIX,n) #ifndef HAS_CPP_ATTR #if defined(__cplusplus) && defined(__has_cpp_attribute) #define HAS_CPP_ATTR(X) __has_cpp_attribute(X) #else #define HAS_CPP_ATTR(X) 0 #endif #endif #ifndef HAS_C_ATTR #if defined(__STDC__) && defined(__has_c_attribute) #define HAS_C_ATTR(X) __has_c_attribute(X) #else #define HAS_C_ATTR(X) 0 #endif #endif #ifndef HAS_GNU_ATTR #if defined(__has_attribute) #define HAS_GNU_ATTR(X) __has_attribute(X) #else #define HAS_GNU_ATTR(X) 0 #endif #endif #ifndef ITT_ATTRIBUTE_FALLTHROUGH #if (HAS_CPP_ATTR(fallthrough) || HAS_C_ATTR(fallthrough)) && (__cplusplus >= 201703L || _MSVC_LANG >= 201703L) #define ITT_ATTRIBUTE_FALLTHROUGH [[fallthrough]] #elif HAS_CPP_ATTR(gnu::fallthrough) #define ITT_ATTRIBUTE_FALLTHROUGH [[gnu::fallthrough]] #elif HAS_CPP_ATTR(clang::fallthrough) #define ITT_ATTRIBUTE_FALLTHROUGH [[clang::fallthrough]] #elif HAS_GNU_ATTR(fallthrough) && !__INTEL_COMPILER #define ITT_ATTRIBUTE_FALLTHROUGH __attribute__((fallthrough)) #else #define ITT_ATTRIBUTE_FALLTHROUGH #endif #endif #if ITT_OS==ITT_OS_WIN static const char* ittnotify_lib_name = "libittnotify.dll"; #elif ITT_OS==ITT_OS_LINUX || ITT_OS==ITT_OS_FREEBSD|| ITT_OS==ITT_OS_OPENBSD static const char* ittnotify_lib_name = "libittnotify.so"; #elif ITT_OS==ITT_OS_MAC static const char* ittnotify_lib_name = "libittnotify.dylib"; #else #error Unsupported or unknown OS. #endif #ifdef __ANDROID__ #include #include #include #include #include #include #include #ifdef ITT_ANDROID_LOG #define ITT_ANDROID_LOG_TAG "INTEL_VTUNE_USERAPI" #define ITT_ANDROID_LOGI(...) ((void)__android_log_print(ANDROID_LOG_INFO, ITT_ANDROID_LOG_TAG, __VA_ARGS__)) #define ITT_ANDROID_LOGW(...) ((void)__android_log_print(ANDROID_LOG_WARN, ITT_ANDROID_LOG_TAG, __VA_ARGS__)) #define ITT_ANDROID_LOGE(...) ((void)__android_log_print(ANDROID_LOG_ERROR,ITT_ANDROID_LOG_TAG, __VA_ARGS__)) #define ITT_ANDROID_LOGD(...) ((void)__android_log_print(ANDROID_LOG_DEBUG,ITT_ANDROID_LOG_TAG, __VA_ARGS__)) #else #define ITT_ANDROID_LOGI(...) #define ITT_ANDROID_LOGW(...) #define ITT_ANDROID_LOGE(...) #define ITT_ANDROID_LOGD(...) #endif /* default location of userapi collector on Android */ #define ANDROID_ITTNOTIFY_DEFAULT_PATH_MASK(x) "/data/data/com.intel.vtune/perfrun/lib" \ #x "/runtime/libittnotify.so" #if ITT_ARCH==ITT_ARCH_IA32 || ITT_ARCH==ITT_ARCH_ARM #define ANDROID_ITTNOTIFY_DEFAULT_PATH ANDROID_ITTNOTIFY_DEFAULT_PATH_MASK(32) #else #define ANDROID_ITTNOTIFY_DEFAULT_PATH ANDROID_ITTNOTIFY_DEFAULT_PATH_MASK(64) #endif #endif #ifndef LIB_VAR_NAME #if ITT_ARCH==ITT_ARCH_IA32 || ITT_ARCH==ITT_ARCH_ARM #define LIB_VAR_NAME INTEL_LIBITTNOTIFY32 #else #define LIB_VAR_NAME INTEL_LIBITTNOTIFY64 #endif #endif /* LIB_VAR_NAME */ #define ITT_MUTEX_INIT_AND_LOCK(p) { \ if (PTHREAD_SYMBOLS) \ { \ if (!p.mutex_initialized) \ { \ if (__itt_interlocked_compare_exchange(&p.atomic_counter, 1, 0) == 0) \ { \ __itt_mutex_init(&p.mutex); \ p.mutex_initialized = 1; \ } \ else \ while (!p.mutex_initialized) \ __itt_thread_yield(); \ } \ __itt_mutex_lock(&p.mutex); \ } \ } #define ITT_MUTEX_DESTROY(p) { \ if (PTHREAD_SYMBOLS) \ { \ if (p.mutex_initialized) \ { \ if (__itt_interlocked_compare_exchange(&p.atomic_counter, 0, 1) == 1) \ { \ __itt_mutex_destroy(&p.mutex); \ p.mutex_initialized = 0; \ } \ } \ } \ } #define ITT_MODULE_OBJECT_VERSION 1 typedef int (__itt_init_ittlib_t)(const char*, __itt_group_id); /* this define used to control initialization function name. */ #ifndef __itt_init_ittlib_name ITT_EXTERN_C int _N_(init_ittlib)(const char*, __itt_group_id); static __itt_init_ittlib_t* __itt_init_ittlib_ptr = _N_(init_ittlib); #define __itt_init_ittlib_name __itt_init_ittlib_ptr #endif /* __itt_init_ittlib_name */ typedef void (__itt_fini_ittlib_t)(void); /* this define used to control finalization function name. */ #ifndef __itt_fini_ittlib_name ITT_EXTERN_C void _N_(fini_ittlib)(void); static __itt_fini_ittlib_t* __itt_fini_ittlib_ptr = _N_(fini_ittlib); #define __itt_fini_ittlib_name __itt_fini_ittlib_ptr #endif /* __itt_fini_ittlib_name */ extern __itt_global _N_(_ittapi_global); /* building pointers to imported funcs */ #undef ITT_STUBV #undef ITT_STUB #define ITT_STUB(api,type,name,args,params,ptr,group,format) \ static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args;\ typedef type api ITT_JOIN(_N_(name),_t) args; \ ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name),_t)* ITTNOTIFY_NAME(name) = ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)); ITT_EXTERN_C_END \ static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args \ { \ if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL) \ __itt_init_ittlib_name(NULL, __itt_group_all); \ if (ITTNOTIFY_NAME(name) && ITTNOTIFY_NAME(name) != ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init))) \ return ITTNOTIFY_NAME(name) params; \ else \ return (type)0; \ } #define ITT_STUBV(api,type,name,args,params,ptr,group,format) \ static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args;\ typedef type api ITT_JOIN(_N_(name),_t) args; \ ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name),_t)* ITTNOTIFY_NAME(name) = ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)); ITT_EXTERN_C_END \ static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args \ { \ if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL) \ __itt_init_ittlib_name(NULL, __itt_group_all); \ if (ITTNOTIFY_NAME(name) && ITTNOTIFY_NAME(name) != ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init))) \ ITTNOTIFY_NAME(name) params; \ else \ return; \ } #undef __ITT_INTERNAL_INIT #include "ittnotify_static.h" #undef ITT_STUB #undef ITT_STUBV #define ITT_STUB(api,type,name,args,params,ptr,group,format) \ static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args;\ typedef type api ITT_JOIN(_N_(name),_t) args; \ ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name),_t)* ITTNOTIFY_NAME(name) = ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)); ITT_EXTERN_C_END #define ITT_STUBV(api,type,name,args,params,ptr,group,format) \ static type api ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)) args;\ typedef type api ITT_JOIN(_N_(name),_t) args; \ ITT_EXTERN_C_BEGIN ITT_JOIN(_N_(name),_t)* ITTNOTIFY_NAME(name) = ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)); ITT_EXTERN_C_END #define __ITT_INTERNAL_INIT #include "ittnotify_static.h" #undef __ITT_INTERNAL_INIT ITT_GROUP_LIST(group_list); #pragma pack(push, 8) typedef struct ___itt_group_alias { const char* env_var; __itt_group_id groups; } __itt_group_alias; static __itt_group_alias group_alias[] = { { "KMP_FOR_TPROFILE", (__itt_group_id)(__itt_group_control | __itt_group_thread | __itt_group_sync | __itt_group_mark) }, { "KMP_FOR_TCHECK", (__itt_group_id)(__itt_group_control | __itt_group_thread | __itt_group_sync | __itt_group_fsync | __itt_group_mark | __itt_group_suppress) }, { NULL, (__itt_group_none) }, { api_version, (__itt_group_none) } /* !!! Just to avoid unused code elimination !!! */ }; #pragma pack(pop) #if ITT_PLATFORM==ITT_PLATFORM_WIN #if _MSC_VER #pragma warning(push) #pragma warning(disable: 4054) /* warning C4054: 'type cast' : from function pointer 'XXX' to data pointer 'void *' */ #endif #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ static __itt_api_info api_list[] = { /* Define functions with static implementation */ #undef ITT_STUB #undef ITT_STUBV #define ITT_STUB(api,type,name,args,params,nameindll,group,format) { ITT_TO_STR(ITT_JOIN(__itt_,nameindll)), (void**)(void*)&ITTNOTIFY_NAME(name), (void*)(size_t)&ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)), (void*)(size_t)&ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)), (__itt_group_id)(group)}, #define ITT_STUBV ITT_STUB #define __ITT_INTERNAL_INIT #include "ittnotify_static.h" #undef __ITT_INTERNAL_INIT /* Define functions without static implementation */ #undef ITT_STUB #undef ITT_STUBV #define ITT_STUB(api,type,name,args,params,nameindll,group,format) {ITT_TO_STR(ITT_JOIN(__itt_,nameindll)), (void**)(void*)&ITTNOTIFY_NAME(name), (void*)(size_t)&ITT_VERSIONIZE(ITT_JOIN(_N_(name),_init)), NULL, (__itt_group_id)(group)}, #define ITT_STUBV ITT_STUB #include "ittnotify_static.h" {NULL, NULL, NULL, NULL, __itt_group_none} }; #if ITT_PLATFORM==ITT_PLATFORM_WIN #if _MSC_VER #pragma warning(pop) #endif #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ /* static part descriptor which handles. all notification api attributes. */ __itt_global _N_(_ittapi_global) = { ITT_MAGIC, /* identification info */ ITT_MAJOR, ITT_MINOR, API_VERSION_BUILD, /* version info */ 0, /* api_initialized */ 0, /* mutex_initialized */ 0, /* atomic_counter */ MUTEX_INITIALIZER, /* mutex */ NULL, /* dynamic library handle */ NULL, /* error_handler */ NULL, /* dll_path_ptr */ (__itt_api_info*)&api_list, /* api_list_ptr */ NULL, /* next __itt_global */ NULL, /* thread_list */ NULL, /* domain_list */ NULL, /* string_list */ __itt_collection_uninitialized, /* collection state */ NULL, /* counter_list */ 0, /* ipt_collect_events */ NULL, /* histogram_list */ NULL /* counter_metadata_list */ }; typedef void (__itt_api_init_t)(__itt_global*, __itt_group_id); typedef void (__itt_api_fini_t)(__itt_global*); static __itt_domain dummy_domain; /* ========================================================================= */ #ifdef ITT_NOTIFY_EXT_REPORT ITT_EXTERN_C void _N_(error_handler)(__itt_error_code, va_list args); #endif /* ITT_NOTIFY_EXT_REPORT */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #if _MSC_VER #pragma warning(push) #pragma warning(disable: 4055) /* warning C4055: 'type cast' : from data pointer 'void *' to function pointer 'XXX' */ #endif #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ static void __itt_report_error(int code, ...) { va_list args; va_start(args, code); if (_N_(_ittapi_global).error_handler != NULL) { __itt_error_handler_t* handler = (__itt_error_handler_t*)(size_t)_N_(_ittapi_global).error_handler; handler((__itt_error_code)code, args); } #ifdef ITT_NOTIFY_EXT_REPORT _N_(error_handler)((__itt_error_code)code, args); #endif /* ITT_NOTIFY_EXT_REPORT */ va_end(args); } static int __itt_is_collector_available(void); #if ITT_PLATFORM==ITT_PLATFORM_WIN #if _MSC_VER #pragma warning(pop) #endif #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if ITT_PLATFORM==ITT_PLATFORM_WIN static __itt_domain* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createW),_init))(const wchar_t* name) { __itt_domain *h_tail = NULL, *h = NULL; if (name == NULL) { return NULL; } ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global)); if (_N_(_ittapi_global).api_initialized) { if (ITTNOTIFY_NAME(domain_createW) && ITTNOTIFY_NAME(domain_createW) != ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createW),_init))) { __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return ITTNOTIFY_NAME(domain_createW)(name); } else { __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return &dummy_domain; } } if (__itt_is_collector_available()) { for (h_tail = NULL, h = _N_(_ittapi_global).domain_list; h != NULL; h_tail = h, h = h->next) { if (h->nameW != NULL && !wcscmp(h->nameW, name)) break; } if (h == NULL) { NEW_DOMAIN_W(&_N_(_ittapi_global), h, h_tail, name); } } if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return h; } static __itt_domain* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createA),_init))(const char* name) #else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */ static __itt_domain* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(domain_create),_init))(const char* name) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ { __itt_domain *h_tail = NULL, *h = NULL; if (name == NULL) { return NULL; } ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global)); if (_N_(_ittapi_global).api_initialized) { #if ITT_PLATFORM==ITT_PLATFORM_WIN if (ITTNOTIFY_NAME(domain_createA) && ITTNOTIFY_NAME(domain_createA) != ITT_VERSIONIZE(ITT_JOIN(_N_(domain_createA),_init))) { __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return ITTNOTIFY_NAME(domain_createA)(name); } #else if (ITTNOTIFY_NAME(domain_create) && ITTNOTIFY_NAME(domain_create) != ITT_VERSIONIZE(ITT_JOIN(_N_(domain_create),_init))) { if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return ITTNOTIFY_NAME(domain_create)(name); } #endif else { #if ITT_PLATFORM==ITT_PLATFORM_WIN __itt_mutex_unlock(&_N_(_ittapi_global).mutex); #else if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex); #endif return &dummy_domain; } } if (__itt_is_collector_available()) { for (h_tail = NULL, h = _N_(_ittapi_global).domain_list; h != NULL; h_tail = h, h = h->next) { if (h->nameA != NULL && !__itt_fstrcmp(h->nameA, name)) break; } if (h == NULL) { NEW_DOMAIN_A(&_N_(_ittapi_global), h, h_tail, name); } } if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return h; } static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(module_load_with_sections),_init))(__itt_module_object* module_obj) { if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL) { __itt_init_ittlib_name(NULL, __itt_group_all); } if (ITTNOTIFY_NAME(module_load_with_sections) && ITTNOTIFY_NAME(module_load_with_sections) != ITT_VERSIONIZE(ITT_JOIN(_N_(module_load_with_sections),_init))) { if(module_obj != NULL) { module_obj->version = ITT_MODULE_OBJECT_VERSION; ITTNOTIFY_NAME(module_load_with_sections)(module_obj); } } } static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(module_unload_with_sections),_init))(__itt_module_object* module_obj) { if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL) { __itt_init_ittlib_name(NULL, __itt_group_all); } if (ITTNOTIFY_NAME(module_unload_with_sections) && ITTNOTIFY_NAME(module_unload_with_sections) != ITT_VERSIONIZE(ITT_JOIN(_N_(module_unload_with_sections),_init))) { if(module_obj != NULL) { module_obj->version = ITT_MODULE_OBJECT_VERSION; ITTNOTIFY_NAME(module_unload_with_sections)(module_obj); } } } #if ITT_PLATFORM==ITT_PLATFORM_WIN static __itt_string_handle* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_createW),_init))(const wchar_t* name) { __itt_string_handle *h_tail = NULL, *h = NULL; if (name == NULL) { return NULL; } ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global)); if (_N_(_ittapi_global).api_initialized) { if (ITTNOTIFY_NAME(string_handle_createW) && ITTNOTIFY_NAME(string_handle_createW) != ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_createW),_init))) { __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return ITTNOTIFY_NAME(string_handle_createW)(name); } else { __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return NULL; } } if (__itt_is_collector_available()) { for (h_tail = NULL, h = _N_(_ittapi_global).string_list; h != NULL; h_tail = h, h = h->next) { if (h->strW != NULL && !wcscmp(h->strW, name)) break; } if (h == NULL) { NEW_STRING_HANDLE_W(&_N_(_ittapi_global), h, h_tail, name); } } __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return h; } static __itt_string_handle* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_createA),_init))(const char* name) #else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */ static __itt_string_handle* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_create),_init))(const char* name) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ { __itt_string_handle *h_tail = NULL, *h = NULL; if (name == NULL) { return NULL; } ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global)); if (_N_(_ittapi_global).api_initialized) { #if ITT_PLATFORM==ITT_PLATFORM_WIN if (ITTNOTIFY_NAME(string_handle_createA) && ITTNOTIFY_NAME(string_handle_createA) != ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_createA),_init))) { __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return ITTNOTIFY_NAME(string_handle_createA)(name); } #else if (ITTNOTIFY_NAME(string_handle_create) && ITTNOTIFY_NAME(string_handle_create) != ITT_VERSIONIZE(ITT_JOIN(_N_(string_handle_create),_init))) { if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return ITTNOTIFY_NAME(string_handle_create)(name); } #endif else { #if ITT_PLATFORM==ITT_PLATFORM_WIN __itt_mutex_unlock(&_N_(_ittapi_global).mutex); #else if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex); #endif return NULL; } } if (__itt_is_collector_available()) { for (h_tail = NULL, h = _N_(_ittapi_global).string_list; h != NULL; h_tail = h, h = h->next) { if (h->strA != NULL && !__itt_fstrcmp(h->strA, name)) break; } if (h == NULL) { NEW_STRING_HANDLE_A(&_N_(_ittapi_global), h, h_tail, name); } } if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return h; } #if ITT_PLATFORM==ITT_PLATFORM_WIN static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createW),_init))(const wchar_t *name, const wchar_t *domain) { __itt_counter_info_t *h_tail = NULL, *h = NULL; __itt_metadata_type type = __itt_metadata_u64; if (name == NULL) { return NULL; } ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global)); if (_N_(_ittapi_global).api_initialized) { if (ITTNOTIFY_NAME(counter_createW) && ITTNOTIFY_NAME(counter_createW) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createW),_init))) { __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return ITTNOTIFY_NAME(counter_createW)(name, domain); } else { __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return NULL; } } if (__itt_is_collector_available()) { for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL; h_tail = h, h = h->next) { if (h->nameW != NULL && h->type == (int)type && !wcscmp(h->nameW, name) && ((h->domainW == NULL && domain == NULL) || (h->domainW != NULL && domain != NULL && !wcscmp(h->domainW, domain)))) break; } if (h == NULL) { NEW_COUNTER_W(&_N_(_ittapi_global), h, h_tail, name, domain, type); } } __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return (__itt_counter)h; } static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createA),_init))(const char *name, const char *domain) #else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */ static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create),_init))(const char *name, const char *domain) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ { __itt_counter_info_t *h_tail = NULL, *h = NULL; __itt_metadata_type type = __itt_metadata_u64; if (name == NULL) { return NULL; } ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global)); if (_N_(_ittapi_global).api_initialized) { #if ITT_PLATFORM==ITT_PLATFORM_WIN if (ITTNOTIFY_NAME(counter_createA) && ITTNOTIFY_NAME(counter_createA) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createA),_init))) { __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return ITTNOTIFY_NAME(counter_createA)(name, domain); } #else if (ITTNOTIFY_NAME(counter_create) && ITTNOTIFY_NAME(counter_create) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create),_init))) { if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return ITTNOTIFY_NAME(counter_create)(name, domain); } #endif else { #if ITT_PLATFORM==ITT_PLATFORM_WIN __itt_mutex_unlock(&_N_(_ittapi_global).mutex); #else if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex); #endif return NULL; } } if (__itt_is_collector_available()) { for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL; h_tail = h, h = h->next) { if (h->nameA != NULL && h->type == (int)type && !__itt_fstrcmp(h->nameA, name) && ((h->domainA == NULL && domain == NULL) || (h->domainA != NULL && domain != NULL && !__itt_fstrcmp(h->domainA, domain)))) break; } if (h == NULL) { NEW_COUNTER_A(&_N_(_ittapi_global), h, h_tail, name, domain, type); } } if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return (__itt_counter)h; } #if ITT_PLATFORM==ITT_PLATFORM_WIN static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typedW),_init))(const wchar_t *name, const wchar_t *domain, __itt_metadata_type type) { __itt_counter_info_t *h_tail = NULL, *h = NULL; if (name == NULL) { return NULL; } ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global)); if (_N_(_ittapi_global).api_initialized) { if (ITTNOTIFY_NAME(counter_create_typedW) && ITTNOTIFY_NAME(counter_create_typedW) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typedW),_init))) { __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return ITTNOTIFY_NAME(counter_create_typedW)(name, domain, type); } else { __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return NULL; } } if (__itt_is_collector_available()) { for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL; h_tail = h, h = h->next) { if (h->nameW != NULL && h->type == (int)type && !wcscmp(h->nameW, name) && ((h->domainW == NULL && domain == NULL) || (h->domainW != NULL && domain != NULL && !wcscmp(h->domainW, domain)))) break; } if (h == NULL) { NEW_COUNTER_W(&_N_(_ittapi_global), h, h_tail, name, domain, type); } } __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return (__itt_counter)h; } static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typedA),_init))(const char *name, const char *domain, __itt_metadata_type type) #else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */ static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typed),_init))(const char *name, const char *domain, __itt_metadata_type type) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ { __itt_counter_info_t *h_tail = NULL, *h = NULL; if (name == NULL) { return NULL; } ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global)); if (_N_(_ittapi_global).api_initialized) { #if ITT_PLATFORM==ITT_PLATFORM_WIN if (ITTNOTIFY_NAME(counter_create_typedA) && ITTNOTIFY_NAME(counter_create_typedA) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typedA),_init))) { __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return ITTNOTIFY_NAME(counter_create_typedA)(name, domain, type); } #else if (ITTNOTIFY_NAME(counter_create_typed) && ITTNOTIFY_NAME(counter_create_typed) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_typed),_init))) { if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return ITTNOTIFY_NAME(counter_create_typed)(name, domain, type); } #endif else { #if ITT_PLATFORM==ITT_PLATFORM_WIN __itt_mutex_unlock(&_N_(_ittapi_global).mutex); #else if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex); #endif return NULL; } } if (__itt_is_collector_available()) { for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL; h_tail = h, h = h->next) { if (h->nameA != NULL && h->type == (int)type && !__itt_fstrcmp(h->nameA, name) && ((h->domainA == NULL && domain == NULL) || (h->domainA != NULL && domain != NULL && !__itt_fstrcmp(h->domainA, domain)))) break; } if (h == NULL) { NEW_COUNTER_A(&_N_(_ittapi_global), h, h_tail, name, domain, type); } } if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return (__itt_counter)h; } #if ITT_PLATFORM==ITT_PLATFORM_WIN static __itt_histogram* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(histogram_createW),_init))(const __itt_domain* domain, const wchar_t* name, __itt_metadata_type x_type, __itt_metadata_type y_type) { __itt_histogram *h_tail = NULL, *h = NULL; if (domain == NULL || name == NULL) { return NULL; } ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global)); if (_N_(_ittapi_global).api_initialized) { if (ITTNOTIFY_NAME(histogram_createW) && ITTNOTIFY_NAME(histogram_createW) != ITT_VERSIONIZE(ITT_JOIN(_N_(histogram_createW),_init))) { __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return ITTNOTIFY_NAME(histogram_createW)(domain, name, x_type, y_type); } else { __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return NULL; } } if (__itt_is_collector_available()) { for (h_tail = NULL, h = _N_(_ittapi_global).histogram_list; h != NULL; h_tail = h, h = h->next) { if (h->domain == NULL) continue; else if (h->domain == domain && h->nameW != NULL && !wcscmp(h->nameW, name)) break; } if (h == NULL) { NEW_HISTOGRAM_W(&_N_(_ittapi_global), h, h_tail, domain, name, x_type, y_type); } } __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return (__itt_histogram*)h; } static __itt_histogram* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(histogram_createA),_init))(const __itt_domain* domain, const char* name, __itt_metadata_type x_type, __itt_metadata_type y_type) #else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */ static __itt_histogram* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(histogram_create),_init))(const __itt_domain* domain, const char* name, __itt_metadata_type x_type, __itt_metadata_type y_type) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ { __itt_histogram *h_tail = NULL, *h = NULL; if (domain == NULL || name == NULL) { return NULL; } ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global)); if (_N_(_ittapi_global).api_initialized) { #if ITT_PLATFORM==ITT_PLATFORM_WIN if (ITTNOTIFY_NAME(histogram_createA) && ITTNOTIFY_NAME(histogram_createA) != ITT_VERSIONIZE(ITT_JOIN(_N_(histogram_createA),_init))) { __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return ITTNOTIFY_NAME(histogram_createA)(domain, name, x_type, y_type); } #else if (ITTNOTIFY_NAME(histogram_create) && ITTNOTIFY_NAME(histogram_create) != ITT_VERSIONIZE(ITT_JOIN(_N_(histogram_create),_init))) { if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return ITTNOTIFY_NAME(histogram_create)(domain, name, x_type, y_type); } #endif else { #if ITT_PLATFORM==ITT_PLATFORM_WIN __itt_mutex_unlock(&_N_(_ittapi_global).mutex); #else if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex); #endif return NULL; } } if (__itt_is_collector_available()) { for (h_tail = NULL, h = _N_(_ittapi_global).histogram_list; h != NULL; h_tail = h, h = h->next) { if (h->domain == NULL) continue; else if (h->domain == domain && h->nameA != NULL && !__itt_fstrcmp(h->nameA, name)) break; } if (h == NULL) { NEW_HISTOGRAM_A(&_N_(_ittapi_global), h, h_tail, domain, name, x_type, y_type); } } if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return (__itt_histogram*)h; } #if ITT_PLATFORM==ITT_PLATFORM_WIN static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createW_v3),_init))(const __itt_domain* domain, const wchar_t* name, __itt_metadata_type type) { __itt_counter_info_t *h_tail = NULL, *h = NULL; if (name == NULL || domain == NULL) { return NULL; } ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global)); if (_N_(_ittapi_global).api_initialized) { if (ITTNOTIFY_NAME(counter_createW_v3) && ITTNOTIFY_NAME(counter_createW_v3) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createW_v3),_init))) { __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return ITTNOTIFY_NAME(counter_createW_v3)(domain, name, type); } else { __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return NULL; } } if (__itt_is_collector_available()) { for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL; h_tail = h, h = h->next) { if (h->nameW != NULL && h->type == (int)type && !wcscmp(h->nameW, name) && ((h->domainW == NULL && domain->nameW == NULL) || (h->domainW != NULL && domain->nameW != NULL && !wcscmp(h->domainW, domain->nameW)))) break; } if (h == NULL) { NEW_COUNTER_W(&_N_(_ittapi_global),h,h_tail,name,domain->nameW,type); } } __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return (__itt_counter)h; } static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createA_v3),_init))(const __itt_domain* domain, const char* name, __itt_metadata_type type) #else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */ static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_v3),_init))(const __itt_domain* domain, const char* name, __itt_metadata_type type) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ { __itt_counter_info_t *h_tail = NULL, *h = NULL; if (name == NULL || domain == NULL) { return NULL; } ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global)); if (_N_(_ittapi_global).api_initialized) { #if ITT_PLATFORM==ITT_PLATFORM_WIN if (ITTNOTIFY_NAME(counter_createA_v3) && ITTNOTIFY_NAME(counter_createA_v3) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createA_v3),_init))) { __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return ITTNOTIFY_NAME(counter_createA_v3)(domain, name, type); } #else if (ITTNOTIFY_NAME(counter_create_v3) && ITTNOTIFY_NAME(counter_create_v3) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_v3),_init))) { if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return ITTNOTIFY_NAME(counter_create_v3)(domain, name, type); } #endif else { #if ITT_PLATFORM==ITT_PLATFORM_WIN __itt_mutex_unlock(&_N_(_ittapi_global).mutex); #else if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex); #endif return NULL; } } if (__itt_is_collector_available()) { for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL; h_tail = h, h = h->next) { if (h->nameA != NULL && h->type == (int)type && !__itt_fstrcmp(h->nameA, name) && ((h->domainA == NULL && domain->nameA == NULL) || (h->domainA != NULL && domain->nameA != NULL && !__itt_fstrcmp(h->domainA, domain->nameA)))) break; } if (h == NULL) { NEW_COUNTER_A(&_N_(_ittapi_global),h,h_tail,name,domain->nameA,type); } } if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return (__itt_counter)h; } static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(bind_context_metadata_to_counter),_init))(__itt_counter counter, size_t length, __itt_context_metadata* metadata) { __itt_counter_metadata *h_tail = NULL, *h = NULL; if (counter == NULL || length == 0 || metadata == NULL) { return; } ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global)); if (_N_(_ittapi_global).api_initialized) { if (ITTNOTIFY_NAME(bind_context_metadata_to_counter) && ITTNOTIFY_NAME(bind_context_metadata_to_counter) != ITT_VERSIONIZE(ITT_JOIN(_N_(bind_context_metadata_to_counter),_init))) { if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex); ITTNOTIFY_NAME(bind_context_metadata_to_counter)(counter, length, metadata); } else { #if ITT_PLATFORM==ITT_PLATFORM_WIN __itt_mutex_unlock(&_N_(_ittapi_global).mutex); #else if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex); #endif return; } } if (__itt_is_collector_available()) { size_t item; char* str_valueA = NULL; #if ITT_PLATFORM==ITT_PLATFORM_WIN wchar_t* str_valueW = NULL; #endif unsigned long long value = 0; __itt_context_type type = __itt_context_unknown; for (item = 0; item < length; item++) { type = metadata[item].type; for (h_tail = NULL, h = _N_(_ittapi_global).counter_metadata_list; h != NULL; h_tail = h, h = h->next) { if (h->counter != NULL && h->counter == counter && h->type == type) break; } if (h == NULL && counter != NULL && type != __itt_context_unknown) { if (type == __itt_context_nameA || type == __itt_context_deviceA || type == __itt_context_unitsA || type == __itt_context_pci_addrA) { str_valueA = (char*)(metadata[item].value); NEW_COUNTER_METADATA_STR_A(&_N_(_ittapi_global),h,h_tail,counter,type,str_valueA); } #if ITT_PLATFORM==ITT_PLATFORM_WIN else if (type == __itt_context_nameW || type == __itt_context_deviceW || type == __itt_context_unitsW || type == __itt_context_pci_addrW) { str_valueW = (wchar_t*)(metadata[item].value); NEW_COUNTER_METADATA_STR_W(&_N_(_ittapi_global),h,h_tail,counter,type,str_valueW); } #endif else if (type >= __itt_context_tid && type <= __itt_context_cpu_cycles_flag) { value = *(unsigned long long*)(metadata[item].value); NEW_COUNTER_METADATA_NUM(&_N_(_ittapi_global),h,h_tail,counter,type,value); } } } } if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex); } /* -------------------------------------------------------------------------- */ static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(pause),_init))(void) { if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL) { __itt_init_ittlib_name(NULL, __itt_group_all); } if (ITTNOTIFY_NAME(pause) && ITTNOTIFY_NAME(pause) != ITT_VERSIONIZE(ITT_JOIN(_N_(pause),_init))) { ITTNOTIFY_NAME(pause)(); } } static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(resume),_init))(void) { if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL) { __itt_init_ittlib_name(NULL, __itt_group_all); } if (ITTNOTIFY_NAME(resume) && ITTNOTIFY_NAME(resume) != ITT_VERSIONIZE(ITT_JOIN(_N_(resume),_init))) { ITTNOTIFY_NAME(resume)(); } } static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(pause_scoped),_init))(__itt_collection_scope scope) { if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL) { __itt_init_ittlib_name(NULL, __itt_group_all); } if (ITTNOTIFY_NAME(pause_scoped) && ITTNOTIFY_NAME(pause_scoped) != ITT_VERSIONIZE(ITT_JOIN(_N_(pause_scoped),_init))) { ITTNOTIFY_NAME(pause_scoped)(scope); } } static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(resume_scoped),_init))(__itt_collection_scope scope) { if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL) { __itt_init_ittlib_name(NULL, __itt_group_all); } if (ITTNOTIFY_NAME(resume_scoped) && ITTNOTIFY_NAME(resume_scoped) != ITT_VERSIONIZE(ITT_JOIN(_N_(resume_scoped),_init))) { ITTNOTIFY_NAME(resume_scoped)(scope); } } #if ITT_PLATFORM==ITT_PLATFORM_WIN static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameW),_init))(const wchar_t* name) { if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL) { __itt_init_ittlib_name(NULL, __itt_group_all); } if (ITTNOTIFY_NAME(thread_set_nameW) && ITTNOTIFY_NAME(thread_set_nameW) != ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameW),_init))) { ITTNOTIFY_NAME(thread_set_nameW)(name); } } static int ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_name_setW),_init))(const wchar_t* name, int namelen) { (void)namelen; ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameW),_init))(name); return 0; } static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameA),_init))(const char* name) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_name),_init))(const char* name) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ { if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL) { __itt_init_ittlib_name(NULL, __itt_group_all); } #if ITT_PLATFORM==ITT_PLATFORM_WIN if (ITTNOTIFY_NAME(thread_set_nameA) && ITTNOTIFY_NAME(thread_set_nameA) != ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameA),_init))) { ITTNOTIFY_NAME(thread_set_nameA)(name); } #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ if (ITTNOTIFY_NAME(thread_set_name) && ITTNOTIFY_NAME(thread_set_name) != ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_name),_init))) { ITTNOTIFY_NAME(thread_set_name)(name); } #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ } #if ITT_PLATFORM==ITT_PLATFORM_WIN static int ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_name_setA),_init))(const char* name, int namelen) { (void)namelen; ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameA),_init))(name); return 0; } #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ static int ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_name_set),_init))(const char* name, int namelen) { (void)namelen; ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_name),_init))(name); return 0; } #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_ignore),_init))(void) { if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL) { __itt_init_ittlib_name(NULL, __itt_group_all); } if (ITTNOTIFY_NAME(thread_ignore) && ITTNOTIFY_NAME(thread_ignore) != ITT_VERSIONIZE(ITT_JOIN(_N_(thread_ignore),_init))) { ITTNOTIFY_NAME(thread_ignore)(); } } static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thr_ignore),_init))(void) { ITT_VERSIONIZE(ITT_JOIN(_N_(thread_ignore),_init))(); } static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(enable_attach),_init))(void) { #ifdef __ANDROID__ /* * if LIB_VAR_NAME env variable were set before then stay previous value * else set default path */ setenv(ITT_TO_STR(LIB_VAR_NAME), ANDROID_ITTNOTIFY_DEFAULT_PATH, 0); #endif } /* -------------------------------------------------------------------------- */ static const char* __itt_fsplit(const char* s, const char* sep, const char** out, int* len) { int i; int j; if (!s || !sep || !out || !len) return NULL; for (i = 0; s[i]; i++) { int b = 0; for (j = 0; sep[j]; j++) if (s[i] == sep[j]) { b = 1; break; } if (!b) break; } if (!s[i]) return NULL; *len = 0; *out = &s[i]; for (; s[i]; i++, (*len)++) { int b = 0; for (j = 0; sep[j]; j++) if (s[i] == sep[j]) { b = 1; break; } if (b) break; } for (; s[i]; i++) { int b = 0; for (j = 0; sep[j]; j++) if (s[i] == sep[j]) { b = 1; break; } if (!b) break; } return &s[i]; } /* This function return value of env variable that placed into static buffer. * !!! The same static buffer is used for subsequent calls. !!! * This was done to avoid dynamic allocation for few calls. * Actually we need this function only four times. */ static const char* __itt_get_env_var(const char* name) { #define MAX_ENV_VALUE_SIZE 4086 static char env_buff[MAX_ENV_VALUE_SIZE]; static char* env_value = (char*)env_buff; if (name != NULL) { #if ITT_PLATFORM==ITT_PLATFORM_WIN size_t max_len = MAX_ENV_VALUE_SIZE - (size_t)(env_value - env_buff); DWORD rc = GetEnvironmentVariableA(name, env_value, (DWORD)max_len); if (rc >= max_len) __itt_report_error(__itt_error_env_too_long, name, (size_t)rc - 1, (size_t)(max_len - 1)); else if (rc > 0) { const char* ret = (const char*)env_value; env_value += rc + 1; return ret; } else { /* If environment variable is empty, GetEnvironmentVariables() * returns zero (number of characters (not including terminating null), * and GetLastError() returns ERROR_SUCCESS. */ DWORD err = GetLastError(); if (err == ERROR_SUCCESS) return env_value; if (err != ERROR_ENVVAR_NOT_FOUND) __itt_report_error(__itt_error_cant_read_env, name, (int)err); } #else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */ char* env = getenv(name); if (env != NULL) { size_t len = __itt_fstrnlen(env, MAX_ENV_VALUE_SIZE); size_t max_len = MAX_ENV_VALUE_SIZE - (size_t)(env_value - env_buff); if (len < max_len) { const char* ret = (const char*)env_value; __itt_fstrcpyn(env_value, max_len, env, len + 1); env_value += len + 1; return ret; } else __itt_report_error(__itt_error_env_too_long, name, (size_t)len, (size_t)(max_len - 1)); } #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ } return NULL; } static const char* __itt_get_lib_name(void) { const char* lib_name = __itt_get_env_var(ITT_TO_STR(LIB_VAR_NAME)); #ifdef __ANDROID__ if (lib_name == NULL) { #if ITT_ARCH==ITT_ARCH_IA32 || ITT_ARCH==ITT_ARCH_ARM const char* const marker_filename = "com.intel.itt.collector_lib_32"; #else const char* const marker_filename = "com.intel.itt.collector_lib_64"; #endif char system_wide_marker_filename[PATH_MAX] = {0}; int itt_marker_file_fd = -1; ssize_t res = 0; res = snprintf(system_wide_marker_filename, PATH_MAX - 1, "%s%s", "/data/local/tmp/", marker_filename); if (res < 0) { ITT_ANDROID_LOGE("Unable to concatenate marker file string."); return lib_name; } itt_marker_file_fd = open(system_wide_marker_filename, O_RDONLY); if (itt_marker_file_fd == -1) { const pid_t my_pid = getpid(); char cmdline_path[PATH_MAX] = {0}; char package_name[PATH_MAX] = {0}; char app_sandbox_file[PATH_MAX] = {0}; int cmdline_fd = 0; ITT_ANDROID_LOGI("Unable to open system-wide marker file."); res = snprintf(cmdline_path, PATH_MAX - 1, "/proc/%d/cmdline", my_pid); if (res < 0) { ITT_ANDROID_LOGE("Unable to get cmdline path string."); return lib_name; } ITT_ANDROID_LOGI("CMD file: %s\n", cmdline_path); cmdline_fd = open(cmdline_path, O_RDONLY); if (cmdline_fd == -1) { ITT_ANDROID_LOGE("Unable to open %s file!", cmdline_path); return lib_name; } res = read(cmdline_fd, package_name, PATH_MAX - 1); if (res == -1) { ITT_ANDROID_LOGE("Unable to read %s file!", cmdline_path); res = close(cmdline_fd); if (res == -1) { ITT_ANDROID_LOGE("Unable to close %s file!", cmdline_path); } return lib_name; } res = close(cmdline_fd); if (res == -1) { ITT_ANDROID_LOGE("Unable to close %s file!", cmdline_path); return lib_name; } ITT_ANDROID_LOGI("Package name: %s\n", package_name); res = snprintf(app_sandbox_file, PATH_MAX - 1, "/data/data/%s/%s", package_name, marker_filename); if (res < 0) { ITT_ANDROID_LOGE("Unable to concatenate marker file string."); return lib_name; } ITT_ANDROID_LOGI("Lib marker file name: %s\n", app_sandbox_file); itt_marker_file_fd = open(app_sandbox_file, O_RDONLY); if (itt_marker_file_fd == -1) { ITT_ANDROID_LOGE("Unable to open app marker file!"); return lib_name; } } { char itt_lib_name[PATH_MAX] = {0}; res = read(itt_marker_file_fd, itt_lib_name, PATH_MAX - 1); if (res == -1) { ITT_ANDROID_LOGE("Unable to read %s file!", itt_marker_file_fd); res = close(itt_marker_file_fd); if (res == -1) { ITT_ANDROID_LOGE("Unable to close %s file!", itt_marker_file_fd); } return lib_name; } ITT_ANDROID_LOGI("ITT Lib path: %s", itt_lib_name); res = close(itt_marker_file_fd); if (res == -1) { ITT_ANDROID_LOGE("Unable to close %s file!", itt_marker_file_fd); return lib_name; } ITT_ANDROID_LOGI("Set env %s to %s", ITT_TO_STR(LIB_VAR_NAME), itt_lib_name); res = setenv(ITT_TO_STR(LIB_VAR_NAME), itt_lib_name, 0); if (res == -1) { ITT_ANDROID_LOGE("Unable to set env var!"); return lib_name; } lib_name = __itt_get_env_var(ITT_TO_STR(LIB_VAR_NAME)); ITT_ANDROID_LOGI("ITT Lib path from env: %s", lib_name); } } #endif return lib_name; } /* Avoid clashes with std::min */ #define __itt_min(a,b) ((a) < (b) ? (a) : (b)) static __itt_group_id __itt_get_groups(void) { int i; __itt_group_id res = __itt_group_none; const char* var_name = "INTEL_ITTNOTIFY_GROUPS"; const char* group_str = __itt_get_env_var(var_name); if (group_str != NULL) { int len; char gr[255]; const char* chunk; while ((group_str = __itt_fsplit(group_str, ",; ", &chunk, &len)) != NULL) { int min_len = __itt_min(len, (int)(sizeof(gr) - 1)); __itt_fstrcpyn(gr, sizeof(gr) - 1, chunk, min_len); gr[min_len] = 0; for (i = 0; group_list[i].name != NULL; i++) { if (!__itt_fstrcmp(gr, group_list[i].name)) { res = (__itt_group_id)(res | group_list[i].id); break; } } } /* TODO: !!! Workaround for bug with warning for unknown group !!! * Should be fixed in new initialization scheme. * Now the following groups should be set always. */ for (i = 0; group_list[i].id != __itt_group_none; i++) if (group_list[i].id != __itt_group_all && group_list[i].id > __itt_group_splitter_min && group_list[i].id < __itt_group_splitter_max) res = (__itt_group_id)(res | group_list[i].id); return res; } else { for (i = 0; group_alias[i].env_var != NULL; i++) if (__itt_get_env_var(group_alias[i].env_var) != NULL) return group_alias[i].groups; } return res; } #undef __itt_min static int __itt_lib_version(lib_t lib) { if (lib == NULL) return 0; if (__itt_get_proc(lib, "__itt_api_init")) return 2; if (__itt_get_proc(lib, "__itt_api_version")) return 1; return 0; } /* It's not used right now! Comment it out to avoid warnings. static void __itt_reinit_all_pointers(void) { register int i; // Fill all pointers with initial stubs for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++) *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].init_func; } */ static void __itt_nullify_all_pointers(void) { int i; /* Nulify all pointers except domain_create, string_handle_create and counter_create */ for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++) *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].null_func; } static int __itt_is_collector_available(void) { int is_available; ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global)); if (_N_(_ittapi_global).state == __itt_collection_uninitialized) { _N_(_ittapi_global).state = (NULL == __itt_get_lib_name()) ? __itt_collection_collector_absent : __itt_collection_collector_exists; } is_available = (_N_(_ittapi_global).state == __itt_collection_collector_exists || _N_(_ittapi_global).state == __itt_collection_init_successful); __itt_mutex_unlock(&_N_(_ittapi_global).mutex); return is_available; } #if ITT_PLATFORM==ITT_PLATFORM_WIN #if _MSC_VER #pragma warning(push) #pragma warning(disable: 4054) /* warning C4054: 'type cast' : from function pointer 'XXX' to data pointer 'void *' */ #pragma warning(disable: 4055) /* warning C4055: 'type cast' : from data pointer 'void *' to function pointer 'XXX' */ #endif #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_EXTERN_C void _N_(fini_ittlib)(void) { __itt_api_fini_t* __itt_api_fini_ptr = NULL; static volatile TIDT current_thread = 0; if (_N_(_ittapi_global).api_initialized) { ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global)); if (_N_(_ittapi_global).api_initialized) { if (current_thread == 0) { if (PTHREAD_SYMBOLS) current_thread = __itt_thread_id(); if (_N_(_ittapi_global).lib != NULL) { __itt_api_fini_ptr = (__itt_api_fini_t*)(size_t)__itt_get_proc(_N_(_ittapi_global).lib, "__itt_api_fini"); } if (__itt_api_fini_ptr) { __itt_api_fini_ptr(&_N_(_ittapi_global)); } __itt_nullify_all_pointers(); /* TODO: !!! not safe !!! don't support unload so far. * if (_N_(_ittapi_global).lib != NULL) * __itt_unload_lib(_N_(_ittapi_global).lib); * _N_(_ittapi_global).lib = NULL; */ _N_(_ittapi_global).api_initialized = 0; current_thread = 0; } } if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex); } } /* !!! this function should be called under mutex lock !!! */ static void __itt_free_allocated_resources(void) { __itt_string_handle* current_string = _N_(_ittapi_global).string_list; while (current_string != NULL) { __itt_string_handle* tmp = current_string->next; free((char*)current_string->strA); #if ITT_PLATFORM==ITT_PLATFORM_WIN free((wchar_t*)current_string->strW); #endif free(current_string); current_string = tmp; } _N_(_ittapi_global).string_list = NULL; __itt_domain* current_domain = _N_(_ittapi_global).domain_list; while (current_domain != NULL) { __itt_domain* tmp = current_domain->next; free((char*)current_domain->nameA); #if ITT_PLATFORM==ITT_PLATFORM_WIN free((wchar_t*)current_domain->nameW); #endif free(current_domain); current_domain = tmp; } _N_(_ittapi_global).domain_list = NULL; __itt_counter_info_t* current_couter = _N_(_ittapi_global).counter_list; while (current_couter != NULL) { __itt_counter_info_t* tmp = current_couter->next; free((char*)current_couter->nameA); free((char*)current_couter->domainA); #if ITT_PLATFORM==ITT_PLATFORM_WIN free((wchar_t*)current_couter->nameW); free((wchar_t*)current_couter->domainW); #endif free(current_couter); current_couter = tmp; } _N_(_ittapi_global).counter_list = NULL; __itt_histogram* current_histogram = _N_(_ittapi_global).histogram_list; while (current_histogram != NULL) { __itt_histogram* tmp = current_histogram->next; free((char*)current_histogram->nameA); #if ITT_PLATFORM==ITT_PLATFORM_WIN free((wchar_t*)current_histogram->nameW); #endif free(current_histogram); current_histogram = tmp; } _N_(_ittapi_global).histogram_list = NULL; __itt_counter_metadata* current_counter_metadata = _N_(_ittapi_global).counter_metadata_list; while (current_counter_metadata != NULL) { __itt_counter_metadata* tmp = current_counter_metadata->next; free((char*)current_counter_metadata->str_valueA); #if ITT_PLATFORM==ITT_PLATFORM_WIN free((wchar_t*)current_counter_metadata->str_valueW); #endif free(current_counter_metadata); current_counter_metadata = tmp; } _N_(_ittapi_global).counter_metadata_list = NULL; } ITT_EXTERN_C int _N_(init_ittlib)(const char* lib_name, __itt_group_id init_groups) { int i; __itt_group_id groups; #ifdef ITT_COMPLETE_GROUP __itt_group_id zero_group = __itt_group_none; #endif /* ITT_COMPLETE_GROUP */ static volatile TIDT current_thread = 0; if (!_N_(_ittapi_global).api_initialized) { #ifndef ITT_SIMPLE_INIT ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global)); #endif /* ITT_SIMPLE_INIT */ if (!_N_(_ittapi_global).api_initialized) { if (current_thread == 0) { if (PTHREAD_SYMBOLS) current_thread = __itt_thread_id(); if (lib_name == NULL) { lib_name = __itt_get_lib_name(); } groups = __itt_get_groups(); if (DL_SYMBOLS && (groups != __itt_group_none || lib_name != NULL)) { _N_(_ittapi_global).lib = __itt_load_lib((lib_name == NULL) ? ittnotify_lib_name : lib_name); if (_N_(_ittapi_global).lib != NULL) { _N_(_ittapi_global).state = __itt_collection_init_successful; __itt_api_init_t* __itt_api_init_ptr; int lib_version = __itt_lib_version(_N_(_ittapi_global).lib); switch (lib_version) { case 0: groups = __itt_group_legacy; ITT_ATTRIBUTE_FALLTHROUGH; case 1: /* Fill all pointers from dynamic library */ for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++) { if (_N_(_ittapi_global).api_list_ptr[i].group & groups & init_groups) { *_N_(_ittapi_global).api_list_ptr[i].func_ptr = (void*)__itt_get_proc(_N_(_ittapi_global).lib, _N_(_ittapi_global).api_list_ptr[i].name); if (*_N_(_ittapi_global).api_list_ptr[i].func_ptr == NULL) { /* Restore pointers for function with static implementation */ *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].null_func; __itt_report_error(__itt_error_no_symbol, lib_name, _N_(_ittapi_global).api_list_ptr[i].name); #ifdef ITT_COMPLETE_GROUP zero_group = (__itt_group_id)(zero_group | _N_(_ittapi_global).api_list_ptr[i].group); #endif /* ITT_COMPLETE_GROUP */ } } else *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].null_func; } if (groups == __itt_group_legacy) { /* Compatibility with legacy tools */ ITTNOTIFY_NAME(thread_ignore) = ITTNOTIFY_NAME(thr_ignore); #if ITT_PLATFORM==ITT_PLATFORM_WIN ITTNOTIFY_NAME(sync_createA) = ITTNOTIFY_NAME(sync_set_nameA); ITTNOTIFY_NAME(sync_createW) = ITTNOTIFY_NAME(sync_set_nameW); #else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */ ITTNOTIFY_NAME(sync_create) = ITTNOTIFY_NAME(sync_set_name); #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITTNOTIFY_NAME(sync_prepare) = ITTNOTIFY_NAME(notify_sync_prepare); ITTNOTIFY_NAME(sync_cancel) = ITTNOTIFY_NAME(notify_sync_cancel); ITTNOTIFY_NAME(sync_acquired) = ITTNOTIFY_NAME(notify_sync_acquired); ITTNOTIFY_NAME(sync_releasing) = ITTNOTIFY_NAME(notify_sync_releasing); } #ifdef ITT_COMPLETE_GROUP for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++) if (_N_(_ittapi_global).api_list_ptr[i].group & zero_group) *_N_(_ittapi_global).api_list_ptr[i].func_ptr = _N_(_ittapi_global).api_list_ptr[i].null_func; #endif /* ITT_COMPLETE_GROUP */ break; case 2: __itt_api_init_ptr = (__itt_api_init_t*)(size_t)__itt_get_proc(_N_(_ittapi_global).lib, "__itt_api_init"); if (__itt_api_init_ptr) __itt_api_init_ptr(&_N_(_ittapi_global), init_groups); break; } } else { _N_(_ittapi_global).state = __itt_collection_init_fail; __itt_free_allocated_resources(); __itt_nullify_all_pointers(); __itt_report_error(__itt_error_no_module, lib_name, #if ITT_PLATFORM==ITT_PLATFORM_WIN __itt_system_error() #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ dlerror() #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ); } } else { _N_(_ittapi_global).state = __itt_collection_collector_absent; __itt_nullify_all_pointers(); } _N_(_ittapi_global).api_initialized = 1; current_thread = 0; /* !!! Just to avoid unused code elimination !!! */ if (__itt_fini_ittlib_ptr == _N_(fini_ittlib)) current_thread = 0; } } #ifndef ITT_SIMPLE_INIT if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex); #endif /* ITT_SIMPLE_INIT */ } /* Evaluating if any function ptr is non empty and it's in init_groups */ for (i = 0; _N_(_ittapi_global).api_list_ptr[i].name != NULL; i++) { if (*_N_(_ittapi_global).api_list_ptr[i].func_ptr != _N_(_ittapi_global).api_list_ptr[i].null_func && _N_(_ittapi_global).api_list_ptr[i].group & init_groups) { return 1; } } return 0; } ITT_EXTERN_C __itt_error_handler_t* _N_(set_error_handler)(__itt_error_handler_t* handler) { __itt_error_handler_t* prev = (__itt_error_handler_t*)(size_t)_N_(_ittapi_global).error_handler; _N_(_ittapi_global).error_handler = (void*)(size_t)handler; return prev; } #if ITT_PLATFORM==ITT_PLATFORM_WIN #if _MSC_VER #pragma warning(pop) #endif #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ /** __itt_mark_pt_region functions marks region of interest * region parameter defines different regions. * 0 <= region < 8 */ #if defined(ITT_API_IPT_SUPPORT) && (ITT_PLATFORM==ITT_PLATFORM_WIN || ITT_PLATFORM==ITT_PLATFORM_POSIX) && !defined(__ANDROID__) void __itt_pt_mark(__itt_pt_region region); void __itt_pt_mark_event(__itt_pt_region region); #endif ITT_EXTERN_C void _N_(mark_pt_region_begin)(__itt_pt_region region) { #if defined(ITT_API_IPT_SUPPORT) && (ITT_PLATFORM==ITT_PLATFORM_WIN || ITT_PLATFORM==ITT_PLATFORM_POSIX) && !defined(__ANDROID__) if (_N_(_ittapi_global).ipt_collect_events == 1) { __itt_pt_mark_event(2*region); } else { __itt_pt_mark(2*region); } #else (void)region; #endif } ITT_EXTERN_C void _N_(mark_pt_region_end)(__itt_pt_region region) { #if defined(ITT_API_IPT_SUPPORT) && (ITT_PLATFORM==ITT_PLATFORM_WIN || ITT_PLATFORM==ITT_PLATFORM_POSIX) && !defined(__ANDROID__) if (_N_(_ittapi_global).ipt_collect_events == 1) { __itt_pt_mark_event(2*region + 1); } else { __itt_pt_mark(2*region + 1); } #else (void)region; #endif } ITT_EXTERN_C __itt_collection_state (_N_(get_collection_state))(void) { if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL) { __itt_init_ittlib_name(NULL, __itt_group_all); } return _N_(_ittapi_global).state; } /* !!! should be called from the library destructor !!! * this function destroys the mutex and frees resources * allocated by ITT API static part */ ITT_EXTERN_C void (_N_(release_resources))(void) { ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global)); __itt_free_allocated_resources(); if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex); ITT_MUTEX_DESTROY(_N_(_ittapi_global)); } ================================================ FILE: third-party/tbb/src/tbb/tools_api/ittnotify_static.h ================================================ /* Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "ittnotify_config.h" #ifndef ITT_FORMAT_DEFINED # ifndef ITT_FORMAT # define ITT_FORMAT # endif /* ITT_FORMAT */ # ifndef ITT_NO_PARAMS # define ITT_NO_PARAMS # endif /* ITT_NO_PARAMS */ #endif /* ITT_FORMAT_DEFINED */ /* * parameters for macro expected: * ITT_STUB(api, type, func_name, arguments, params, func_name_in_dll, group, printf_fmt) */ #ifdef __ITT_INTERNAL_INIT #ifndef __ITT_INTERNAL_BODY #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(ITTAPI, __itt_domain*, domain_createA, (const char *name), (ITT_FORMAT name), domain_createA, __itt_group_structure, "\"%s\"") ITT_STUB(ITTAPI, __itt_domain*, domain_createW, (const wchar_t *name), (ITT_FORMAT name), domain_createW, __itt_group_structure, "\"%S\"") #else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */ ITT_STUB(ITTAPI, __itt_domain*, domain_create, (const char *name), (ITT_FORMAT name), domain_create, __itt_group_structure, "\"%s\"") #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUBV(ITTAPI, void, module_load_with_sections, (__itt_module_object* module_obj), (ITT_FORMAT module_obj), module_load_with_sections, __itt_group_module, "%p") ITT_STUBV(ITTAPI, void, module_unload_with_sections, (__itt_module_object* module_obj), (ITT_FORMAT module_obj), module_unload_with_sections, __itt_group_module, "%p") #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_createA, (const char *name), (ITT_FORMAT name), string_handle_createA, __itt_group_structure, "\"%s\"") ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_createW, (const wchar_t *name), (ITT_FORMAT name), string_handle_createW, __itt_group_structure, "\"%S\"") #else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */ ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_create, (const char *name), (ITT_FORMAT name), string_handle_create, __itt_group_structure, "\"%s\"") #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(ITTAPI, __itt_counter, counter_createA, (const char *name, const char *domain), (ITT_FORMAT name, domain), counter_createA, __itt_group_counter, "\"%s\", \"%s\"") ITT_STUB(ITTAPI, __itt_counter, counter_createW, (const wchar_t *name, const wchar_t *domain), (ITT_FORMAT name, domain), counter_createW, __itt_group_counter, "\"%s\", \"%s\"") #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUB(ITTAPI, __itt_counter, counter_create, (const char *name, const char *domain), (ITT_FORMAT name, domain), counter_create, __itt_group_counter, "\"%s\", \"%s\"") #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(ITTAPI, __itt_counter, counter_create_typedA, (const char *name, const char *domain, __itt_metadata_type type), (ITT_FORMAT name, domain, type), counter_create_typedA, __itt_group_counter, "\"%s\", \"%s\", %d") ITT_STUB(ITTAPI, __itt_counter, counter_create_typedW, (const wchar_t *name, const wchar_t *domain, __itt_metadata_type type), (ITT_FORMAT name, domain, type), counter_create_typedW, __itt_group_counter, "\"%s\", \"%s\", %d") #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUB(ITTAPI, __itt_counter, counter_create_typed, (const char *name, const char *domain, __itt_metadata_type type), (ITT_FORMAT name, domain, type), counter_create_typed, __itt_group_counter, "\"%s\", \"%s\", %d") #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUBV(ITTAPI, void, pause, (void), (ITT_NO_PARAMS), pause, __itt_group_control | __itt_group_legacy, "no args") ITT_STUBV(ITTAPI, void, resume, (void), (ITT_NO_PARAMS), resume, __itt_group_control | __itt_group_legacy, "no args") ITT_STUBV(ITTAPI, void, pause_scoped, (__itt_collection_scope scope), (ITT_FORMAT scope), pause_scoped, __itt_group_control, "%d") ITT_STUBV(ITTAPI, void, resume_scoped, (__itt_collection_scope scope), (ITT_FORMAT scope), resume_scoped, __itt_group_control, "%d") #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUBV(ITTAPI, void, thread_set_nameA, (const char *name), (ITT_FORMAT name), thread_set_nameA, __itt_group_thread, "\"%s\"") ITT_STUBV(ITTAPI, void, thread_set_nameW, (const wchar_t *name), (ITT_FORMAT name), thread_set_nameW, __itt_group_thread, "\"%S\"") #else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */ ITT_STUBV(ITTAPI, void, thread_set_name, (const char *name), (ITT_FORMAT name), thread_set_name, __itt_group_thread, "\"%s\"") #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUBV(ITTAPI, void, thread_ignore, (void), (ITT_NO_PARAMS), thread_ignore, __itt_group_thread, "no args") #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(LIBITTAPI, int, thr_name_setA, (const char *name, int namelen), (ITT_FORMAT name, namelen), thr_name_setA, __itt_group_thread | __itt_group_legacy, "\"%s\", %d") ITT_STUB(LIBITTAPI, int, thr_name_setW, (const wchar_t *name, int namelen), (ITT_FORMAT name, namelen), thr_name_setW, __itt_group_thread | __itt_group_legacy, "\"%S\", %d") #else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */ ITT_STUB(LIBITTAPI, int, thr_name_set, (const char *name, int namelen), (ITT_FORMAT name, namelen), thr_name_set, __itt_group_thread | __itt_group_legacy, "\"%s\", %d") #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUBV(LIBITTAPI, void, thr_ignore, (void), (ITT_NO_PARAMS), thr_ignore, __itt_group_thread | __itt_group_legacy, "no args") #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(ITTAPI, __itt_histogram*, histogram_createA, (const __itt_domain* domain, const char* name, __itt_metadata_type x_type, __itt_metadata_type y_type), (ITT_FORMAT domain, name, x_type, y_type), histogram_createA, __itt_group_structure, "%p, \"%s\", %d, %d") ITT_STUB(ITTAPI, __itt_histogram*, histogram_createW, (const __itt_domain* domain, const wchar_t* name, __itt_metadata_type x_type, __itt_metadata_type y_type), (ITT_FORMAT domain, name, x_type, y_type), histogram_createW, __itt_group_structure, "%p, \"%s\", %d, %d") #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUB(ITTAPI, __itt_histogram*, histogram_create, (const __itt_domain* domain, const char* name, __itt_metadata_type x_type, __itt_metadata_type y_type), (ITT_FORMAT domain, name, x_type, y_type), histogram_create, __itt_group_structure, "%p, \"%s\", %d, %d") #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(ITTAPI, __itt_counter, counter_createA_v3, (const __itt_domain* domain, const char *name, __itt_metadata_type type), (ITT_FORMAT domain, name, type), counter_createA_v3, __itt_group_counter, "%p, \"%s\", %d") ITT_STUB(ITTAPI, __itt_counter, counter_createW_v3, (const __itt_domain* domain, const wchar_t *name, __itt_metadata_type type), (ITT_FORMAT domain, name, type), counter_createW_v3, __itt_group_counter, "%p, \"%s\", %d") #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUB(ITTAPI, __itt_counter, counter_create_v3, (const __itt_domain* domain, const char *name, __itt_metadata_type type), (ITT_FORMAT domain, name, type), counter_create_v3, __itt_group_counter, "%p, \"%s\", %d") #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUBV(ITTAPI, void, bind_context_metadata_to_counter, (__itt_counter counter, size_t length, __itt_context_metadata* metadata), (ITT_FORMAT counter, length, metadata), bind_context_metadata_to_counter, __itt_group_structure, "%p, %lu, %p") #endif /* __ITT_INTERNAL_BODY */ ITT_STUBV(ITTAPI, void, enable_attach, (void), (ITT_NO_PARAMS), enable_attach, __itt_group_all, "no args") #else /* __ITT_INTERNAL_INIT */ ITT_STUBV(ITTAPI, void, detach, (void), (ITT_NO_PARAMS), detach, __itt_group_control | __itt_group_legacy, "no args") #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUBV(ITTAPI, void, sync_createA, (void *addr, const char *objtype, const char *objname, int attribute), (ITT_FORMAT addr, objtype, objname, attribute), sync_createA, __itt_group_sync | __itt_group_fsync, "%p, \"%s\", \"%s\", %x") ITT_STUBV(ITTAPI, void, sync_createW, (void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute), (ITT_FORMAT addr, objtype, objname, attribute), sync_createW, __itt_group_sync | __itt_group_fsync, "%p, \"%S\", \"%S\", %x") ITT_STUBV(ITTAPI, void, sync_renameA, (void *addr, const char *name), (ITT_FORMAT addr, name), sync_renameA, __itt_group_sync | __itt_group_fsync, "%p, \"%s\"") ITT_STUBV(ITTAPI, void, sync_renameW, (void *addr, const wchar_t *name), (ITT_FORMAT addr, name), sync_renameW, __itt_group_sync | __itt_group_fsync, "%p, \"%S\"") #else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */ ITT_STUBV(ITTAPI, void, sync_create, (void *addr, const char *objtype, const char *objname, int attribute), (ITT_FORMAT addr, objtype, objname, attribute), sync_create, __itt_group_sync | __itt_group_fsync, "%p, \"%s\", \"%s\", %x") ITT_STUBV(ITTAPI, void, sync_rename, (void *addr, const char *name), (ITT_FORMAT addr, name), sync_rename, __itt_group_sync | __itt_group_fsync, "%p, \"%s\"") #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUBV(ITTAPI, void, sync_destroy, (void *addr), (ITT_FORMAT addr), sync_destroy, __itt_group_sync | __itt_group_fsync, "%p") ITT_STUBV(ITTAPI, void, sync_prepare, (void* addr), (ITT_FORMAT addr), sync_prepare, __itt_group_sync, "%p") ITT_STUBV(ITTAPI, void, sync_cancel, (void *addr), (ITT_FORMAT addr), sync_cancel, __itt_group_sync, "%p") ITT_STUBV(ITTAPI, void, sync_acquired, (void *addr), (ITT_FORMAT addr), sync_acquired, __itt_group_sync, "%p") ITT_STUBV(ITTAPI, void, sync_releasing, (void* addr), (ITT_FORMAT addr), sync_releasing, __itt_group_sync, "%p") ITT_STUBV(ITTAPI, void, suppress_push, (unsigned int mask), (ITT_FORMAT mask), suppress_push, __itt_group_suppress, "%p") ITT_STUBV(ITTAPI, void, suppress_pop, (void), (ITT_NO_PARAMS), suppress_pop, __itt_group_suppress, "no args") ITT_STUBV(ITTAPI, void, suppress_mark_range, (__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size),(ITT_FORMAT mode, mask, address, size), suppress_mark_range, __itt_group_suppress, "%d, %p, %p, %d") ITT_STUBV(ITTAPI, void, suppress_clear_range,(__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size),(ITT_FORMAT mode, mask, address, size), suppress_clear_range,__itt_group_suppress, "%d, %p, %p, %d") ITT_STUBV(ITTAPI, void, fsync_prepare, (void* addr), (ITT_FORMAT addr), sync_prepare, __itt_group_fsync, "%p") ITT_STUBV(ITTAPI, void, fsync_cancel, (void *addr), (ITT_FORMAT addr), sync_cancel, __itt_group_fsync, "%p") ITT_STUBV(ITTAPI, void, fsync_acquired, (void *addr), (ITT_FORMAT addr), sync_acquired, __itt_group_fsync, "%p") ITT_STUBV(ITTAPI, void, fsync_releasing, (void* addr), (ITT_FORMAT addr), sync_releasing, __itt_group_fsync, "%p") ITT_STUBV(ITTAPI, void, model_site_begin, (__itt_model_site *site, __itt_model_site_instance *instance, const char *name), (ITT_FORMAT site, instance, name), model_site_begin, __itt_group_model, "%p, %p, \"%s\"") ITT_STUBV(ITTAPI, void, model_site_end, (__itt_model_site *site, __itt_model_site_instance *instance), (ITT_FORMAT site, instance), model_site_end, __itt_group_model, "%p, %p") ITT_STUBV(ITTAPI, void, model_task_begin, (__itt_model_task *task, __itt_model_task_instance *instance, const char *name), (ITT_FORMAT task, instance, name), model_task_begin, __itt_group_model, "%p, %p, \"%s\"") ITT_STUBV(ITTAPI, void, model_task_end, (__itt_model_task *task, __itt_model_task_instance *instance), (ITT_FORMAT task, instance), model_task_end, __itt_group_model, "%p, %p") ITT_STUBV(ITTAPI, void, model_lock_acquire, (void *lock), (ITT_FORMAT lock), model_lock_acquire, __itt_group_model, "%p") ITT_STUBV(ITTAPI, void, model_lock_release, (void *lock), (ITT_FORMAT lock), model_lock_release, __itt_group_model, "%p") ITT_STUBV(ITTAPI, void, model_record_allocation, (void *addr, size_t size), (ITT_FORMAT addr, size), model_record_allocation, __itt_group_model, "%p, %d") ITT_STUBV(ITTAPI, void, model_record_deallocation, (void *addr), (ITT_FORMAT addr), model_record_deallocation, __itt_group_model, "%p") ITT_STUBV(ITTAPI, void, model_induction_uses, (void* addr, size_t size), (ITT_FORMAT addr, size), model_induction_uses, __itt_group_model, "%p, %d") ITT_STUBV(ITTAPI, void, model_reduction_uses, (void* addr, size_t size), (ITT_FORMAT addr, size), model_reduction_uses, __itt_group_model, "%p, %d") ITT_STUBV(ITTAPI, void, model_observe_uses, (void* addr, size_t size), (ITT_FORMAT addr, size), model_observe_uses, __itt_group_model, "%p, %d") ITT_STUBV(ITTAPI, void, model_clear_uses, (void* addr), (ITT_FORMAT addr), model_clear_uses, __itt_group_model, "%p") #ifndef __ITT_INTERNAL_BODY #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUBV(ITTAPI, void, model_site_beginW, (const wchar_t *name), (ITT_FORMAT name), model_site_beginW, __itt_group_model, "\"%s\"") ITT_STUBV(ITTAPI, void, model_task_beginW, (const wchar_t *name), (ITT_FORMAT name), model_task_beginW, __itt_group_model, "\"%s\"") ITT_STUBV(ITTAPI, void, model_iteration_taskW, (const wchar_t *name), (ITT_FORMAT name), model_iteration_taskW, __itt_group_model, "\"%s\"") #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUBV(ITTAPI, void, model_site_beginA, (const char *name), (ITT_FORMAT name), model_site_beginA, __itt_group_model, "\"%s\"") ITT_STUBV(ITTAPI, void, model_site_beginAL, (const char *name, size_t len), (ITT_FORMAT name, len), model_site_beginAL, __itt_group_model, "\"%s\", %d") ITT_STUBV(ITTAPI, void, model_task_beginA, (const char *name), (ITT_FORMAT name), model_task_beginA, __itt_group_model, "\"%s\"") ITT_STUBV(ITTAPI, void, model_task_beginAL, (const char *name, size_t len), (ITT_FORMAT name, len), model_task_beginAL, __itt_group_model, "\"%s\", %d") ITT_STUBV(ITTAPI, void, model_iteration_taskA, (const char *name), (ITT_FORMAT name), model_iteration_taskA, __itt_group_model, "\"%s\"") ITT_STUBV(ITTAPI, void, model_iteration_taskAL, (const char *name, size_t len), (ITT_FORMAT name, len), model_iteration_taskAL, __itt_group_model, "\"%s\", %d") ITT_STUBV(ITTAPI, void, model_site_end_2, (void), (ITT_NO_PARAMS), model_site_end_2, __itt_group_model, "no args") ITT_STUBV(ITTAPI, void, model_task_end_2, (void), (ITT_NO_PARAMS), model_task_end_2, __itt_group_model, "no args") ITT_STUBV(ITTAPI, void, model_lock_acquire_2, (void *lock), (ITT_FORMAT lock), model_lock_acquire_2, __itt_group_model, "%p") ITT_STUBV(ITTAPI, void, model_lock_release_2, (void *lock), (ITT_FORMAT lock), model_lock_release_2, __itt_group_model, "%p") ITT_STUBV(ITTAPI, void, model_aggregate_task, (size_t count), (ITT_FORMAT count), model_aggregate_task, __itt_group_model, "%d") ITT_STUBV(ITTAPI, void, model_disable_push, (__itt_model_disable x), (ITT_FORMAT x), model_disable_push, __itt_group_model, "%p") ITT_STUBV(ITTAPI, void, model_disable_pop, (void), (ITT_NO_PARAMS), model_disable_pop, __itt_group_model, "no args") #endif /* __ITT_INTERNAL_BODY */ #ifndef __ITT_INTERNAL_BODY #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(ITTAPI, __itt_heap_function, heap_function_createA, (const char *name, const char *domain), (ITT_FORMAT name, domain), heap_function_createA, __itt_group_heap, "\"%s\", \"%s\"") ITT_STUB(ITTAPI, __itt_heap_function, heap_function_createW, (const wchar_t *name, const wchar_t *domain), (ITT_FORMAT name, domain), heap_function_createW, __itt_group_heap, "\"%s\", \"%s\"") #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUB(ITTAPI, __itt_heap_function, heap_function_create, (const char *name, const char *domain), (ITT_FORMAT name, domain), heap_function_create, __itt_group_heap, "\"%s\", \"%s\"") #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* __ITT_INTERNAL_BODY */ ITT_STUBV(ITTAPI, void, heap_allocate_begin, (__itt_heap_function h, size_t size, int initialized), (ITT_FORMAT h, size, initialized), heap_allocate_begin, __itt_group_heap, "%p, %lu, %d") ITT_STUBV(ITTAPI, void, heap_allocate_end, (__itt_heap_function h, void** addr, size_t size, int initialized), (ITT_FORMAT h, addr, size, initialized), heap_allocate_end, __itt_group_heap, "%p, %p, %lu, %d") ITT_STUBV(ITTAPI, void, heap_free_begin, (__itt_heap_function h, void* addr), (ITT_FORMAT h, addr), heap_free_begin, __itt_group_heap, "%p, %p") ITT_STUBV(ITTAPI, void, heap_free_end, (__itt_heap_function h, void* addr), (ITT_FORMAT h, addr), heap_free_end, __itt_group_heap, "%p, %p") ITT_STUBV(ITTAPI, void, heap_reallocate_begin, (__itt_heap_function h, void* addr, size_t new_size, int initialized), (ITT_FORMAT h, addr, new_size, initialized), heap_reallocate_begin, __itt_group_heap, "%p, %p, %lu, %d") ITT_STUBV(ITTAPI, void, heap_reallocate_end, (__itt_heap_function h, void* addr, void** new_addr, size_t new_size, int initialized), (ITT_FORMAT h, addr, new_addr, new_size, initialized), heap_reallocate_end, __itt_group_heap, "%p, %p, %p, %lu, %d") ITT_STUBV(ITTAPI, void, heap_internal_access_begin, (void), (ITT_NO_PARAMS), heap_internal_access_begin, __itt_group_heap, "no args") ITT_STUBV(ITTAPI, void, heap_internal_access_end, (void), (ITT_NO_PARAMS), heap_internal_access_end, __itt_group_heap, "no args") ITT_STUBV(ITTAPI, void, heap_record_memory_growth_begin, (void), (ITT_NO_PARAMS), heap_record_memory_growth_begin, __itt_group_heap, "no args") ITT_STUBV(ITTAPI, void, heap_record_memory_growth_end, (void), (ITT_NO_PARAMS), heap_record_memory_growth_end, __itt_group_heap, "no args") ITT_STUBV(ITTAPI, void, heap_reset_detection, (unsigned int reset_mask), (ITT_FORMAT reset_mask), heap_reset_detection, __itt_group_heap, "%u") ITT_STUBV(ITTAPI, void, heap_record, (unsigned int record_mask), (ITT_FORMAT record_mask), heap_record, __itt_group_heap, "%u") ITT_STUBV(ITTAPI, void, id_create, (const __itt_domain *domain, __itt_id id), (ITT_FORMAT domain, id), id_create, __itt_group_structure, "%p, %lu") ITT_STUBV(ITTAPI, void, id_destroy, (const __itt_domain *domain, __itt_id id), (ITT_FORMAT domain, id), id_destroy, __itt_group_structure, "%p, %lu") ITT_STUB(ITTAPI, __itt_timestamp, get_timestamp, (void), (ITT_NO_PARAMS), get_timestamp, __itt_group_structure, "no args") ITT_STUBV(ITTAPI, void, region_begin, (const __itt_domain *domain, __itt_id id, __itt_id parent, __itt_string_handle *name), (ITT_FORMAT domain, id, parent, name), region_begin, __itt_group_structure, "%p, %lu, %lu, %p") ITT_STUBV(ITTAPI, void, region_end, (const __itt_domain *domain, __itt_id id), (ITT_FORMAT domain, id), region_end, __itt_group_structure, "%p, %lu") #ifndef __ITT_INTERNAL_BODY ITT_STUBV(ITTAPI, void, frame_begin_v3, (const __itt_domain *domain, __itt_id *id), (ITT_FORMAT domain, id), frame_begin_v3, __itt_group_structure, "%p, %p") ITT_STUBV(ITTAPI, void, frame_end_v3, (const __itt_domain *domain, __itt_id *id), (ITT_FORMAT domain, id), frame_end_v3, __itt_group_structure, "%p, %p") ITT_STUBV(ITTAPI, void, frame_submit_v3, (const __itt_domain *domain, __itt_id *id, __itt_timestamp begin, __itt_timestamp end), (ITT_FORMAT domain, id, begin, end), frame_submit_v3, __itt_group_structure, "%p, %p, %lu, %lu") #endif /* __ITT_INTERNAL_BODY */ ITT_STUBV(ITTAPI, void, task_group, (const __itt_domain *domain, __itt_id id, __itt_id parent, __itt_string_handle *name), (ITT_FORMAT domain, id, parent, name), task_group, __itt_group_structure, "%p, %lu, %lu, %p") ITT_STUBV(ITTAPI, void, task_begin, (const __itt_domain *domain, __itt_id id, __itt_id parent, __itt_string_handle *name), (ITT_FORMAT domain, id, parent, name), task_begin, __itt_group_structure, "%p, %lu, %lu, %p") ITT_STUBV(ITTAPI, void, task_begin_fn, (const __itt_domain *domain, __itt_id id, __itt_id parent, void* fn), (ITT_FORMAT domain, id, parent, fn), task_begin_fn, __itt_group_structure, "%p, %lu, %lu, %p") ITT_STUBV(ITTAPI, void, task_end, (const __itt_domain *domain), (ITT_FORMAT domain), task_end, __itt_group_structure, "%p") ITT_STUBV(ITTAPI, void, counter_inc_v3, (const __itt_domain *domain, __itt_string_handle *name), (ITT_FORMAT domain, name), counter_inc_v3, __itt_group_structure, "%p, %p") ITT_STUBV(ITTAPI, void, counter_inc_delta_v3, (const __itt_domain *domain, __itt_string_handle *name, unsigned long long value), (ITT_FORMAT domain, name, value), counter_inc_delta_v3, __itt_group_structure, "%p, %p, %lu") ITT_STUBV(ITTAPI, void, counter_dec_v3, (const __itt_domain *domain, __itt_string_handle *name), (ITT_FORMAT domain, name), counter_dec_v3, __itt_group_structure, "%p, %p") ITT_STUBV(ITTAPI, void, counter_dec_delta_v3, (const __itt_domain *domain, __itt_string_handle *name, unsigned long long value), (ITT_FORMAT domain, name, value), counter_dec_delta_v3, __itt_group_structure, "%p, %p, %lu") ITT_STUBV(ITTAPI, void, marker, (const __itt_domain *domain, __itt_id id, __itt_string_handle *name, __itt_scope scope), (ITT_FORMAT domain, id, name, scope), marker, __itt_group_structure, "%p, %lu, %p, %d") ITT_STUBV(ITTAPI, void, metadata_add, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data), (ITT_FORMAT domain, id, key, type, count, data), metadata_add, __itt_group_structure, "%p, %lu, %p, %d, %lu, %p") #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUBV(ITTAPI, void, metadata_str_addA, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char* data, size_t length), (ITT_FORMAT domain, id, key, data, length), metadata_str_addA, __itt_group_structure, "%p, %lu, %p, %p, %lu") ITT_STUBV(ITTAPI, void, metadata_str_addW, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const wchar_t* data, size_t length), (ITT_FORMAT domain, id, key, data, length), metadata_str_addW, __itt_group_structure, "%p, %lu, %p, %p, %lu") #else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */ ITT_STUBV(ITTAPI, void, metadata_str_add, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char* data, size_t length), (ITT_FORMAT domain, id, key, data, length), metadata_str_add, __itt_group_structure, "%p, %lu, %p, %p, %lu") #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUBV(ITTAPI, void, relation_add_to_current, (const __itt_domain *domain, __itt_relation relation, __itt_id tail), (ITT_FORMAT domain, relation, tail), relation_add_to_current, __itt_group_structure, "%p, %lu, %p") ITT_STUBV(ITTAPI, void, relation_add, (const __itt_domain *domain, __itt_id head, __itt_relation relation, __itt_id tail), (ITT_FORMAT domain, head, relation, tail), relation_add, __itt_group_structure, "%p, %p, %lu, %p") #ifndef __ITT_INTERNAL_BODY #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(LIBITTAPI, __itt_event, event_createA, (const char *name, int namelen), (ITT_FORMAT name, namelen), event_createA, __itt_group_mark | __itt_group_legacy, "\"%s\", %d") ITT_STUB(LIBITTAPI, __itt_event, event_createW, (const wchar_t *name, int namelen), (ITT_FORMAT name, namelen), event_createW, __itt_group_mark | __itt_group_legacy, "\"%S\", %d") #else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */ ITT_STUB(LIBITTAPI, __itt_event, event_create, (const char *name, int namelen), (ITT_FORMAT name, namelen), event_create, __itt_group_mark | __itt_group_legacy, "\"%s\", %d") #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUB(LIBITTAPI, int, event_start, (__itt_event event), (ITT_FORMAT event), event_start, __itt_group_mark | __itt_group_legacy, "%d") ITT_STUB(LIBITTAPI, int, event_end, (__itt_event event), (ITT_FORMAT event), event_end, __itt_group_mark | __itt_group_legacy, "%d") #endif /* __ITT_INTERNAL_BODY */ #ifndef __ITT_INTERNAL_BODY #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUBV(ITTAPI, void, sync_set_nameA, (void *addr, const char *objtype, const char *objname, int attribute), (ITT_FORMAT addr, objtype, objname, attribute), sync_set_nameA, __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p, \"%s\", \"%s\", %x") ITT_STUBV(ITTAPI, void, sync_set_nameW, (void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute), (ITT_FORMAT addr, objtype, objname, attribute), sync_set_nameW, __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p, \"%S\", \"%S\", %x") #else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */ ITT_STUBV(ITTAPI, void, sync_set_name, (void *addr, const char *objtype, const char *objname, int attribute), (ITT_FORMAT addr, objtype, objname, attribute), sync_set_name, __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "p, \"%s\", \"%s\", %x") #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(LIBITTAPI, int, notify_sync_nameA, (void *p, const char *objtype, int typelen, const char *objname, int namelen, int attribute), (ITT_FORMAT p, objtype, typelen, objname, namelen, attribute), notify_sync_nameA, __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p, \"%s\", %d, \"%s\", %d, %x") ITT_STUB(LIBITTAPI, int, notify_sync_nameW, (void *p, const wchar_t *objtype, int typelen, const wchar_t *objname, int namelen, int attribute), (ITT_FORMAT p, objtype, typelen, objname, namelen, attribute), notify_sync_nameW, __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p, \"%S\", %d, \"%S\", %d, %x") #else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */ ITT_STUB(LIBITTAPI, int, notify_sync_name, (void *p, const char *objtype, int typelen, const char *objname, int namelen, int attribute), (ITT_FORMAT p, objtype, typelen, objname, namelen, attribute), notify_sync_name, __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p, \"%s\", %d, \"%s\", %d, %x") #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUBV(LIBITTAPI, void, notify_sync_prepare, (void *p), (ITT_FORMAT p), notify_sync_prepare, __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p") ITT_STUBV(LIBITTAPI, void, notify_sync_cancel, (void *p), (ITT_FORMAT p), notify_sync_cancel, __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p") ITT_STUBV(LIBITTAPI, void, notify_sync_acquired, (void *p), (ITT_FORMAT p), notify_sync_acquired, __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p") ITT_STUBV(LIBITTAPI, void, notify_sync_releasing, (void *p), (ITT_FORMAT p), notify_sync_releasing, __itt_group_sync | __itt_group_fsync | __itt_group_legacy, "%p") #endif /* __ITT_INTERNAL_BODY */ ITT_STUBV(LIBITTAPI, void, memory_read, (void *addr, size_t size), (ITT_FORMAT addr, size), memory_read, __itt_group_legacy, "%p, %lu") ITT_STUBV(LIBITTAPI, void, memory_write, (void *addr, size_t size), (ITT_FORMAT addr, size), memory_write, __itt_group_legacy, "%p, %lu") ITT_STUBV(LIBITTAPI, void, memory_update, (void *addr, size_t size), (ITT_FORMAT addr, size), memory_update, __itt_group_legacy, "%p, %lu") ITT_STUB(LIBITTAPI, __itt_state_t, state_get, (void), (ITT_NO_PARAMS), state_get, __itt_group_legacy, "no args") ITT_STUB(LIBITTAPI, __itt_state_t, state_set, (__itt_state_t s), (ITT_FORMAT s), state_set, __itt_group_legacy, "%d") ITT_STUB(LIBITTAPI, __itt_obj_state_t, obj_mode_set, (__itt_obj_prop_t p, __itt_obj_state_t s), (ITT_FORMAT p, s), obj_mode_set, __itt_group_legacy, "%d, %d") ITT_STUB(LIBITTAPI, __itt_thr_state_t, thr_mode_set, (__itt_thr_prop_t p, __itt_thr_state_t s), (ITT_FORMAT p, s), thr_mode_set, __itt_group_legacy, "%d, %d") #ifndef __ITT_INTERNAL_BODY #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(ITTAPI, __itt_frame, frame_createA, (const char *domain), (ITT_FORMAT domain), frame_createA, __itt_group_frame, "\"%s\"") ITT_STUB(ITTAPI, __itt_frame, frame_createW, (const wchar_t *domain), (ITT_FORMAT domain), frame_createW, __itt_group_frame, "\"%s\"") #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUB(ITTAPI, __itt_frame, frame_create, (const char *domain), (ITT_FORMAT domain), frame_create, __itt_group_frame, "\"%s\"") #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(ITTAPI, __itt_pt_region, pt_region_createA, (const char *name), (ITT_FORMAT name), pt_region_createA, __itt_group_structure, "\"%s\"") ITT_STUB(ITTAPI, __itt_pt_region, pt_region_createW, (const wchar_t *name), (ITT_FORMAT name), pt_region_createW, __itt_group_structure, "\"%S\"") #else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */ ITT_STUB(ITTAPI, __itt_pt_region, pt_region_create, (const char *name), (ITT_FORMAT name), pt_region_create, __itt_group_structure, "\"%s\"") #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* __ITT_INTERNAL_BODY */ ITT_STUBV(ITTAPI, void, frame_begin, (__itt_frame frame), (ITT_FORMAT frame), frame_begin, __itt_group_frame, "%p") ITT_STUBV(ITTAPI, void, frame_end, (__itt_frame frame), (ITT_FORMAT frame), frame_end, __itt_group_frame, "%p") ITT_STUBV(ITTAPI, void, counter_destroy, (__itt_counter id), (ITT_FORMAT id), counter_destroy, __itt_group_counter, "%p") ITT_STUBV(ITTAPI, void, counter_inc, (__itt_counter id), (ITT_FORMAT id), counter_inc, __itt_group_counter, "%p") ITT_STUBV(ITTAPI, void, counter_inc_delta, (__itt_counter id, unsigned long long value), (ITT_FORMAT id, value), counter_inc_delta, __itt_group_counter, "%p, %lu") ITT_STUBV(ITTAPI, void, counter_dec, (__itt_counter id), (ITT_FORMAT id), counter_dec, __itt_group_counter, "%p") ITT_STUBV(ITTAPI, void, counter_dec_delta, (__itt_counter id, unsigned long long value), (ITT_FORMAT id, value), counter_dec_delta, __itt_group_counter, "%p, %lu") ITT_STUBV(ITTAPI, void, counter_set_value, (__itt_counter id, void *value_ptr), (ITT_FORMAT id, value_ptr), counter_set_value, __itt_group_counter, "%p, %p") ITT_STUBV(ITTAPI, void, counter_set_value_ex, (__itt_counter id, __itt_clock_domain *clock_domain, unsigned long long timestamp, void *value_ptr), (ITT_FORMAT id, clock_domain, timestamp, value_ptr), counter_set_value_ex, __itt_group_counter, "%p, %p, %llu, %p") #ifndef __ITT_INTERNAL_BODY #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(ITTAPI, __itt_mark_type, mark_createA, (const char *name), (ITT_FORMAT name), mark_createA, __itt_group_mark, "\"%s\"") ITT_STUB(ITTAPI, __itt_mark_type, mark_createW, (const wchar_t *name), (ITT_FORMAT name), mark_createW, __itt_group_mark, "\"%S\"") #else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */ ITT_STUB(ITTAPI, __itt_mark_type, mark_create, (const char *name), (ITT_FORMAT name), mark_create, __itt_group_mark, "\"%s\"") #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* __ITT_INTERNAL_BODY */ #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(ITTAPI, int, markA, (__itt_mark_type mt, const char *parameter), (ITT_FORMAT mt, parameter), markA, __itt_group_mark, "%d, \"%s\"") ITT_STUB(ITTAPI, int, markW, (__itt_mark_type mt, const wchar_t *parameter), (ITT_FORMAT mt, parameter), markW, __itt_group_mark, "%d, \"%S\"") #else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */ ITT_STUB(ITTAPI, int, mark, (__itt_mark_type mt, const char *parameter), (ITT_FORMAT mt, parameter), mark, __itt_group_mark, "%d, \"%s\"") #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUB(ITTAPI, int, mark_off, (__itt_mark_type mt), (ITT_FORMAT mt), mark_off, __itt_group_mark, "%d") #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(ITTAPI, int, mark_globalA, (__itt_mark_type mt, const char *parameter), (ITT_FORMAT mt, parameter), mark_globalA, __itt_group_mark, "%d, \"%s\"") ITT_STUB(ITTAPI, int, mark_globalW, (__itt_mark_type mt, const wchar_t *parameter), (ITT_FORMAT mt, parameter), mark_globalW, __itt_group_mark, "%d, \"%S\"") #else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */ ITT_STUB(ITTAPI, int, mark_global, (__itt_mark_type mt, const char *parameter), (ITT_FORMAT mt, parameter), mark_global, __itt_group_mark, "%d, \"%S\"") #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUB(ITTAPI, int, mark_global_off, (__itt_mark_type mt), (ITT_FORMAT mt), mark_global_off, __itt_group_mark, "%d") #ifndef __ITT_INTERNAL_BODY ITT_STUB(ITTAPI, __itt_caller, stack_caller_create, (void), (ITT_NO_PARAMS), stack_caller_create, __itt_group_stitch, "no args") #endif /* __ITT_INTERNAL_BODY */ ITT_STUBV(ITTAPI, void, stack_caller_destroy, (__itt_caller id), (ITT_FORMAT id), stack_caller_destroy, __itt_group_stitch, "%p") ITT_STUBV(ITTAPI, void, stack_callee_enter, (__itt_caller id), (ITT_FORMAT id), stack_callee_enter, __itt_group_stitch, "%p") ITT_STUBV(ITTAPI, void, stack_callee_leave, (__itt_caller id), (ITT_FORMAT id), stack_callee_leave, __itt_group_stitch, "%p") ITT_STUB(ITTAPI, __itt_clock_domain*, clock_domain_create, (__itt_get_clock_info_fn fn, void* fn_data), (ITT_FORMAT fn, fn_data), clock_domain_create, __itt_group_structure, "%p, %p") ITT_STUBV(ITTAPI, void, clock_domain_reset, (void), (ITT_NO_PARAMS), clock_domain_reset, __itt_group_structure, "no args") ITT_STUBV(ITTAPI, void, id_create_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id), (ITT_FORMAT domain, clock_domain, timestamp, id), id_create_ex, __itt_group_structure, "%p, %p, %lu, %lu") ITT_STUBV(ITTAPI, void, id_destroy_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id), (ITT_FORMAT domain, clock_domain, timestamp, id), id_destroy_ex, __itt_group_structure, "%p, %p, %lu, %lu") ITT_STUBV(ITTAPI, void, task_begin_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_id parentid, __itt_string_handle *name), (ITT_FORMAT domain, clock_domain, timestamp, id, parentid, name), task_begin_ex, __itt_group_structure, "%p, %p, %lu, %lu, %lu, %p") ITT_STUBV(ITTAPI, void, task_begin_fn_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_id parentid, void* fn), (ITT_FORMAT domain, clock_domain, timestamp, id, parentid, fn), task_begin_fn_ex, __itt_group_structure, "%p, %p, %lu, %lu, %lu, %p") ITT_STUBV(ITTAPI, void, task_end_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp), (ITT_FORMAT domain, clock_domain, timestamp), task_end_ex, __itt_group_structure, "%p, %p, %lu") ITT_STUBV(ITTAPI, void, task_begin_overlapped, (const __itt_domain *domain, __itt_id id, __itt_id parent, __itt_string_handle *name), (ITT_FORMAT domain, id, parent, name), task_begin_overlapped, __itt_group_structure, "%p, %lu, %lu, %p") ITT_STUBV(ITTAPI, void, task_begin_overlapped_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_id parentid, __itt_string_handle *name), (ITT_FORMAT domain, clock_domain, timestamp, id, parentid, name), task_begin_overlapped_ex, __itt_group_structure, "%p, %p, %lu, %lu, %lu, %p") ITT_STUBV(ITTAPI, void, task_end_overlapped, (const __itt_domain *domain, __itt_id id), (ITT_FORMAT domain, id), task_end_overlapped, __itt_group_structure, "%p, %lu") ITT_STUBV(ITTAPI, void, task_end_overlapped_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id), (ITT_FORMAT domain, clock_domain, timestamp, id), task_end_overlapped_ex, __itt_group_structure, "%p, %p, %lu, %lu") ITT_STUBV(ITTAPI, void, marker_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_string_handle *name, __itt_scope scope), (ITT_FORMAT domain, clock_domain, timestamp, id, name, scope), marker_ex, __itt_group_structure, "%p, %p, %lu, %lu, %p, %d") ITT_STUBV(ITTAPI, void, metadata_add_with_scope, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data), (ITT_FORMAT domain, scope, key, type, count, data), metadata_add_with_scope, __itt_group_structure, "%p, %d, %p, %d, %lu, %p") #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUBV(ITTAPI, void, metadata_str_add_with_scopeA, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length), (ITT_FORMAT domain, scope, key, data, length), metadata_str_add_with_scopeA, __itt_group_structure, "%p, %d, %p, %p, %lu") ITT_STUBV(ITTAPI, void, metadata_str_add_with_scopeW, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const wchar_t *data, size_t length), (ITT_FORMAT domain, scope, key, data, length), metadata_str_add_with_scopeW, __itt_group_structure, "%p, %d, %p, %p, %lu") #else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */ ITT_STUBV(ITTAPI, void, metadata_str_add_with_scope, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length), (ITT_FORMAT domain, scope, key, data, length), metadata_str_add_with_scope, __itt_group_structure, "%p, %d, %p, %p, %lu") #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUBV(ITTAPI, void, relation_add_to_current_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_relation relation, __itt_id tail), (ITT_FORMAT domain, clock_domain, timestamp, relation, tail), relation_add_to_current_ex, __itt_group_structure, "%p, %p, %lu, %d, %lu") ITT_STUBV(ITTAPI, void, relation_add_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id head, __itt_relation relation, __itt_id tail), (ITT_FORMAT domain, clock_domain, timestamp, head, relation, tail), relation_add_ex, __itt_group_structure, "%p, %p, %lu, %lu, %d, %lu") ITT_STUB(ITTAPI, __itt_track_group*, track_group_create, (__itt_string_handle* name, __itt_track_group_type track_group_type), (ITT_FORMAT name, track_group_type), track_group_create, __itt_group_structure, "%p, %d") ITT_STUB(ITTAPI, __itt_track*, track_create, (__itt_track_group* track_group,__itt_string_handle* name, __itt_track_type track_type), (ITT_FORMAT track_group, name, track_type), track_create, __itt_group_structure, "%p, %p, %d") ITT_STUBV(ITTAPI, void, set_track, (__itt_track *track), (ITT_FORMAT track), set_track, __itt_group_structure, "%p") #ifndef __ITT_INTERNAL_BODY ITT_STUB(ITTAPI, const char*, api_version, (void), (ITT_NO_PARAMS), api_version, __itt_group_all & ~__itt_group_legacy, "no args") #endif /* __ITT_INTERNAL_BODY */ #ifndef __ITT_INTERNAL_BODY #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(ITTAPI, int, av_saveA, (void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder), (ITT_FORMAT data, rank, dimensions, type, filePath, columnOrder), av_saveA, __itt_group_arrays, "%p, %d, %p, %d, \"%s\", %d") ITT_STUB(ITTAPI, int, av_saveW, (void *data, int rank, const int *dimensions, int type, const wchar_t *filePath, int columnOrder), (ITT_FORMAT data, rank, dimensions, type, filePath, columnOrder), av_saveW, __itt_group_arrays, "%p, %d, %p, %d, \"%S\", %d") #else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */ ITT_STUB(ITTAPI, int, av_save, (void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder), (ITT_FORMAT data, rank, dimensions, type, filePath, columnOrder), av_save, __itt_group_arrays, "%p, %d, %p, %d, \"%s\", %d") #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* __ITT_INTERNAL_BODY */ #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUBV(ITTAPI, void, module_loadA, (void *start_addr, void* end_addr, const char *path), (ITT_FORMAT start_addr, end_addr, path), module_loadA, __itt_group_module, "%p, %p, %p") ITT_STUBV(ITTAPI, void, module_loadW, (void *start_addr, void* end_addr, const wchar_t *path), (ITT_FORMAT start_addr, end_addr, path), module_loadW, __itt_group_module, "%p, %p, %p") #else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */ ITT_STUBV(ITTAPI, void, module_load, (void *start_addr, void *end_addr, const char *path), (ITT_FORMAT start_addr, end_addr, path), module_load, __itt_group_module, "%p, %p, %p") #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUBV(ITTAPI, void, module_unload, (void *start_addr), (ITT_FORMAT start_addr), module_unload, __itt_group_module, "%p") ITT_STUBV(ITTAPI, void, histogram_submit, (__itt_histogram* histogram, size_t length, void* x_data, void* y_data), (ITT_FORMAT histogram, length, x_data, y_data), histogram_submit, __itt_group_structure, "%p, %lu, %p, %p") ITT_STUBV(ITTAPI, void, counter_set_value_v3, (__itt_counter counter, void *value_ptr), (ITT_FORMAT counter, value_ptr), counter_set_value_v3, __itt_group_counter, "%p, %p") #endif /* __ITT_INTERNAL_INIT */ ================================================ FILE: third-party/tbb/src/tbb/tools_api/ittnotify_types.h ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _ITTNOTIFY_TYPES_H_ #define _ITTNOTIFY_TYPES_H_ typedef enum ___itt_group_id { __itt_group_none = 0, __itt_group_legacy = 1<<0, __itt_group_control = 1<<1, __itt_group_thread = 1<<2, __itt_group_mark = 1<<3, __itt_group_sync = 1<<4, __itt_group_fsync = 1<<5, __itt_group_jit = 1<<6, __itt_group_model = 1<<7, __itt_group_splitter_min = 1<<7, __itt_group_counter = 1<<8, __itt_group_frame = 1<<9, __itt_group_stitch = 1<<10, __itt_group_heap = 1<<11, __itt_group_splitter_max = 1<<12, __itt_group_structure = 1<<12, __itt_group_suppress = 1<<13, __itt_group_arrays = 1<<14, __itt_group_module = 1<<15, __itt_group_all = -1 } __itt_group_id; #pragma pack(push, 8) typedef struct ___itt_group_list { __itt_group_id id; const char* name; } __itt_group_list; #pragma pack(pop) #define ITT_GROUP_LIST(varname) \ static __itt_group_list varname[] = { \ { __itt_group_all, "all" }, \ { __itt_group_control, "control" }, \ { __itt_group_thread, "thread" }, \ { __itt_group_mark, "mark" }, \ { __itt_group_sync, "sync" }, \ { __itt_group_fsync, "fsync" }, \ { __itt_group_jit, "jit" }, \ { __itt_group_model, "model" }, \ { __itt_group_counter, "counter" }, \ { __itt_group_frame, "frame" }, \ { __itt_group_stitch, "stitch" }, \ { __itt_group_heap, "heap" }, \ { __itt_group_structure, "structure" }, \ { __itt_group_suppress, "suppress" }, \ { __itt_group_arrays, "arrays" }, \ { __itt_group_module, "module" }, \ { __itt_group_none, NULL } \ } #endif /* _ITTNOTIFY_TYPES_H_ */ ================================================ FILE: third-party/tbb/src/tbb/tools_api/legacy/ittnotify.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _LEGACY_ITTNOTIFY_H_ #define _LEGACY_ITTNOTIFY_H_ /** * @file * @brief Legacy User API functions and types */ /** @cond exclude_from_documentation */ #ifndef ITT_OS_WIN # define ITT_OS_WIN 1 #endif /* ITT_OS_WIN */ #ifndef ITT_OS_LINUX # define ITT_OS_LINUX 2 #endif /* ITT_OS_LINUX */ #ifndef ITT_OS_MAC # define ITT_OS_MAC 3 #endif /* ITT_OS_MAC */ #ifndef ITT_OS_FREEBSD # define ITT_OS_FREEBSD 4 #endif /* ITT_OS_FREEBSD */ #ifndef ITT_OS_OPENBSD # define ITT_OS_OPENBSD 5 #endif /* ITT_OS_OPENBSD */ #ifndef ITT_OS # if defined WIN32 || defined _WIN32 # define ITT_OS ITT_OS_WIN # elif defined( __APPLE__ ) && defined( __MACH__ ) # define ITT_OS ITT_OS_MAC # elif defined( __FreeBSD__ ) # define ITT_OS ITT_OS_FREEBSD # elif defined( __OpenBSD__ ) # define ITT_OS ITT_OS_OPENBSD # else # define ITT_OS ITT_OS_LINUX # endif #endif /* ITT_OS */ #ifndef ITT_PLATFORM_WIN # define ITT_PLATFORM_WIN 1 #endif /* ITT_PLATFORM_WIN */ #ifndef ITT_PLATFORM_POSIX # define ITT_PLATFORM_POSIX 2 #endif /* ITT_PLATFORM_POSIX */ #ifndef ITT_PLATFORM_MAC # define ITT_PLATFORM_MAC 3 #endif /* ITT_PLATFORM_MAC */ #ifndef ITT_PLATFORM_FREEBSD # define ITT_PLATFORM_FREEBSD 4 #endif /* ITT_PLATFORM_FREEBSD */ #ifndef ITT_PLATFORM_OPENBSD # define ITT_PLATFORM_OPENBSD 5 #endif /* ITT_PLATFORM_OPENBSD */ #ifndef ITT_PLATFORM # if ITT_OS==ITT_OS_WIN # define ITT_PLATFORM ITT_PLATFORM_WIN # elif ITT_OS==ITT_OS_MAC # define ITT_PLATFORM ITT_PLATFORM_MAC # elif ITT_OS==ITT_OS_FREEBSD # define ITT_PLATFORM ITT_PLATFORM_FREEBSD # elif ITT_OS==ITT_OS_OPENBSD # define ITT_PLATFORM ITT_PLATFORM_OPENBSD # else # define ITT_PLATFORM ITT_PLATFORM_POSIX # endif #endif /* ITT_PLATFORM */ #if defined(_UNICODE) && !defined(UNICODE) #define UNICODE #endif #include #if ITT_PLATFORM==ITT_PLATFORM_WIN #include #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #include #if defined(UNICODE) || defined(_UNICODE) #include #endif /* UNICODE || _UNICODE */ #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #ifndef ITTAPI_CDECL # if ITT_PLATFORM==ITT_PLATFORM_WIN # define ITTAPI_CDECL __cdecl # else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ # if defined _M_IX86 || defined __i386__ # define ITTAPI_CDECL __attribute__ ((cdecl)) # else /* _M_IX86 || __i386__ */ # define ITTAPI_CDECL /* actual only on x86 platform */ # endif /* _M_IX86 || __i386__ */ # endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* ITTAPI_CDECL */ #ifndef STDCALL # if ITT_PLATFORM==ITT_PLATFORM_WIN # define STDCALL __stdcall # else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ # if defined _M_IX86 || defined __i386__ # define STDCALL __attribute__ ((stdcall)) # else /* _M_IX86 || __i386__ */ # define STDCALL /* supported only on x86 platform */ # endif /* _M_IX86 || __i386__ */ # endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* STDCALL */ #define ITTAPI ITTAPI_CDECL #define LIBITTAPI ITTAPI_CDECL /* TODO: Temporary for compatibility! */ #define ITTAPI_CALL ITTAPI_CDECL #define LIBITTAPI_CALL ITTAPI_CDECL #if ITT_PLATFORM==ITT_PLATFORM_WIN /* use __forceinline (VC++ specific) */ #if defined(__MINGW32__) && !defined(__cplusplus) #define ITT_INLINE static __inline__ __attribute__((__always_inline__,__gnu_inline__)) #else #define ITT_INLINE static __forceinline #endif /* __MINGW32__ */ #define ITT_INLINE_ATTRIBUTE /* nothing */ #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ /* * Generally, functions are not inlined unless optimization is specified. * For functions declared inline, this attribute inlines the function even * if no optimization level was specified. */ #ifdef __STRICT_ANSI__ #define ITT_INLINE static #define ITT_INLINE_ATTRIBUTE __attribute__((unused)) #else /* __STRICT_ANSI__ */ #define ITT_INLINE static inline #define ITT_INLINE_ATTRIBUTE __attribute__((always_inline, unused)) #endif /* __STRICT_ANSI__ */ #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ /** @endcond */ /** @cond exclude_from_documentation */ /* Helper macro for joining tokens */ #define ITT_JOIN_AUX(p,n) p##n #define ITT_JOIN(p,n) ITT_JOIN_AUX(p,n) #ifdef ITT_MAJOR #undef ITT_MAJOR #endif #ifdef ITT_MINOR #undef ITT_MINOR #endif #define ITT_MAJOR 3 #define ITT_MINOR 0 /* Standard versioning of a token with major and minor version numbers */ #define ITT_VERSIONIZE(x) \ ITT_JOIN(x, \ ITT_JOIN(_, \ ITT_JOIN(ITT_MAJOR, \ ITT_JOIN(_, ITT_MINOR)))) #ifndef INTEL_ITTNOTIFY_PREFIX # define INTEL_ITTNOTIFY_PREFIX __itt_ #endif /* INTEL_ITTNOTIFY_PREFIX */ #ifndef INTEL_ITTNOTIFY_POSTFIX # define INTEL_ITTNOTIFY_POSTFIX _ptr_ #endif /* INTEL_ITTNOTIFY_POSTFIX */ #define ITTNOTIFY_NAME_AUX(n) ITT_JOIN(INTEL_ITTNOTIFY_PREFIX,n) #define ITTNOTIFY_NAME(n) ITT_VERSIONIZE(ITTNOTIFY_NAME_AUX(ITT_JOIN(n,INTEL_ITTNOTIFY_POSTFIX))) #define ITTNOTIFY_VOID(n) (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n) #define ITTNOTIFY_DATA(n) (!ITTNOTIFY_NAME(n)) ? 0 : ITTNOTIFY_NAME(n) #define ITTNOTIFY_VOID_D0(n,d) (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d) #define ITTNOTIFY_VOID_D1(n,d,x) (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x) #define ITTNOTIFY_VOID_D2(n,d,x,y) (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y) #define ITTNOTIFY_VOID_D3(n,d,x,y,z) (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z) #define ITTNOTIFY_VOID_D4(n,d,x,y,z,a) (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a) #define ITTNOTIFY_VOID_D5(n,d,x,y,z,a,b) (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b) #define ITTNOTIFY_VOID_D6(n,d,x,y,z,a,b,c) (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c) #define ITTNOTIFY_DATA_D0(n,d) (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ? 0 : ITTNOTIFY_NAME(n)(d) #define ITTNOTIFY_DATA_D1(n,d,x) (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ? 0 : ITTNOTIFY_NAME(n)(d,x) #define ITTNOTIFY_DATA_D2(n,d,x,y) (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ? 0 : ITTNOTIFY_NAME(n)(d,x,y) #define ITTNOTIFY_DATA_D3(n,d,x,y,z) (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ? 0 : ITTNOTIFY_NAME(n)(d,x,y,z) #define ITTNOTIFY_DATA_D4(n,d,x,y,z,a) (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ? 0 : ITTNOTIFY_NAME(n)(d,x,y,z,a) #define ITTNOTIFY_DATA_D5(n,d,x,y,z,a,b) (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ? 0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b) #define ITTNOTIFY_DATA_D6(n,d,x,y,z,a,b,c) (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ? 0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c) #ifdef ITT_STUB #undef ITT_STUB #endif #ifdef ITT_STUBV #undef ITT_STUBV #endif #define ITT_STUBV(api,type,name,args) \ typedef type (api* ITT_JOIN(ITTNOTIFY_NAME(name),_t)) args; \ extern ITT_JOIN(ITTNOTIFY_NAME(name),_t) ITTNOTIFY_NAME(name); #define ITT_STUB ITT_STUBV /** @endcond */ #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ /** * @defgroup legacy Legacy API * @{ * @} */ /** * @defgroup legacy_control Collection Control * @ingroup legacy * General behavior: application continues to run, but no profiling information is being collected * * Pausing occurs not only for the current thread but for all process as well as spawned processes * - Intel(R) Parallel Inspector and Intel(R) Inspector XE: * - Does not analyze or report errors that involve memory access. * - Other errors are reported as usual. Pausing data collection in * Intel(R) Parallel Inspector and Intel(R) Inspector XE * only pauses tracing and analyzing memory access. * It does not pause tracing or analyzing threading APIs. * . * - Intel(R) VTune(TM) Profiler: * - Does continue to record when new threads are started. * . * - Other effects: * - Possible reduction of runtime overhead. * . * @{ */ #ifndef _ITTNOTIFY_H_ /** @brief Pause collection */ void ITTAPI __itt_pause(void); /** @brief Resume collection */ void ITTAPI __itt_resume(void); /** @brief Detach collection */ void ITTAPI __itt_detach(void); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, pause, (void)) ITT_STUBV(ITTAPI, void, resume, (void)) ITT_STUBV(ITTAPI, void, detach, (void)) #define __itt_pause ITTNOTIFY_VOID(pause) #define __itt_pause_ptr ITTNOTIFY_NAME(pause) #define __itt_resume ITTNOTIFY_VOID(resume) #define __itt_resume_ptr ITTNOTIFY_NAME(resume) #define __itt_detach ITTNOTIFY_VOID(detach) #define __itt_detach_ptr ITTNOTIFY_NAME(detach) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_pause() #define __itt_pause_ptr 0 #define __itt_resume() #define __itt_resume_ptr 0 #define __itt_detach() #define __itt_detach_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_pause_ptr 0 #define __itt_resume_ptr 0 #define __itt_detach_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ #endif /* _ITTNOTIFY_H_ */ /** @} legacy_control group */ /** * @defgroup legacy_threads Threads * @ingroup legacy * Threads group * @warning Legacy API * @{ */ /** * @deprecated Legacy API * @brief Set name to be associated with thread in analysis GUI. * @return __itt_err upon failure (name or namelen being null,name and namelen mismatched) */ #if ITT_PLATFORM==ITT_PLATFORM_WIN int LIBITTAPI __itt_thr_name_setA(const char *name, int namelen); int LIBITTAPI __itt_thr_name_setW(const wchar_t *name, int namelen); #if defined(UNICODE) || defined(_UNICODE) # define __itt_thr_name_set __itt_thr_name_setW # define __itt_thr_name_set_ptr __itt_thr_name_setW_ptr #else # define __itt_thr_name_set __itt_thr_name_setA # define __itt_thr_name_set_ptr __itt_thr_name_setA_ptr #endif /* UNICODE */ #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ int LIBITTAPI __itt_thr_name_set(const char *name, int namelen); #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(LIBITTAPI, int, thr_name_setA, (const char *name, int namelen)) ITT_STUB(LIBITTAPI, int, thr_name_setW, (const wchar_t *name, int namelen)) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUB(LIBITTAPI, int, thr_name_set, (const char *name, int namelen)) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_thr_name_setA ITTNOTIFY_DATA(thr_name_setA) #define __itt_thr_name_setA_ptr ITTNOTIFY_NAME(thr_name_setA) #define __itt_thr_name_setW ITTNOTIFY_DATA(thr_name_setW) #define __itt_thr_name_setW_ptr ITTNOTIFY_NAME(thr_name_setW) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_thr_name_set ITTNOTIFY_DATA(thr_name_set) #define __itt_thr_name_set_ptr ITTNOTIFY_NAME(thr_name_set) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #else /* INTEL_NO_ITTNOTIFY_API */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_thr_name_setA(name, namelen) #define __itt_thr_name_setA_ptr 0 #define __itt_thr_name_setW(name, namelen) #define __itt_thr_name_setW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_thr_name_set(name, namelen) #define __itt_thr_name_set_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_thr_name_setA_ptr 0 #define __itt_thr_name_setW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_thr_name_set_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @deprecated Legacy API * @brief Mark current thread as ignored from this point on, for the duration of its existence. */ void LIBITTAPI __itt_thr_ignore(void); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(LIBITTAPI, void, thr_ignore, (void)) #define __itt_thr_ignore ITTNOTIFY_VOID(thr_ignore) #define __itt_thr_ignore_ptr ITTNOTIFY_NAME(thr_ignore) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_thr_ignore() #define __itt_thr_ignore_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_thr_ignore_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @} legacy_threads group */ /** * @defgroup legacy_sync Synchronization * @ingroup legacy * Synchronization group * @warning Legacy API * @{ */ /** * @hideinitializer * @brief possible value of attribute argument for sync object type */ #define __itt_attr_barrier 1 /** * @hideinitializer * @brief possible value of attribute argument for sync object type */ #define __itt_attr_mutex 2 /** * @deprecated Legacy API * @brief Assign a name to a sync object using char or Unicode string * @param[in] addr - pointer to the sync object. You should use a real pointer to your object * to make sure that the values don't clash with other object addresses * @param[in] objtype - null-terminated object type string. If NULL is passed, the object will * be assumed to be of generic "User Synchronization" type * @param[in] objname - null-terminated object name string. If NULL, no name will be assigned * to the object -- you can use the __itt_sync_rename call later to assign * the name * @param[in] attribute - one of [#__itt_attr_barrier, #__itt_attr_mutex] values which defines the * exact semantics of how prepare/acquired/releasing calls work. */ #if ITT_PLATFORM==ITT_PLATFORM_WIN void ITTAPI __itt_sync_set_nameA(void *addr, const char *objtype, const char *objname, int attribute); void ITTAPI __itt_sync_set_nameW(void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute); #if defined(UNICODE) || defined(_UNICODE) # define __itt_sync_set_name __itt_sync_set_nameW # define __itt_sync_set_name_ptr __itt_sync_set_nameW_ptr #else /* UNICODE */ # define __itt_sync_set_name __itt_sync_set_nameA # define __itt_sync_set_name_ptr __itt_sync_set_nameA_ptr #endif /* UNICODE */ #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ void ITTAPI __itt_sync_set_name(void *addr, const char* objtype, const char* objname, int attribute); #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUBV(ITTAPI, void, sync_set_nameA, (void *addr, const char *objtype, const char *objname, int attribute)) ITT_STUBV(ITTAPI, void, sync_set_nameW, (void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute)) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUBV(ITTAPI, void, sync_set_name, (void *addr, const char *objtype, const char *objname, int attribute)) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_sync_set_nameA ITTNOTIFY_VOID(sync_set_nameA) #define __itt_sync_set_nameA_ptr ITTNOTIFY_NAME(sync_set_nameA) #define __itt_sync_set_nameW ITTNOTIFY_VOID(sync_set_nameW) #define __itt_sync_set_nameW_ptr ITTNOTIFY_NAME(sync_set_nameW) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_sync_set_name ITTNOTIFY_VOID(sync_set_name) #define __itt_sync_set_name_ptr ITTNOTIFY_NAME(sync_set_name) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #else /* INTEL_NO_ITTNOTIFY_API */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_sync_set_nameA(addr, objtype, objname, attribute) #define __itt_sync_set_nameA_ptr 0 #define __itt_sync_set_nameW(addr, objtype, objname, attribute) #define __itt_sync_set_nameW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_sync_set_name(addr, objtype, objname, attribute) #define __itt_sync_set_name_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_sync_set_nameA_ptr 0 #define __itt_sync_set_nameW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_sync_set_name_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @deprecated Legacy API * @brief Assign a name and type to a sync object using char or Unicode string * @param[in] addr - pointer to the sync object. You should use a real pointer to your object * to make sure that the values don't clash with other object addresses * @param[in] objtype - null-terminated object type string. If NULL is passed, the object will * be assumed to be of generic "User Synchronization" type * @param[in] objname - null-terminated object name string. If NULL, no name will be assigned * to the object -- you can use the __itt_sync_rename call later to assign * the name * @param[in] typelen, namelen - a length of string for appropriate objtype and objname parameter * @param[in] attribute - one of [#__itt_attr_barrier, #__itt_attr_mutex] values which defines the * exact semantics of how prepare/acquired/releasing calls work. * @return __itt_err upon failure (name or namelen being null,name and namelen mismatched) */ #if ITT_PLATFORM==ITT_PLATFORM_WIN int LIBITTAPI __itt_notify_sync_nameA(void *addr, const char *objtype, int typelen, const char *objname, int namelen, int attribute); int LIBITTAPI __itt_notify_sync_nameW(void *addr, const wchar_t *objtype, int typelen, const wchar_t *objname, int namelen, int attribute); #if defined(UNICODE) || defined(_UNICODE) # define __itt_notify_sync_name __itt_notify_sync_nameW #else # define __itt_notify_sync_name __itt_notify_sync_nameA #endif #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ int LIBITTAPI __itt_notify_sync_name(void *addr, const char *objtype, int typelen, const char *objname, int namelen, int attribute); #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(LIBITTAPI, int, notify_sync_nameA, (void *addr, const char *objtype, int typelen, const char *objname, int namelen, int attribute)) ITT_STUB(LIBITTAPI, int, notify_sync_nameW, (void *addr, const wchar_t *objtype, int typelen, const wchar_t *objname, int namelen, int attribute)) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUB(LIBITTAPI, int, notify_sync_name, (void *addr, const char *objtype, int typelen, const char *objname, int namelen, int attribute)) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_notify_sync_nameA ITTNOTIFY_DATA(notify_sync_nameA) #define __itt_notify_sync_nameA_ptr ITTNOTIFY_NAME(notify_sync_nameA) #define __itt_notify_sync_nameW ITTNOTIFY_DATA(notify_sync_nameW) #define __itt_notify_sync_nameW_ptr ITTNOTIFY_NAME(notify_sync_nameW) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_notify_sync_name ITTNOTIFY_DATA(notify_sync_name) #define __itt_notify_sync_name_ptr ITTNOTIFY_NAME(notify_sync_name) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #else /* INTEL_NO_ITTNOTIFY_API */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_notify_sync_nameA(addr, objtype, typelen, objname, namelen, attribute) #define __itt_notify_sync_nameA_ptr 0 #define __itt_notify_sync_nameW(addr, objtype, typelen, objname, namelen, attribute) #define __itt_notify_sync_nameW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_notify_sync_name(addr, objtype, typelen, objname, namelen, attribute) #define __itt_notify_sync_name_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_notify_sync_nameA_ptr 0 #define __itt_notify_sync_nameW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_notify_sync_name_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @deprecated Legacy API * @brief Enter spin loop on user-defined sync object */ void LIBITTAPI __itt_notify_sync_prepare(void* addr); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(LIBITTAPI, void, notify_sync_prepare, (void *addr)) #define __itt_notify_sync_prepare ITTNOTIFY_VOID(notify_sync_prepare) #define __itt_notify_sync_prepare_ptr ITTNOTIFY_NAME(notify_sync_prepare) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_notify_sync_prepare(addr) #define __itt_notify_sync_prepare_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_notify_sync_prepare_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @deprecated Legacy API * @brief Quit spin loop without acquiring spin object */ void LIBITTAPI __itt_notify_sync_cancel(void *addr); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(LIBITTAPI, void, notify_sync_cancel, (void *addr)) #define __itt_notify_sync_cancel ITTNOTIFY_VOID(notify_sync_cancel) #define __itt_notify_sync_cancel_ptr ITTNOTIFY_NAME(notify_sync_cancel) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_notify_sync_cancel(addr) #define __itt_notify_sync_cancel_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_notify_sync_cancel_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @deprecated Legacy API * @brief Successful spin loop completion (sync object acquired) */ void LIBITTAPI __itt_notify_sync_acquired(void *addr); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(LIBITTAPI, void, notify_sync_acquired, (void *addr)) #define __itt_notify_sync_acquired ITTNOTIFY_VOID(notify_sync_acquired) #define __itt_notify_sync_acquired_ptr ITTNOTIFY_NAME(notify_sync_acquired) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_notify_sync_acquired(addr) #define __itt_notify_sync_acquired_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_notify_sync_acquired_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @deprecated Legacy API * @brief Start sync object releasing code. Is called before the lock release call. */ void LIBITTAPI __itt_notify_sync_releasing(void* addr); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(LIBITTAPI, void, notify_sync_releasing, (void *addr)) #define __itt_notify_sync_releasing ITTNOTIFY_VOID(notify_sync_releasing) #define __itt_notify_sync_releasing_ptr ITTNOTIFY_NAME(notify_sync_releasing) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_notify_sync_releasing(addr) #define __itt_notify_sync_releasing_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_notify_sync_releasing_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @} legacy_sync group */ #ifndef _ITTNOTIFY_H_ /** * @defgroup legacy_events Events * @ingroup legacy * Events group * @{ */ /** @brief user event type */ typedef int __itt_event; /** * @brief Create an event notification * @note name or namelen being null/name and namelen not matching, user event feature not enabled * @return non-zero event identifier upon success and __itt_err otherwise */ #if ITT_PLATFORM==ITT_PLATFORM_WIN __itt_event LIBITTAPI __itt_event_createA(const char *name, int namelen); __itt_event LIBITTAPI __itt_event_createW(const wchar_t *name, int namelen); #if defined(UNICODE) || defined(_UNICODE) # define __itt_event_create __itt_event_createW # define __itt_event_create_ptr __itt_event_createW_ptr #else # define __itt_event_create __itt_event_createA # define __itt_event_create_ptr __itt_event_createA_ptr #endif /* UNICODE */ #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ __itt_event LIBITTAPI __itt_event_create(const char *name, int namelen); #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(LIBITTAPI, __itt_event, event_createA, (const char *name, int namelen)) ITT_STUB(LIBITTAPI, __itt_event, event_createW, (const wchar_t *name, int namelen)) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUB(LIBITTAPI, __itt_event, event_create, (const char *name, int namelen)) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_event_createA ITTNOTIFY_DATA(event_createA) #define __itt_event_createA_ptr ITTNOTIFY_NAME(event_createA) #define __itt_event_createW ITTNOTIFY_DATA(event_createW) #define __itt_event_createW_ptr ITTNOTIFY_NAME(event_createW) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_event_create ITTNOTIFY_DATA(event_create) #define __itt_event_create_ptr ITTNOTIFY_NAME(event_create) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #else /* INTEL_NO_ITTNOTIFY_API */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_event_createA(name, namelen) (__itt_event)0 #define __itt_event_createA_ptr 0 #define __itt_event_createW(name, namelen) (__itt_event)0 #define __itt_event_createW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_event_create(name, namelen) (__itt_event)0 #define __itt_event_create_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_event_createA_ptr 0 #define __itt_event_createW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_event_create_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Record an event occurrence. * @return __itt_err upon failure (invalid event id/user event feature not enabled) */ int LIBITTAPI __itt_event_start(__itt_event event); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUB(LIBITTAPI, int, event_start, (__itt_event event)) #define __itt_event_start ITTNOTIFY_DATA(event_start) #define __itt_event_start_ptr ITTNOTIFY_NAME(event_start) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_event_start(event) (int)0 #define __itt_event_start_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_event_start_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @brief Record an event end occurrence. * @note It is optional if events do not have durations. * @return __itt_err upon failure (invalid event id/user event feature not enabled) */ int LIBITTAPI __itt_event_end(__itt_event event); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUB(LIBITTAPI, int, event_end, (__itt_event event)) #define __itt_event_end ITTNOTIFY_DATA(event_end) #define __itt_event_end_ptr ITTNOTIFY_NAME(event_end) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_event_end(event) (int)0 #define __itt_event_end_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_event_end_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @} legacy_events group */ #endif /* _ITTNOTIFY_H_ */ /** * @defgroup legacy_memory Memory Accesses * @ingroup legacy */ /** * @deprecated Legacy API * @brief Inform the tool of memory accesses on reading */ void LIBITTAPI __itt_memory_read(void *addr, size_t size); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(LIBITTAPI, void, memory_read, (void *addr, size_t size)) #define __itt_memory_read ITTNOTIFY_VOID(memory_read) #define __itt_memory_read_ptr ITTNOTIFY_NAME(memory_read) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_memory_read(addr, size) #define __itt_memory_read_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_memory_read_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @deprecated Legacy API * @brief Inform the tool of memory accesses on writing */ void LIBITTAPI __itt_memory_write(void *addr, size_t size); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(LIBITTAPI, void, memory_write, (void *addr, size_t size)) #define __itt_memory_write ITTNOTIFY_VOID(memory_write) #define __itt_memory_write_ptr ITTNOTIFY_NAME(memory_write) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_memory_write(addr, size) #define __itt_memory_write_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_memory_write_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @deprecated Legacy API * @brief Inform the tool of memory accesses on updating */ void LIBITTAPI __itt_memory_update(void *address, size_t size); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(LIBITTAPI, void, memory_update, (void *addr, size_t size)) #define __itt_memory_update ITTNOTIFY_VOID(memory_update) #define __itt_memory_update_ptr ITTNOTIFY_NAME(memory_update) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_memory_update(addr, size) #define __itt_memory_update_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_memory_update_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @} legacy_memory group */ /** * @defgroup legacy_state Thread and Object States * @ingroup legacy */ /** @brief state type */ typedef int __itt_state_t; /** @cond exclude_from_documentation */ typedef enum __itt_obj_state { __itt_obj_state_err = 0, __itt_obj_state_clr = 1, __itt_obj_state_set = 2, __itt_obj_state_use = 3 } __itt_obj_state_t; typedef enum __itt_thr_state { __itt_thr_state_err = 0, __itt_thr_state_clr = 1, __itt_thr_state_set = 2 } __itt_thr_state_t; typedef enum __itt_obj_prop { __itt_obj_prop_watch = 1, __itt_obj_prop_ignore = 2, __itt_obj_prop_sharable = 3 } __itt_obj_prop_t; typedef enum __itt_thr_prop { __itt_thr_prop_quiet = 1 } __itt_thr_prop_t; /** @endcond */ /** * @deprecated Legacy API * @brief managing thread and object states */ __itt_state_t LIBITTAPI __itt_state_get(void); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUB(ITTAPI, __itt_state_t, state_get, (void)) #define __itt_state_get ITTNOTIFY_DATA(state_get) #define __itt_state_get_ptr ITTNOTIFY_NAME(state_get) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_state_get(void) (__itt_state_t)0 #define __itt_state_get_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_state_get_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @deprecated Legacy API * @brief managing thread and object states */ __itt_state_t LIBITTAPI __itt_state_set(__itt_state_t s); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUB(ITTAPI, __itt_state_t, state_set, (__itt_state_t s)) #define __itt_state_set ITTNOTIFY_DATA(state_set) #define __itt_state_set_ptr ITTNOTIFY_NAME(state_set) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_state_set(s) (__itt_state_t)0 #define __itt_state_set_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_state_set_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @deprecated Legacy API * @brief managing thread and object modes */ __itt_thr_state_t LIBITTAPI __itt_thr_mode_set(__itt_thr_prop_t p, __itt_thr_state_t s); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUB(ITTAPI, __itt_thr_state_t, thr_mode_set, (__itt_thr_prop_t p, __itt_thr_state_t s)) #define __itt_thr_mode_set ITTNOTIFY_DATA(thr_mode_set) #define __itt_thr_mode_set_ptr ITTNOTIFY_NAME(thr_mode_set) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_thr_mode_set(p, s) (__itt_thr_state_t)0 #define __itt_thr_mode_set_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_thr_mode_set_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** * @deprecated Legacy API * @brief managing thread and object modes */ __itt_obj_state_t LIBITTAPI __itt_obj_mode_set(__itt_obj_prop_t p, __itt_obj_state_t s); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUB(ITTAPI, __itt_obj_state_t, obj_mode_set, (__itt_obj_prop_t p, __itt_obj_state_t s)) #define __itt_obj_mode_set ITTNOTIFY_DATA(obj_mode_set) #define __itt_obj_mode_set_ptr ITTNOTIFY_NAME(obj_mode_set) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_obj_mode_set(p, s) (__itt_obj_state_t)0 #define __itt_obj_mode_set_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_obj_mode_set_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @} legacy_state group */ /** * @defgroup frames Frames * @ingroup legacy * Frames group * @{ */ /** * @brief opaque structure for frame identification */ typedef struct __itt_frame_t *__itt_frame; /** * @brief Create a global frame with given domain */ #if ITT_PLATFORM==ITT_PLATFORM_WIN __itt_frame ITTAPI __itt_frame_createA(const char *domain); __itt_frame ITTAPI __itt_frame_createW(const wchar_t *domain); #if defined(UNICODE) || defined(_UNICODE) # define __itt_frame_create __itt_frame_createW # define __itt_frame_create_ptr __itt_frame_createW_ptr #else /* UNICODE */ # define __itt_frame_create __itt_frame_createA # define __itt_frame_create_ptr __itt_frame_createA_ptr #endif /* UNICODE */ #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ __itt_frame ITTAPI __itt_frame_create(const char *domain); #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API #if ITT_PLATFORM==ITT_PLATFORM_WIN ITT_STUB(ITTAPI, __itt_frame, frame_createA, (const char *domain)) ITT_STUB(ITTAPI, __itt_frame, frame_createW, (const wchar_t *domain)) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ ITT_STUB(ITTAPI, __itt_frame, frame_create, (const char *domain)) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_frame_createA ITTNOTIFY_DATA(frame_createA) #define __itt_frame_createA_ptr ITTNOTIFY_NAME(frame_createA) #define __itt_frame_createW ITTNOTIFY_DATA(frame_createW) #define __itt_frame_createW_ptr ITTNOTIFY_NAME(frame_createW) #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_frame_create ITTNOTIFY_DATA(frame_create) #define __itt_frame_create_ptr ITTNOTIFY_NAME(frame_create) #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #else /* INTEL_NO_ITTNOTIFY_API */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_frame_createA(domain) #define __itt_frame_createA_ptr 0 #define __itt_frame_createW(domain) #define __itt_frame_createW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_frame_create(domain) #define __itt_frame_create_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #if ITT_PLATFORM==ITT_PLATFORM_WIN #define __itt_frame_createA_ptr 0 #define __itt_frame_createW_ptr 0 #else /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #define __itt_frame_create_ptr 0 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */ #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @brief Record a frame begin occurrence. */ void ITTAPI __itt_frame_begin(__itt_frame frame); /** @brief Record a frame end occurrence. */ void ITTAPI __itt_frame_end (__itt_frame frame); /** @cond exclude_from_documentation */ #ifndef INTEL_NO_MACRO_BODY #ifndef INTEL_NO_ITTNOTIFY_API ITT_STUBV(ITTAPI, void, frame_begin, (__itt_frame frame)) ITT_STUBV(ITTAPI, void, frame_end, (__itt_frame frame)) #define __itt_frame_begin ITTNOTIFY_VOID(frame_begin) #define __itt_frame_begin_ptr ITTNOTIFY_NAME(frame_begin) #define __itt_frame_end ITTNOTIFY_VOID(frame_end) #define __itt_frame_end_ptr ITTNOTIFY_NAME(frame_end) #else /* INTEL_NO_ITTNOTIFY_API */ #define __itt_frame_begin(frame) #define __itt_frame_begin_ptr 0 #define __itt_frame_end(frame) #define __itt_frame_end_ptr 0 #endif /* INTEL_NO_ITTNOTIFY_API */ #else /* INTEL_NO_MACRO_BODY */ #define __itt_frame_begin_ptr 0 #define __itt_frame_end_ptr 0 #endif /* INTEL_NO_MACRO_BODY */ /** @endcond */ /** @} frames group */ #ifdef __cplusplus } #endif /* __cplusplus */ #endif /* _LEGACY_ITTNOTIFY_H_ */ ================================================ FILE: third-party/tbb/src/tbb/version.cpp ================================================ /* Copyright (c) 2020-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "oneapi/tbb/version.h" extern "C" int TBB_runtime_interface_version() { return TBB_INTERFACE_VERSION; } extern "C" const char* TBB_runtime_version() { static const char version_str[] = TBB_VERSION_STRING; return version_str; } ================================================ FILE: third-party/tbb/src/tbb/waiters.h ================================================ /* Copyright (c) 2005-2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _TBB_waiters_H #define _TBB_waiters_H #include "oneapi/tbb/detail/_task.h" #include "scheduler_common.h" #include "arena.h" #include "threading_control.h" namespace tbb { namespace detail { namespace r1 { inline d1::task* get_self_recall_task(arena_slot& slot); class waiter_base { public: waiter_base(arena& a, int yields_multiplier = 1) : my_arena(a), my_backoff(int(a.my_num_slots), yields_multiplier) {} bool pause() { if (my_backoff.pause()) { my_arena.out_of_work(); return true; } return false; } void reset_wait() { my_backoff.reset_wait(); } protected: arena& my_arena; stealing_loop_backoff my_backoff; }; class outermost_worker_waiter : public waiter_base { public: using waiter_base::waiter_base; bool continue_execution(arena_slot& slot, d1::task*& t) { __TBB_ASSERT(t == nullptr, nullptr); if (is_worker_should_leave(slot)) { if (is_delayed_leave_enabled()) { static constexpr std::chrono::microseconds worker_wait_leave_duration(1000); static_assert(worker_wait_leave_duration > std::chrono::steady_clock::duration(1), "Clock resolution is not enough for measured interval."); for (auto t1 = std::chrono::steady_clock::now(), t2 = t1; std::chrono::duration_cast(t2 - t1) < worker_wait_leave_duration; t2 = std::chrono::steady_clock::now()) { if (!my_arena.is_empty() && !my_arena.is_recall_requested()) { return true; } if (!my_arena.my_thread_leave.is_retention_allowed() || my_arena.my_threading_control->is_any_other_client_active()) { break; } d0::yield(); } } // Leave dispatch loop return false; } t = get_self_recall_task(slot); return true; } void pause(arena_slot&) { waiter_base::pause(); } d1::wait_context* wait_ctx() { return nullptr; } static bool postpone_execution(d1::task&) { return false; } private: using base_type = waiter_base; bool is_delayed_leave_enabled() { #if __TBB_PREVIEW_PARALLEL_PHASE return my_arena.my_thread_leave.is_retention_allowed(); #else return !governor::hybrid_cpu(); #endif } bool is_worker_should_leave(arena_slot& slot) const { bool is_top_priority_arena = my_arena.is_top_priority(); bool is_task_pool_empty = slot.task_pool.load(std::memory_order_relaxed) == EmptyTaskPool; if (is_top_priority_arena) { // Worker in most priority arena do not leave arena, until all work in task_pool is done if (is_task_pool_empty && my_arena.is_recall_requested()) { return true; } } else { if (my_arena.is_recall_requested()) { // If worker has work in task pool, we must notify other threads, // because can appear missed wake up of other threads if (!is_task_pool_empty) { my_arena.advertise_new_work(); } return true; } } return false; } }; class sleep_waiter : public waiter_base { protected: using waiter_base::waiter_base; template void sleep(std::uintptr_t uniq_tag, Pred wakeup_condition) { my_arena.get_waiting_threads_monitor().wait(wakeup_condition, market_context{uniq_tag, &my_arena}); reset_wait(); } }; class external_waiter : public sleep_waiter { public: external_waiter(arena& a, d1::wait_context& wo) : sleep_waiter(a, /*yields_multiplier*/10), my_wait_ctx(wo) {} bool continue_execution(arena_slot& slot, d1::task*& t) const { __TBB_ASSERT(t == nullptr, nullptr); if (!my_wait_ctx.continue_execution()) return false; t = get_self_recall_task(slot); return true; } void pause(arena_slot&) { if (!sleep_waiter::pause()) { return; } auto wakeup_condition = [&] { return !my_arena.is_empty() || !my_wait_ctx.continue_execution(); }; sleep(std::uintptr_t(&my_wait_ctx), wakeup_condition); } d1::wait_context* wait_ctx() { return &my_wait_ctx; } static bool postpone_execution(d1::task&) { return false; } private: d1::wait_context& my_wait_ctx; }; #if __TBB_RESUMABLE_TASKS class coroutine_waiter : public sleep_waiter { public: using sleep_waiter::sleep_waiter; bool continue_execution(arena_slot& slot, d1::task*& t) const { __TBB_ASSERT(t == nullptr, nullptr); t = get_self_recall_task(slot); return true; } void pause(arena_slot& slot) { if (!sleep_waiter::pause()) { return; } suspend_point_type* sp = slot.default_task_dispatcher().m_suspend_point; auto wakeup_condition = [&] { return !my_arena.is_empty() || sp->m_is_owner_recalled.load(std::memory_order_relaxed); }; sleep(std::uintptr_t(sp), wakeup_condition); } d1::wait_context* wait_ctx() { return nullptr; } static bool postpone_execution(d1::task& t) { return task_accessor::is_resume_task(t); } }; #endif // __TBB_RESUMABLE_TASKS } // namespace r1 } // namespace detail } // namespace tbb #endif // _TBB_waiters_H ================================================ FILE: third-party/tbb/src/tbbbind/CMakeLists.txt ================================================ # Copyright (c) 2020-2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. set(CMAKE_SKIP_BUILD_RPATH TRUE) function(tbbbind_build TBBBIND_NAME REQUIRED_HWLOC_TARGET) if (NOT TARGET ${REQUIRED_HWLOC_TARGET}) message(STATUS "HWLOC target ${REQUIRED_HWLOC_TARGET} doesn't exist." " The ${TBBBIND_NAME} target cannot be created") return() endif() add_library(${TBBBIND_NAME} tbb_bind.cpp) if (WIN32) target_sources(${TBBBIND_NAME} PRIVATE tbb_bind.rc) endif() add_library(TBB::${TBBBIND_NAME} ALIAS ${TBBBIND_NAME}) target_compile_definitions(${TBBBIND_NAME} PUBLIC $<$:TBB_USE_DEBUG> PRIVATE __TBBBIND_BUILD) target_include_directories(${TBBBIND_NAME} PUBLIC $ $ ${HWLOC_INCLUDE_DIRS} # pkg-config defined ) target_compile_options(${TBBBIND_NAME} PRIVATE ${TBB_CXX_STD_FLAG} # TODO: consider making it PUBLIC. ${TBB_MMD_FLAG} ${TBB_DSE_FLAG} ${TBB_WARNING_LEVEL} ${TBB_LIB_COMPILE_FLAGS} ${TBB_COMMON_COMPILE_FLAGS} ) # Avoid use of target_link_libraries here as it changes /DEF option to \DEF on Windows. set_target_properties(${TBBBIND_NAME} PROPERTIES DEFINE_SYMBOL "" VERSION ${TBBBIND_BINARY_VERSION}.${TBB_BINARY_MINOR_VERSION} SOVERSION ${TBBBIND_BINARY_VERSION} ) tbb_handle_ipo(${TBBBIND_NAME}) if (TBB_DEF_FILE_PREFIX) # If there's no prefix, assume we're using export directives set_target_properties(${TBBBIND_NAME} PROPERTIES LINK_FLAGS "${TBB_LINK_DEF_FILE_FLAG}\"${CMAKE_CURRENT_SOURCE_DIR}/def/${TBB_DEF_FILE_PREFIX}-tbbbind.def\"" LINK_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/def/${TBB_DEF_FILE_PREFIX}-tbbbind.def" ) endif() # Prefer using target_link_options instead of target_link_libraries to specify link options because # target_link_libraries may incorrectly handle some options (on Windows, for example). if (COMMAND target_link_options) target_link_options(${TBBBIND_NAME} PRIVATE ${TBB_LIB_LINK_FLAGS} ${TBB_COMMON_LINK_FLAGS} ) else() target_link_libraries(${TBBBIND_NAME} PRIVATE ${TBB_LIB_LINK_FLAGS} ${TBB_COMMON_LINK_FLAGS} ) endif() target_link_libraries(${TBBBIND_NAME} PUBLIC ${REQUIRED_HWLOC_TARGET} PRIVATE ${TBB_LIB_LINK_LIBS} ${TBB_COMMON_LINK_LIBS} ) tbb_install_target(${TBBBIND_NAME}) endfunction() if (NOT DEFINED HWLOC_TARGET_EXPLICITLY_DEFINED AND TARGET PkgConfig::HWLOC) message(STATUS "The ${TBBBIND_LIBRARY_NAME} target will be configured using the HWLOC ${HWLOC_VERSION}") tbbbind_build(${TBBBIND_LIBRARY_NAME} PkgConfig::HWLOC) else() tbbbind_build(tbbbind HWLOC::hwloc_1_11) tbbbind_build(tbbbind_2_0 HWLOC::hwloc_2 ) tbbbind_build(tbbbind_2_5 HWLOC::hwloc_2_5 ) endif() ================================================ FILE: third-party/tbb/src/tbbbind/def/lin32-tbbbind.def ================================================ /* Copyright (c) 2019-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ { global: __TBB_internal_initialize_system_topology; __TBB_internal_apply_affinity; __TBB_internal_restore_affinity; __TBB_internal_allocate_binding_handler; __TBB_internal_deallocate_binding_handler; __TBB_internal_get_default_concurrency; __TBB_internal_destroy_system_topology; }; ================================================ FILE: third-party/tbb/src/tbbbind/def/lin64-tbbbind.def ================================================ /* Copyright (c) 2019-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ { global: __TBB_internal_initialize_system_topology; __TBB_internal_apply_affinity; __TBB_internal_restore_affinity; __TBB_internal_allocate_binding_handler; __TBB_internal_deallocate_binding_handler; __TBB_internal_get_default_concurrency; __TBB_internal_destroy_system_topology; }; ================================================ FILE: third-party/tbb/src/tbbbind/def/mac64-tbbbind.def ================================================ # Copyright (c) 2023 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ___TBB_internal_initialize_system_topology ___TBB_internal_get_default_concurrency ___TBB_internal_destroy_system_topology ================================================ FILE: third-party/tbb/src/tbbbind/def/win32-tbbbind.def ================================================ ; Copyright (c) 2019-2021 Intel Corporation ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. ; You may obtain a copy of the License at ; ; http://www.apache.org/licenses/LICENSE-2.0 ; ; Unless required by applicable law or agreed to in writing, software ; distributed under the License is distributed on an "AS IS" BASIS, ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ; See the License for the specific language governing permissions and ; limitations under the License. EXPORTS __TBB_internal_initialize_system_topology __TBB_internal_apply_affinity __TBB_internal_restore_affinity __TBB_internal_allocate_binding_handler __TBB_internal_deallocate_binding_handler __TBB_internal_get_default_concurrency __TBB_internal_destroy_system_topology ================================================ FILE: third-party/tbb/src/tbbbind/def/win64-tbbbind.def ================================================ ; Copyright (c) 2019-2021 Intel Corporation ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. ; You may obtain a copy of the License at ; ; http://www.apache.org/licenses/LICENSE-2.0 ; ; Unless required by applicable law or agreed to in writing, software ; distributed under the License is distributed on an "AS IS" BASIS, ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ; See the License for the specific language governing permissions and ; limitations under the License. EXPORTS __TBB_internal_initialize_system_topology __TBB_internal_apply_affinity __TBB_internal_restore_affinity __TBB_internal_allocate_binding_handler __TBB_internal_deallocate_binding_handler __TBB_internal_get_default_concurrency __TBB_internal_destroy_system_topology ================================================ FILE: third-party/tbb/src/tbbbind/tbb_bind.cpp ================================================ /* Copyright (c) 2019-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include "../tbb/assert_impl.h" // Out-of-line TBB assertion handling routines are instantiated here. #include "oneapi/tbb/detail/_assert.h" #include "oneapi/tbb/detail/_config.h" #if _MSC_VER && !__INTEL_COMPILER && !__clang__ #pragma warning( push ) #pragma warning( disable : 4100 ) #elif _MSC_VER && __clang__ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-parameter" #endif #include #if _MSC_VER && !__INTEL_COMPILER && !__clang__ #pragma warning( pop ) #elif _MSC_VER && __clang__ #pragma GCC diagnostic pop #endif #define __TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT (HWLOC_API_VERSION >= 0x20400) #define __TBBBIND_HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING_PRESENT (HWLOC_API_VERSION >= 0x20500) #define __TBBBIND_HWLOC_WINDOWS_API_AVAILABLE (_WIN32 && HWLOC_API_VERSION >= 0x20500) #if __TBBBIND_HWLOC_WINDOWS_API_AVAILABLE #include #endif // Most of hwloc calls returns negative exit code on error. // This macro tracks error codes that are returned from the hwloc interfaces. #define assertion_hwloc_wrapper(command, ...) \ __TBB_ASSERT_EX( (command(__VA_ARGS__)) >= 0, "Error occurred during call to hwloc API."); namespace tbb { namespace detail { namespace r1 { //------------------------------------------------------------------------ // Information about the machine's hardware TBB is happen to work on //------------------------------------------------------------------------ class system_topology { friend class binding_handler; // Common topology members hwloc_topology_t topology{nullptr}; hwloc_cpuset_t process_cpu_affinity_mask{nullptr}; hwloc_nodeset_t process_node_affinity_mask{nullptr}; std::size_t number_of_processors_groups{1}; std::vector processor_groups_affinity_masks_list{}; // NUMA API related topology members std::vector numa_affinity_masks_list{}; std::vector numa_indexes_list{}; int numa_nodes_count{0}; // Hybrid CPUs API related topology members std::vector core_types_affinity_masks_list{}; std::vector core_types_indexes_list{}; enum init_stages { uninitialized, started, topology_allocated, topology_loaded, topology_parsed } initialization_state; // Binding threads that locate in another Windows Processor groups // is allowed only if machine topology contains several Windows Processors groups // and process affinity mask wasn't limited manually (affinity mask cannot violate // processors group boundaries). bool intergroup_binding_allowed(std::size_t groups_num) { return groups_num > 1; } private: void topology_initialization(std::size_t groups_num) { initialization_state = started; // Parse topology if ( hwloc_topology_init( &topology ) == 0 ) { initialization_state = topology_allocated; #if __TBBBIND_HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING_PRESENT unsigned long flags = 0; if (groups_num > 1) { // HWLOC x86 backend might interfere with process affinity mask on // Windows systems with multiple processor groups. flags = HWLOC_TOPOLOGY_FLAG_DONT_CHANGE_BINDING; } else { flags = HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM | HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING; } if (hwloc_topology_set_flags(topology, flags) != 0) { return; } #endif if ( hwloc_topology_load( topology ) == 0 ) { initialization_state = topology_loaded; } } if ( initialization_state != topology_loaded ) return; #if __TBB_CPUBIND_PRESENT // Getting process affinity mask if ( intergroup_binding_allowed(groups_num) ) { process_cpu_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_cpuset (topology)); process_node_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_nodeset(topology)); } else { process_cpu_affinity_mask = hwloc_bitmap_alloc(); process_node_affinity_mask = hwloc_bitmap_alloc(); assertion_hwloc_wrapper(hwloc_get_cpubind, topology, process_cpu_affinity_mask, 0); hwloc_cpuset_to_nodeset(topology, process_cpu_affinity_mask, process_node_affinity_mask); } #else process_cpu_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_cpuset (topology)); process_node_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_nodeset(topology)); #endif number_of_processors_groups = groups_num; } void numa_topology_parsing() { // Fill parameters with stubs if topology parsing is broken. if ( initialization_state != topology_loaded ) { numa_nodes_count = 1; numa_indexes_list.push_back(-1); return; } // If system contains no NUMA nodes, HWLOC 1.11 returns an infinitely filled bitmap. // hwloc_bitmap_weight() returns negative value for such bitmaps, so we use this check // to change way of topology initialization. numa_nodes_count = hwloc_bitmap_weight(process_node_affinity_mask); if (numa_nodes_count <= 0) { // numa_nodes_count may be empty if the process affinity mask is empty too (invalid case) // or if some internal HWLOC error occurred. // So we place -1 as index in this case. numa_indexes_list.push_back(numa_nodes_count == 0 ? -1 : 0); numa_nodes_count = 1; numa_affinity_masks_list.push_back(hwloc_bitmap_dup(process_cpu_affinity_mask)); } else { // Get NUMA logical indexes list unsigned counter = 0; int i = 0; int max_numa_index = -1; numa_indexes_list.resize(numa_nodes_count); hwloc_obj_t node_buffer; hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) { node_buffer = hwloc_get_numanode_obj_by_os_index(topology, i); numa_indexes_list[counter] = static_cast(node_buffer->logical_index); if ( numa_indexes_list[counter] > max_numa_index ) { max_numa_index = numa_indexes_list[counter]; } counter++; } hwloc_bitmap_foreach_end(); __TBB_ASSERT(max_numa_index >= 0, "Maximal NUMA index must not be negative"); // Fill concurrency and affinity masks lists numa_affinity_masks_list.resize(max_numa_index + 1); int index = 0; hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) { node_buffer = hwloc_get_numanode_obj_by_os_index(topology, i); index = static_cast(node_buffer->logical_index); hwloc_cpuset_t& current_mask = numa_affinity_masks_list[index]; current_mask = hwloc_bitmap_dup(node_buffer->cpuset); hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask); __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask), "hwloc detected unavailable NUMA node"); } hwloc_bitmap_foreach_end(); } } void core_types_topology_parsing() { // Fill parameters with stubs if topology parsing is broken. if ( initialization_state != topology_loaded ) { core_types_indexes_list.push_back(-1); return; } #if __TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT __TBB_ASSERT(hwloc_get_api_version() >= 0x20400, "Hybrid CPUs support interfaces required HWLOC >= 2.4"); // Parsing the hybrid CPU topology int core_types_number = hwloc_cpukinds_get_nr(topology, 0); bool core_types_parsing_broken = core_types_number <= 0; if (!core_types_parsing_broken) { core_types_affinity_masks_list.resize(core_types_number); int efficiency{-1}; for (int core_type = 0; core_type < core_types_number; ++core_type) { hwloc_cpuset_t& current_mask = core_types_affinity_masks_list[core_type]; current_mask = hwloc_bitmap_alloc(); if (!hwloc_cpukinds_get_info(topology, core_type, current_mask, &efficiency, nullptr, nullptr, 0) && efficiency >= 0 ) { hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask); if (hwloc_bitmap_weight(current_mask) > 0) { core_types_indexes_list.push_back(core_type); } __TBB_ASSERT(hwloc_bitmap_weight(current_mask) >= 0, "Infinivitely filled core type mask"); } else { core_types_parsing_broken = true; break; } } } #else /*!__TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/ bool core_types_parsing_broken{true}; #endif /*__TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/ if (core_types_parsing_broken) { for (auto& core_type_mask : core_types_affinity_masks_list) { hwloc_bitmap_free(core_type_mask); } core_types_affinity_masks_list.resize(1); core_types_indexes_list.resize(1); core_types_affinity_masks_list[0] = hwloc_bitmap_dup(process_cpu_affinity_mask); core_types_indexes_list[0] = -1; } } #if __TBBBIND_HWLOC_WINDOWS_API_AVAILABLE void processor_groups_topology_parsing() { __TBB_ASSERT(number_of_processors_groups > 1, nullptr); processor_groups_affinity_masks_list.resize(number_of_processors_groups); for (unsigned group = 0; group < number_of_processors_groups; ++group) { processor_groups_affinity_masks_list[group] = hwloc_bitmap_alloc(); assertion_hwloc_wrapper(hwloc_windows_get_processor_group_cpuset, topology, group, processor_groups_affinity_masks_list[group], /*flags*/0); } #if TBB_USE_ASSERT affinity_mask tmp = hwloc_bitmap_alloc(); for (auto proc_group_mask : processor_groups_affinity_masks_list) { __TBB_ASSERT(!hwloc_bitmap_intersects(tmp, proc_group_mask), "Masks of processor groups intersect."); hwloc_bitmap_or(tmp, tmp, proc_group_mask); } hwloc_bitmap_free(tmp); #endif } #endif void enforce_hwloc_2_5_runtime_linkage() { // Without the call of this function HWLOC 2.4 can be successfully loaded during the tbbbind_2_5 loading. // It is possible since tbbbind_2_5 don't use any new entry points that were introduced in HWLOC 2.5 // But tbbbind_2_5 compiles with HWLOC 2.5 header, therefore such situation requires binary forward compatibility // which are not guaranteed by the HWLOC library. To enforce linkage tbbbind_2_5 only with HWLOC >= 2.5 version // this function calls the interface that is available in the HWLOC 2.5 only. #if HWLOC_API_VERSION >= 0x20500 auto some_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, nullptr); hwloc_get_obj_with_same_locality(topology, some_core, HWLOC_OBJ_CORE, nullptr, nullptr, 0); #endif } void initialize( std::size_t groups_num ) { if ( initialization_state != uninitialized ) return; topology_initialization(groups_num); numa_topology_parsing(); core_types_topology_parsing(); #if __TBBBIND_HWLOC_WINDOWS_API_AVAILABLE if (intergroup_binding_allowed(groups_num)) { processor_groups_topology_parsing(); } #endif enforce_hwloc_2_5_runtime_linkage(); if (initialization_state == topology_loaded) initialization_state = topology_parsed; } static system_topology* instance_ptr; public: typedef hwloc_cpuset_t affinity_mask; typedef hwloc_const_cpuset_t const_affinity_mask; bool is_topology_parsed() { return initialization_state == topology_parsed; } static void construct( std::size_t groups_num ) { if (instance_ptr == nullptr) { instance_ptr = new system_topology(); instance_ptr->initialize(groups_num); } } static system_topology& instance() { __TBB_ASSERT(instance_ptr != nullptr, "Getting instance of non-constructed topology"); return *instance_ptr; } static void destroy() { __TBB_ASSERT(instance_ptr != nullptr, "Destroying non-constructed topology"); delete instance_ptr; } ~system_topology() { if ( is_topology_parsed() ) { for (auto& numa_node_mask : numa_affinity_masks_list) { hwloc_bitmap_free(numa_node_mask); } for (auto& core_type_mask : core_types_affinity_masks_list) { hwloc_bitmap_free(core_type_mask); } for (auto& processor_group : processor_groups_affinity_masks_list) { hwloc_bitmap_free(processor_group); } hwloc_bitmap_free(process_node_affinity_mask); hwloc_bitmap_free(process_cpu_affinity_mask); } if ( initialization_state >= topology_allocated ) { hwloc_topology_destroy(topology); } initialization_state = uninitialized; } void fill_topology_information( int& _numa_nodes_count, int*& _numa_indexes_list, int& _core_types_count, int*& _core_types_indexes_list ) { __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized system_topology"); _numa_nodes_count = numa_nodes_count; _numa_indexes_list = numa_indexes_list.data(); _core_types_count = (int)core_types_indexes_list.size(); _core_types_indexes_list = core_types_indexes_list.data(); } void fill_constraints_affinity_mask(affinity_mask input_mask, int numa_node_index, int core_type_index, int max_threads_per_core) { __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized system_topology"); __TBB_ASSERT(numa_node_index < (int)numa_affinity_masks_list.size(), "Wrong NUMA node id"); __TBB_ASSERT(core_type_index < (int)core_types_affinity_masks_list.size(), "Wrong core type id"); __TBB_ASSERT(max_threads_per_core == -1 || max_threads_per_core > 0, "Wrong max_threads_per_core"); hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc(); hwloc_cpuset_t core_mask = hwloc_bitmap_alloc(); hwloc_bitmap_copy(constraints_mask, process_cpu_affinity_mask); if (numa_node_index >= 0) { hwloc_bitmap_and(constraints_mask, constraints_mask, numa_affinity_masks_list[numa_node_index]); } if (core_type_index >= 0) { hwloc_bitmap_and(constraints_mask, constraints_mask, core_types_affinity_masks_list[core_type_index]); } if (max_threads_per_core > 0) { // clear input mask hwloc_bitmap_zero(input_mask); hwloc_obj_t current_core = nullptr; while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) { hwloc_bitmap_and(core_mask, constraints_mask, current_core->cpuset); // fit the core mask to required bits number int current_threads_per_core = 0; for (int id = hwloc_bitmap_first(core_mask); id != -1; id = hwloc_bitmap_next(core_mask, id)) { if (++current_threads_per_core > max_threads_per_core) { hwloc_bitmap_clr(core_mask, id); } } hwloc_bitmap_or(input_mask, input_mask, core_mask); } } else { hwloc_bitmap_copy(input_mask, constraints_mask); } hwloc_bitmap_free(core_mask); hwloc_bitmap_free(constraints_mask); } void fit_num_threads_per_core(affinity_mask result_mask, affinity_mask current_mask, affinity_mask constraints_mask) { hwloc_bitmap_zero(result_mask); hwloc_obj_t current_core = nullptr; while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) { if (hwloc_bitmap_intersects(current_mask, current_core->cpuset)) { hwloc_bitmap_or(result_mask, result_mask, current_core->cpuset); } } hwloc_bitmap_and(result_mask, result_mask, constraints_mask); } /** * Finds processor group for the passed slot number, which are from 0 to max concurrency - 1, by * traversing masks of processor groups one by one, intersecting them with the constrained mask. * Once total weight of processor groups united mask is greater than the slot number, the mask * of the last traversed processor group is returned, denoting the mask to apply to the thread * occupying given slot number. */ void fit_to_processor_group(affinity_mask result_mask, affinity_mask constraints_mask, std::size_t slot_num) { __TBB_ASSERT(number_of_processors_groups > 1, nullptr); hwloc_bitmap_zero(result_mask); int constraints_mask_weight = hwloc_bitmap_weight(constraints_mask); // Map slot number to a number within constraints mask if // max concurrency is greater than weight of the mask. slot_num %= constraints_mask_weight; std::size_t total_weight = 0; for (auto& processor_group : processor_groups_affinity_masks_list) { if (hwloc_bitmap_intersects(constraints_mask, processor_group)) { hwloc_bitmap_and(result_mask, processor_group, constraints_mask); total_weight += hwloc_bitmap_weight(result_mask); if (slot_num < total_weight) { return; // Corresponding processor group where to bind the thread is found } } } } int get_default_concurrency(int numa_node_index, int core_type_index, int max_threads_per_core) { __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized system_topology"); hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc(); fill_constraints_affinity_mask(constraints_mask, numa_node_index, core_type_index, max_threads_per_core); int default_concurrency = hwloc_bitmap_weight(constraints_mask); hwloc_bitmap_free(constraints_mask); return default_concurrency; } affinity_mask allocate_process_affinity_mask() { __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized system_topology"); return hwloc_bitmap_dup(process_cpu_affinity_mask); } void free_affinity_mask( affinity_mask mask_to_free ) { hwloc_bitmap_free(mask_to_free); // If bitmap is nullptr, no operation is performed. } void store_current_affinity_mask( affinity_mask current_mask ) { assertion_hwloc_wrapper(hwloc_get_cpubind, topology, current_mask, HWLOC_CPUBIND_THREAD); hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask); __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask), "Current affinity mask must intersects with process affinity mask"); } void set_affinity_mask( const_affinity_mask mask ) { if (hwloc_bitmap_weight(mask) > 0) { assertion_hwloc_wrapper(hwloc_set_cpubind, topology, mask, HWLOC_CPUBIND_THREAD); } } }; system_topology* system_topology::instance_ptr{nullptr}; class binding_handler { // Following vector saves thread affinity mask on scheduler entry to return it to this thread // on scheduler exit. typedef std::vector affinity_masks_container; affinity_masks_container affinity_backup; system_topology::affinity_mask handler_affinity_mask; #ifdef _WIN32 affinity_masks_container affinity_buffer; int my_numa_node_id; int my_core_type_id; int my_max_threads_per_core; #endif public: binding_handler( std::size_t size, int numa_node_id, int core_type_id, int max_threads_per_core ) : affinity_backup(size) #ifdef _WIN32 , affinity_buffer(size) , my_numa_node_id(numa_node_id) , my_core_type_id(core_type_id) , my_max_threads_per_core(max_threads_per_core) #endif { for (std::size_t i = 0; i < size; ++i) { affinity_backup[i] = system_topology::instance().allocate_process_affinity_mask(); #ifdef _WIN32 affinity_buffer[i] = system_topology::instance().allocate_process_affinity_mask(); #endif } handler_affinity_mask = system_topology::instance().allocate_process_affinity_mask(); system_topology::instance().fill_constraints_affinity_mask (handler_affinity_mask, numa_node_id, core_type_id, max_threads_per_core); } ~binding_handler() { for (std::size_t i = 0; i < affinity_backup.size(); ++i) { system_topology::instance().free_affinity_mask(affinity_backup[i]); #ifdef _WIN32 system_topology::instance().free_affinity_mask(affinity_buffer[i]); #endif } system_topology::instance().free_affinity_mask(handler_affinity_mask); } void apply_affinity( unsigned slot_num ) { auto& topology = system_topology::instance(); __TBB_ASSERT(slot_num < affinity_backup.size(), "The slot number is greater than the number of slots in the arena"); __TBB_ASSERT(topology.is_topology_parsed(), "Trying to get access to uninitialized system_topology"); topology.store_current_affinity_mask(affinity_backup[slot_num]); system_topology::affinity_mask thread_affinity = handler_affinity_mask; #ifdef _WIN32 // If we have a constraint based only on the max_threads_per_core setting, then the // constraints affinity mask may cross the border between several processor groups // on systems with more then 64 logical processors. That is why we need to use the special // function, which regulates the number of threads in the current threads mask. bool is_default_numa = my_numa_node_id == -1 || topology.numa_indexes_list.size() == 1; bool is_default_core_type = my_core_type_id == -1 || topology.core_types_indexes_list.size() == 1; if (topology.number_of_processors_groups > 1 && my_max_threads_per_core != -1 && is_default_numa && is_default_core_type ) { topology.fit_num_threads_per_core(affinity_buffer[slot_num], affinity_backup[slot_num], handler_affinity_mask); thread_affinity = affinity_buffer[slot_num]; } #if __TBBBIND_HWLOC_WINDOWS_API_AVAILABLE else if (topology.number_of_processors_groups > 1) { topology.fit_to_processor_group(affinity_buffer[slot_num], handler_affinity_mask, slot_num); thread_affinity = affinity_buffer[slot_num]; } #endif #endif topology.set_affinity_mask(thread_affinity); } void restore_previous_affinity_mask( unsigned slot_num ) { auto& topology = system_topology::instance(); __TBB_ASSERT(topology.is_topology_parsed(), "Trying to get access to uninitialized system_topology"); topology.set_affinity_mask(affinity_backup[slot_num]); }; }; extern "C" { // exported to TBB interfaces TBBBIND_EXPORT void __TBB_internal_initialize_system_topology( std::size_t groups_num, int& numa_nodes_count, int*& numa_indexes_list, int& core_types_count, int*& core_types_indexes_list ) { system_topology::construct(groups_num); system_topology::instance().fill_topology_information( numa_nodes_count, numa_indexes_list, core_types_count, core_types_indexes_list ); } TBBBIND_EXPORT binding_handler* __TBB_internal_allocate_binding_handler(int number_of_slots, int numa_id, int core_type_id, int max_threads_per_core) { __TBB_ASSERT(number_of_slots > 0, "Trying to create numa handler for 0 threads."); return new binding_handler(number_of_slots, numa_id, core_type_id, max_threads_per_core); } TBBBIND_EXPORT void __TBB_internal_deallocate_binding_handler(binding_handler* handler_ptr) { __TBB_ASSERT(handler_ptr != nullptr, "Trying to deallocate nullptr pointer."); delete handler_ptr; } TBBBIND_EXPORT void __TBB_internal_apply_affinity(binding_handler* handler_ptr, int slot_num) { __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata."); handler_ptr->apply_affinity(slot_num); } TBBBIND_EXPORT void __TBB_internal_restore_affinity(binding_handler* handler_ptr, int slot_num) { __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata."); handler_ptr->restore_previous_affinity_mask(slot_num); } TBBBIND_EXPORT int __TBB_internal_get_default_concurrency(int numa_id, int core_type_id, int max_threads_per_core) { return system_topology::instance().get_default_concurrency(numa_id, core_type_id, max_threads_per_core); } TBBBIND_EXPORT void __TBB_internal_destroy_system_topology() { return system_topology::destroy(); } } // extern "C" } // namespace r1 } // namespace detail } // namespace tbb #undef assertion_hwloc_wrapper ================================================ FILE: third-party/tbb/src/tbbbind/tbb_bind.rc ================================================ // Copyright (c) 2005-2025 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. ///////////////////////////////////////////////////////////////////////////// // // Includes // #include #include "../../include/oneapi/tbb/version.h" ///////////////////////////////////////////////////////////////////////////// // Neutral resources #ifdef _WIN32 LANGUAGE LANG_NEUTRAL, SUBLANG_NEUTRAL #pragma code_page(1252) #endif //_WIN32 ///////////////////////////////////////////////////////////////////////////// // // Version // #define TBB_VERNUMBERS TBB_VERSION_MAJOR,TBB_VERSION_MINOR,TBB_VERSION_PATCH #define TBB_VERSION TBB_VERSION_STRING VS_VERSION_INFO VERSIONINFO FILEVERSION TBB_VERNUMBERS PRODUCTVERSION TBB_VERNUMBERS FILEFLAGSMASK 0x17L #ifdef _DEBUG FILEFLAGS 0x1L #else FILEFLAGS 0x0L #endif FILEOS 0x40004L FILETYPE 0x2L FILESUBTYPE 0x0L BEGIN BLOCK "StringFileInfo" BEGIN BLOCK "000004b0" BEGIN VALUE "CompanyName", "Intel Corporation\0" VALUE "FileDescription", "oneAPI Threading Building Blocks (oneTBB) library\0" VALUE "FileVersion", TBB_VERSION "\0" VALUE "LegalCopyright", "Copyright 2005-2025 Intel Corporation. All Rights Reserved.\0" VALUE "LegalTrademarks", "\0" #ifndef TBB_USE_DEBUG VALUE "OriginalFilename", "tbbbind.dll\0" #else VALUE "OriginalFilename", "tbbbind_debug.dll\0" #endif VALUE "ProductName", "oneAPI Threading Building Blocks (oneTBB)\0" VALUE "ProductVersion", TBB_VERSION "\0" VALUE "PrivateBuild", "\0" VALUE "SpecialBuild", "\0" END END BLOCK "VarFileInfo" BEGIN VALUE "Translation", 0x0, 1200 END END ================================================ FILE: third-party/tbb/src/tbbmalloc/CMakeLists.txt ================================================ # Copyright (c) 2020-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. add_library(tbbmalloc backend.cpp backref.cpp frontend.cpp large_objects.cpp tbbmalloc.cpp ../tbb/itt_notify.cpp) if (WIN32) target_sources(tbbmalloc PRIVATE tbbmalloc.rc) endif() add_library(TBB::tbbmalloc ALIAS tbbmalloc) target_compile_definitions(tbbmalloc PUBLIC $<$:TBB_USE_DEBUG> PRIVATE __TBBMALLOC_BUILD $<$>:__TBB_DYNAMIC_LOAD_ENABLED=0> $<$>:__TBB_SOURCE_DIRECTLY_INCLUDED=1>) if (NOT ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(armv7-a|aarch64|mips|arm64|riscv)" OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64" OR WINDOWS_STORE OR TBB_WINDOWS_DRIVER OR TBB_SANITIZE MATCHES "thread")) target_compile_definitions(tbbmalloc PRIVATE __TBB_USE_ITT_NOTIFY) endif() target_include_directories(tbbmalloc PUBLIC $ $) # TODO: fix warnings if (MSVC) # signed unsigned mismatch, declaration hides class member set(TBB_WARNING_SUPPRESS ${TBB_WARNING_SUPPRESS} /wd4267 /wd4244 /wd4245 /wd4458) endif() target_compile_options(tbbmalloc PRIVATE ${TBB_CXX_STD_FLAG} # TODO: consider making it PUBLIC. ${TBB_MMD_FLAG} ${TBB_DSE_FLAG} ${TBB_WARNING_LEVEL} ${TBB_WARNING_SUPPRESS} ${TBB_LIB_COMPILE_FLAGS} ${TBBMALLOC_LIB_COMPILE_FLAGS} ${TBB_COMMON_COMPILE_FLAGS} ) enable_language(C) # Avoid use of target_link_libraries here as it changes /DEF option to \DEF on Windows. set_target_properties(tbbmalloc PROPERTIES DEFINE_SYMBOL "" VERSION ${TBBMALLOC_BINARY_VERSION}.${TBB_BINARY_MINOR_VERSION} SOVERSION ${TBBMALLOC_BINARY_VERSION} LINKER_LANGUAGE C ) tbb_handle_ipo(tbbmalloc) if (TBB_DEF_FILE_PREFIX) # If there's no prefix, assume we're using export directives set_target_properties(tbbmalloc PROPERTIES LINK_FLAGS "${TBB_LINK_DEF_FILE_FLAG}\"${CMAKE_CURRENT_SOURCE_DIR}/def/${TBB_DEF_FILE_PREFIX}-tbbmalloc.def\"" LINK_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/def/${TBB_DEF_FILE_PREFIX}-tbbmalloc.def" ) endif() set(CMAKE_CXX_IMPLICIT_LINK_LIBRARIES "") # Prefer using target_link_options instead of target_link_libraries to specify link options because # target_link_libraries may incorrectly handle some options (on Windows, for example). if (COMMAND target_link_options) target_link_options(tbbmalloc PRIVATE ${TBB_LIB_LINK_FLAGS} ${TBB_COMMON_LINK_FLAGS} ) else() target_link_libraries(tbbmalloc PRIVATE ${TBB_LIB_LINK_FLAGS} ${TBB_COMMON_LINK_FLAGS} ) endif() target_link_libraries(tbbmalloc PRIVATE Threads::Threads ${TBB_LIB_LINK_LIBS} ${TBB_COMMON_LINK_LIBS} ) if(TBB_BUILD_APPLE_FRAMEWORKS) set_target_properties(tbbmalloc PROPERTIES FRAMEWORK TRUE FRAMEWORK_VERSION ${TBBMALLOC_BINARY_VERSION}.${TBB_BINARY_MINOR_VERSION} XCODE_ATTRIBUTE_PRODUCT_BUNDLE_IDENTIFIER com.intel.tbbmalloc MACOSX_FRAMEWORK_IDENTIFIER com.intel.tbbmalloc MACOSX_FRAMEWORK_BUNDLE_VERSION ${TBBMALLOC_BINARY_VERSION}.${TBB_BINARY_MINOR_VERSION} MACOSX_FRAMEWORK_SHORT_VERSION_STRING ${TBBMALLOC_BINARY_VERSION} ) endif() tbb_install_target(tbbmalloc) ================================================ FILE: third-party/tbb/src/tbbmalloc/Customize.h ================================================ /* Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _TBB_malloc_Customize_H_ #define _TBB_malloc_Customize_H_ // customizing MALLOC_ASSERT macro #define MALLOC_ASSERT(assertion, message) __TBB_ASSERT(assertion, message) #define MALLOC_ASSERT_EX(assertion, message) __TBB_ASSERT_EX(assertion, message) #ifndef MALLOC_DEBUG #define MALLOC_DEBUG TBB_USE_DEBUG #endif #include "oneapi/tbb/detail/_utils.h" #include "oneapi/tbb/detail/_assert.h" #include "Synchronize.h" #if __TBB_USE_ITT_NOTIFY #include "../tbb/itt_notify.h" #define MALLOC_ITT_SYNC_PREPARE(pointer) ITT_NOTIFY(sync_prepare, (pointer)) #define MALLOC_ITT_SYNC_ACQUIRED(pointer) ITT_NOTIFY(sync_acquired, (pointer)) #define MALLOC_ITT_SYNC_RELEASING(pointer) ITT_NOTIFY(sync_releasing, (pointer)) #define MALLOC_ITT_SYNC_CANCEL(pointer) ITT_NOTIFY(sync_cancel, (pointer)) #define MALLOC_ITT_FINI_ITTLIB() ITT_FINI_ITTLIB() #define MALLOC_ITT_RELEASE_RESOURCES() ITT_RELEASE_RESOURCES() #else #define MALLOC_ITT_SYNC_PREPARE(pointer) ((void)0) #define MALLOC_ITT_SYNC_ACQUIRED(pointer) ((void)0) #define MALLOC_ITT_SYNC_RELEASING(pointer) ((void)0) #define MALLOC_ITT_SYNC_CANCEL(pointer) ((void)0) #define MALLOC_ITT_FINI_ITTLIB() ((void)0) #define MALLOC_ITT_RELEASE_RESOURCES() ((void)0) #endif inline intptr_t BitScanRev(uintptr_t x) { return x == 0 ? -1 : static_cast(tbb::detail::log2(x)); } template static inline bool isAligned(T* arg, uintptr_t alignment) { return tbb::detail::is_aligned(arg,alignment); } static inline bool isPowerOfTwo(uintptr_t arg) { return tbb::detail::is_power_of_two(arg); } static inline bool isPowerOfTwoAtLeast(uintptr_t arg, uintptr_t power2) { return arg && tbb::detail::is_power_of_two_at_least(arg,power2); } inline void do_yield() { tbb::detail::yield(); } #define USE_DEFAULT_MEMORY_MAPPING 1 // To support malloc replacement #include "../tbbmalloc_proxy/proxy.h" #if MALLOC_UNIXLIKE_OVERLOAD_ENABLED #define malloc_proxy __TBB_malloc_proxy extern "C" void * __TBB_malloc_proxy(size_t) __attribute__ ((weak)); #elif MALLOC_ZONE_OVERLOAD_ENABLED // as there is no significant overhead, always suppose that proxy can be present const bool malloc_proxy = true; #else const bool malloc_proxy = false; #endif namespace rml { namespace internal { void init_tbbmalloc(); } } // namespaces #define MALLOC_EXTRA_INITIALIZATION rml::internal::init_tbbmalloc() // Need these to work regardless of tools support. namespace tbb { namespace detail { namespace d1 { enum notify_type {prepare=0, cancel, acquired, releasing}; #if TBB_USE_PROFILING_TOOLS inline void call_itt_notify(notify_type t, void *ptr) { // unreferenced formal parameter warning detail::suppress_unused_warning(ptr); switch ( t ) { case prepare: MALLOC_ITT_SYNC_PREPARE( ptr ); break; case cancel: MALLOC_ITT_SYNC_CANCEL( ptr ); break; case acquired: MALLOC_ITT_SYNC_ACQUIRED( ptr ); break; case releasing: MALLOC_ITT_SYNC_RELEASING( ptr ); break; } } #else inline void call_itt_notify(notify_type /*t*/, void * /*ptr*/) {} #endif // TBB_USE_PROFILING_TOOLS } // namespace d1 } // namespace detail } // namespace tbb #include "oneapi/tbb/detail/_aggregator.h" template struct MallocAggregator { typedef tbb::detail::d1::aggregator_generic type; }; //! aggregated_operation base class template struct MallocAggregatedOperation { typedef tbb::detail::d1::aggregated_operation type; }; #endif /* _TBB_malloc_Customize_H_ */ ================================================ FILE: third-party/tbb/src/tbbmalloc/MapMemory.h ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _itt_shared_malloc_MapMemory_H #define _itt_shared_malloc_MapMemory_H #include #if __unix__ || __APPLE__ || __sun || __FreeBSD__ #if __sun && !defined(_XPG4_2) // To have void* as mmap's 1st argument #define _XPG4_2 1 #define XPG4_WAS_DEFINED 1 #endif #include #if __unix__ /* __TBB_MAP_HUGETLB is MAP_HUGETLB from system header linux/mman.h. The header is not included here, as on some Linux flavors inclusion of linux/mman.h leads to compilation error, while changing of MAP_HUGETLB is highly unexpected. */ #define __TBB_MAP_HUGETLB 0x40000 #else #define __TBB_MAP_HUGETLB 0 #endif #if XPG4_WAS_DEFINED #undef _XPG4_2 #undef XPG4_WAS_DEFINED #endif inline void* mmap_impl(size_t map_size, void* map_hint = nullptr, int map_flags = 0) { #ifndef MAP_ANONYMOUS // macOS* defines MAP_ANON, which is deprecated in Linux*. #define MAP_ANONYMOUS MAP_ANON #endif /* MAP_ANONYMOUS */ return mmap(map_hint, map_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | map_flags, -1, 0); } inline void* mmapTHP(size_t bytes) { // Initializes in zero-initialized data section static void* hint; // Optimistically try to use a last huge page aligned region end // as a hint for mmap. hint = hint ? (void*)((uintptr_t)hint - bytes) : hint; void* result = mmap_impl(bytes, hint); // Something went wrong if (result == MAP_FAILED) { hint = nullptr; return MAP_FAILED; } // Otherwise, fall back to the slow path - map oversized region // and trim excess parts. if (!isAligned(result, HUGE_PAGE_SIZE)) { // Undo previous try munmap(result, bytes); // Map oversized on huge page size region result = mmap_impl(bytes + HUGE_PAGE_SIZE); // Something went wrong if (result == MAP_FAILED) { hint = nullptr; return MAP_FAILED; } // Misalignment offset uintptr_t offset = 0; if (!isAligned(result, HUGE_PAGE_SIZE)) { // Trim excess head of a region if it is no aligned offset = HUGE_PAGE_SIZE - ((uintptr_t)result & (HUGE_PAGE_SIZE - 1)); munmap(result, offset); // New region beginning result = (void*)((uintptr_t)result + offset); } // Trim excess tail of a region munmap((void*)((uintptr_t)result + bytes), HUGE_PAGE_SIZE - offset); } // Assume, that mmap virtual addresses grow down by default // So, set a hint as a result of a last successful allocation // and then use it minus requested size as a new mapping point. // TODO: Atomic store is meant here, fence not needed, but // currently we don't have such function. hint = result; MALLOC_ASSERT(isAligned(result, HUGE_PAGE_SIZE), "Mapped address is not aligned on huge page size."); return result; } #define MEMORY_MAPPING_USES_MALLOC 0 void* MapMemory (size_t bytes, PageType pageType) { void* result = nullptr; int prevErrno = errno; switch (pageType) { case REGULAR: { result = mmap_impl(bytes); break; } case PREALLOCATED_HUGE_PAGE: { MALLOC_ASSERT((bytes % HUGE_PAGE_SIZE) == 0, "Mapping size should be divisible by huge page size"); result = mmap_impl(bytes, nullptr, __TBB_MAP_HUGETLB); break; } case TRANSPARENT_HUGE_PAGE: { MALLOC_ASSERT((bytes % HUGE_PAGE_SIZE) == 0, "Mapping size should be divisible by huge page size"); result = mmapTHP(bytes); break; } default: { MALLOC_ASSERT(false, "Unknown page type"); } } if (result == MAP_FAILED) { errno = prevErrno; return nullptr; } return result; } int UnmapMemory(void *area, size_t bytes) { int prevErrno = errno; int ret = munmap(area, bytes); if (-1 == ret) errno = prevErrno; return ret; } #elif (_WIN32 || _WIN64) && !__TBB_WIN8UI_SUPPORT #include #define MEMORY_MAPPING_USES_MALLOC 0 void* MapMemory (size_t bytes, PageType) { /* Is VirtualAlloc thread safe? */ return VirtualAlloc(nullptr, bytes, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); } int UnmapMemory(void *area, size_t /*bytes*/) { BOOL result = VirtualFree(area, 0, MEM_RELEASE); return !result; } #else void *ErrnoPreservingMalloc(size_t bytes) { int prevErrno = errno; void *ret = malloc( bytes ); if (!ret) errno = prevErrno; return ret; } #define MEMORY_MAPPING_USES_MALLOC 1 void* MapMemory (size_t bytes, PageType) { return ErrnoPreservingMalloc( bytes ); } int UnmapMemory(void *area, size_t /*bytes*/) { free( area ); return 0; } #endif /* OS dependent */ #if MALLOC_CHECK_RECURSION && MEMORY_MAPPING_USES_MALLOC #error Impossible to protect against malloc recursion when memory mapping uses malloc. #endif #endif /* _itt_shared_malloc_MapMemory_H */ ================================================ FILE: third-party/tbb/src/tbbmalloc/Statistics.h ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #define MAX_THREADS 1024 #define NUM_OF_BINS 30 #define ThreadCommonCounters NUM_OF_BINS enum counter_type { allocBlockNew = 0, allocBlockPublic, allocBumpPtrUsed, allocFreeListUsed, allocPrivatized, examineEmptyEnough, examineNotEmpty, freeRestoreBumpPtr, freeByOtherThread, freeToActiveBlock, freeToInactiveBlock, freeBlockPublic, freeBlockBack, MaxCounters }; enum common_counter_type { allocNewLargeObj = 0, allocCachedLargeObj, cacheLargeObj, freeLargeObj, lockPublicFreeList, freeToOtherThread }; #if COLLECT_STATISTICS /* Statistics reporting callback registered via a static object dtor on Posix or DLL_PROCESS_DETACH on Windows. */ static bool reportAllocationStatistics; struct bin_counters { int counter[MaxCounters]; }; static bin_counters statistic[MAX_THREADS][NUM_OF_BINS+1]; //zero-initialized; static inline int STAT_increment(int thread, int bin, int ctr) { return reportAllocationStatistics && thread < MAX_THREADS ? ++(statistic[thread][bin].counter[ctr]) : 0; } static inline void initStatisticsCollection() { #if defined(MALLOCENV_COLLECT_STATISTICS) if (nullptr != getenv(MALLOCENV_COLLECT_STATISTICS)) reportAllocationStatistics = true; #endif } #else #define STAT_increment(a,b,c) ((void)0) #endif /* COLLECT_STATISTICS */ #if COLLECT_STATISTICS static inline void STAT_print(int thread) { if (!reportAllocationStatistics) return; char filename[100]; #if USE_PTHREAD sprintf(filename, "stat_ScalableMalloc_proc%04d_thr%04d.log", getpid(), thread); #else sprintf(filename, "stat_ScalableMalloc_thr%04d.log", thread); #endif FILE* outfile = fopen(filename, "w"); for(int i=0; i //! Stripped down version of spin_mutex. /** Instances of MallocMutex must be declared in memory that is zero-initialized. There are no constructors. This is a feature that lets it be used in situations where the mutex might be used while file-scope constructors are running. There are no methods "acquire" or "release". The scoped_lock must be used in a strict block-scoped locking pattern. Omitting these methods permitted further simplification. */ class MallocMutex : tbb::detail::no_copy { std::atomic_flag m_flag = ATOMIC_FLAG_INIT; void lock() { tbb::detail::atomic_backoff backoff; while (m_flag.test_and_set()) backoff.pause(); } bool try_lock() { return !m_flag.test_and_set(); } void unlock() { m_flag.clear(std::memory_order_release); } public: class scoped_lock : tbb::detail::no_copy { MallocMutex& m_mutex; bool m_taken; public: scoped_lock(MallocMutex& m) : m_mutex(m), m_taken(true) { m.lock(); } scoped_lock(MallocMutex& m, bool block, bool *locked) : m_mutex(m), m_taken(false) { if (block) { m.lock(); m_taken = true; } else { m_taken = m.try_lock(); } if (locked) *locked = m_taken; } scoped_lock(scoped_lock& other) = delete; scoped_lock& operator=(scoped_lock&) = delete; ~scoped_lock() { if (m_taken) { m_mutex.unlock(); } } }; friend class scoped_lock; }; inline void SpinWaitWhileEq(const std::atomic& location, const intptr_t value) { tbb::detail::spin_wait_while_eq(location, value); } #if USE_PTHREAD && __TBB_SOURCE_DIRECTLY_INCLUDED inline void SpinWaitUntilEq(const std::atomic& location, const intptr_t value) { tbb::detail::spin_wait_until_eq(location, value); } #endif class AtomicBackoff { tbb::detail::atomic_backoff backoff; public: AtomicBackoff() {} void pause() { backoff.pause(); } }; #endif /* __TBB_malloc_Synchronize_H_ */ ================================================ FILE: third-party/tbb/src/tbbmalloc/TypeDefinitions.h ================================================ /* Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef _itt_shared_malloc_TypeDefinitions_H_ #define _itt_shared_malloc_TypeDefinitions_H_ // Define preprocessor symbols used to determine architecture #if _WIN32||_WIN64 # if defined(_M_X64)||defined(__x86_64__) // the latter for MinGW support # define __ARCH_x86_64 1 # elif defined(_M_IA64) # define __ARCH_ipf 1 # elif defined(_M_IX86)||defined(__i386__) // the latter for MinGW support # define __ARCH_x86_32 1 # elif defined(_M_ARM) || defined(_M_ARM64) || defined(__aarch64__) // the latter for MinGW support # define __ARCH_other 1 # else # error Unknown processor architecture for Windows # endif # define USE_WINTHREAD 1 #else /* Assume generic Unix */ # if __x86_64__ # define __ARCH_x86_64 1 # elif __ia64__ # define __ARCH_ipf 1 # elif __i386__ || __i386 # define __ARCH_x86_32 1 # else # define __ARCH_other 1 # endif # define USE_PTHREAD 1 #endif // According to C99 standard INTPTR_MIN defined for C++ // iff __STDC_LIMIT_MACROS pre-defined #ifndef __STDC_LIMIT_MACROS #define __STDC_LIMIT_MACROS 1 #endif //! PROVIDE YOUR OWN Customize.h IF YOU FEEL NECESSARY #include "Customize.h" #include "shared_utils.h" #endif /* _itt_shared_malloc_TypeDefinitions_H_ */ ================================================ FILE: third-party/tbb/src/tbbmalloc/backend.cpp ================================================ /* Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include /* for memset */ #include #include "tbbmalloc_internal.h" namespace rml { namespace internal { /*********** Code to acquire memory from the OS or other executive ****************/ /* syscall/malloc can set non-zero errno in case of failure, but later allocator might be able to find memory to fulfill the request. And we do not want changing of errno by successful scalable_malloc call. To support this, restore old errno in (get|free)RawMemory, and set errno in frontend just before returning to user code. Please note: every syscall/libc call used inside scalable_malloc that sets errno must be protected this way, not just memory allocation per se. */ #if USE_DEFAULT_MEMORY_MAPPING #include "MapMemory.h" #else /* assume MapMemory and UnmapMemory are customized */ #endif void* getRawMemory (size_t size, PageType pageType) { return MapMemory(size, pageType); } int freeRawMemory (void *object, size_t size) { return UnmapMemory(object, size); } #if CHECK_ALLOCATION_RANGE void Backend::UsedAddressRange::registerAlloc(uintptr_t left, uintptr_t right) { MallocMutex::scoped_lock lock(mutex); if (left < leftBound.load(std::memory_order_relaxed)) leftBound.store(left, std::memory_order_relaxed); if (right > rightBound.load(std::memory_order_relaxed)) rightBound.store(right, std::memory_order_relaxed); MALLOC_ASSERT(leftBound.load(std::memory_order_relaxed), ASSERT_TEXT); MALLOC_ASSERT(leftBound.load(std::memory_order_relaxed) < rightBound.load(std::memory_order_relaxed), ASSERT_TEXT); MALLOC_ASSERT(leftBound.load(std::memory_order_relaxed) <= left && right <= rightBound.load(std::memory_order_relaxed), ASSERT_TEXT); } void Backend::UsedAddressRange::registerFree(uintptr_t left, uintptr_t right) { MallocMutex::scoped_lock lock(mutex); if (leftBound.load(std::memory_order_relaxed) == left) { if (rightBound.load(std::memory_order_relaxed) == right) { leftBound.store(ADDRESS_UPPER_BOUND, std::memory_order_relaxed); rightBound.store(0, std::memory_order_relaxed); } else leftBound.store(right, std::memory_order_relaxed); } else if (rightBound.load(std::memory_order_relaxed) == right) rightBound.store(left, std::memory_order_relaxed); MALLOC_ASSERT((!rightBound.load(std::memory_order_relaxed) && leftBound.load(std::memory_order_relaxed) == ADDRESS_UPPER_BOUND) || leftBound.load(std::memory_order_relaxed) < rightBound.load(std::memory_order_relaxed), ASSERT_TEXT); } #endif // CHECK_ALLOCATION_RANGE // Initialized in frontend inside defaultMemPool extern HugePagesStatus hugePages; void *Backend::allocRawMem(size_t &size) { void *res = nullptr; size_t allocSize = 0; if (extMemPool->userPool()) { if (extMemPool->fixedPool && bootsrapMemDone == bootsrapMemStatus.load(std::memory_order_acquire)) return nullptr; MALLOC_ASSERT(bootsrapMemStatus != bootsrapMemNotDone, "Backend::allocRawMem() called prematurely?"); // TODO: support for raw mem not aligned at sizeof(uintptr_t) // memory from fixed pool is asked once and only once allocSize = alignUpGeneric(size, extMemPool->granularity); res = (*extMemPool->rawAlloc)(extMemPool->poolId, allocSize); } else { // Align allocation on page size size_t pageSize = hugePages.isEnabled ? hugePages.getGranularity() : extMemPool->granularity; MALLOC_ASSERT(pageSize, "Page size cannot be zero."); allocSize = alignUpGeneric(size, pageSize); // If user requested huge pages and they are available, try to use preallocated ones firstly. // If there are none, lets check transparent huge pages support and use them instead. if (hugePages.isEnabled) { if (hugePages.isHPAvailable) { res = getRawMemory(allocSize, PREALLOCATED_HUGE_PAGE); } if (!res && hugePages.isTHPAvailable) { res = getRawMemory(allocSize, TRANSPARENT_HUGE_PAGE); } } if (!res) { res = getRawMemory(allocSize, REGULAR); } } if (res) { MALLOC_ASSERT(allocSize > 0, "Invalid size of an allocated region."); size = allocSize; if (!extMemPool->userPool()) usedAddrRange.registerAlloc((uintptr_t)res, (uintptr_t)res+size); #if MALLOC_DEBUG volatile size_t curTotalSize = totalMemSize; // to read global value once MALLOC_ASSERT(curTotalSize+size > curTotalSize, "Overflow allocation size."); #endif totalMemSize.fetch_add(size); } return res; } bool Backend::freeRawMem(void *object, size_t size) { bool fail; #if MALLOC_DEBUG volatile size_t curTotalSize = totalMemSize; // to read global value once MALLOC_ASSERT(curTotalSize-size < curTotalSize, "Negative allocation size."); #endif totalMemSize.fetch_sub(size); if (extMemPool->userPool()) { MALLOC_ASSERT(!extMemPool->fixedPool, "No free for fixed-size pools."); fail = (*extMemPool->rawFree)(extMemPool->poolId, object, size); } else { usedAddrRange.registerFree((uintptr_t)object, (uintptr_t)object + size); fail = freeRawMemory(object, size); } // TODO: use result in all freeRawMem() callers return !fail; } /********* End memory acquisition code ********************************/ // Protected object size. After successful locking returns size of locked block, // and releasing requires setting block size. class GuardedSize : tbb::detail::no_copy { std::atomic value; public: enum State { LOCKED, COAL_BLOCK, // block is coalescing now MAX_LOCKED_VAL = COAL_BLOCK, LAST_REGION_BLOCK, // used to mark last block in region // values after this are "normal" block sizes MAX_SPEC_VAL = LAST_REGION_BLOCK }; void initLocked() { value.store(LOCKED, std::memory_order_release); } // TBB_REVAMP_TODO: was relaxed void makeCoalscing() { MALLOC_ASSERT(value.load(std::memory_order_relaxed) == LOCKED, ASSERT_TEXT); value.store(COAL_BLOCK, std::memory_order_release); // TBB_REVAMP_TODO: was relaxed } size_t tryLock(State state) { MALLOC_ASSERT(state <= MAX_LOCKED_VAL, ASSERT_TEXT); size_t sz = value.load(std::memory_order_acquire); for (;;) { if (sz <= MAX_LOCKED_VAL) { break; } if (value.compare_exchange_strong(sz, state)) { break; } } return sz; } void unlock(size_t size) { MALLOC_ASSERT(value.load(std::memory_order_relaxed) <= MAX_LOCKED_VAL, "The lock is not locked"); MALLOC_ASSERT(size > MAX_LOCKED_VAL, ASSERT_TEXT); value.store(size, std::memory_order_release); } bool isLastRegionBlock() const { return value.load(std::memory_order_relaxed) == LAST_REGION_BLOCK; } friend void Backend::IndexedBins::verify(); }; struct MemRegion { MemRegion *next, // keep all regions in any pool to release all them on *prev; // pool destroying, 2-linked list to release individual // regions. size_t allocSz, // got from pool callback blockSz; // initial and maximal inner block size MemRegionType type; }; // this data must be unmodified while block is in use, so separate it class BlockMutexes { protected: GuardedSize myL, // lock for me leftL; // lock for left neighbor }; class FreeBlock : BlockMutexes { public: static const size_t minBlockSize; friend void Backend::IndexedBins::verify(); FreeBlock *prev, // in 2-linked list related to bin *next, *nextToFree; // used to form a queue during coalescing // valid only when block is in processing, i.e. one is not free and not size_t sizeTmp; // used outside of backend int myBin; // bin that is owner of the block bool slabAligned; bool blockInBin; // this block in myBin already FreeBlock *rightNeig(size_t sz) const { MALLOC_ASSERT(sz, ASSERT_TEXT); return (FreeBlock*)((uintptr_t)this+sz); } FreeBlock *leftNeig(size_t sz) const { MALLOC_ASSERT(sz, ASSERT_TEXT); return (FreeBlock*)((uintptr_t)this - sz); } void initHeader() { myL.initLocked(); leftL.initLocked(); } void setMeFree(size_t size) { myL.unlock(size); } size_t trySetMeUsed(GuardedSize::State s) { return myL.tryLock(s); } bool isLastRegionBlock() const { return myL.isLastRegionBlock(); } void setLeftFree(size_t sz) { leftL.unlock(sz); } size_t trySetLeftUsed(GuardedSize::State s) { return leftL.tryLock(s); } size_t tryLockBlock() { size_t rSz, sz = trySetMeUsed(GuardedSize::LOCKED); if (sz <= GuardedSize::MAX_LOCKED_VAL) return false; rSz = rightNeig(sz)->trySetLeftUsed(GuardedSize::LOCKED); if (rSz <= GuardedSize::MAX_LOCKED_VAL) { setMeFree(sz); return false; } MALLOC_ASSERT(rSz == sz, ASSERT_TEXT); return sz; } void markCoalescing(size_t blockSz) { myL.makeCoalscing(); rightNeig(blockSz)->leftL.makeCoalscing(); sizeTmp = blockSz; nextToFree = nullptr; } void markUsed() { myL.initLocked(); rightNeig(sizeTmp)->leftL.initLocked(); nextToFree = nullptr; } static void markBlocks(FreeBlock *fBlock, int num, size_t size) { for (int i=1; iinitHeader(); } } }; // Last block in any region. Its "size" field is GuardedSize::LAST_REGION_BLOCK, // This kind of blocks used to find region header // and have a possibility to return region back to OS struct LastFreeBlock : public FreeBlock { MemRegion *memRegion; }; const size_t FreeBlock::minBlockSize = sizeof(FreeBlock); inline bool BackendSync::waitTillBlockReleased(intptr_t startModifiedCnt) { AtomicBackoff backoff; #if __TBB_MALLOC_BACKEND_STAT class ITT_Guard { void *ptr; public: ITT_Guard(void *p) : ptr(p) { MALLOC_ITT_SYNC_PREPARE(ptr); } ~ITT_Guard() { MALLOC_ITT_SYNC_ACQUIRED(ptr); } }; ITT_Guard ittGuard(&inFlyBlocks); #endif intptr_t myBinsInFlyBlocks = inFlyBlocks.load(std::memory_order_acquire); intptr_t myCoalescQInFlyBlocks = backend->blocksInCoalescing(); while (true) { MALLOC_ASSERT(myBinsInFlyBlocks>=0 && myCoalescQInFlyBlocks>=0, nullptr); intptr_t currBinsInFlyBlocks = inFlyBlocks.load(std::memory_order_acquire); intptr_t currCoalescQInFlyBlocks = backend->blocksInCoalescing(); WhiteboxTestingYield(); // Stop waiting iff: // 1) blocks were removed from processing, not added if (myBinsInFlyBlocks > currBinsInFlyBlocks // 2) released during delayed coalescing queue || myCoalescQInFlyBlocks > currCoalescQInFlyBlocks) break; // 3) if there are blocks in coalescing, and no progress in its processing, // try to scan coalescing queue and stop waiting, if changes were made // (if there are no changes and in-fly blocks exist, we continue // waiting to not increase load on coalescQ) if (currCoalescQInFlyBlocks > 0 && backend->scanCoalescQ(/*forceCoalescQDrop=*/false)) break; // 4) when there are no blocks if (!currBinsInFlyBlocks && !currCoalescQInFlyBlocks) { // re-scan make sense only if bins were modified since scanned auto pool = backend->extMemPool; if (pool->hardCachesCleanupInProgress.load(std::memory_order_acquire) || pool->softCachesCleanupInProgress.load(std::memory_order_acquire)) { backoff.pause(); continue; } return startModifiedCnt != getNumOfMods(); } myBinsInFlyBlocks = currBinsInFlyBlocks; myCoalescQInFlyBlocks = currCoalescQInFlyBlocks; backoff.pause(); } return true; } void CoalRequestQ::putBlock(FreeBlock *fBlock) { MALLOC_ASSERT(fBlock->sizeTmp >= FreeBlock::minBlockSize, ASSERT_TEXT); fBlock->markUsed(); // the block is in the queue, do not forget that it's here inFlyBlocks++; FreeBlock *myBlToFree = blocksToFree.load(std::memory_order_acquire); for (;;) { fBlock->nextToFree = myBlToFree; if (blocksToFree.compare_exchange_strong(myBlToFree, fBlock)) { return; } } } FreeBlock *CoalRequestQ::getAll() { for (;;) { FreeBlock *myBlToFree = blocksToFree.load(std::memory_order_acquire); if (!myBlToFree) { return nullptr; } else { if (blocksToFree.compare_exchange_strong(myBlToFree, nullptr)) { return myBlToFree; } else { continue; } } } } inline void CoalRequestQ::blockWasProcessed() { bkndSync->binsModified(); int prev = inFlyBlocks.fetch_sub(1); tbb::detail::suppress_unused_warning(prev); MALLOC_ASSERT(prev > 0, ASSERT_TEXT); } // Try to get a block from a bin. // If the remaining free space would stay in the same bin, // split the block without removing it. // If the free space should go to other bin(s), remove the block. // alignedBin is true, if all blocks in the bin have slab-aligned right side. FreeBlock *Backend::IndexedBins::getFromBin(int binIdx, BackendSync *sync, size_t size, bool needAlignedRes, bool alignedBin, bool wait, int *binLocked) { Bin *b = &freeBins[binIdx]; try_next: FreeBlock *fBlock = nullptr; if (!b->empty()) { bool locked = false; MallocMutex::scoped_lock scopedLock(b->tLock, wait, &locked); if (!locked) { if (binLocked) (*binLocked)++; return nullptr; } for (FreeBlock *curr = b->head.load(std::memory_order_relaxed); curr; curr = curr->next) { size_t szBlock = curr->tryLockBlock(); if (!szBlock) { // block is locked, re-do bin lock, as there is no place to spin // while block coalescing goto try_next; } // GENERAL CASE if (alignedBin || !needAlignedRes) { size_t splitSz = szBlock - size; // If we got a block as split result, it must have a room for control structures. if (szBlock >= size && (splitSz >= FreeBlock::minBlockSize || !splitSz)) fBlock = curr; } else { // SPECIAL CASE, to get aligned block from unaligned bin we have to cut the middle of a block // and return remaining left and right part. Possible only in fixed pool scenario, assert for this // is set inside splitBlock() function. void *newB = alignUp(curr, slabSize); uintptr_t rightNew = (uintptr_t)newB + size; uintptr_t rightCurr = (uintptr_t)curr + szBlock; // Check if the block size is sufficient, // and also left and right split results are either big enough or non-existent if (rightNew <= rightCurr && (newB == curr || ((uintptr_t)newB - (uintptr_t)curr) >= FreeBlock::minBlockSize) && (rightNew == rightCurr || (rightCurr - rightNew) >= FreeBlock::minBlockSize)) fBlock = curr; } if (fBlock) { // consume must be called before result of removing from a bin is visible externally. sync->blockConsumed(); // TODO: think about cases when block stays in the same bin b->removeBlock(fBlock); if (freeBins[binIdx].empty()) bitMask.set(binIdx, false); fBlock->sizeTmp = szBlock; break; } else { // block size is not valid, search for next block in the bin curr->setMeFree(szBlock); curr->rightNeig(szBlock)->setLeftFree(szBlock); } } } return fBlock; } bool Backend::IndexedBins::tryReleaseRegions(int binIdx, Backend *backend) { Bin *b = &freeBins[binIdx]; FreeBlock *fBlockList = nullptr; // got all blocks from the bin and re-do coalesce on them // to release single-block regions try_next: if (!b->empty()) { MallocMutex::scoped_lock binLock(b->tLock); for (FreeBlock *curr = b->head.load(std::memory_order_relaxed); curr; ) { size_t szBlock = curr->tryLockBlock(); if (!szBlock) goto try_next; FreeBlock *next = curr->next; b->removeBlock(curr); curr->sizeTmp = szBlock; curr->nextToFree = fBlockList; fBlockList = curr; curr = next; } } return backend->coalescAndPutList(fBlockList, /*forceCoalescQDrop=*/true, /*reportBlocksProcessed=*/false); } void Backend::Bin::removeBlock(FreeBlock *fBlock) { MALLOC_ASSERT(fBlock->next||fBlock->prev||fBlock== head.load(std::memory_order_relaxed), "Detected that a block is not in the bin."); if (head.load(std::memory_order_relaxed) == fBlock) head.store(fBlock->next, std::memory_order_relaxed); if (tail == fBlock) tail = fBlock->prev; if (fBlock->prev) fBlock->prev->next = fBlock->next; if (fBlock->next) fBlock->next->prev = fBlock->prev; } void Backend::IndexedBins::addBlock(int binIdx, FreeBlock *fBlock, size_t /* blockSz */, bool addToTail) { Bin *b = &freeBins[binIdx]; fBlock->myBin = binIdx; fBlock->next = fBlock->prev = nullptr; { MallocMutex::scoped_lock scopedLock(b->tLock); if (addToTail) { fBlock->prev = b->tail; b->tail = fBlock; if (fBlock->prev) fBlock->prev->next = fBlock; if (!b->head.load(std::memory_order_relaxed)) b->head.store(fBlock, std::memory_order_relaxed); } else { fBlock->next = b->head.load(std::memory_order_relaxed); b->head.store(fBlock, std::memory_order_relaxed); if (fBlock->next) fBlock->next->prev = fBlock; if (!b->tail) b->tail = fBlock; } } bitMask.set(binIdx, true); } bool Backend::IndexedBins::tryAddBlock(int binIdx, FreeBlock *fBlock, bool addToTail) { bool locked = false; Bin *b = &freeBins[binIdx]; fBlock->myBin = binIdx; if (addToTail) { fBlock->next = nullptr; { MallocMutex::scoped_lock scopedLock(b->tLock, /*wait=*/false, &locked); if (!locked) return false; fBlock->prev = b->tail; b->tail = fBlock; if (fBlock->prev) fBlock->prev->next = fBlock; if (!b->head.load(std::memory_order_relaxed)) b->head.store(fBlock, std::memory_order_relaxed); } } else { fBlock->prev = nullptr; { MallocMutex::scoped_lock scopedLock(b->tLock, /*wait=*/false, &locked); if (!locked) return false; fBlock->next = b->head.load(std::memory_order_relaxed); b->head.store(fBlock, std::memory_order_relaxed); if (fBlock->next) fBlock->next->prev = fBlock; if (!b->tail) b->tail = fBlock; } } bitMask.set(binIdx, true); return true; } void Backend::IndexedBins::reset() { for (unsigned i=0; ifixedPool, "Aligned block request from unaligned bin possible only in fixed pool scenario."); // Space to use is in the middle FreeBlock *newBlock = alignUp(fBlock, slabSize); FreeBlock *rightPart = (FreeBlock*)((uintptr_t)newBlock + totalSize); uintptr_t fBlockEnd = (uintptr_t)fBlock + fBlock->sizeTmp; // Return free right part if ((uintptr_t)rightPart != fBlockEnd) { rightPart->initHeader(); // to prevent coalescing rightPart with fBlock size_t rightSize = fBlockEnd - (uintptr_t)rightPart; coalescAndPut(rightPart, rightSize, toAlignedBin(rightPart, rightSize)); } // And free left part if (newBlock != fBlock) { newBlock->initHeader(); // to prevent coalescing fBlock with newB size_t leftSize = (uintptr_t)newBlock - (uintptr_t)fBlock; coalescAndPut(fBlock, leftSize, toAlignedBin(fBlock, leftSize)); } fBlock = newBlock; } else if (size_t splitSize = fBlock->sizeTmp - totalSize) { // need to split the block // GENERAL CASE, cut the left or right part of the block FreeBlock *splitBlock = nullptr; if (needAlignedBlock) { // For slab aligned blocks cut the right side of the block // and return it to a requester, original block returns to backend splitBlock = fBlock; fBlock = (FreeBlock*)((uintptr_t)splitBlock + splitSize); fBlock->initHeader(); } else { // For large object blocks cut original block and put free right part to backend splitBlock = (FreeBlock*)((uintptr_t)fBlock + totalSize); splitBlock->initHeader(); } // Mark free block as it`s parent only when the requested type (needAlignedBlock) // and returned from Bins/OS block (isAligned) are equal (XOR operation used) bool markAligned = (blockIsAligned ^ needAlignedBlock) ? toAlignedBin(splitBlock, splitSize) : blockIsAligned; coalescAndPut(splitBlock, splitSize, markAligned); } MALLOC_ASSERT(!needAlignedBlock || isAligned(fBlock, slabSize), "Expect to get aligned block, if one was requested."); FreeBlock::markBlocks(fBlock, num, size); return fBlock; } size_t Backend::getMaxBinnedSize() const { return hugePages.isEnabled && !inUserPool() ? maxBinned_HugePage : maxBinned_SmallPage; } inline bool Backend::MaxRequestComparator::operator()(size_t oldMaxReq, size_t requestSize) const { return requestSize > oldMaxReq && requestSize < backend->getMaxBinnedSize(); } // last chance to get memory FreeBlock *Backend::releaseMemInCaches(intptr_t startModifiedCnt, int *lockedBinsThreshold, int numOfLockedBins) { // something released from caches if (extMemPool->hardCachesCleanup(false)) return (FreeBlock*)VALID_BLOCK_IN_BIN; if (bkndSync.waitTillBlockReleased(startModifiedCnt)) return (FreeBlock*)VALID_BLOCK_IN_BIN; // OS can't give us more memory, but we have some in locked bins if (*lockedBinsThreshold && numOfLockedBins) { *lockedBinsThreshold = 0; return (FreeBlock*)VALID_BLOCK_IN_BIN; } return nullptr; // nothing found, give up } FreeBlock *Backend::askMemFromOS(size_t blockSize, intptr_t startModifiedCnt, int *lockedBinsThreshold, int numOfLockedBins, bool *splittableRet, bool needSlabRegion) { FreeBlock *block; // The block sizes can be divided into 3 groups: // 1. "quite small": popular object size, we are in bootstarp or something // like; request several regions. // 2. "quite large": we want to have several such blocks in the region // but not want several pre-allocated regions. // 3. "huge": exact fit, we allocate only one block and do not allow // any other allocations to placed in a region. // Dividing the block sizes in these groups we are trying to balance between // too small regions (that leads to fragmentation) and too large ones (that // leads to excessive address space consumption). If a region is "too // large", allocate only one, to prevent fragmentation. It supposedly // doesn't hurt performance, because the object requested by user is large. // Bounds for the groups are: const size_t maxBinned = getMaxBinnedSize(); const size_t quiteSmall = maxBinned / 8; const size_t quiteLarge = maxBinned; if (blockSize >= quiteLarge) { // Do not interact with other threads via semaphores, as for exact fit // we can't share regions with them, memory requesting is individual. block = addNewRegion(blockSize, MEMREG_ONE_BLOCK, /*addToBin=*/false); if (!block) return releaseMemInCaches(startModifiedCnt, lockedBinsThreshold, numOfLockedBins); *splittableRet = false; } else { const size_t regSz_sizeBased = alignUp(4*maxRequestedSize, 1024*1024); // Another thread is modifying backend while we can't get the block. // Wait while it leaves and re-do the scan // before trying other ways to extend the backend. if (bkndSync.waitTillBlockReleased(startModifiedCnt) // semaphore is protecting adding more more memory from OS || memExtendingSema.wait()) return (FreeBlock*)VALID_BLOCK_IN_BIN; if (startModifiedCnt != bkndSync.getNumOfMods()) { memExtendingSema.signal(); return (FreeBlock*)VALID_BLOCK_IN_BIN; } if (blockSize < quiteSmall) { // For this size of blocks, add NUM_OF_REG "advance" regions in bin, // and return one as a result. // TODO: add to bin first, because other threads can use them right away. // This must be done carefully, because blocks in bins can be released // in releaseCachesToLimit(). const unsigned NUM_OF_REG = 3; MemRegionType regType = needSlabRegion ? MEMREG_SLAB_BLOCKS : MEMREG_LARGE_BLOCKS; block = addNewRegion(regSz_sizeBased, regType, /*addToBin=*/false); if (block) for (unsigned idx=0; idxsoftCachesCleanup() && (locTotalMemSize = totalMemSize.load(std::memory_order_acquire)) <= (locMemSoftLimit = memSoftLimit.load(std::memory_order_acquire))) return; // clean global large-object cache, if this is not enough, clean local caches // do this in several tries, because backend fragmentation can prevent // region from releasing for (int cleanLocal = 0; cleanLocal<2; cleanLocal++) while (cleanLocal ? extMemPool->allLocalCaches.cleanup(/*cleanOnlyUnused=*/true) : extMemPool->loc.decreasingCleanup()) if ((locTotalMemSize = totalMemSize.load(std::memory_order_acquire)) <= (locMemSoftLimit = memSoftLimit.load(std::memory_order_acquire))) return; // last chance to match memSoftLimit extMemPool->hardCachesCleanup(true); } int Backend::IndexedBins::getMinNonemptyBin(unsigned startBin) const { int p = bitMask.getMinTrue(startBin); return p == -1 ? Backend::freeBinsNum : p; } FreeBlock *Backend::IndexedBins::findBlock(int nativeBin, BackendSync *sync, size_t size, bool needAlignedBlock, bool alignedBin, int *numOfLockedBins) { for (int i=getMinNonemptyBin(nativeBin); i<(int)freeBinsNum; i=getMinNonemptyBin(i+1)) if (FreeBlock *block = getFromBin(i, sync, size, needAlignedBlock, alignedBin, /*wait=*/false, numOfLockedBins)) return block; return nullptr; } void Backend::requestBootstrapMem() { if (bootsrapMemDone == bootsrapMemStatus.load(std::memory_order_acquire)) return; MallocMutex::scoped_lock lock( bootsrapMemStatusMutex ); if (bootsrapMemDone == bootsrapMemStatus) return; MALLOC_ASSERT(bootsrapMemNotDone == bootsrapMemStatus, ASSERT_TEXT); bootsrapMemStatus = bootsrapMemInitializing; // request some rather big region during bootstrap in advance // ok to get nullptr here, as later we re-do a request with more modest size addNewRegion(2*1024*1024, MEMREG_SLAB_BLOCKS, /*addToBin=*/true); bootsrapMemStatus = bootsrapMemDone; } // try to allocate size Byte block in available bins // needAlignedRes is true if result must be slab-aligned FreeBlock *Backend::genericGetBlock(int num, size_t size, bool needAlignedBlock) { FreeBlock *block = nullptr; const size_t totalReqSize = num*size; // no splitting after requesting new region, asks exact size const int nativeBin = sizeToBin(totalReqSize); requestBootstrapMem(); // If we found 2 or less locked bins, it's time to ask more memory from OS. // But nothing can be asked from fixed pool. And we prefer wait, not ask // for more memory, if block is quite large. int lockedBinsThreshold = extMemPool->fixedPool || size>=maxBinned_SmallPage? 0 : 2; // Find maximal requested size limited by getMaxBinnedSize() AtomicUpdate(maxRequestedSize, totalReqSize, MaxRequestComparator(this)); scanCoalescQ(/*forceCoalescQDrop=*/false); bool splittable = true; for (;;) { const intptr_t startModifiedCnt = bkndSync.getNumOfMods(); int numOfLockedBins; intptr_t cleanCnt; do { cleanCnt = backendCleanCnt.load(std::memory_order_acquire); numOfLockedBins = 0; if (needAlignedBlock) { block = freeSlabAlignedBins.findBlock(nativeBin, &bkndSync, num*size, needAlignedBlock, /*alignedBin=*/true, &numOfLockedBins); if (!block && extMemPool->fixedPool) block = freeLargeBlockBins.findBlock(nativeBin, &bkndSync, num*size, needAlignedBlock, /*alignedBin=*/false, &numOfLockedBins); } else { block = freeLargeBlockBins.findBlock(nativeBin, &bkndSync, num*size, needAlignedBlock, /*alignedBin=*/false, &numOfLockedBins); if (!block && extMemPool->fixedPool) block = freeSlabAlignedBins.findBlock(nativeBin, &bkndSync, num*size, needAlignedBlock, /*alignedBin=*/true, &numOfLockedBins); } } while (!block && (numOfLockedBins>lockedBinsThreshold || cleanCnt % 2 == 1 || cleanCnt != backendCleanCnt.load(std::memory_order_acquire))); if (block) break; bool retScanCoalescQ = scanCoalescQ(/*forceCoalescQDrop=*/true); bool retSoftCachesCleanup = extMemPool->softCachesCleanup(); if (!(retScanCoalescQ || retSoftCachesCleanup)) { // bins are not updated, // only remaining possibility is to ask for more memory block = askMemFromOS(totalReqSize, startModifiedCnt, &lockedBinsThreshold, numOfLockedBins, &splittable, needAlignedBlock); if (!block) return nullptr; if (block != (FreeBlock*)VALID_BLOCK_IN_BIN) { // size can be increased in askMemFromOS, that's why >= MALLOC_ASSERT(block->sizeTmp >= size, ASSERT_TEXT); break; } // valid block somewhere in bins, let's find it block = nullptr; } } MALLOC_ASSERT(block, ASSERT_TEXT); if (splittable) { // At this point we have to be sure that slabAligned attribute describes the right block state block = splitBlock(block, num, size, block->slabAligned, needAlignedBlock); } // matched blockConsumed() from startUseBlock() bkndSync.blockReleased(); return block; } LargeMemoryBlock *Backend::getLargeBlock(size_t size) { LargeMemoryBlock *lmb = (LargeMemoryBlock*)genericGetBlock(1, size, /*needAlignedRes=*/false); if (lmb) { lmb->unalignedSize = size; if (extMemPool->userPool()) extMemPool->lmbList.add(lmb); } return lmb; } BlockI *Backend::getSlabBlock(int num) { BlockI *b = (BlockI*)genericGetBlock(num, slabSize, /*slabAligned=*/true); MALLOC_ASSERT(isAligned(b, slabSize), ASSERT_TEXT); return b; } void Backend::putSlabBlock(BlockI *block) { genericPutBlock((FreeBlock *)block, slabSize, /*slabAligned=*/true); } void *Backend::getBackRefSpace(size_t size, bool *rawMemUsed) { // This block is released only at shutdown, so it can prevent // a entire region releasing when it's received from the backend, // so prefer getRawMemory using. if (void *ret = getRawMemory(size, REGULAR)) { *rawMemUsed = true; return ret; } void *ret = genericGetBlock(1, size, /*needAlignedRes=*/false); if (ret) *rawMemUsed = false; return ret; } void Backend::putBackRefSpace(void *b, size_t size, bool rawMemUsed) { if (rawMemUsed) freeRawMemory(b, size); // ignore not raw mem, as it released on region releasing } void Backend::removeBlockFromBin(FreeBlock *fBlock) { if (fBlock->myBin != Backend::NO_BIN) { if (fBlock->slabAligned) freeSlabAlignedBins.lockRemoveBlock(fBlock->myBin, fBlock); else freeLargeBlockBins.lockRemoveBlock(fBlock->myBin, fBlock); } } void Backend::genericPutBlock(FreeBlock *fBlock, size_t blockSz, bool slabAligned) { bkndSync.blockConsumed(); coalescAndPut(fBlock, blockSz, slabAligned); bkndSync.blockReleased(); } void AllLargeBlocksList::add(LargeMemoryBlock *lmb) { MallocMutex::scoped_lock scoped_cs(largeObjLock); lmb->gPrev = nullptr; lmb->gNext = loHead; if (lmb->gNext) lmb->gNext->gPrev = lmb; loHead = lmb; } void AllLargeBlocksList::remove(LargeMemoryBlock *lmb) { MallocMutex::scoped_lock scoped_cs(largeObjLock); if (loHead == lmb) loHead = lmb->gNext; if (lmb->gNext) lmb->gNext->gPrev = lmb->gPrev; if (lmb->gPrev) lmb->gPrev->gNext = lmb->gNext; } void Backend::putLargeBlock(LargeMemoryBlock *lmb) { if (extMemPool->userPool()) extMemPool->lmbList.remove(lmb); genericPutBlock((FreeBlock *)lmb, lmb->unalignedSize, false); } void Backend::returnLargeObject(LargeMemoryBlock *lmb) { removeBackRef(lmb->backRefIdx); putLargeBlock(lmb); STAT_increment(getThreadId(), ThreadCommonCounters, freeLargeObj); } #if BACKEND_HAS_MREMAP void *Backend::remap(void *ptr, size_t oldSize, size_t newSize, size_t alignment) { // no remap for user pools and for object too small that living in bins if (inUserPool() || min(oldSize, newSize)extMemPool->granularity) return nullptr; const LargeMemoryBlock* lmbOld = ((LargeObjectHdr *)ptr - 1)->memoryBlock; const size_t oldUnalignedSize = lmbOld->unalignedSize; FreeBlock *oldFBlock = (FreeBlock *)lmbOld; FreeBlock *right = oldFBlock->rightNeig(oldUnalignedSize); // in every region only one block can have LAST_REGION_BLOCK on right, // so don't need no synchronization if (!right->isLastRegionBlock()) return nullptr; MemRegion *oldRegion = static_cast(right)->memRegion; MALLOC_ASSERT( oldRegion < ptr, ASSERT_TEXT ); const size_t oldRegionSize = oldRegion->allocSz; if (oldRegion->type != MEMREG_ONE_BLOCK) return nullptr; // we are not single in the region const size_t userOffset = (uintptr_t)ptr - (uintptr_t)oldRegion; const size_t alignedSize = LargeObjectCache::alignToBin(newSize + userOffset); const size_t requestSize = alignUp(sizeof(MemRegion) + alignedSize + sizeof(LastFreeBlock), extMemPool->granularity); if (requestSize < alignedSize) // is wrapped around? return nullptr; regionList.remove(oldRegion); // The deallocation should be registered in address range before mremap to // prevent a race condition with allocation on another thread. // (OS can reuse the memory and registerAlloc will be missed on another thread) usedAddrRange.registerFree((uintptr_t)oldRegion, (uintptr_t)oldRegion + oldRegionSize); void *ret = mremap(oldRegion, oldRegion->allocSz, requestSize, MREMAP_MAYMOVE); if (MAP_FAILED == ret) { // can't remap, revert and leave regionList.add(oldRegion); usedAddrRange.registerAlloc((uintptr_t)oldRegion, (uintptr_t)oldRegion + oldRegionSize); return nullptr; } MemRegion *region = (MemRegion*)ret; MALLOC_ASSERT(region->type == MEMREG_ONE_BLOCK, ASSERT_TEXT); region->allocSz = requestSize; region->blockSz = alignedSize; FreeBlock *fBlock = (FreeBlock *)alignUp((uintptr_t)region + sizeof(MemRegion), largeObjectAlignment); regionList.add(region); startUseBlock(region, fBlock, /*addToBin=*/false); MALLOC_ASSERT(fBlock->sizeTmp == region->blockSz, ASSERT_TEXT); // matched blockConsumed() in startUseBlock(). // TODO: get rid of useless pair blockConsumed()/blockReleased() bkndSync.blockReleased(); // object must start at same offset from region's start void *object = (void*)((uintptr_t)region + userOffset); MALLOC_ASSERT(isAligned(object, alignment), ASSERT_TEXT); LargeObjectHdr *header = (LargeObjectHdr*)object - 1; setBackRef(header->backRefIdx, header); LargeMemoryBlock *lmb = (LargeMemoryBlock*)fBlock; lmb->unalignedSize = region->blockSz; lmb->objectSize = newSize; lmb->backRefIdx = header->backRefIdx; header->memoryBlock = lmb; MALLOC_ASSERT((uintptr_t)lmb + lmb->unalignedSize >= (uintptr_t)object + lmb->objectSize, "An object must fit to the block."); usedAddrRange.registerAlloc((uintptr_t)region, (uintptr_t)region + requestSize); totalMemSize.fetch_add(region->allocSz - oldRegionSize); return object; } #endif /* BACKEND_HAS_MREMAP */ void Backend::releaseRegion(MemRegion *memRegion) { regionList.remove(memRegion); freeRawMem(memRegion, memRegion->allocSz); } // coalesce fBlock with its neighborhood FreeBlock *Backend::doCoalesc(FreeBlock *fBlock, MemRegion **mRegion) { FreeBlock *resBlock = fBlock; size_t resSize = fBlock->sizeTmp; MemRegion *memRegion = nullptr; fBlock->markCoalescing(resSize); resBlock->blockInBin = false; // coalescing with left neighbor size_t leftSz = fBlock->trySetLeftUsed(GuardedSize::COAL_BLOCK); if (leftSz != GuardedSize::LOCKED) { if (leftSz == GuardedSize::COAL_BLOCK) { coalescQ.putBlock(fBlock); return nullptr; } else { FreeBlock *left = fBlock->leftNeig(leftSz); size_t lSz = left->trySetMeUsed(GuardedSize::COAL_BLOCK); if (lSz <= GuardedSize::MAX_LOCKED_VAL) { fBlock->setLeftFree(leftSz); // rollback coalescQ.putBlock(fBlock); return nullptr; } else { MALLOC_ASSERT(lSz == leftSz, "Invalid header"); left->blockInBin = true; resBlock = left; resSize += leftSz; resBlock->sizeTmp = resSize; } } } // coalescing with right neighbor FreeBlock *right = fBlock->rightNeig(fBlock->sizeTmp); size_t rightSz = right->trySetMeUsed(GuardedSize::COAL_BLOCK); if (rightSz != GuardedSize::LOCKED) { // LastFreeBlock is on the right side if (GuardedSize::LAST_REGION_BLOCK == rightSz) { right->setMeFree(GuardedSize::LAST_REGION_BLOCK); memRegion = static_cast(right)->memRegion; } else if (GuardedSize::COAL_BLOCK == rightSz) { if (resBlock->blockInBin) { resBlock->blockInBin = false; removeBlockFromBin(resBlock); } coalescQ.putBlock(resBlock); return nullptr; } else { size_t rSz = right->rightNeig(rightSz)-> trySetLeftUsed(GuardedSize::COAL_BLOCK); if (rSz <= GuardedSize::MAX_LOCKED_VAL) { right->setMeFree(rightSz); // rollback if (resBlock->blockInBin) { resBlock->blockInBin = false; removeBlockFromBin(resBlock); } coalescQ.putBlock(resBlock); return nullptr; } else { MALLOC_ASSERT(rSz == rightSz, "Invalid header"); removeBlockFromBin(right); resSize += rightSz; // Is LastFreeBlock on the right side of right? FreeBlock *nextRight = right->rightNeig(rightSz); size_t nextRightSz = nextRight-> trySetMeUsed(GuardedSize::COAL_BLOCK); if (nextRightSz > GuardedSize::MAX_LOCKED_VAL) { if (nextRightSz == GuardedSize::LAST_REGION_BLOCK) memRegion = static_cast(nextRight)->memRegion; nextRight->setMeFree(nextRightSz); } } } } if (memRegion) { MALLOC_ASSERT((uintptr_t)memRegion + memRegion->allocSz >= (uintptr_t)right + sizeof(LastFreeBlock), ASSERT_TEXT); MALLOC_ASSERT((uintptr_t)memRegion < (uintptr_t)resBlock, ASSERT_TEXT); *mRegion = memRegion; } else *mRegion = nullptr; resBlock->sizeTmp = resSize; return resBlock; } bool Backend::coalescAndPutList(FreeBlock *list, bool forceCoalescQDrop, bool reportBlocksProcessed) { bool regionReleased = false; for (FreeBlock *helper; list; list = helper, // matches block enqueue in CoalRequestQ::putBlock() reportBlocksProcessed? coalescQ.blockWasProcessed() : (void)0) { MemRegion *memRegion; bool addToTail = false; helper = list->nextToFree; FreeBlock *toRet = doCoalesc(list, &memRegion); if (!toRet) continue; if (memRegion && memRegion->blockSz == toRet->sizeTmp && !extMemPool->fixedPool) { if (extMemPool->regionsAreReleaseable()) { // release the region, because there is no used blocks in it if (toRet->blockInBin) removeBlockFromBin(toRet); releaseRegion(memRegion); regionReleased = true; continue; } else // add block from empty region to end of bin, addToTail = true; // preserving for exact fit } size_t currSz = toRet->sizeTmp; int bin = sizeToBin(currSz); bool toAligned = extMemPool->fixedPool ? toAlignedBin(toRet, currSz) : toRet->slabAligned; bool needAddToBin = true; if (toRet->blockInBin) { // Does it stay in same bin? if (toRet->myBin == bin && toRet->slabAligned == toAligned) needAddToBin = false; else { toRet->blockInBin = false; removeBlockFromBin(toRet); } } // Does not stay in same bin, or bin-less; add it if (needAddToBin) { toRet->prev = toRet->next = toRet->nextToFree = nullptr; toRet->myBin = NO_BIN; toRet->slabAligned = toAligned; // If the block is too small to fit in any bin, keep it bin-less. // It's not a leak because the block later can be coalesced. if (currSz >= minBinnedSize) { toRet->sizeTmp = currSz; IndexedBins *target = toRet->slabAligned ? &freeSlabAlignedBins : &freeLargeBlockBins; if (forceCoalescQDrop) { target->addBlock(bin, toRet, toRet->sizeTmp, addToTail); } else if (!target->tryAddBlock(bin, toRet, addToTail)) { coalescQ.putBlock(toRet); continue; } } toRet->sizeTmp = 0; } // Free (possibly coalesced) free block. // Adding to bin must be done before this point, // because after a block is free it can be coalesced, and // using its pointer became unsafe. // Remember that coalescing is not done under any global lock. toRet->setMeFree(currSz); toRet->rightNeig(currSz)->setLeftFree(currSz); } return regionReleased; } // Coalesce fBlock and add it back to a bin; // processing delayed coalescing requests. void Backend::coalescAndPut(FreeBlock *fBlock, size_t blockSz, bool slabAligned) { fBlock->sizeTmp = blockSz; fBlock->nextToFree = nullptr; fBlock->slabAligned = slabAligned; coalescAndPutList(fBlock, /*forceCoalescQDrop=*/false, /*reportBlocksProcessed=*/false); } bool Backend::scanCoalescQ(bool forceCoalescQDrop) { FreeBlock *currCoalescList = coalescQ.getAll(); if (currCoalescList) // reportBlocksProcessed=true informs that the blocks leave coalescQ, // matches blockConsumed() from CoalRequestQ::putBlock() coalescAndPutList(currCoalescList, forceCoalescQDrop, /*reportBlocksProcessed=*/true); // returns status of coalescQ.getAll(), as an indication of possible changes in backend // TODO: coalescAndPutList() may report is some new free blocks became available or not return currCoalescList; } FreeBlock *Backend::findBlockInRegion(MemRegion *region, size_t exactBlockSize) { FreeBlock *fBlock; size_t blockSz; uintptr_t fBlockEnd, lastFreeBlock = (uintptr_t)region + region->allocSz - sizeof(LastFreeBlock); static_assert(sizeof(LastFreeBlock) % sizeof(uintptr_t) == 0, "Atomic applied on LastFreeBlock, and we put it at the end of region, that" " is uintptr_t-aligned, so no unaligned atomic operations are possible."); // right bound is slab-aligned, keep LastFreeBlock after it if (region->type == MEMREG_SLAB_BLOCKS) { fBlock = (FreeBlock *)alignUp((uintptr_t)region + sizeof(MemRegion), sizeof(uintptr_t)); fBlockEnd = alignDown(lastFreeBlock, slabSize); } else { fBlock = (FreeBlock *)alignUp((uintptr_t)region + sizeof(MemRegion), largeObjectAlignment); fBlockEnd = (uintptr_t)fBlock + exactBlockSize; MALLOC_ASSERT(fBlockEnd <= lastFreeBlock, ASSERT_TEXT); } if (fBlockEnd <= (uintptr_t)fBlock) return nullptr; // allocSz is too small blockSz = fBlockEnd - (uintptr_t)fBlock; // TODO: extend getSlabBlock to support degradation, i.e. getting less blocks // then requested, and then relax this check // (now all or nothing is implemented, check according to this) if (blockSz < numOfSlabAllocOnMiss*slabSize) return nullptr; region->blockSz = blockSz; return fBlock; } // startUseBlock may add the free block to a bin, the block can be used and // even released after this, so the region must be added to regionList already void Backend::startUseBlock(MemRegion *region, FreeBlock *fBlock, bool addToBin) { size_t blockSz = region->blockSz; fBlock->initHeader(); fBlock->setMeFree(blockSz); LastFreeBlock *lastBl = static_cast(fBlock->rightNeig(blockSz)); // to not get unaligned atomics during LastFreeBlock access MALLOC_ASSERT(isAligned(lastBl, sizeof(uintptr_t)), nullptr); lastBl->initHeader(); lastBl->setMeFree(GuardedSize::LAST_REGION_BLOCK); lastBl->setLeftFree(blockSz); lastBl->myBin = NO_BIN; lastBl->memRegion = region; if (addToBin) { unsigned targetBin = sizeToBin(blockSz); // during adding advance regions, register bin for a largest block in region advRegBins.registerBin(targetBin); if (region->type == MEMREG_SLAB_BLOCKS) { fBlock->slabAligned = true; freeSlabAlignedBins.addBlock(targetBin, fBlock, blockSz, /*addToTail=*/false); } else { fBlock->slabAligned = false; freeLargeBlockBins.addBlock(targetBin, fBlock, blockSz, /*addToTail=*/false); } } else { // to match with blockReleased() in genericGetBlock bkndSync.blockConsumed(); // Understand our alignment for correct splitBlock operation fBlock->slabAligned = region->type == MEMREG_SLAB_BLOCKS ? true : false; fBlock->sizeTmp = fBlock->tryLockBlock(); MALLOC_ASSERT(fBlock->sizeTmp >= FreeBlock::minBlockSize, "Locking must be successful"); } } void MemRegionList::add(MemRegion *r) { r->prev = nullptr; MallocMutex::scoped_lock lock(regionListLock); r->next = head; head = r; if (head->next) head->next->prev = head; } void MemRegionList::remove(MemRegion *r) { MallocMutex::scoped_lock lock(regionListLock); if (head == r) head = head->next; if (r->next) r->next->prev = r->prev; if (r->prev) r->prev->next = r->next; } #if __TBB_MALLOC_BACKEND_STAT int MemRegionList::reportStat(FILE *f) { int regNum = 0; MallocMutex::scoped_lock lock(regionListLock); for (MemRegion *curr = head; curr; curr = curr->next) { fprintf(f, "%p: max block %lu B, ", curr, curr->blockSz); regNum++; } return regNum; } #endif FreeBlock *Backend::addNewRegion(size_t size, MemRegionType memRegType, bool addToBin) { static_assert(sizeof(BlockMutexes) <= sizeof(BlockI), "Header must be not overwritten in used blocks"); MALLOC_ASSERT(FreeBlock::minBlockSize > GuardedSize::MAX_SPEC_VAL, "Block length must not conflict with special values of GuardedSize"); // If the region is not "for slabs" we should reserve some space for // a region header, the worst case alignment and the last block mark. const size_t requestSize = memRegType == MEMREG_SLAB_BLOCKS ? size : size + sizeof(MemRegion) + largeObjectAlignment + FreeBlock::minBlockSize + sizeof(LastFreeBlock); size_t rawSize = requestSize; MemRegion *region = (MemRegion*)allocRawMem(rawSize); if (!region) { MALLOC_ASSERT(rawSize==requestSize, "getRawMem has not allocated memory but changed the allocated size."); return nullptr; } if (rawSize < sizeof(MemRegion)) { if (!extMemPool->fixedPool) freeRawMem(region, rawSize); return nullptr; } region->type = memRegType; region->allocSz = rawSize; FreeBlock *fBlock = findBlockInRegion(region, size); if (!fBlock) { if (!extMemPool->fixedPool) freeRawMem(region, rawSize); return nullptr; } regionList.add(region); startUseBlock(region, fBlock, addToBin); bkndSync.binsModified(); return addToBin? (FreeBlock*)VALID_BLOCK_IN_BIN : fBlock; } void Backend::init(ExtMemoryPool *extMemoryPool) { extMemPool = extMemoryPool; usedAddrRange.init(); coalescQ.init(&bkndSync); bkndSync.init(this); } void Backend::reset() { MALLOC_ASSERT(extMemPool->userPool(), "Only user pool can be reset."); // no active threads are allowed in backend while reset() called verify(); freeLargeBlockBins.reset(); freeSlabAlignedBins.reset(); advRegBins.reset(); for (MemRegion *curr = regionList.head; curr; curr = curr->next) { FreeBlock *fBlock = findBlockInRegion(curr, curr->blockSz); MALLOC_ASSERT(fBlock, "A memory region unexpectedly got smaller"); startUseBlock(curr, fBlock, /*addToBin=*/true); } } bool Backend::destroy() { bool noError = true; // no active threads are allowed in backend while destroy() called verify(); if (!inUserPool()) { freeLargeBlockBins.reset(); freeSlabAlignedBins.reset(); } while (regionList.head) { MemRegion *helper = regionList.head->next; noError &= freeRawMem(regionList.head, regionList.head->allocSz); regionList.head = helper; } return noError; } bool Backend::clean() { scanCoalescQ(/*forceCoalescQDrop=*/false); // Backend::clean is always called under synchronization so only one thread can // enter to this method at once. // backendCleanCnt%2== 1 means that clean operation is in progress backendCleanCnt.fetch_add(1, std::memory_order_acq_rel); bool res = false; // We can have several blocks occupying a whole region, // because such regions are added in advance (see askMemFromOS() and reset()), // and never used. Release them all. for (int i = advRegBins.getMinUsedBin(0); i != -1; i = advRegBins.getMinUsedBin(i+1)) { if (i == freeSlabAlignedBins.getMinNonemptyBin(i)) res |= freeSlabAlignedBins.tryReleaseRegions(i, this); if (i == freeLargeBlockBins.getMinNonemptyBin(i)) res |= freeLargeBlockBins.tryReleaseRegions(i, this); } backendCleanCnt.fetch_add(1, std::memory_order_acq_rel); return res; } void Backend::IndexedBins::verify() { #if MALLOC_DEBUG for (int i=0; i<(int)freeBinsNum; i++) { for (FreeBlock *fb = freeBins[i].head.load(std::memory_order_relaxed); fb; fb=fb->next) { uintptr_t mySz = fb->myL.value; MALLOC_ASSERT(mySz>GuardedSize::MAX_SPEC_VAL, ASSERT_TEXT); FreeBlock *right = (FreeBlock*)((uintptr_t)fb + mySz); suppress_unused_warning(right); MALLOC_ASSERT(right->myL.value<=GuardedSize::MAX_SPEC_VAL, ASSERT_TEXT); MALLOC_ASSERT(right->leftL.value==mySz, ASSERT_TEXT); MALLOC_ASSERT(fb->leftL.value<=GuardedSize::MAX_SPEC_VAL, ASSERT_TEXT); } } #endif } // For correct operation, it must be called when no other threads // is changing backend. void Backend::verify() { #if MALLOC_DEBUG scanCoalescQ(/*forceCoalescQDrop=*/false); #endif // MALLOC_DEBUG freeLargeBlockBins.verify(); freeSlabAlignedBins.verify(); } #if __TBB_MALLOC_BACKEND_STAT size_t Backend::Bin::countFreeBlocks() { size_t cnt = 0; { MallocMutex::scoped_lock lock(tLock); for (FreeBlock *fb = head; fb; fb = fb->next) cnt++; } return cnt; } size_t Backend::Bin::reportFreeBlocks(FILE *f) { size_t totalSz = 0; MallocMutex::scoped_lock lock(tLock); for (FreeBlock *fb = head; fb; fb = fb->next) { size_t sz = fb->tryLockBlock(); fb->setMeFree(sz); fb->rightNeig(sz)->setLeftFree(sz); fprintf(f, " [%p;%p]", fb, (void*)((uintptr_t)fb+sz)); totalSz += sz; } return totalSz; } void Backend::IndexedBins::reportStat(FILE *f) { size_t totalSize = 0; for (int i=0; i inFlyBlocks; // to another std::atomic binsModifications; // incremented on every bin modification Backend *backend; public: void init(Backend *b) { backend = b; } void blockConsumed() { inFlyBlocks++; } void binsModified() { binsModifications++; } void blockReleased() { #if __TBB_MALLOC_BACKEND_STAT MALLOC_ITT_SYNC_RELEASING(&inFlyBlocks); #endif binsModifications++; intptr_t prev = inFlyBlocks.fetch_sub(1); MALLOC_ASSERT(prev > 0, ASSERT_TEXT); suppress_unused_warning(prev); } intptr_t getNumOfMods() const { return binsModifications.load(std::memory_order_acquire); } // return true if need re-do the blocks search inline bool waitTillBlockReleased(intptr_t startModifiedCnt); }; class CoalRequestQ { // queue of free blocks that coalescing was delayed private: std::atomic blocksToFree; BackendSync *bkndSync; // counted blocks in blocksToFree and that are leaved blocksToFree // and still in active coalescing std::atomic inFlyBlocks; public: void init(BackendSync *bSync) { bkndSync = bSync; } FreeBlock *getAll(); // return current list of blocks and make queue empty void putBlock(FreeBlock *fBlock); inline void blockWasProcessed(); intptr_t blocksInFly() const { return inFlyBlocks.load(std::memory_order_acquire); } }; class MemExtendingSema { std::atomic active; public: bool wait() { bool rescanBins = false; // up to 3 threads can add more memory from OS simultaneously, // rest of threads have to wait intptr_t prevCnt = active.load(std::memory_order_acquire); for (;;) { if (prevCnt < 3) { if (active.compare_exchange_strong(prevCnt, prevCnt + 1)) { break; } } else { SpinWaitWhileEq(active, prevCnt); rescanBins = true; break; } } return rescanBins; } void signal() { active.fetch_sub(1); } }; enum MemRegionType { // The region holds only slabs MEMREG_SLAB_BLOCKS = 0, // The region can hold several large object blocks MEMREG_LARGE_BLOCKS, // The region holds only one block with a requested size MEMREG_ONE_BLOCK }; class MemRegionList { MallocMutex regionListLock; public: MemRegion *head; void add(MemRegion *r); void remove(MemRegion *r); int reportStat(FILE *f); }; class Backend { private: /* Blocks in range [minBinnedSize; getMaxBinnedSize()] are kept in bins, one region can contains several blocks. Larger blocks are allocated directly and one region always contains one block. */ enum { minBinnedSize = 8*1024UL, /* If huge pages are available, maxBinned_HugePage used. If not, maxBinned_SmallPage is the threshold. TODO: use pool's granularity for upper bound setting.*/ maxBinned_SmallPage = 1024*1024UL, // TODO: support other page sizes maxBinned_HugePage = 4*1024*1024UL }; enum { VALID_BLOCK_IN_BIN = 1 // valid block added to bin, not returned as result }; public: // Backend bins step is the same as CacheStep for large object cache static const size_t freeBinsStep = LargeObjectCache::LargeBSProps::CacheStep; static const unsigned freeBinsNum = (maxBinned_HugePage-minBinnedSize)/freeBinsStep + 1; // if previous access missed per-thread slabs pool, // allocate numOfSlabAllocOnMiss blocks in advance static const int numOfSlabAllocOnMiss = 2; enum { NO_BIN = -1, // special bin for blocks >= maxBinned_HugePage, blocks go to this bin // when pool is created with keepAllMemory policy // TODO: currently this bin is scanned using "1st fit", as it accumulates // blocks of different sizes, "best fit" is preferred in terms of fragmentation HUGE_BIN = freeBinsNum-1 }; // Bin keeps 2-linked list of free blocks. It must be 2-linked // because during coalescing a block it's removed from a middle of the list. struct Bin { std::atomic head; FreeBlock* tail; MallocMutex tLock; void removeBlock(FreeBlock *fBlock); void reset() { head.store(nullptr, std::memory_order_relaxed); tail = nullptr; } bool empty() const { return !head.load(std::memory_order_relaxed); } size_t countFreeBlocks(); size_t reportFreeBlocks(FILE *f); void reportStat(FILE *f); }; typedef BitMaskMin BitMaskBins; // array of bins supplemented with bitmask for fast finding of non-empty bins class IndexedBins { BitMaskBins bitMask; Bin freeBins[Backend::freeBinsNum]; FreeBlock *getFromBin(int binIdx, BackendSync *sync, size_t size, bool needAlignedBlock, bool alignedBin, bool wait, int *resLocked); public: FreeBlock *findBlock(int nativeBin, BackendSync *sync, size_t size, bool needAlignedBlock, bool alignedBin,int *numOfLockedBins); bool tryReleaseRegions(int binIdx, Backend *backend); void lockRemoveBlock(int binIdx, FreeBlock *fBlock); void addBlock(int binIdx, FreeBlock *fBlock, size_t blockSz, bool addToTail); bool tryAddBlock(int binIdx, FreeBlock *fBlock, bool addToTail); int getMinNonemptyBin(unsigned startBin) const; void verify(); void reset(); void reportStat(FILE *f); }; private: class AdvRegionsBins { BitMaskBins bins; public: void registerBin(int regBin) { bins.set(regBin, 1); } int getMinUsedBin(int start) const { return bins.getMinTrue(start); } void reset() { bins.reset(); } }; // auxiliary class to atomic maximum request finding class MaxRequestComparator { const Backend *backend; public: MaxRequestComparator(const Backend *be) : backend(be) {} inline bool operator()(size_t oldMaxReq, size_t requestSize) const; }; #if CHECK_ALLOCATION_RANGE // Keep min and max of all addresses requested from OS, // use it for checking memory possibly allocated by replaced allocators // and for debugging purposes. Valid only for default memory pool. class UsedAddressRange { static const uintptr_t ADDRESS_UPPER_BOUND = UINTPTR_MAX; std::atomic leftBound, rightBound; MallocMutex mutex; public: // rightBound is zero-initialized void init() { leftBound.store(ADDRESS_UPPER_BOUND, std::memory_order_relaxed); } void registerAlloc(uintptr_t left, uintptr_t right); void registerFree(uintptr_t left, uintptr_t right); // as only left and right bounds are kept, we can return true // for pointer not allocated by us, if more than single region // was requested from OS bool inRange(void *ptr) const { const uintptr_t p = (uintptr_t)ptr; return leftBound.load(std::memory_order_relaxed)<=p && p<=rightBound.load(std::memory_order_relaxed); } }; #else class UsedAddressRange { public: void init() { } void registerAlloc(uintptr_t, uintptr_t) {} void registerFree(uintptr_t, uintptr_t) {} bool inRange(void *) const { return true; } }; #endif ExtMemoryPool *extMemPool; // used for release every region on pool destroying MemRegionList regionList; CoalRequestQ coalescQ; // queue of coalescing requests BackendSync bkndSync; // semaphore protecting adding more more memory from OS MemExtendingSema memExtendingSema; //size_t totalMemSize, // memSoftLimit; std::atomic totalMemSize; std::atomic memSoftLimit; UsedAddressRange usedAddrRange; // to keep 1st allocation large than requested, keep bootstrapping status enum { bootsrapMemNotDone = 0, bootsrapMemInitializing, bootsrapMemDone }; std::atomic bootsrapMemStatus; MallocMutex bootsrapMemStatusMutex; // Using of maximal observed requested size allows decrease // memory consumption for small requests and decrease fragmentation // for workloads when small and large allocation requests are mixed. // TODO: decrease, not only increase it std::atomic maxRequestedSize; // register bins related to advance regions AdvRegionsBins advRegBins; // Storage for split FreeBlocks IndexedBins freeLargeBlockBins, freeSlabAlignedBins; std::atomic backendCleanCnt; // Our friends friend class BackendSync; /******************************** Backend methods ******************************/ /*--------------------------- Coalescing functions ----------------------------*/ void coalescAndPut(FreeBlock *fBlock, size_t blockSz, bool slabAligned); bool coalescAndPutList(FreeBlock *head, bool forceCoalescQDrop, bool reportBlocksProcessed); // Main coalescing operation FreeBlock *doCoalesc(FreeBlock *fBlock, MemRegion **memRegion); // Queue for conflicted blocks during coalescing bool scanCoalescQ(bool forceCoalescQDrop); intptr_t blocksInCoalescing() const { return coalescQ.blocksInFly(); } /*--------------------- FreeBlock backend accessors ---------------------------*/ FreeBlock *genericGetBlock(int num, size_t size, bool slabAligned); void genericPutBlock(FreeBlock *fBlock, size_t blockSz, bool slabAligned); // Split the block and return remaining parts to backend if possible FreeBlock *splitBlock(FreeBlock *fBlock, int num, size_t size, bool isAligned, bool needAlignedBlock); void removeBlockFromBin(FreeBlock *fBlock); // TODO: combine with returnLargeObject void putLargeBlock(LargeMemoryBlock *lmb); /*------------------- Starting point for OS allocation ------------------------*/ void requestBootstrapMem(); FreeBlock *askMemFromOS(size_t totalReqSize, intptr_t startModifiedCnt, int *lockedBinsThreshold, int numOfLockedBins, bool *splittable, bool needSlabRegion); /*---------------------- Memory regions allocation ----------------------------*/ FreeBlock *addNewRegion(size_t size, MemRegionType type, bool addToBin); void releaseRegion(MemRegion *region); // TODO: combine in one initMemoryRegion function FreeBlock *findBlockInRegion(MemRegion *region, size_t exactBlockSize); void startUseBlock(MemRegion *region, FreeBlock *fBlock, bool addToBin); /*------------------------- Raw memory accessors ------------------------------*/ void *allocRawMem(size_t &size); bool freeRawMem(void *object, size_t size); /*------------------------------ Cleanup functions ----------------------------*/ // Clean all memory from all caches (extMemPool hard cleanup) FreeBlock *releaseMemInCaches(intptr_t startModifiedCnt, int *lockedBinsThreshold, int numOfLockedBins); // Soft heap limit (regular cleanup, then maybe hard cleanup) void releaseCachesToLimit(); /*---------------------------------- Utility ----------------------------------*/ // TODO: move inside IndexedBins class static int sizeToBin(size_t size) { if (size >= maxBinned_HugePage) return HUGE_BIN; else if (size < minBinnedSize) return NO_BIN; int bin = (size - minBinnedSize)/freeBinsStep; MALLOC_ASSERT(bin < HUGE_BIN, "Invalid size."); return bin; } static bool toAlignedBin(FreeBlock *block, size_t size) { return isAligned((char*)block + size, slabSize) && size >= slabSize; } public: /*--------------------- Init, reset, destroy, verify -------------------------*/ void init(ExtMemoryPool *extMemoryPool); bool destroy(); void verify(); void reset(); bool clean(); // clean on caches cleanup /*------------------------- Slab block request --------------------------------*/ BlockI *getSlabBlock(int num); void putSlabBlock(BlockI *block); /*-------------------------- Large object request -----------------------------*/ LargeMemoryBlock *getLargeBlock(size_t size); // TODO: make consistent with getLargeBlock void returnLargeObject(LargeMemoryBlock *lmb); /*-------------------------- Backreference memory request ----------------------*/ void *getBackRefSpace(size_t size, bool *rawMemUsed); void putBackRefSpace(void *b, size_t size, bool rawMemUsed); /*----------------------------- Remap object ----------------------------------*/ void *remap(void *ptr, size_t oldSize, size_t newSize, size_t alignment); /*---------------------------- Validation -------------------------------------*/ bool inUserPool() const; bool ptrCanBeValid(void *ptr) const { return usedAddrRange.inRange(ptr); } /*-------------------------- Configuration API --------------------------------*/ // Soft heap limit void setRecommendedMaxSize(size_t softLimit) { memSoftLimit = softLimit; releaseCachesToLimit(); } /*------------------------------- Info ----------------------------------------*/ size_t getMaxBinnedSize() const; /*-------------------------- Testing, statistics ------------------------------*/ #if __TBB_MALLOC_WHITEBOX_TEST size_t getTotalMemSize() const { return totalMemSize.load(std::memory_order_relaxed); } #endif #if __TBB_MALLOC_BACKEND_STAT void reportStat(FILE *f); private: static size_t binToSize(int bin) { MALLOC_ASSERT(bin <= HUGE_BIN, "Invalid bin."); return bin*freeBinsStep + minBinnedSize; } #endif }; #endif // __TBB_backend_H ================================================ FILE: third-party/tbb/src/tbbmalloc/backref.cpp ================================================ /* Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "tbbmalloc_internal.h" #include /* for placement new */ namespace rml { namespace internal { /********* backreferences ***********************/ /* Each slab block and each large memory object header contains BackRefIdx * that points out in some BackRefBlock which points back to this block or header. */ struct BackRefBlock : public BlockI { BackRefBlock *nextForUse; // the next in the chain of blocks with free items FreeObject *bumpPtr; // bump pointer moves from the end to the beginning of the block FreeObject *freeList; // list of all blocks that were allocated from raw mem (i.e., not from backend) BackRefBlock *nextRawMemBlock; std::atomic allocatedCount; // the number of objects allocated BackRefIdx::main_t myNum; // the index in the main MallocMutex blockMutex; // true if this block has been added to the listForUse chain, // modifications protected by mainMutex std::atomic addedToForUse; BackRefBlock(const BackRefBlock *blockToUse, intptr_t num) : nextForUse(nullptr), bumpPtr((FreeObject*)((uintptr_t)blockToUse + slabSize - sizeof(void*))), freeList(nullptr), nextRawMemBlock(nullptr), allocatedCount(0), myNum(num), addedToForUse(false) { memset(static_cast(&blockMutex), 0, sizeof(MallocMutex)); MALLOC_ASSERT(!(num >> CHAR_BIT*sizeof(BackRefIdx::main_t)), "index in BackRefMain must fit to BackRefIdx::main"); } // clean all but header void zeroSet() { memset(static_cast(this+1), 0, BackRefBlock::bytes-sizeof(BackRefBlock)); } static const int bytes = slabSize; }; // max number of backreference pointers in slab block static const int BR_MAX_CNT = (BackRefBlock::bytes-sizeof(BackRefBlock))/sizeof(void*); struct BackRefMain { /* On 64-bit systems a slab block can hold up to ~2K back pointers to slab blocks * or large objects, so it can address at least 32MB. The main array of 256KB * holds 32K pointers to such blocks, addressing ~1 TB. * On 32-bit systems there is ~4K back pointers in a slab block, so ~64MB can be addressed. * The main array of 8KB holds 2K pointers to leaves, so ~128 GB can addressed. */ static const size_t bytes = sizeof(uintptr_t)>4? 256*1024 : 8*1024; static const int dataSz; /* space is reserved for main table and 4 leaves taking into account VirtualAlloc allocation granularity */ static const int leaves = 4; static const size_t mainSize = BackRefMain::bytes+leaves*BackRefBlock::bytes; // The size of memory request for a few more leaf blocks; // selected to match VirtualAlloc granularity static const size_t blockSpaceSize = 64*1024; Backend *backend; std::atomic active; // if defined, use it for allocations std::atomic listForUse; // the chain of data blocks with free items BackRefBlock *allRawMemBlocks; std::atomic lastUsed; // index of the last used block bool rawMemUsed; MallocMutex requestNewSpaceMutex; BackRefBlock *backRefBl[1]; // the real size of the array is dataSz BackRefBlock *findFreeBlock(); void addToForUseList(BackRefBlock *bl); void initEmptyBackRefBlock(BackRefBlock *newBl); bool requestNewSpace(); }; const int BackRefMain::dataSz = 1+(BackRefMain::bytes-sizeof(BackRefMain))/sizeof(BackRefBlock*); static MallocMutex mainMutex; static std::atomic backRefMain; bool initBackRefMain(Backend *backend) { bool rawMemUsed; BackRefMain *main = (BackRefMain*)backend->getBackRefSpace(BackRefMain::mainSize, &rawMemUsed); if (! main) return false; main->backend = backend; main->listForUse.store(nullptr, std::memory_order_relaxed); main->allRawMemBlocks = nullptr; main->rawMemUsed = rawMemUsed; main->lastUsed = -1; memset(static_cast(&main->requestNewSpaceMutex), 0, sizeof(MallocMutex)); for (int i=0; izeroSet(); main->initEmptyBackRefBlock(bl); if (i) main->addToForUseList(bl); else // active leaf is not needed in listForUse main->active.store(bl, std::memory_order_relaxed); } // backRefMain is read in getBackRef, so publish it in consistent state backRefMain.store(main, std::memory_order_release); return true; } #if __TBB_SOURCE_DIRECTLY_INCLUDED void destroyBackRefMain(Backend *backend) { if (backRefMain.load(std::memory_order_acquire)) { // Is initBackRefMain() called? for (BackRefBlock *curr = backRefMain.load(std::memory_order_relaxed)->allRawMemBlocks; curr; ) { BackRefBlock *next = curr->nextRawMemBlock; // allRawMemBlocks list is only for raw mem blocks backend->putBackRefSpace(curr, BackRefMain::blockSpaceSize, /*rawMemUsed=*/true); curr = next; } backend->putBackRefSpace(backRefMain.load(std::memory_order_relaxed), BackRefMain::mainSize, backRefMain.load(std::memory_order_relaxed)->rawMemUsed); } } #endif void BackRefMain::addToForUseList(BackRefBlock *bl) { bl->nextForUse = listForUse.load(std::memory_order_relaxed); listForUse.store(bl, std::memory_order_relaxed); bl->addedToForUse.store(true, std::memory_order_relaxed); } void BackRefMain::initEmptyBackRefBlock(BackRefBlock *newBl) { intptr_t nextLU = lastUsed+1; new (newBl) BackRefBlock(newBl, nextLU); MALLOC_ASSERT(nextLU < dataSz, nullptr); backRefBl[nextLU] = newBl; // lastUsed is read in getBackRef, and access to backRefBl[lastUsed] // is possible only after checking backref against current lastUsed lastUsed.store(nextLU, std::memory_order_release); } bool BackRefMain::requestNewSpace() { bool isRawMemUsed; static_assert(!(blockSpaceSize % BackRefBlock::bytes), "Must request space for whole number of blocks."); if (BackRefMain::dataSz <= lastUsed + 1) // no space in main return false; // only one thread at a time may add blocks MallocMutex::scoped_lock newSpaceLock(requestNewSpaceMutex); if (listForUse.load(std::memory_order_relaxed)) // double check that only one block is available return true; BackRefBlock *newBl = (BackRefBlock*)backend->getBackRefSpace(blockSpaceSize, &isRawMemUsed); if (!newBl) return false; // touch a page for the 1st time without taking mainMutex ... for (BackRefBlock *bl = newBl; (uintptr_t)bl < (uintptr_t)newBl + blockSpaceSize; bl = (BackRefBlock*)((uintptr_t)bl + BackRefBlock::bytes)) { bl->zeroSet(); } MallocMutex::scoped_lock lock(mainMutex); // ... and share under lock const size_t numOfUnusedIdxs = BackRefMain::dataSz - lastUsed - 1; if (numOfUnusedIdxs <= 0) { // no space in main under lock, roll back backend->putBackRefSpace(newBl, blockSpaceSize, isRawMemUsed); return false; } // It's possible that only part of newBl is used, due to lack of indices in main. // This is OK as such underutilization is possible only once for backreferneces table. int blocksToUse = min(numOfUnusedIdxs, blockSpaceSize / BackRefBlock::bytes); // use the first block in the batch to maintain the list of "raw" memory // to be released at shutdown if (isRawMemUsed) { newBl->nextRawMemBlock = backRefMain.load(std::memory_order_relaxed)->allRawMemBlocks; backRefMain.load(std::memory_order_relaxed)->allRawMemBlocks = newBl; } for (BackRefBlock *bl = newBl; blocksToUse>0; bl = (BackRefBlock*)((uintptr_t)bl + BackRefBlock::bytes), blocksToUse--) { initEmptyBackRefBlock(bl); if (active.load(std::memory_order_relaxed)->allocatedCount.load(std::memory_order_relaxed) == BR_MAX_CNT) { active.store(bl, std::memory_order_release); // active leaf is not needed in listForUse } else { addToForUseList(bl); } } return true; } BackRefBlock *BackRefMain::findFreeBlock() { BackRefBlock* active_block = active.load(std::memory_order_acquire); MALLOC_ASSERT(active_block, ASSERT_TEXT); if (active_block->allocatedCount.load(std::memory_order_relaxed) < BR_MAX_CNT) return active_block; if (listForUse.load(std::memory_order_relaxed)) { // use released list MallocMutex::scoped_lock lock(mainMutex); if (active_block->allocatedCount.load(std::memory_order_relaxed) == BR_MAX_CNT) { active_block = listForUse.load(std::memory_order_relaxed); if (active_block) { active.store(active_block, std::memory_order_release); listForUse.store(active_block->nextForUse, std::memory_order_relaxed); MALLOC_ASSERT(active_block->addedToForUse.load(std::memory_order_relaxed), ASSERT_TEXT); active_block->addedToForUse.store(false, std::memory_order_relaxed); } } } else // allocate new data node if (!requestNewSpace()) return nullptr; return active.load(std::memory_order_acquire); // reread because of requestNewSpace } void *getBackRef(BackRefIdx backRefIdx) { // !backRefMain means no initialization done, so it can't be valid memory // see addEmptyBackRefBlock for fences around lastUsed if (!(backRefMain.load(std::memory_order_acquire)) || backRefIdx.getMain() > (backRefMain.load(std::memory_order_relaxed)->lastUsed.load(std::memory_order_acquire)) || backRefIdx.getOffset() >= BR_MAX_CNT) { return nullptr; } std::atomic& backRefEntry = *(std::atomic*)( (uintptr_t)backRefMain.load(std::memory_order_relaxed)->backRefBl[backRefIdx.getMain()] + sizeof(BackRefBlock) + backRefIdx.getOffset() * sizeof(std::atomic) ); return backRefEntry.load(std::memory_order_relaxed); } void setBackRef(BackRefIdx backRefIdx, void *newPtr) { MALLOC_ASSERT(backRefIdx.getMain()<=backRefMain.load(std::memory_order_relaxed)->lastUsed.load(std::memory_order_relaxed) && backRefIdx.getOffset()*)((uintptr_t)backRefMain.load(std::memory_order_relaxed)->backRefBl[backRefIdx.getMain()] + sizeof(BackRefBlock) + backRefIdx.getOffset() * sizeof(void*)))->store(newPtr, std::memory_order_relaxed); } BackRefIdx BackRefIdx::newBackRef(bool largeObj) { BackRefBlock *blockToUse; void **toUse; BackRefIdx res; bool lastBlockFirstUsed = false; do { MALLOC_ASSERT(backRefMain.load(std::memory_order_relaxed), ASSERT_TEXT); blockToUse = backRefMain.load(std::memory_order_relaxed)->findFreeBlock(); if (!blockToUse) return BackRefIdx(); toUse = nullptr; { // the block is locked to find a reference MallocMutex::scoped_lock lock(blockToUse->blockMutex); if (blockToUse->freeList) { toUse = (void**)blockToUse->freeList; blockToUse->freeList = blockToUse->freeList->next; MALLOC_ASSERT(!blockToUse->freeList || ((uintptr_t)blockToUse->freeList>=(uintptr_t)blockToUse && (uintptr_t)blockToUse->freeList < (uintptr_t)blockToUse + slabSize), ASSERT_TEXT); } else if (blockToUse->allocatedCount.load(std::memory_order_relaxed) < BR_MAX_CNT) { toUse = (void**)blockToUse->bumpPtr; blockToUse->bumpPtr = (FreeObject*)((uintptr_t)blockToUse->bumpPtr - sizeof(void*)); if (blockToUse->allocatedCount.load(std::memory_order_relaxed) == BR_MAX_CNT-1) { MALLOC_ASSERT((uintptr_t)blockToUse->bumpPtr < (uintptr_t)blockToUse+sizeof(BackRefBlock), ASSERT_TEXT); blockToUse->bumpPtr = nullptr; } } if (toUse) { if (!blockToUse->allocatedCount.load(std::memory_order_relaxed) && !backRefMain.load(std::memory_order_relaxed)->listForUse.load(std::memory_order_relaxed)) { lastBlockFirstUsed = true; } blockToUse->allocatedCount.store(blockToUse->allocatedCount.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed); } } // end of lock scope } while (!toUse); // The first thread that uses the last block requests new space in advance; // possible failures are ignored. if (lastBlockFirstUsed) backRefMain.load(std::memory_order_relaxed)->requestNewSpace(); res.main = blockToUse->myNum; uintptr_t offset = ((uintptr_t)toUse - ((uintptr_t)blockToUse + sizeof(BackRefBlock)))/sizeof(void*); // Is offset too big? MALLOC_ASSERT(!(offset >> 15), ASSERT_TEXT); res.offset = offset; if (largeObj) res.largeObj = largeObj; return res; } void removeBackRef(BackRefIdx backRefIdx) { MALLOC_ASSERT(!backRefIdx.isInvalid(), ASSERT_TEXT); MALLOC_ASSERT(backRefIdx.getMain()<=backRefMain.load(std::memory_order_relaxed)->lastUsed.load(std::memory_order_relaxed) && backRefIdx.getOffset()backRefBl[backRefIdx.getMain()]; std::atomic& backRefEntry = *(std::atomic*)((uintptr_t)currBlock + sizeof(BackRefBlock) + backRefIdx.getOffset()*sizeof(std::atomic)); MALLOC_ASSERT(((uintptr_t)&backRefEntry >(uintptr_t)currBlock && (uintptr_t)&backRefEntry <(uintptr_t)currBlock + slabSize), ASSERT_TEXT); { MallocMutex::scoped_lock lock(currBlock->blockMutex); backRefEntry.store(currBlock->freeList, std::memory_order_relaxed); #if MALLOC_DEBUG uintptr_t backRefEntryValue = (uintptr_t)backRefEntry.load(std::memory_order_relaxed); MALLOC_ASSERT(!backRefEntryValue || (backRefEntryValue > (uintptr_t)currBlock && backRefEntryValue < (uintptr_t)currBlock + slabSize), ASSERT_TEXT); #endif currBlock->freeList = (FreeObject*)&backRefEntry; currBlock->allocatedCount.store(currBlock->allocatedCount.load(std::memory_order_relaxed)-1, std::memory_order_relaxed); } // TODO: do we need double-check here? if (!currBlock->addedToForUse.load(std::memory_order_relaxed) && currBlock!=backRefMain.load(std::memory_order_relaxed)->active.load(std::memory_order_relaxed)) { MallocMutex::scoped_lock lock(mainMutex); if (!currBlock->addedToForUse.load(std::memory_order_relaxed) && currBlock!=backRefMain.load(std::memory_order_relaxed)->active.load(std::memory_order_relaxed)) backRefMain.load(std::memory_order_relaxed)->addToForUseList(currBlock); } } /********* End of backreferences ***********************/ } // namespace internal } // namespace rml ================================================ FILE: third-party/tbb/src/tbbmalloc/def/lin32-tbbmalloc.def ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ { global: scalable_calloc; scalable_free; scalable_malloc; scalable_realloc; scalable_posix_memalign; scalable_aligned_malloc; scalable_aligned_realloc; scalable_aligned_free; scalable_msize; scalable_allocation_mode; scalable_allocation_command; __TBB_malloc_safer_aligned_msize; __TBB_malloc_safer_aligned_realloc; __TBB_malloc_safer_free; __TBB_malloc_safer_msize; __TBB_malloc_safer_realloc; /* memory pool stuff */ _ZN3rml10pool_resetEPNS_10MemoryPoolE; _ZN3rml11pool_createEiPKNS_13MemPoolPolicyE; _ZN3rml14pool_create_v1EiPKNS_13MemPoolPolicyEPPNS_10MemoryPoolE; _ZN3rml11pool_mallocEPNS_10MemoryPoolEj; _ZN3rml12pool_destroyEPNS_10MemoryPoolE; _ZN3rml9pool_freeEPNS_10MemoryPoolEPv; _ZN3rml12pool_reallocEPNS_10MemoryPoolEPvj; _ZN3rml20pool_aligned_reallocEPNS_10MemoryPoolEPvjj; _ZN3rml19pool_aligned_mallocEPNS_10MemoryPoolEjj; _ZN3rml13pool_identifyEPv; _ZN3rml10pool_msizeEPNS_10MemoryPoolEPv; local: /* TBB symbols */ *3rml*; *3tbb*; *__TBB*; __itt_*; ITT_DoOneTimeInitialization; TBB_runtime_interface_version; /* Intel Compiler (libirc) symbols */ __intel_*; _intel_*; get_memcpy_largest_cachelinesize; get_memcpy_largest_cache_size; get_mem_ops_method; init_mem_ops_method; irc__get_msg; irc__print; override_mem_ops_method; set_memcpy_largest_cachelinesize; set_memcpy_largest_cache_size; }; ================================================ FILE: third-party/tbb/src/tbbmalloc/def/lin64-tbbmalloc.def ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ { global: scalable_calloc; scalable_free; scalable_malloc; scalable_realloc; scalable_posix_memalign; scalable_aligned_malloc; scalable_aligned_realloc; scalable_aligned_free; scalable_msize; scalable_allocation_mode; scalable_allocation_command; __TBB_malloc_safer_aligned_msize; __TBB_malloc_safer_aligned_realloc; __TBB_malloc_safer_free; __TBB_malloc_safer_msize; __TBB_malloc_safer_realloc; /* memory pool stuff */ _ZN3rml11pool_createElPKNS_13MemPoolPolicyE; _ZN3rml14pool_create_v1ElPKNS_13MemPoolPolicyEPPNS_10MemoryPoolE; _ZN3rml10pool_resetEPNS_10MemoryPoolE; _ZN3rml11pool_mallocEPNS_10MemoryPoolEm; _ZN3rml12pool_destroyEPNS_10MemoryPoolE; _ZN3rml9pool_freeEPNS_10MemoryPoolEPv; _ZN3rml12pool_reallocEPNS_10MemoryPoolEPvm; _ZN3rml20pool_aligned_reallocEPNS_10MemoryPoolEPvmm; _ZN3rml19pool_aligned_mallocEPNS_10MemoryPoolEmm; _ZN3rml13pool_identifyEPv; _ZN3rml10pool_msizeEPNS_10MemoryPoolEPv; local: /* TBB symbols */ *3rml*; *3tbb*; *__TBB*; __itt_*; ITT_DoOneTimeInitialization; TBB_runtime_interface_version; /* Intel Compiler (libirc) symbols */ __intel_*; _intel_*; get_memcpy_largest_cachelinesize; get_memcpy_largest_cache_size; get_mem_ops_method; init_mem_ops_method; irc__get_msg; irc__print; override_mem_ops_method; set_memcpy_largest_cachelinesize; set_memcpy_largest_cache_size; }; ================================================ FILE: third-party/tbb/src/tbbmalloc/def/mac64-tbbmalloc.def ================================================ # Copyright (c) 2005-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. _scalable_calloc _scalable_free _scalable_malloc _scalable_realloc _scalable_posix_memalign _scalable_aligned_malloc _scalable_aligned_realloc _scalable_aligned_free _scalable_msize _scalable_allocation_mode _scalable_allocation_command ___TBB_malloc_safer_aligned_msize ___TBB_malloc_safer_aligned_realloc ___TBB_malloc_safer_free ___TBB_malloc_safer_msize ___TBB_malloc_safer_realloc ___TBB_malloc_free_definite_size /* memory pool stuff */ __ZN3rml11pool_createElPKNS_13MemPoolPolicyE __ZN3rml14pool_create_v1ElPKNS_13MemPoolPolicyEPPNS_10MemoryPoolE __ZN3rml10pool_resetEPNS_10MemoryPoolE __ZN3rml12pool_destroyEPNS_10MemoryPoolE __ZN3rml11pool_mallocEPNS_10MemoryPoolEm __ZN3rml9pool_freeEPNS_10MemoryPoolEPv __ZN3rml12pool_reallocEPNS_10MemoryPoolEPvm __ZN3rml20pool_aligned_reallocEPNS_10MemoryPoolEPvmm __ZN3rml19pool_aligned_mallocEPNS_10MemoryPoolEmm __ZN3rml13pool_identifyEPv __ZN3rml10pool_msizeEPNS_10MemoryPoolEPv ================================================ FILE: third-party/tbb/src/tbbmalloc/def/win32-tbbmalloc.def ================================================ ; Copyright (c) 2005-2021 Intel Corporation ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. ; You may obtain a copy of the License at ; ; http://www.apache.org/licenses/LICENSE-2.0 ; ; Unless required by applicable law or agreed to in writing, software ; distributed under the License is distributed on an "AS IS" BASIS, ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ; See the License for the specific language governing permissions and ; limitations under the License. EXPORTS ; frontend.cpp scalable_calloc scalable_free scalable_malloc scalable_realloc scalable_posix_memalign scalable_aligned_malloc scalable_aligned_realloc scalable_aligned_free scalable_msize scalable_allocation_mode scalable_allocation_command __TBB_malloc_safer_free __TBB_malloc_safer_realloc __TBB_malloc_safer_msize __TBB_malloc_safer_aligned_msize __TBB_malloc_safer_aligned_realloc ; memory pool stuff ?pool_create@rml@@YAPAVMemoryPool@1@HPBUMemPoolPolicy@1@@Z ?pool_create_v1@rml@@YA?AW4MemPoolError@1@HPBUMemPoolPolicy@1@PAPAVMemoryPool@1@@Z ?pool_destroy@rml@@YA_NPAVMemoryPool@1@@Z ?pool_malloc@rml@@YAPAXPAVMemoryPool@1@I@Z ?pool_free@rml@@YA_NPAVMemoryPool@1@PAX@Z ?pool_reset@rml@@YA_NPAVMemoryPool@1@@Z ?pool_realloc@rml@@YAPAXPAVMemoryPool@1@PAXI@Z ?pool_aligned_realloc@rml@@YAPAXPAVMemoryPool@1@PAXII@Z ?pool_aligned_malloc@rml@@YAPAXPAVMemoryPool@1@II@Z ?pool_identify@rml@@YAPAVMemoryPool@1@PAX@Z ?pool_msize@rml@@YAIPAVMemoryPool@1@PAX@Z ================================================ FILE: third-party/tbb/src/tbbmalloc/def/win64-tbbmalloc.def ================================================ ; Copyright (c) 2005-2021 Intel Corporation ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. ; You may obtain a copy of the License at ; ; http://www.apache.org/licenses/LICENSE-2.0 ; ; Unless required by applicable law or agreed to in writing, software ; distributed under the License is distributed on an "AS IS" BASIS, ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ; See the License for the specific language governing permissions and ; limitations under the License. EXPORTS ; frontend.cpp scalable_calloc scalable_free scalable_malloc scalable_realloc scalable_posix_memalign scalable_aligned_malloc scalable_aligned_realloc scalable_aligned_free scalable_msize scalable_allocation_mode scalable_allocation_command __TBB_malloc_safer_free __TBB_malloc_safer_realloc __TBB_malloc_safer_msize __TBB_malloc_safer_aligned_msize __TBB_malloc_safer_aligned_realloc ; memory pool stuff ?pool_create@rml@@YAPEAVMemoryPool@1@_JPEBUMemPoolPolicy@1@@Z ?pool_create_v1@rml@@YA?AW4MemPoolError@1@_JPEBUMemPoolPolicy@1@PEAPEAVMemoryPool@1@@Z ?pool_destroy@rml@@YA_NPEAVMemoryPool@1@@Z ?pool_malloc@rml@@YAPEAXPEAVMemoryPool@1@_K@Z ?pool_free@rml@@YA_NPEAVMemoryPool@1@PEAX@Z ?pool_reset@rml@@YA_NPEAVMemoryPool@1@@Z ?pool_realloc@rml@@YAPEAXPEAVMemoryPool@1@PEAX_K@Z ?pool_aligned_realloc@rml@@YAPEAXPEAVMemoryPool@1@PEAX_K2@Z ?pool_aligned_malloc@rml@@YAPEAXPEAVMemoryPool@1@_K1@Z ?pool_identify@rml@@YAPEAVMemoryPool@1@PEAX@Z ?pool_msize@rml@@YA_KPEAVMemoryPool@1@PEAX@Z ================================================ FILE: third-party/tbb/src/tbbmalloc/frontend.cpp ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "tbbmalloc_internal.h" #include #include /* for placement new */ #include /* for memset */ #include "oneapi/tbb/version.h" #include "../tbb/environment.h" #include "../tbb/itt_notify.h" // for __TBB_load_ittnotify() #if USE_PTHREAD #define TlsSetValue_func pthread_setspecific #define TlsGetValue_func pthread_getspecific #define GetMyTID() pthread_self() #include extern "C" { static void mallocThreadShutdownNotification(void*); } #if __sun || __SUNPRO_CC #define __asm__ asm #endif #include // sysconf(_SC_PAGESIZE) #elif USE_WINTHREAD #define GetMyTID() GetCurrentThreadId() #if __TBB_WIN8UI_SUPPORT #include #define TlsSetValue_func FlsSetValue #define TlsGetValue_func FlsGetValue #define TlsAlloc() FlsAlloc(nullptr) #define TLS_ALLOC_FAILURE FLS_OUT_OF_INDEXES #define TlsFree FlsFree #else #define TlsSetValue_func TlsSetValue #define TlsGetValue_func TlsGetValue #define TLS_ALLOC_FAILURE TLS_OUT_OF_INDEXES #endif #else #error Must define USE_PTHREAD or USE_WINTHREAD #endif #define FREELIST_NONBLOCKING 1 namespace rml { class MemoryPool; namespace internal { class Block; class MemoryPool; #if MALLOC_CHECK_RECURSION inline bool isMallocInitialized(); #endif // MALLOC_CHECK_RECURSION /** Support for handling the special UNUSABLE pointer state **/ const intptr_t UNUSABLE = 0x1; inline bool isSolidPtr( void* ptr ) { return (UNUSABLE|(intptr_t)ptr)!=UNUSABLE; } inline bool isNotForUse( void* ptr ) { return (intptr_t)ptr==UNUSABLE; } /* * Block::objectSize value used to mark blocks allocated by startupAlloc */ const uint16_t startupAllocObjSizeMark = ~(uint16_t)0; /* * The following constant is used to define the size of struct Block, the block header. * The intent is to have the size of a Block multiple of the cache line size, this allows us to * get good alignment at the cost of some overhead equal to the amount of padding included in the Block. */ const int blockHeaderAlignment = estimatedCacheLineSize; /********* The data structures and global objects **************/ /* * The malloc routines themselves need to be able to occasionally malloc some space, * in order to set up the structures used by the thread local structures. This * routine performs that functions. */ class BootStrapBlocks { MallocMutex bootStrapLock; Block *bootStrapBlock; Block *bootStrapBlockUsed; FreeObject *bootStrapObjectList; public: void *allocate(MemoryPool *memPool, size_t size); void free(void* ptr); void reset(); }; #if USE_INTERNAL_TID class ThreadId { static tls_key_t Tid_key; std::atomic ThreadCount; unsigned int id; static unsigned int tlsNumber() { unsigned int result = reinterpret_cast(TlsGetValue_func(Tid_key)); if( !result ) { RecursiveMallocCallProtector scoped; // Thread-local value is zero -> first call from this thread, // need to initialize with next ID value (IDs start from 1) result = ++ThreadCount; // returned new value! TlsSetValue_func( Tid_key, reinterpret_cast(result) ); } return result; } public: static bool init() { #if USE_WINTHREAD Tid_key = TlsAlloc(); if (Tid_key == TLS_ALLOC_FAILURE) return false; #else int status = pthread_key_create( &Tid_key, nullptr ); if ( status ) { fprintf (stderr, "The memory manager cannot create tls key during initialization\n"); return false; } #endif /* USE_WINTHREAD */ return true; } #if __TBB_SOURCE_DIRECTLY_INCLUDED static void destroy() { if( Tid_key ) { #if USE_WINTHREAD BOOL status = !(TlsFree( Tid_key )); // fail is zero #else int status = pthread_key_delete( Tid_key ); #endif /* USE_WINTHREAD */ if ( status ) fprintf (stderr, "The memory manager cannot delete tls key\n"); Tid_key = 0; } } #endif ThreadId() : id(ThreadId::tlsNumber()) {} bool isCurrentThreadId() const { return id == ThreadId::tlsNumber(); } #if COLLECT_STATISTICS || MALLOC_TRACE friend unsigned int getThreadId() { return ThreadId::tlsNumber(); } #endif #if COLLECT_STATISTICS static unsigned getMaxThreadId() { return ThreadCount.load(std::memory_order_relaxed); } friend int STAT_increment(ThreadId tid, int bin, int ctr); #endif }; tls_key_t ThreadId::Tid_key; intptr_t ThreadId::ThreadCount; #if COLLECT_STATISTICS int STAT_increment(ThreadId tid, int bin, int ctr) { return ::STAT_increment(tid.id, bin, ctr); } #endif #else // USE_INTERNAL_TID class ThreadId { #if USE_PTHREAD std::atomic tid; #else std::atomic tid; #endif public: ThreadId() : tid(GetMyTID()) {} ThreadId(ThreadId &other) = delete; ~ThreadId() = default; #if USE_PTHREAD bool isCurrentThreadId() const { return pthread_equal(pthread_self(), tid.load(std::memory_order_relaxed)); } #else bool isCurrentThreadId() const { return GetCurrentThreadId() == tid.load(std::memory_order_relaxed); } #endif ThreadId& operator=(const ThreadId& other) { tid.store(other.tid.load(std::memory_order_relaxed), std::memory_order_relaxed); return *this; } static bool init() { return true; } #if __TBB_SOURCE_DIRECTLY_INCLUDED static void destroy() {} #endif }; #endif // USE_INTERNAL_TID /*********** Code to provide thread ID and a thread-local void pointer **********/ bool TLSKey::init() { #if USE_WINTHREAD TLS_pointer_key = TlsAlloc(); if (TLS_pointer_key == TLS_ALLOC_FAILURE) return false; #else int status = pthread_key_create( &TLS_pointer_key, mallocThreadShutdownNotification ); if ( status ) return false; #endif /* USE_WINTHREAD */ return true; } bool TLSKey::destroy() { #if USE_WINTHREAD BOOL status1 = !(TlsFree(TLS_pointer_key)); // fail is zero #else int status1 = pthread_key_delete(TLS_pointer_key); #endif /* USE_WINTHREAD */ MALLOC_ASSERT(!status1, "The memory manager cannot delete tls key."); return status1==0; } inline TLSData* TLSKey::getThreadMallocTLS() const { return (TLSData *)TlsGetValue_func( TLS_pointer_key ); } inline void TLSKey::setThreadMallocTLS( TLSData * newvalue ) { RecursiveMallocCallProtector scoped; TlsSetValue_func( TLS_pointer_key, newvalue ); } /* The 'next' field in the block header has to maintain some invariants: * it needs to be on a 16K boundary and the first field in the block. * Any value stored there needs to have the lower 14 bits set to 0 * so that various assert work. This means that if you want to smash this memory * for debugging purposes you will need to obey this invariant. * The total size of the header needs to be a power of 2 to simplify * the alignment requirements. For now it is a 128 byte structure. * To avoid false sharing, the fields changed only locally are separated * from the fields changed by foreign threads. * Changing the size of the block header would require to change * some bin allocation sizes, in particular "fitting" sizes (see above). */ class Bin; class StartupBlock; class MemoryPool { // if no explicit grainsize, expect to see malloc in user's pAlloc // and set reasonable low granularity static const size_t defaultGranularity = estimatedCacheLineSize; MemoryPool() = delete; // deny public: static MallocMutex memPoolListLock; // list of all active pools is used to release // all TLS data on thread termination or library unload MemoryPool *next, *prev; ExtMemoryPool extMemPool; BootStrapBlocks bootStrapBlocks; static void initDefaultPool(); bool init(intptr_t poolId, const MemPoolPolicy* memPoolPolicy); bool reset(); bool destroy(); void onThreadShutdown(TLSData *tlsData); inline TLSData *getTLS(bool create); void clearTLS() { extMemPool.tlsPointerKey.setThreadMallocTLS(nullptr); } Block *getEmptyBlock(size_t size); void returnEmptyBlock(Block *block, bool poolTheBlock); // get/put large object to/from local large object cache void *getFromLLOCache(TLSData *tls, size_t size, size_t alignment); void putToLLOCache(TLSData *tls, void *object); }; static intptr_t defaultMemPool_space[sizeof(MemoryPool)/sizeof(intptr_t) + (sizeof(MemoryPool)%sizeof(intptr_t)? 1 : 0)]; static MemoryPool *defaultMemPool = (MemoryPool*)defaultMemPool_space; const size_t MemoryPool::defaultGranularity; // zero-initialized MallocMutex MemoryPool::memPoolListLock; // TODO: move huge page status to default pool, because that's its states HugePagesStatus hugePages; static bool usedBySrcIncluded = false; // Padding helpers template struct PaddingImpl { size_t __padding[padd]; }; template<> struct PaddingImpl<0> {}; template struct Padding : PaddingImpl {}; // Slab block is 16KB-aligned. To prevent false sharing, separate locally-accessed // fields and fields commonly accessed by not owner threads. class GlobalBlockFields : public BlockI { protected: std::atomic publicFreeList; std::atomic nextPrivatizable; MemoryPool *poolPtr; }; class LocalBlockFields : public GlobalBlockFields, Padding { protected: Block *next; Block *previous; /* Use double linked list to speed up removal */ FreeObject *bumpPtr; /* Bump pointer moves from the end to the beginning of a block */ FreeObject *freeList; /* Pointer to local data for the owner thread. Used for fast finding tls when releasing object from a block that current thread owned. nullptr for orphaned blocks. */ std::atomic tlsPtr; ThreadId ownerTid; /* the ID of the thread that owns or last owned the block */ BackRefIdx backRefIdx; uint16_t allocatedCount; /* Number of objects allocated (obviously by the owning thread) */ uint16_t objectSize; bool isFull; friend class FreeBlockPool; friend class StartupBlock; friend class LifoList; friend void *BootStrapBlocks::allocate(MemoryPool *, size_t); friend bool OrphanedBlocks::cleanup(Backend*); friend Block *MemoryPool::getEmptyBlock(size_t); }; // Use inheritance to guarantee that a user data start on next cache line. // Can't use member for it, because when LocalBlockFields already on cache line, // we must have no additional memory consumption for all compilers. class Block : public LocalBlockFields, Padding<2*blockHeaderAlignment - sizeof(LocalBlockFields)> { public: bool empty() const { if (allocatedCount > 0) return false; MALLOC_ASSERT(!isSolidPtr(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT); return true; } inline FreeObject* allocate(); inline FreeObject *allocateFromFreeList(); inline bool adjustFullness(); void adjustPositionInBin(Bin* bin = nullptr); #if MALLOC_DEBUG bool freeListNonNull() { return freeList; } #endif void freePublicObject(FreeObject *objectToFree); inline void freeOwnObject(void *object); void reset(); void privatizePublicFreeList( bool reset = true ); void restoreBumpPtr(); void privatizeOrphaned(TLSData *tls, unsigned index); bool readyToShare(); void shareOrphaned(intptr_t binTag, unsigned index); unsigned int getSize() const { MALLOC_ASSERT(isStartupAllocObject() || objectSize0, msg); #endif // must not point to slab's header MALLOC_ASSERT((uintptr_t)object - (uintptr_t)this >= sizeof(Block), msg); if (startupAllocObjSizeMark == objectSize) // startup block MALLOC_ASSERT(object<=bumpPtr, msg); else { // non-startup objects are 8 Byte aligned MALLOC_ASSERT(isAligned(object, 8), "Try to free invalid small object"); FreeObject *toFree = findObjectToFree(object); #if !__TBB_USE_THREAD_SANITIZER MALLOC_ASSERT(allocatedCount <= (slabSize-sizeof(Block))/objectSize && (!bumpPtr || object>bumpPtr), msg); // check against head of freeList, as this is mostly // expected after double free MALLOC_ASSERT(toFree != freeList, msg); #endif // check against head of publicFreeList, to detect double free // involving foreign thread MALLOC_ASSERT(toFree != publicFreeList.load(std::memory_order_relaxed), msg); } #else suppress_unused_warning(object); #endif } void initEmptyBlock(TLSData *tls, size_t size); size_t findObjectSize(void *object) const; MemoryPool *getMemPool() const { return poolPtr; } // do not use on the hot path! protected: void cleanBlockHeader(); private: static const float emptyEnoughRatio; /* Threshold on free space needed to "reactivate" a block */ inline FreeObject *allocateFromBumpPtr(); inline FreeObject *findAllocatedObject(const void *address) const; #if MALLOC_DEBUG inline bool isProperlyPlaced(const void *object) const; #endif inline void markOwned(TLSData *tls) { MALLOC_ASSERT(!tlsPtr.load(std::memory_order_relaxed), ASSERT_TEXT); ownerTid = ThreadId(); /* save the ID of the current thread */ tlsPtr.store(tls, std::memory_order_relaxed); } inline void markOrphaned() { MALLOC_ASSERT(tlsPtr.load(std::memory_order_relaxed), ASSERT_TEXT); tlsPtr.store(nullptr, std::memory_order_relaxed); } friend class Bin; friend class TLSData; friend bool MemoryPool::destroy(); }; const float Block::emptyEnoughRatio = 1.0 / 4.0; static_assert(sizeof(Block) <= 2*estimatedCacheLineSize, "The class Block does not fit into 2 cache lines on this platform. " "Defining USE_INTERNAL_TID may help to fix it."); class Bin { private: public: Block *activeBlk; std::atomic mailbox; MallocMutex mailLock; public: inline Block* getActiveBlock() const { return activeBlk; } void resetActiveBlock() { activeBlk = nullptr; } inline void setActiveBlock(Block *block); inline Block* setPreviousBlockActive(); Block* getPrivatizedFreeListBlock(); void moveBlockToFront(Block *block); bool cleanPublicFreeLists(); void processEmptyBlock(Block *block, bool poolTheBlock); void addPublicFreeListBlock(Block* block); void outofTLSBin(Block* block); void verifyTLSBin(size_t size) const; void pushTLSBin(Block* block); #if MALLOC_DEBUG void verifyInitState() const { MALLOC_ASSERT( !activeBlk, ASSERT_TEXT ); MALLOC_ASSERT( !mailbox.load(std::memory_order_relaxed), ASSERT_TEXT ); } #endif friend void Block::freePublicObject (FreeObject *objectToFree); }; /********* End of the data structures **************/ /* * There are bins for all 8 byte aligned objects less than this segregated size; 8 bins in total */ const uint32_t minSmallObjectIndex = 0; const uint32_t numSmallObjectBins = 8; const uint32_t maxSmallObjectSize = 64; /* * There are 4 bins between each couple of powers of 2 [64-128-256-...] * from maxSmallObjectSize till this size; 16 bins in total */ const uint32_t minSegregatedObjectIndex = minSmallObjectIndex+numSmallObjectBins; const uint32_t numSegregatedObjectBins = 16; const uint32_t maxSegregatedObjectSize = 1024; /* * And there are 5 bins with allocation sizes that are multiples of estimatedCacheLineSize * and selected to fit 9, 6, 4, 3, and 2 allocations in a block. */ const uint32_t minFittingIndex = minSegregatedObjectIndex+numSegregatedObjectBins; const uint32_t numFittingBins = 5; const uint32_t fittingAlignment = estimatedCacheLineSize; #define SET_FITTING_SIZE(N) ( (slabSize-sizeof(Block))/N ) & ~(fittingAlignment-1) // For blockSize=16*1024, sizeof(Block)=2*estimatedCacheLineSize and fittingAlignment=estimatedCacheLineSize, // the comments show the fitting sizes and the amounts left unused for estimatedCacheLineSize=64/128: const uint32_t fittingSize1 = SET_FITTING_SIZE(9); // 1792/1792 128/000 const uint32_t fittingSize2 = SET_FITTING_SIZE(6); // 2688/2688 128/000 const uint32_t fittingSize3 = SET_FITTING_SIZE(4); // 4032/3968 128/256 const uint32_t fittingSize4 = SET_FITTING_SIZE(3); // 5376/5376 128/000 const uint32_t fittingSize5 = SET_FITTING_SIZE(2); // 8128/8064 000/000 #undef SET_FITTING_SIZE /* * The total number of thread-specific Block-based bins */ const uint32_t numBlockBins = minFittingIndex+numFittingBins; /* * Objects of this size and larger are considered large objects. */ const uint32_t minLargeObjectSize = fittingSize5 + 1; /* * Per-thread pool of slab blocks. Idea behind it is to not share with other * threads memory that are likely in local cache(s) of our CPU. */ class FreeBlockPool { private: std::atomic head; int size; Backend *backend; public: static const int POOL_HIGH_MARK = 32; static const int POOL_LOW_MARK = 8; class ResOfGet { ResOfGet() = delete; public: Block* block; bool lastAccMiss; ResOfGet(Block *b, bool lastMiss) : block(b), lastAccMiss(lastMiss) {} }; // allocated in zero-initialized memory FreeBlockPool(Backend *bknd) : backend(bknd) {} ResOfGet getBlock(); void returnBlock(Block *block); bool externalCleanup(); // can be called by another thread }; template class LocalLOCImpl { private: static const size_t MAX_TOTAL_SIZE = 4*1024*1024; // TODO: can single-linked list be faster here? LargeMemoryBlock *tail; // need it when do releasing on overflow std::atomic head; size_t totalSize; int numOfBlocks; public: bool put(LargeMemoryBlock *object, ExtMemoryPool *extMemPool); LargeMemoryBlock *get(size_t size); bool externalCleanup(ExtMemoryPool *extMemPool); #if __TBB_MALLOC_WHITEBOX_TEST LocalLOCImpl() : tail(nullptr), head(nullptr), totalSize(0), numOfBlocks(0) {} static size_t getMaxSize() { return MAX_TOTAL_SIZE; } static const int LOC_HIGH_MARK = HIGH_MARK; #else // no ctor, object must be created in zero-initialized memory #endif }; typedef LocalLOCImpl<8,32> LocalLOC; // set production code parameters class TLSData : public TLSRemote { MemoryPool *memPool; public: Bin bin[numBlockBinLimit]; FreeBlockPool freeSlabBlocks; LocalLOC lloc; unsigned currCacheIdx; private: std::atomic unused; public: TLSData(MemoryPool *mPool, Backend *bknd) : memPool(mPool), freeSlabBlocks(bknd), currCacheIdx(0) {} MemoryPool *getMemPool() const { return memPool; } Bin* getAllocationBin(size_t size); void release(); bool externalCleanup(bool cleanOnlyUnused, bool cleanBins) { if (!unused.load(std::memory_order_relaxed) && cleanOnlyUnused) return false; // Heavy operation in terms of synchronization complexity, // should be called only for the current thread bool released = cleanBins ? cleanupBlockBins() : false; // both cleanups to be called, and the order is not important bool lloc_cleaned = lloc.externalCleanup(&memPool->extMemPool); bool free_slab_blocks_cleaned = freeSlabBlocks.externalCleanup(); return released || lloc_cleaned || free_slab_blocks_cleaned; } bool cleanupBlockBins(); void markUsed() { unused.store(false, std::memory_order_relaxed); } // called by owner when TLS touched void markUnused() { unused.store(true, std::memory_order_relaxed); } // can be called by not owner thread }; TLSData *TLSKey::createTLS(MemoryPool *memPool, Backend *backend) { MALLOC_ASSERT( sizeof(TLSData) >= sizeof(Bin) * numBlockBins + sizeof(FreeBlockPool), ASSERT_TEXT ); TLSData* tls = (TLSData*) memPool->bootStrapBlocks.allocate(memPool, sizeof(TLSData)); if ( !tls ) return nullptr; new(tls) TLSData(memPool, backend); /* the block contains zeroes after bootStrapMalloc, so bins are initialized */ #if MALLOC_DEBUG for (uint32_t i = 0; i < numBlockBinLimit; i++) tls->bin[i].verifyInitState(); #endif setThreadMallocTLS(tls); memPool->extMemPool.allLocalCaches.registerThread(tls); return tls; } bool TLSData::cleanupBlockBins() { bool released = false; for (uint32_t i = 0; i < numBlockBinLimit; i++) { released |= bin[i].cleanPublicFreeLists(); // After cleaning public free lists, only the active block might be empty. // Do not use processEmptyBlock because it will just restore bumpPtr. Block *block = bin[i].getActiveBlock(); if (block && block->empty()) { bin[i].outofTLSBin(block); memPool->returnEmptyBlock(block, /*poolTheBlock=*/false); released = true; } } return released; } bool ExtMemoryPool::releaseAllLocalCaches() { // Iterate all registered TLS data and clean LLOC and Slab pools bool released = allLocalCaches.cleanup(/*cleanOnlyUnused=*/false); // Bins privatization is done only for the current thread if (TLSData *tlsData = tlsPointerKey.getThreadMallocTLS()) released |= tlsData->cleanupBlockBins(); return released; } void AllLocalCaches::registerThread(TLSRemote *tls) { tls->prev = nullptr; MallocMutex::scoped_lock lock(listLock); MALLOC_ASSERT(head!=tls, ASSERT_TEXT); tls->next = head; if (head) head->prev = tls; head = tls; MALLOC_ASSERT(head->next!=head, ASSERT_TEXT); } void AllLocalCaches::unregisterThread(TLSRemote *tls) { MallocMutex::scoped_lock lock(listLock); MALLOC_ASSERT(head, "Can't unregister thread: no threads are registered."); if (head == tls) head = tls->next; if (tls->next) tls->next->prev = tls->prev; if (tls->prev) tls->prev->next = tls->next; MALLOC_ASSERT(!tls->next || tls->next->next!=tls->next, ASSERT_TEXT); } bool AllLocalCaches::cleanup(bool cleanOnlyUnused) { bool released = false; { MallocMutex::scoped_lock lock(listLock); for (TLSRemote *curr=head; curr; curr=curr->next) released |= static_cast(curr)->externalCleanup(cleanOnlyUnused, /*cleanBins=*/false); } return released; } void AllLocalCaches::markUnused() { bool locked = false; MallocMutex::scoped_lock lock(listLock, /*block=*/false, &locked); if (!locked) // not wait for marking if someone doing something with it return; for (TLSRemote *curr=head; curr; curr=curr->next) static_cast(curr)->markUnused(); } #if MALLOC_CHECK_RECURSION MallocMutex RecursiveMallocCallProtector::rmc_mutex; std::atomic RecursiveMallocCallProtector::owner_thread; std::atomic RecursiveMallocCallProtector::autoObjPtr; bool RecursiveMallocCallProtector::mallocRecursionDetected; #if __FreeBSD__ bool RecursiveMallocCallProtector::canUsePthread; #endif #endif /*********** End code to provide thread ID and a TLS pointer **********/ // Parameter for isLargeObject, keeps our expectations on memory origin. // Assertions must use unknownMem to reliably report object invalidity. enum MemoryOrigin { ourMem, // allocated by TBB allocator unknownMem // can be allocated by system allocator or TBB allocator }; template #if __TBB_USE_THREAD_SANITIZER // We have a real race when accessing the large object header for // non large objects (e.g. small or foreign objects). // Therefore, we need to hide this access from the thread sanitizer __attribute__((no_sanitize("thread"))) #endif bool isLargeObject(void *object); static void *internalMalloc(size_t size); static void internalFree(void *object); static void *internalPoolMalloc(MemoryPool* mPool, size_t size); static bool internalPoolFree(MemoryPool *mPool, void *object, size_t size); #if !MALLOC_DEBUG #if __INTEL_COMPILER || _MSC_VER #define NOINLINE(decl) __declspec(noinline) decl #define ALWAYSINLINE(decl) __forceinline decl #elif __GNUC__ #define NOINLINE(decl) decl __attribute__ ((noinline)) #define ALWAYSINLINE(decl) decl __attribute__ ((always_inline)) #else #define NOINLINE(decl) decl #define ALWAYSINLINE(decl) decl #endif static NOINLINE( bool doInitialization() ); ALWAYSINLINE( bool isMallocInitialized() ); #undef ALWAYSINLINE #undef NOINLINE #endif /* !MALLOC_DEBUG */ /********* Now some rough utility code to deal with indexing the size bins. **************/ /* * Given a number return the highest non-zero bit in it. It is intended to work with 32-bit values only. * Moreover, on some platforms, for sake of simplicity and performance, it is narrowed to only serve for 64 to 1023. * This is enough for current algorithm of distribution of sizes among bins. * __TBB_Log2 is not used here to minimize dependencies on TBB specific sources. */ #if _WIN64 && _MSC_VER>=1400 && !__INTEL_COMPILER extern "C" unsigned char _BitScanReverse( unsigned long* i, unsigned long w ); #pragma intrinsic(_BitScanReverse) #endif static inline unsigned int highestBitPos(unsigned int n) { MALLOC_ASSERT( n>=64 && n<1024, ASSERT_TEXT ); // only needed for bsr array lookup, but always true unsigned int pos; #if __ARCH_x86_32||__ARCH_x86_64 # if __unix__||__APPLE__||__MINGW32__ __asm__ ("bsr %1,%0" : "=r"(pos) : "r"(n)); # elif (_WIN32 && (!_WIN64 || __INTEL_COMPILER)) __asm { bsr eax, n mov pos, eax } # elif _WIN64 && _MSC_VER>=1400 _BitScanReverse((unsigned long*)&pos, (unsigned long)n); # else # error highestBitPos() not implemented for this platform # endif #elif __arm__ __asm__ __volatile__ ( "clz %0, %1\n" "rsb %0, %0, %2\n" :"=r" (pos) :"r" (n), "I" (31) ); #else static unsigned int bsr[16] = {0/*N/A*/,6,7,7,8,8,8,8,9,9,9,9,9,9,9,9}; pos = bsr[ n>>6 ]; #endif /* __ARCH_* */ return pos; } unsigned int getSmallObjectIndex(unsigned int size) { unsigned int result = (size-1)>>3; constexpr bool is_64bit = (8 == sizeof(void*)); if (is_64bit) { // For 64-bit malloc, 16 byte alignment is needed except for bin 0. if (result) result |= 1; // 0,1,3,5,7; bins 2,4,6 are not aligned to 16 bytes } return result; } /* * Depending on indexRequest, for a given size return either the index into the bin * for objects of this size, or the actual size of objects in this bin. * TODO: Change return type to unsigned short. */ template static unsigned int getIndexOrObjectSize (unsigned int size) { if (size <= maxSmallObjectSize) { // selection from 8/16/24/32/40/48/56/64 unsigned int index = getSmallObjectIndex( size ); /* Bin 0 is for 8 bytes, bin 1 is for 16, and so forth */ return indexRequest ? index : (index+1)<<3; } else if (size <= maxSegregatedObjectSize ) { // 80/96/112/128 / 160/192/224/256 / 320/384/448/512 / 640/768/896/1024 unsigned int order = highestBitPos(size-1); // which group of bin sizes? MALLOC_ASSERT( 6<=order && order<=9, ASSERT_TEXT ); if (indexRequest) return minSegregatedObjectIndex - (4*6) - 4 + (4*order) + ((size-1)>>(order-2)); else { unsigned int alignment = 128 >> (9-order); // alignment in the group MALLOC_ASSERT( alignment==16 || alignment==32 || alignment==64 || alignment==128, ASSERT_TEXT ); return alignUp(size,alignment); } } else { if( size <= fittingSize3 ) { if( size <= fittingSize2 ) { if( size <= fittingSize1 ) return indexRequest ? minFittingIndex : fittingSize1; else return indexRequest ? minFittingIndex+1 : fittingSize2; } else return indexRequest ? minFittingIndex+2 : fittingSize3; } else { if( size <= fittingSize5 ) { if( size <= fittingSize4 ) return indexRequest ? minFittingIndex+3 : fittingSize4; else return indexRequest ? minFittingIndex+4 : fittingSize5; } else { MALLOC_ASSERT( 0,ASSERT_TEXT ); // this should not happen return ~0U; } } } } static unsigned int getIndex (unsigned int size) { return getIndexOrObjectSize(size); } static unsigned int getObjectSize (unsigned int size) { return getIndexOrObjectSize(size); } void *BootStrapBlocks::allocate(MemoryPool *memPool, size_t size) { FreeObject *result; MALLOC_ASSERT( size == sizeof(TLSData), ASSERT_TEXT ); { // Lock with acquire MallocMutex::scoped_lock scoped_cs(bootStrapLock); if( bootStrapObjectList) { result = bootStrapObjectList; bootStrapObjectList = bootStrapObjectList->next; } else { if (!bootStrapBlock) { bootStrapBlock = memPool->getEmptyBlock(size); if (!bootStrapBlock) return nullptr; } result = bootStrapBlock->bumpPtr; bootStrapBlock->bumpPtr = (FreeObject *)((uintptr_t)bootStrapBlock->bumpPtr - bootStrapBlock->objectSize); if ((uintptr_t)bootStrapBlock->bumpPtr < (uintptr_t)bootStrapBlock+sizeof(Block)) { bootStrapBlock->bumpPtr = nullptr; bootStrapBlock->next = bootStrapBlockUsed; bootStrapBlockUsed = bootStrapBlock; bootStrapBlock = nullptr; } } } // Unlock with release memset (result, 0, size); return (void*)result; } void BootStrapBlocks::free(void* ptr) { MALLOC_ASSERT( ptr, ASSERT_TEXT ); { // Lock with acquire MallocMutex::scoped_lock scoped_cs(bootStrapLock); ((FreeObject*)ptr)->next = bootStrapObjectList; bootStrapObjectList = (FreeObject*)ptr; } // Unlock with release } void BootStrapBlocks::reset() { bootStrapBlock = bootStrapBlockUsed = nullptr; bootStrapObjectList = nullptr; } #if !(FREELIST_NONBLOCKING) static MallocMutex publicFreeListLock; // lock for changes of publicFreeList #endif /********* End rough utility code **************/ /* LifoList assumes zero initialization so a vector of it can be created * by just allocating some space with no call to constructor. * On Linux, it seems to be necessary to avoid linking with C++ libraries. * * By usage convention there is no race on the initialization. */ LifoList::LifoList( ) : top(nullptr) { // MallocMutex assumes zero initialization memset(static_cast(&lock), 0, sizeof(MallocMutex)); } void LifoList::push(Block *block) { MallocMutex::scoped_lock scoped_cs(lock); block->next = top.load(std::memory_order_relaxed); top.store(block, std::memory_order_relaxed); } Block *LifoList::pop() { Block* block = nullptr; if (top.load(std::memory_order_relaxed)) { MallocMutex::scoped_lock scoped_cs(lock); block = top.load(std::memory_order_relaxed); if (block) { top.store(block->next, std::memory_order_relaxed); } } return block; } Block *LifoList::grab() { Block *block = nullptr; if (top.load(std::memory_order_relaxed)) { MallocMutex::scoped_lock scoped_cs(lock); block = top.load(std::memory_order_relaxed); top.store(nullptr, std::memory_order_relaxed); } return block; } /********* Thread and block related code *************/ template void AllLargeBlocksList::releaseAll(Backend *backend) { LargeMemoryBlock *next, *lmb = loHead; loHead = nullptr; for (; lmb; lmb = next) { next = lmb->gNext; if (poolDestroy) { // as it's pool destruction, no need to return object to backend, // only remove backrefs, as they are global removeBackRef(lmb->backRefIdx); } else { // clean g(Next|Prev) to prevent removing lmb // from AllLargeBlocksList inside returnLargeObject lmb->gNext = lmb->gPrev = nullptr; backend->returnLargeObject(lmb); } } } TLSData* MemoryPool::getTLS(bool create) { TLSData* tls = extMemPool.tlsPointerKey.getThreadMallocTLS(); if (create && !tls) tls = extMemPool.tlsPointerKey.createTLS(this, &extMemPool.backend); return tls; } /* * Return the bin for the given size. */ inline Bin* TLSData::getAllocationBin(size_t size) { return bin + getIndex(size); } /* Return an empty uninitialized block in a non-blocking fashion. */ Block *MemoryPool::getEmptyBlock(size_t size) { TLSData* tls = getTLS(/*create=*/false); // try to use per-thread cache, if TLS available FreeBlockPool::ResOfGet resOfGet = tls? tls->freeSlabBlocks.getBlock() : FreeBlockPool::ResOfGet(nullptr, false); Block *result = resOfGet.block; if (!result) { // not found in local cache, asks backend for slabs int num = resOfGet.lastAccMiss? Backend::numOfSlabAllocOnMiss : 1; BackRefIdx backRefIdx[Backend::numOfSlabAllocOnMiss]; result = static_cast(extMemPool.backend.getSlabBlock(num)); if (!result) return nullptr; if (!extMemPool.userPool()) for (int i=0; ibackRefIdx) BackRefIdx(); } else { setBackRef(backRefIdx[i], b); b->backRefIdx = backRefIdx[i]; } b->tlsPtr.store(tls, std::memory_order_relaxed); b->poolPtr = this; // all but first one go to per-thread pool if (i > 0) { MALLOC_ASSERT(tls, ASSERT_TEXT); tls->freeSlabBlocks.returnBlock(b); } } } MALLOC_ASSERT(result, ASSERT_TEXT); result->initEmptyBlock(tls, size); STAT_increment(getThreadId(), getIndex(result->objectSize), allocBlockNew); return result; } void MemoryPool::returnEmptyBlock(Block *block, bool poolTheBlock) { block->reset(); if (poolTheBlock) { getTLS(/*create=*/false)->freeSlabBlocks.returnBlock(block); } else { // slab blocks in user's pools do not have valid backRefIdx if (!extMemPool.userPool()) removeBackRef(*(block->getBackRefIdx())); extMemPool.backend.putSlabBlock(block); } } bool ExtMemoryPool::init(intptr_t poolId, rawAllocType rawAlloc, rawFreeType rawFree, size_t granularity, bool keepAllMemory, bool fixedPool) { this->poolId = poolId; this->rawAlloc = rawAlloc; this->rawFree = rawFree; this->granularity = granularity; this->keepAllMemory = keepAllMemory; this->fixedPool = fixedPool; this->delayRegsReleasing = false; if (!initTLS()) return false; loc.init(this); backend.init(this); MALLOC_ASSERT(isPoolValid(), nullptr); return true; } bool ExtMemoryPool::initTLS() { return tlsPointerKey.init(); } bool MemoryPool::init(intptr_t poolId, const MemPoolPolicy *policy) { if (!extMemPool.init(poolId, policy->pAlloc, policy->pFree, policy->granularity? policy->granularity : defaultGranularity, policy->keepAllMemory, policy->fixedPool)) return false; { MallocMutex::scoped_lock lock(memPoolListLock); next = defaultMemPool->next; defaultMemPool->next = this; prev = defaultMemPool; if (next) next->prev = this; } return true; } bool MemoryPool::reset() { MALLOC_ASSERT(extMemPool.userPool(), "No reset for the system pool."); // memory is not releasing during pool reset // TODO: mark regions to release unused on next reset() extMemPool.delayRegionsReleasing(true); bootStrapBlocks.reset(); extMemPool.lmbList.releaseAll(&extMemPool.backend); if (!extMemPool.reset()) return false; if (!extMemPool.initTLS()) return false; extMemPool.delayRegionsReleasing(false); return true; } bool MemoryPool::destroy() { #if __TBB_MALLOC_LOCACHE_STAT extMemPool.loc.reportStat(stdout); #endif #if __TBB_MALLOC_BACKEND_STAT extMemPool.backend.reportStat(stdout); #endif { MallocMutex::scoped_lock lock(memPoolListLock); // remove itself from global pool list if (prev) prev->next = next; if (next) next->prev = prev; } // slab blocks in non-default pool do not have backreferences, // only large objects do if (extMemPool.userPool()) extMemPool.lmbList.releaseAll(&extMemPool.backend); else { // only one non-userPool() is supported now MALLOC_ASSERT(this==defaultMemPool, nullptr); // There and below in extMemPool.destroy(), do not restore initial state // for user pool, because it's just about to be released. But for system // pool restoring, we do not want to do zeroing of it on subsequent reload. bootStrapBlocks.reset(); extMemPool.orphanedBlocks.reset(); } return extMemPool.destroy(); } void MemoryPool::onThreadShutdown(TLSData *tlsData) { if (tlsData) { // might be called for "empty" TLS tlsData->release(); bootStrapBlocks.free(tlsData); clearTLS(); } } #if MALLOC_DEBUG void Bin::verifyTLSBin (size_t size) const { /* The debug version verifies the TLSBin as needed */ uint32_t objSize = getObjectSize(size); if (activeBlk) { MALLOC_ASSERT( activeBlk->isOwnedByCurrentThread(), ASSERT_TEXT ); MALLOC_ASSERT( activeBlk->objectSize == objSize, ASSERT_TEXT ); #if MALLOC_DEBUG>1 for (Block* temp = activeBlk->next; temp; temp=temp->next) { MALLOC_ASSERT( temp!=activeBlk, ASSERT_TEXT ); MALLOC_ASSERT( temp->isOwnedByCurrentThread(), ASSERT_TEXT ); MALLOC_ASSERT( temp->objectSize == objSize, ASSERT_TEXT ); MALLOC_ASSERT( temp->previous->next == temp, ASSERT_TEXT ); if (temp->next) { MALLOC_ASSERT( temp->next->previous == temp, ASSERT_TEXT ); } } for (Block* temp = activeBlk->previous; temp; temp=temp->previous) { MALLOC_ASSERT( temp!=activeBlk, ASSERT_TEXT ); MALLOC_ASSERT( temp->isOwnedByCurrentThread(), ASSERT_TEXT ); MALLOC_ASSERT( temp->objectSize == objSize, ASSERT_TEXT ); MALLOC_ASSERT( temp->next->previous == temp, ASSERT_TEXT ); if (temp->previous) { MALLOC_ASSERT( temp->previous->next == temp, ASSERT_TEXT ); } } #endif /* MALLOC_DEBUG>1 */ } } #else /* MALLOC_DEBUG */ inline void Bin::verifyTLSBin (size_t) const { } #endif /* MALLOC_DEBUG */ /* * Add a block to the start of this tls bin list. */ void Bin::pushTLSBin(Block* block) { /* The objectSize should be defined and not a parameter because the function is applied to partially filled blocks as well */ unsigned int size = block->objectSize; MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT ); MALLOC_ASSERT( block->objectSize != 0, ASSERT_TEXT ); MALLOC_ASSERT( block->next == nullptr, ASSERT_TEXT ); MALLOC_ASSERT( block->previous == nullptr, ASSERT_TEXT ); MALLOC_ASSERT( this, ASSERT_TEXT ); verifyTLSBin(size); block->next = activeBlk; if( activeBlk ) { block->previous = activeBlk->previous; activeBlk->previous = block; if( block->previous ) block->previous->next = block; } else { activeBlk = block; } verifyTLSBin(size); } /* * Take a block out of its tls bin (e.g. before removal). */ void Bin::outofTLSBin(Block* block) { unsigned int size = block->objectSize; MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT ); MALLOC_ASSERT( block->objectSize != 0, ASSERT_TEXT ); MALLOC_ASSERT( this, ASSERT_TEXT ); verifyTLSBin(size); if (block == activeBlk) { activeBlk = block->previous? block->previous : block->next; } /* Unlink the block */ if (block->previous) { MALLOC_ASSERT( block->previous->next == block, ASSERT_TEXT ); block->previous->next = block->next; } if (block->next) { MALLOC_ASSERT( block->next->previous == block, ASSERT_TEXT ); block->next->previous = block->previous; } block->next = nullptr; block->previous = nullptr; verifyTLSBin(size); } Block* Bin::getPrivatizedFreeListBlock() { Block* block; MALLOC_ASSERT( this, ASSERT_TEXT ); // if this method is called, active block usage must be unsuccessful MALLOC_ASSERT( (!activeBlk && !mailbox.load(std::memory_order_relaxed)) || (activeBlk && activeBlk->isFull), ASSERT_TEXT ); // the counter should be changed STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList); if (!mailbox.load(std::memory_order_acquire)) // hotpath is empty mailbox return nullptr; else { // mailbox is not empty, take lock and inspect it MallocMutex::scoped_lock scoped_cs(mailLock); block = mailbox.load(std::memory_order_relaxed); if( block ) { MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT ); MALLOC_ASSERT( !isNotForUse(block->nextPrivatizable.load(std::memory_order_relaxed)), ASSERT_TEXT ); mailbox.store(block->nextPrivatizable.load(std::memory_order_relaxed), std::memory_order_relaxed); block->nextPrivatizable.store((Block*)this, std::memory_order_relaxed); } } if( block ) { MALLOC_ASSERT( isSolidPtr(block->publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT ); block->privatizePublicFreeList(); block->adjustPositionInBin(this); } return block; } void Bin::addPublicFreeListBlock(Block* block) { MallocMutex::scoped_lock scoped_cs(mailLock); block->nextPrivatizable.store(mailbox.load(std::memory_order_relaxed), std::memory_order_relaxed); mailbox.store(block, std::memory_order_relaxed); } // Process publicly freed objects in all blocks and return empty blocks // to the backend in order to reduce overall footprint. bool Bin::cleanPublicFreeLists() { Block* block; if (!mailbox.load(std::memory_order_acquire)) return false; else { // Grab all the blocks in the mailbox MallocMutex::scoped_lock scoped_cs(mailLock); block = mailbox.load(std::memory_order_relaxed); mailbox.store(nullptr, std::memory_order_relaxed); } bool released = false; while (block) { MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT ); Block* tmp = block->nextPrivatizable.load(std::memory_order_relaxed); block->nextPrivatizable.store((Block*)this, std::memory_order_relaxed); block->privatizePublicFreeList(); if (block->empty()) { processEmptyBlock(block, /*poolTheBlock=*/false); released = true; } else block->adjustPositionInBin(this); block = tmp; } return released; } bool Block::adjustFullness() { if (bumpPtr) { /* If we are still using a bump ptr for this block it is empty enough to use. */ STAT_increment(getThreadId(), getIndex(objectSize), examineEmptyEnough); isFull = false; } else { const float threshold = (slabSize - sizeof(Block)) * (1 - emptyEnoughRatio); /* allocatedCount shows how many objects in the block are in use; however it still counts * blocks freed by other threads; so prior call to privatizePublicFreeList() is recommended */ isFull = (allocatedCount*objectSize > threshold) ? true : false; #if COLLECT_STATISTICS if (isFull) STAT_increment(getThreadId(), getIndex(objectSize), examineNotEmpty); else STAT_increment(getThreadId(), getIndex(objectSize), examineEmptyEnough); #endif } return isFull; } // This method resides in class Block, and not in class Bin, in order to avoid // calling getAllocationBin on a reasonably hot path in Block::freeOwnObject void Block::adjustPositionInBin(Bin* bin/*=nullptr*/) { // If the block were full, but became empty enough to use, // move it to the front of the list if (isFull && !adjustFullness()) { if (!bin) bin = tlsPtr.load(std::memory_order_relaxed)->getAllocationBin(objectSize); bin->moveBlockToFront(this); } } /* Restore the bump pointer for an empty block that is planned to use */ void Block::restoreBumpPtr() { MALLOC_ASSERT( allocatedCount == 0, ASSERT_TEXT ); MALLOC_ASSERT( !isSolidPtr(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT ); STAT_increment(getThreadId(), getIndex(objectSize), freeRestoreBumpPtr); bumpPtr = (FreeObject *)((uintptr_t)this + slabSize - objectSize); freeList = nullptr; isFull = false; } void Block::freeOwnObject(void *object) { tlsPtr.load(std::memory_order_relaxed)->markUsed(); allocatedCount--; MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT ); #if COLLECT_STATISTICS // Note that getAllocationBin is not called on the hottest path with statistics off. if (tlsPtr.load(std::memory_order_relaxed)->getAllocationBin(objectSize)->getActiveBlock() != this) STAT_increment(getThreadId(), getIndex(objectSize), freeToInactiveBlock); else STAT_increment(getThreadId(), getIndex(objectSize), freeToActiveBlock); #endif if (empty()) { // If the last object of a slab is freed, the slab cannot be marked full MALLOC_ASSERT(!isFull, ASSERT_TEXT); tlsPtr.load(std::memory_order_relaxed)->getAllocationBin(objectSize)->processEmptyBlock(this, /*poolTheBlock=*/true); } else { // hot path FreeObject *objectToFree = findObjectToFree(object); objectToFree->next = freeList; freeList = objectToFree; adjustPositionInBin(); } } void Block::freePublicObject (FreeObject *objectToFree) { FreeObject* localPublicFreeList{}; MALLOC_ITT_SYNC_RELEASING(&publicFreeList); #if FREELIST_NONBLOCKING // TBB_REVAMP_TODO: make it non atomic in non-blocking scenario localPublicFreeList = publicFreeList.load(std::memory_order_relaxed); do { objectToFree->next = localPublicFreeList; // no backoff necessary because trying to make change, not waiting for a change } while( !publicFreeList.compare_exchange_strong(localPublicFreeList, objectToFree) ); #else STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList); { MallocMutex::scoped_lock scoped_cs(publicFreeListLock); localPublicFreeList = objectToFree->next = publicFreeList; publicFreeList = objectToFree; } #endif if( localPublicFreeList==nullptr ) { // if the block is abandoned, its nextPrivatizable pointer should be UNUSABLE // otherwise, it should point to the bin the block belongs to. // reading nextPrivatizable is thread-safe below, because: // 1) the executing thread atomically got publicFreeList==nullptr and changed it to non-nullptr; // 2) only owning thread can change it back to nullptr, // 3) but it can not be done until the block is put to the mailbox // So the executing thread is now the only one that can change nextPrivatizable Block* next = nextPrivatizable.load(std::memory_order_acquire); if( !isNotForUse(next) ) { MALLOC_ASSERT( next!=nullptr, ASSERT_TEXT ); Bin* theBin = (Bin*) next; #if MALLOC_DEBUG && TBB_REVAMP_TODO // FIXME: The thread that returns the block is not the block's owner. // The below assertion compares 'theBin' against the caller's local bin, thus, it always fails. // Need to find a way to get the correct remote bin for comparison. { // check that nextPrivatizable points to the bin the block belongs to uint32_t index = getIndex( objectSize ); TLSData* tls = getThreadMallocTLS(); MALLOC_ASSERT( theBin==tls->bin+index, ASSERT_TEXT ); } #endif // MALLOC_DEBUG theBin->addPublicFreeListBlock(this); } } STAT_increment(getThreadId(), ThreadCommonCounters, freeToOtherThread); STAT_increment(ownerTid.load(std::memory_order_relaxed), getIndex(objectSize), freeByOtherThread); } // Make objects freed by other threads available for use again void Block::privatizePublicFreeList( bool reset ) { FreeObject *localPublicFreeList; // If reset is false, publicFreeList should not be zeroed but set to UNUSABLE // to properly synchronize with other threads freeing objects to this slab. const intptr_t endMarker = reset ? 0 : UNUSABLE; // Only the owner thread may reset the pointer to nullptr MALLOC_ASSERT( isOwnedByCurrentThread() || !reset, ASSERT_TEXT ); #if FREELIST_NONBLOCKING localPublicFreeList = publicFreeList.exchange((FreeObject*)endMarker); #else STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList); { MallocMutex::scoped_lock scoped_cs(publicFreeListLock); localPublicFreeList = publicFreeList; publicFreeList = endMarker; } #endif MALLOC_ITT_SYNC_ACQUIRED(&publicFreeList); MALLOC_ASSERT( !(reset && isNotForUse(publicFreeList)), ASSERT_TEXT ); // publicFreeList must have been UNUSABLE or valid, but not nullptr MALLOC_ASSERT( localPublicFreeList!=nullptr, ASSERT_TEXT ); if( isSolidPtr(localPublicFreeList) ) { MALLOC_ASSERT( allocatedCount <= (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT ); /* other threads did not change the counter freeing our blocks */ allocatedCount--; FreeObject *temp = localPublicFreeList; while( isSolidPtr(temp->next) ){ // the list will end with either nullptr or UNUSABLE temp = temp->next; allocatedCount--; MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT ); } /* merge with local freeList */ temp->next = freeList; freeList = localPublicFreeList; STAT_increment(getThreadId(), getIndex(objectSize), allocPrivatized); } } void Block::privatizeOrphaned(TLSData *tls, unsigned index) { Bin* bin = tls->bin + index; STAT_increment(getThreadId(), index, allocBlockPublic); next = nullptr; previous = nullptr; MALLOC_ASSERT( publicFreeList.load(std::memory_order_relaxed) != nullptr, ASSERT_TEXT ); /* There is not a race here since no other thread owns this block */ markOwned(tls); // It is safe to change nextPrivatizable, as publicFreeList is not null MALLOC_ASSERT( isNotForUse(nextPrivatizable.load(std::memory_order_relaxed)), ASSERT_TEXT ); nextPrivatizable.store((Block*)bin, std::memory_order_relaxed); // the next call is required to change publicFreeList to 0 privatizePublicFreeList(); if( empty() ) { restoreBumpPtr(); } else { adjustFullness(); // check the block fullness and set isFull } MALLOC_ASSERT( !isNotForUse(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT ); } bool Block::readyToShare() { FreeObject* oldVal = nullptr; #if FREELIST_NONBLOCKING publicFreeList.compare_exchange_strong(oldVal, (FreeObject*)UNUSABLE); #else STAT_increment(getThreadId(), ThreadCommonCounters, lockPublicFreeList); { MallocMutex::scoped_lock scoped_cs(publicFreeListLock); if ( (oldVal=publicFreeList)==nullptr ) publicFreeList = reinterpret_cast(UNUSABLE); } #endif return oldVal==nullptr; } void Block::shareOrphaned(intptr_t binTag, unsigned index) { MALLOC_ASSERT( binTag, ASSERT_TEXT ); // unreferenced formal parameter warning tbb::detail::suppress_unused_warning(index); STAT_increment(getThreadId(), index, freeBlockPublic); markOrphaned(); if ((intptr_t)nextPrivatizable.load(std::memory_order_relaxed) == binTag) { // First check passed: the block is not in mailbox yet. // Need to set publicFreeList to non-zero, so other threads // will not change nextPrivatizable and it can be zeroed. if ( !readyToShare() ) { // another thread freed an object; we need to wait until it finishes. // There is no need for exponential backoff, as the wait here is not for a lock; // but need to yield, so the thread we wait has a chance to run. // TODO: add a pause to also be friendly to hyperthreads int count = 256; while ((intptr_t)nextPrivatizable.load(std::memory_order_relaxed) == binTag) { if (--count==0) { do_yield(); count = 256; } } } } MALLOC_ASSERT( publicFreeList.load(std::memory_order_relaxed) !=nullptr, ASSERT_TEXT ); // now it is safe to change our data previous = nullptr; // it is caller responsibility to ensure that the list of blocks // formed by nextPrivatizable pointers is kept consistent if required. // if only called from thread shutdown code, it does not matter. nextPrivatizable.store((Block*)UNUSABLE, std::memory_order_relaxed); } void Block::cleanBlockHeader() { next = nullptr; previous = nullptr; freeList = nullptr; allocatedCount = 0; isFull = false; tlsPtr.store(nullptr, std::memory_order_relaxed); publicFreeList.store(nullptr, std::memory_order_relaxed); } void Block::initEmptyBlock(TLSData *tls, size_t size) { // Having getIndex and getObjectSize called next to each other // allows better compiler optimization as they basically share the code. unsigned int index = getIndex(size); unsigned int objSz = getObjectSize(size); cleanBlockHeader(); MALLOC_ASSERT(objSz <= USHRT_MAX, "objSz must not be less 2^16-1"); objectSize = objSz; markOwned(tls); // bump pointer should be prepared for first allocation - thus mode it down to objectSize bumpPtr = (FreeObject *)((uintptr_t)this + slabSize - objectSize); // each block should have the address where the head of the list of "privatizable" blocks is kept // the only exception is a block for boot strap which is initialized when TLS is yet nullptr nextPrivatizable.store( tls? (Block*)(tls->bin + index) : nullptr, std::memory_order_relaxed); TRACEF(( "[ScalableMalloc trace] Empty block %p is initialized, owner is %ld, objectSize is %d, bumpPtr is %p\n", this, tlsPtr.load(std::memory_order_relaxed) ? getThreadId() : -1, objectSize, bumpPtr )); } Block *OrphanedBlocks::get(TLSData *tls, unsigned int size) { // TODO: try to use index from getAllocationBin unsigned int index = getIndex(size); Block *block = bins[index].pop(); if (block) { MALLOC_ITT_SYNC_ACQUIRED(bins+index); block->privatizeOrphaned(tls, index); } return block; } void OrphanedBlocks::put(intptr_t binTag, Block *block) { unsigned int index = getIndex(block->getSize()); block->shareOrphaned(binTag, index); MALLOC_ITT_SYNC_RELEASING(bins+index); bins[index].push(block); } void OrphanedBlocks::reset() { for (uint32_t i=0; inext; block->privatizePublicFreeList( /*reset=*/false ); // do not set publicFreeList to nullptr if (block->empty()) { block->reset(); // slab blocks in user's pools do not have valid backRefIdx if (!backend->inUserPool()) removeBackRef(*(block->getBackRefIdx())); backend->putSlabBlock(block); released = true; } else { MALLOC_ITT_SYNC_RELEASING(bins+i); bins[i].push(block); } block = next; } } return released; } FreeBlockPool::ResOfGet FreeBlockPool::getBlock() { Block *b = head.exchange(nullptr); bool lastAccessMiss; if (b) { size--; Block *newHead = b->next; lastAccessMiss = false; head.store(newHead, std::memory_order_release); } else { lastAccessMiss = true; } return ResOfGet(b, lastAccessMiss); } void FreeBlockPool::returnBlock(Block *block) { MALLOC_ASSERT( size <= POOL_HIGH_MARK, ASSERT_TEXT ); Block *localHead = head.exchange(nullptr); if (!localHead) { size = 0; // head was stolen by externalClean, correct size accordingly } else if (size == POOL_HIGH_MARK) { // release cold blocks and add hot one, // so keep POOL_LOW_MARK-1 blocks and add new block to head Block *headToFree = localHead, *helper; for (int i=0; inext; Block *last = headToFree; headToFree = headToFree->next; last->next = nullptr; size = POOL_LOW_MARK-1; for (Block *currBl = headToFree; currBl; currBl = helper) { helper = currBl->next; // slab blocks in user's pools do not have valid backRefIdx if (!backend->inUserPool()) removeBackRef(currBl->backRefIdx); backend->putSlabBlock(currBl); } } size++; block->next = localHead; head.store(block, std::memory_order_release); } bool FreeBlockPool::externalCleanup() { Block *helper; bool released = false; for (Block *currBl=head.exchange(nullptr); currBl; currBl=helper) { helper = currBl->next; // slab blocks in user's pools do not have valid backRefIdx if (!backend->inUserPool()) removeBackRef(currBl->backRefIdx); backend->putSlabBlock(currBl); released = true; } return released; } /* Prepare the block for returning to FreeBlockPool */ void Block::reset() { // it is caller's responsibility to ensure no data is lost before calling this MALLOC_ASSERT( allocatedCount==0, ASSERT_TEXT ); MALLOC_ASSERT( !isSolidPtr(publicFreeList.load(std::memory_order_relaxed)), ASSERT_TEXT ); if (!isStartupAllocObject()) STAT_increment(getThreadId(), getIndex(objectSize), freeBlockBack); cleanBlockHeader(); nextPrivatizable.store(nullptr, std::memory_order_relaxed); objectSize = 0; // for an empty block, bump pointer should point right after the end of the block bumpPtr = (FreeObject *)((uintptr_t)this + slabSize); } inline void Bin::setActiveBlock (Block *block) { // MALLOC_ASSERT( bin, ASSERT_TEXT ); MALLOC_ASSERT( block->isOwnedByCurrentThread(), ASSERT_TEXT ); // it is the caller responsibility to keep bin consistence (i.e. ensure this block is in the bin list) activeBlk = block; } inline Block* Bin::setPreviousBlockActive() { MALLOC_ASSERT( activeBlk, ASSERT_TEXT ); Block* temp = activeBlk->previous; if( temp ) { MALLOC_ASSERT( !(temp->isFull), ASSERT_TEXT ); activeBlk = temp; } return temp; } inline bool Block::isOwnedByCurrentThread() const { return tlsPtr.load(std::memory_order_relaxed) && ownerTid.isCurrentThreadId(); } FreeObject *Block::findObjectToFree(const void *object) const { FreeObject *objectToFree; // Due to aligned allocations, a pointer passed to scalable_free // might differ from the address of internally allocated object. // Small objects however should always be fine. if (objectSize <= maxSegregatedObjectSize) objectToFree = (FreeObject*)object; // "Fitting size" allocations are suspicious if aligned higher than naturally else { if ( ! isAligned(object,2*fittingAlignment) ) // TODO: the above check is questionable - it gives false negatives in ~50% cases, // so might even be slower in average than unconditional use of findAllocatedObject. // here it should be a "real" object objectToFree = (FreeObject*)object; else // here object can be an aligned address, so applying additional checks objectToFree = findAllocatedObject(object); MALLOC_ASSERT( isAligned(objectToFree,fittingAlignment), ASSERT_TEXT ); } MALLOC_ASSERT( isProperlyPlaced(objectToFree), ASSERT_TEXT ); return objectToFree; } void TLSData::release() { memPool->extMemPool.allLocalCaches.unregisterThread(this); externalCleanup(/*cleanOnlyUnused=*/false, /*cleanBins=*/false); for (unsigned index = 0; index < numBlockBins; index++) { Block *activeBlk = bin[index].getActiveBlock(); if (!activeBlk) continue; Block *threadlessBlock = activeBlk->previous; bool syncOnMailbox = false; while (threadlessBlock) { Block *threadBlock = threadlessBlock->previous; if (threadlessBlock->empty()) { /* we destroy the thread, so not use its block pool */ memPool->returnEmptyBlock(threadlessBlock, /*poolTheBlock=*/false); } else { memPool->extMemPool.orphanedBlocks.put(intptr_t(bin+index), threadlessBlock); syncOnMailbox = true; } threadlessBlock = threadBlock; } threadlessBlock = activeBlk; while (threadlessBlock) { Block *threadBlock = threadlessBlock->next; if (threadlessBlock->empty()) { /* we destroy the thread, so not use its block pool */ memPool->returnEmptyBlock(threadlessBlock, /*poolTheBlock=*/false); } else { memPool->extMemPool.orphanedBlocks.put(intptr_t(bin+index), threadlessBlock); syncOnMailbox = true; } threadlessBlock = threadBlock; } bin[index].resetActiveBlock(); if (syncOnMailbox) { // Although, we synchronized on nextPrivatizable inside a block, we still need to // synchronize on the bin lifetime because the thread releasing an object into the public // free list is touching the bin (mailbox and mailLock) MallocMutex::scoped_lock scoped_cs(bin[index].mailLock); } } } #if MALLOC_CHECK_RECURSION // TODO: Use dedicated heap for this /* * It's a special kind of allocation that can be used when malloc is * not available (either during startup or when malloc was already called and * we are, say, inside pthread_setspecific's call). * Block can contain objects of different sizes, * allocations are performed by moving bump pointer and increasing of object counter, * releasing is done via counter of objects allocated in the block * or moving bump pointer if releasing object is on a bound. * TODO: make bump pointer to grow to the same backward direction as all the others. */ class StartupBlock : public Block { size_t availableSize() const { return slabSize - ((uintptr_t)bumpPtr - (uintptr_t)this); } static StartupBlock *getBlock(); public: static FreeObject *allocate(size_t size); static size_t msize(void *ptr) { return *((size_t*)ptr - 1); } void free(void *ptr); }; static MallocMutex startupMallocLock; static StartupBlock *firstStartupBlock; StartupBlock *StartupBlock::getBlock() { BackRefIdx backRefIdx = BackRefIdx::newBackRef(/*largeObj=*/false); if (backRefIdx.isInvalid()) return nullptr; StartupBlock *block = static_cast( defaultMemPool->extMemPool.backend.getSlabBlock(1)); if (!block) return nullptr; block->cleanBlockHeader(); setBackRef(backRefIdx, block); block->backRefIdx = backRefIdx; // use startupAllocObjSizeMark to mark objects from startup block marker block->objectSize = startupAllocObjSizeMark; block->bumpPtr = (FreeObject *)((uintptr_t)block + sizeof(StartupBlock)); return block; } FreeObject *StartupBlock::allocate(size_t size) { FreeObject *result; StartupBlock *newBlock = nullptr; /* Objects must be aligned on their natural bounds, and objects bigger than word on word's bound. */ size = alignUp(size, sizeof(size_t)); // We need size of an object to implement msize. size_t reqSize = size + sizeof(size_t); { MallocMutex::scoped_lock scoped_cs(startupMallocLock); // Re-check whether we need a new block (conditions might have changed) if (!firstStartupBlock || firstStartupBlock->availableSize() < reqSize) { if (!newBlock) { newBlock = StartupBlock::getBlock(); if (!newBlock) return nullptr; } newBlock->next = (Block*)firstStartupBlock; if (firstStartupBlock) firstStartupBlock->previous = (Block*)newBlock; firstStartupBlock = newBlock; } result = firstStartupBlock->bumpPtr; firstStartupBlock->allocatedCount++; firstStartupBlock->bumpPtr = (FreeObject *)((uintptr_t)firstStartupBlock->bumpPtr + reqSize); } // keep object size at the negative offset *((size_t*)result) = size; return (FreeObject*)((size_t*)result+1); } void StartupBlock::free(void *ptr) { Block* blockToRelease = nullptr; { MallocMutex::scoped_lock scoped_cs(startupMallocLock); MALLOC_ASSERT(firstStartupBlock, ASSERT_TEXT); MALLOC_ASSERT(startupAllocObjSizeMark==objectSize && allocatedCount>0, ASSERT_TEXT); MALLOC_ASSERT((uintptr_t)ptr>=(uintptr_t)this+sizeof(StartupBlock) && (uintptr_t)ptr+StartupBlock::msize(ptr)<=(uintptr_t)this+slabSize, ASSERT_TEXT); if (0 == --allocatedCount) { if (this == firstStartupBlock) firstStartupBlock = (StartupBlock*)firstStartupBlock->next; if (previous) previous->next = next; if (next) next->previous = previous; blockToRelease = this; } else if ((uintptr_t)ptr + StartupBlock::msize(ptr) == (uintptr_t)bumpPtr) { // last object in the block released FreeObject *newBump = (FreeObject*)((size_t*)ptr - 1); MALLOC_ASSERT((uintptr_t)newBump>(uintptr_t)this+sizeof(StartupBlock), ASSERT_TEXT); bumpPtr = newBump; } } if (blockToRelease) { blockToRelease->previous = blockToRelease->next = nullptr; defaultMemPool->returnEmptyBlock(blockToRelease, /*poolTheBlock=*/false); } } #endif /* MALLOC_CHECK_RECURSION */ /********* End thread related code *************/ /********* Library initialization *************/ //! Value indicating the state of initialization. /* 0 = initialization not started. * 1 = initialization started but not finished. * 2 = initialization finished. * In theory, we only need values 0 and 2. But value 1 is nonetheless * useful for detecting errors in the double-check pattern. */ static std::atomic mallocInitialized{0}; // implicitly initialized to 0 static MallocMutex initMutex; /** The leading "\0" is here so that applying "strings" to the binary delivers a clean result. */ static char VersionString[] = "\0" TBBMALLOC_VERSION_STRINGS; #if USE_PTHREAD && __TBB_SOURCE_DIRECTLY_INCLUDED /* Decrease race interval between dynamic library unloading and pthread key destructor. Protect only Pthreads with supported unloading. */ class ShutdownSync { /* flag is the number of threads in pthread key dtor body (i.e., between threadDtorStart() and threadDtorDone()) or the signal to skip dtor, if flag < 0 */ std::atomic flag; static const intptr_t skipDtor = INTPTR_MIN/2; public: void init() { flag.store(0, std::memory_order_release); } /* Suppose that 2*abs(skipDtor) or more threads never call threadDtorStart() simultaneously, so flag never becomes negative because of that. */ bool threadDtorStart() { if (flag.load(std::memory_order_acquire) < 0) return false; if (++flag <= 0) { // note that new value returned flag.fetch_sub(1); // flag is spoiled by us, restore it return false; } return true; } void threadDtorDone() { flag.fetch_sub(1); } void processExit() { if (flag.fetch_add(skipDtor) != 0) { SpinWaitUntilEq(flag, skipDtor); } } }; #else class ShutdownSync { public: void init() { } bool threadDtorStart() { return true; } void threadDtorDone() { } void processExit() { } }; #endif // USE_PTHREAD && __TBB_SOURCE_DIRECTLY_INCLUDED static ShutdownSync shutdownSync; inline bool isMallocInitialized() { // Load must have acquire fence; otherwise thread taking "initialized" path // might perform textually later loads *before* mallocInitialized becomes 2. return 2 == mallocInitialized.load(std::memory_order_acquire); } /* Caller is responsible for ensuring this routine is called exactly once. */ extern "C" void MallocInitializeITT() { #if __TBB_USE_ITT_NOTIFY if (!usedBySrcIncluded) tbb::detail::r1::__TBB_load_ittnotify(); #endif } void MemoryPool::initDefaultPool() { hugePages.init(); } /* * Allocator initialization routine; * it is called lazily on the very first scalable_malloc call. */ static bool initMemoryManager() { TRACEF(( "[ScalableMalloc trace] sizeof(Block) is %d (expected 128); sizeof(uintptr_t) is %d\n", sizeof(Block), sizeof(uintptr_t) )); MALLOC_ASSERT( 2*blockHeaderAlignment == sizeof(Block), ASSERT_TEXT ); MALLOC_ASSERT( sizeof(FreeObject) == sizeof(void*), ASSERT_TEXT ); MALLOC_ASSERT( isAligned(defaultMemPool, sizeof(intptr_t)), "Memory pool must be void*-aligned for atomic to work over aligned arguments."); #if USE_WINTHREAD const size_t granularity = 64*1024; // granulatity of VirtualAlloc #else // POSIX.1-2001-compliant way to get page size const size_t granularity = sysconf(_SC_PAGESIZE); #endif if (!defaultMemPool) { // Do not rely on static constructors and do the assignment in case // of library static section not initialized at this call yet. defaultMemPool = (MemoryPool*)defaultMemPool_space; } bool initOk = defaultMemPool-> extMemPool.init(0, nullptr, nullptr, granularity, /*keepAllMemory=*/false, /*fixedPool=*/false); // TODO: extMemPool.init() to not allocate memory if (!initOk || !initBackRefMain(&defaultMemPool->extMemPool.backend) || !ThreadId::init()) return false; MemoryPool::initDefaultPool(); // init() is required iff initMemoryManager() is called // after mallocProcessShutdownNotification() shutdownSync.init(); #if COLLECT_STATISTICS initStatisticsCollection(); #endif return true; } static bool GetBoolEnvironmentVariable(const char* name) { return tbb::detail::r1::GetBoolEnvironmentVariable(name); } //! Ensures that initMemoryManager() is called once and only once. /** Does not return until initMemoryManager() has been completed by a thread. There is no need to call this routine if mallocInitialized==2 . */ static bool doInitialization() { MallocMutex::scoped_lock lock( initMutex ); if (mallocInitialized.load(std::memory_order_relaxed)!=2) { MALLOC_ASSERT( mallocInitialized.load(std::memory_order_relaxed)==0, ASSERT_TEXT ); mallocInitialized.store(1, std::memory_order_relaxed); RecursiveMallocCallProtector scoped; if (!initMemoryManager()) { mallocInitialized.store(0, std::memory_order_relaxed); // restore and out return false; } #ifdef MALLOC_EXTRA_INITIALIZATION MALLOC_EXTRA_INITIALIZATION; #endif #if MALLOC_CHECK_RECURSION RecursiveMallocCallProtector::detectNaiveOverload(); #endif MALLOC_ASSERT( mallocInitialized.load(std::memory_order_relaxed)==1, ASSERT_TEXT ); // Store must have release fence, otherwise mallocInitialized==2 // might become remotely visible before side effects of // initMemoryManager() become remotely visible. mallocInitialized.store(2, std::memory_order_release); if( GetBoolEnvironmentVariable("TBB_VERSION") ) { fputs(VersionString+1,stderr); hugePages.printStatus(); } } /* It can't be 0 or I would have initialized it */ MALLOC_ASSERT( mallocInitialized.load(std::memory_order_relaxed)==2, ASSERT_TEXT ); return true; } /********* End library initialization *************/ /********* The malloc show begins *************/ FreeObject *Block::allocateFromFreeList() { FreeObject *result; if (!freeList) return nullptr; result = freeList; MALLOC_ASSERT( result, ASSERT_TEXT ); freeList = result->next; MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT ); allocatedCount++; STAT_increment(getThreadId(), getIndex(objectSize), allocFreeListUsed); return result; } FreeObject *Block::allocateFromBumpPtr() { FreeObject *result = bumpPtr; if (result) { bumpPtr = (FreeObject *) ((uintptr_t) bumpPtr - objectSize); if ( (uintptr_t)bumpPtr < (uintptr_t)this+sizeof(Block) ) { bumpPtr = nullptr; } MALLOC_ASSERT( allocatedCount < (slabSize-sizeof(Block))/objectSize, ASSERT_TEXT ); allocatedCount++; STAT_increment(getThreadId(), getIndex(objectSize), allocBumpPtrUsed); } return result; } inline FreeObject* Block::allocate() { MALLOC_ASSERT( isOwnedByCurrentThread(), ASSERT_TEXT ); /* for better cache locality, first looking in the free list. */ if ( FreeObject *result = allocateFromFreeList() ) { return result; } MALLOC_ASSERT( !freeList, ASSERT_TEXT ); /* if free list is empty, try thread local bump pointer allocation. */ if ( FreeObject *result = allocateFromBumpPtr() ) { return result; } MALLOC_ASSERT( !bumpPtr, ASSERT_TEXT ); /* the block is considered full. */ isFull = true; return nullptr; } size_t Block::findObjectSize(void *object) const { size_t blSize = getSize(); #if MALLOC_CHECK_RECURSION // Currently, there is no aligned allocations from startup blocks, // so we can return just StartupBlock::msize(). // TODO: This must be extended if we add aligned allocation from startup blocks. if (!blSize) return StartupBlock::msize(object); #endif // object can be aligned, so real size can be less than block's size_t size = blSize - ((uintptr_t)object - (uintptr_t)findObjectToFree(object)); MALLOC_ASSERT(size>0 && sizegetMemPool()->returnEmptyBlock(block, poolTheBlock); } else { /* all objects are free - let's restore the bump pointer */ block->restoreBumpPtr(); } } template bool LocalLOCImpl::put(LargeMemoryBlock *object, ExtMemoryPool *extMemPool) { const size_t size = object->unalignedSize; // not spoil cache with too large object, that can cause its total cleanup if (size > MAX_TOTAL_SIZE) return false; LargeMemoryBlock *localHead = head.exchange(nullptr); object->prev = nullptr; object->next = localHead; if (localHead) localHead->prev = object; else { // those might not be cleaned during local cache stealing, correct them totalSize = 0; numOfBlocks = 0; tail = object; } localHead = object; totalSize += size; numOfBlocks++; // must meet both size and number of cached objects constrains if (totalSize > MAX_TOTAL_SIZE || numOfBlocks >= HIGH_MARK) { // scanning from tail until meet conditions while (totalSize > MAX_TOTAL_SIZE || numOfBlocks > LOW_MARK) { totalSize -= tail->unalignedSize; numOfBlocks--; tail = tail->prev; } LargeMemoryBlock *headToRelease = tail->next; tail->next = nullptr; extMemPool->freeLargeObjectList(headToRelease); } head.store(localHead, std::memory_order_release); return true; } template LargeMemoryBlock *LocalLOCImpl::get(size_t size) { LargeMemoryBlock *localHead, *res = nullptr; if (size > MAX_TOTAL_SIZE) return nullptr; // TBB_REVAMP_TODO: review this line if (!head.load(std::memory_order_acquire) || (localHead = head.exchange(nullptr)) == nullptr) { // do not restore totalSize, numOfBlocks and tail at this point, // as they are used only in put(), where they must be restored return nullptr; } for (LargeMemoryBlock *curr = localHead; curr; curr=curr->next) { if (curr->unalignedSize == size) { res = curr; if (curr->next) curr->next->prev = curr->prev; else tail = curr->prev; if (curr != localHead) curr->prev->next = curr->next; else localHead = curr->next; totalSize -= size; numOfBlocks--; break; } } head.store(localHead, std::memory_order_release); return res; } template bool LocalLOCImpl::externalCleanup(ExtMemoryPool *extMemPool) { if (LargeMemoryBlock *localHead = head.exchange(nullptr)) { extMemPool->freeLargeObjectList(localHead); return true; } return false; } void *MemoryPool::getFromLLOCache(TLSData* tls, size_t size, size_t alignment) { LargeMemoryBlock *lmb = nullptr; size_t headersSize = sizeof(LargeMemoryBlock)+sizeof(LargeObjectHdr); size_t allocationSize = LargeObjectCache::alignToBin(size+headersSize+alignment); if (allocationSize < size) // allocationSize is wrapped around after alignToBin return nullptr; MALLOC_ASSERT(allocationSize >= alignment, "Overflow must be checked before."); if (tls) { tls->markUsed(); lmb = tls->lloc.get(allocationSize); } if (!lmb) lmb = extMemPool.mallocLargeObject(this, allocationSize); if (lmb) { // doing shuffle we suppose that alignment offset guarantees // that different cache lines are in use MALLOC_ASSERT(alignment >= estimatedCacheLineSize, ASSERT_TEXT); void *alignedArea = (void*)alignUp((uintptr_t)lmb+headersSize, alignment); uintptr_t alignedRight = alignDown((uintptr_t)lmb+lmb->unalignedSize - size, alignment); // Has some room to shuffle object between cache lines? // Note that alignedRight and alignedArea are aligned at alignment. unsigned ptrDelta = alignedRight - (uintptr_t)alignedArea; if (ptrDelta && tls) { // !tls is cold path // for the hot path of alignment==estimatedCacheLineSize, // allow compilers to use shift for division // (since estimatedCacheLineSize is a power-of-2 constant) unsigned numOfPossibleOffsets = alignment == estimatedCacheLineSize? ptrDelta / estimatedCacheLineSize : ptrDelta / alignment; unsigned myCacheIdx = ++tls->currCacheIdx; unsigned offset = myCacheIdx % numOfPossibleOffsets; // Move object to a cache line with an offset that is different from // previous allocation. This supposedly allows us to use cache // associativity more efficiently. alignedArea = (void*)((uintptr_t)alignedArea + offset*alignment); } MALLOC_ASSERT((uintptr_t)lmb+lmb->unalignedSize >= (uintptr_t)alignedArea+size, "Object doesn't fit the block."); LargeObjectHdr *header = (LargeObjectHdr*)alignedArea-1; header->memoryBlock = lmb; header->backRefIdx = lmb->backRefIdx; setBackRef(header->backRefIdx, header); lmb->objectSize = size; MALLOC_ASSERT( isLargeObject(alignedArea), ASSERT_TEXT ); MALLOC_ASSERT( isAligned(alignedArea, alignment), ASSERT_TEXT ); return alignedArea; } return nullptr; } void MemoryPool::putToLLOCache(TLSData *tls, void *object) { LargeObjectHdr *header = (LargeObjectHdr*)object - 1; // overwrite backRefIdx to simplify double free detection header->backRefIdx = BackRefIdx(); if (tls) { tls->markUsed(); if (tls->lloc.put(header->memoryBlock, &extMemPool)) return; } extMemPool.freeLargeObject(header->memoryBlock); } /* * All aligned allocations fall into one of the following categories: * 1. if both request size and alignment are <= maxSegregatedObjectSize, * we just align the size up, and request this amount, because for every size * aligned to some power of 2, the allocated object is at least that aligned. * 2. for sizegetTLS(/*create=*/true); // take into account only alignment that are higher then natural result = memPool->getFromLLOCache(tls, size, largeObjectAlignment>alignment? largeObjectAlignment: alignment); } MALLOC_ASSERT( isAligned(result, alignment), ASSERT_TEXT ); return result; } static void *reallocAligned(MemoryPool *memPool, void *ptr, size_t newSize, size_t alignment = 0) { void *result; size_t copySize; if (isLargeObject(ptr)) { LargeMemoryBlock* lmb = ((LargeObjectHdr *)ptr - 1)->memoryBlock; copySize = lmb->unalignedSize-((uintptr_t)ptr-(uintptr_t)lmb); // Apply different strategies if size decreases if (newSize <= copySize && (0 == alignment || isAligned(ptr, alignment))) { // For huge objects (that do not fit in backend cache), keep the same space unless // the new size is at least twice smaller bool isMemoryBlockHuge = copySize > memPool->extMemPool.backend.getMaxBinnedSize(); size_t threshold = isMemoryBlockHuge ? copySize / 2 : 0; if (newSize > threshold) { lmb->objectSize = newSize; return ptr; } // TODO: For large objects suitable for the backend cache, // split out the excessive part and put it to the backend. } // Reallocate for real copySize = lmb->objectSize; #if BACKEND_HAS_MREMAP if (void *r = memPool->extMemPool.remap(ptr, copySize, newSize, alignment < largeObjectAlignment ? largeObjectAlignment : alignment)) return r; #endif result = alignment ? allocateAligned(memPool, newSize, alignment) : internalPoolMalloc(memPool, newSize); } else { Block* block = (Block *)alignDown(ptr, slabSize); copySize = block->findObjectSize(ptr); // TODO: Move object to another bin if size decreases and the current bin is "empty enough". // Currently, in case of size decreasing, old pointer is returned if (newSize <= copySize && (0==alignment || isAligned(ptr, alignment))) { return ptr; } else { result = alignment ? allocateAligned(memPool, newSize, alignment) : internalPoolMalloc(memPool, newSize); } } if (result) { memcpy(result, ptr, copySize < newSize ? copySize : newSize); internalPoolFree(memPool, ptr, 0); } return result; } #if MALLOC_DEBUG /* A predicate checks if an object is properly placed inside its block */ inline bool Block::isProperlyPlaced(const void *object) const { return 0 == ((uintptr_t)this + slabSize - (uintptr_t)object) % objectSize; } #endif /* Finds the real object inside the block */ FreeObject *Block::findAllocatedObject(const void *address) const { // calculate offset from the end of the block space uint16_t offset = (uintptr_t)this + slabSize - (uintptr_t)address; MALLOC_ASSERT( offset<=slabSize-sizeof(Block), ASSERT_TEXT ); // find offset difference from a multiple of allocation size offset %= objectSize; // and move the address down to where the real object starts. return (FreeObject*)((uintptr_t)address - (offset? objectSize-offset: 0)); } /* * Bad dereference caused by a foreign pointer is possible only here, not earlier in call chain. * Separate function isolates SEH code, as it has bad influence on compiler optimization. */ static inline BackRefIdx safer_dereference (const BackRefIdx *ptr) { BackRefIdx id; #if _MSC_VER __try { #endif id = dereference(ptr); #if _MSC_VER } __except( GetExceptionCode() == EXCEPTION_ACCESS_VIOLATION? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH ) { id = BackRefIdx(); } #endif return id; } template bool isLargeObject(void *object) { if (!isAligned(object, largeObjectAlignment)) return false; LargeObjectHdr *header = (LargeObjectHdr*)object - 1; BackRefIdx idx = (memOrigin == unknownMem) ? safer_dereference(&header->backRefIdx) : dereference(&header->backRefIdx); return idx.isLargeObject() // in valid LargeObjectHdr memoryBlock is not nullptr && header->memoryBlock // in valid LargeObjectHdr memoryBlock points somewhere before header // TODO: more strict check && (uintptr_t)header->memoryBlock < (uintptr_t)header && getBackRef(idx) == header; } static inline bool isSmallObject (void *ptr) { Block* expectedBlock = (Block*)alignDown(ptr, slabSize); const BackRefIdx* idx = expectedBlock->getBackRefIdx(); bool isSmall = expectedBlock == getBackRef(safer_dereference(idx)); if (isSmall) expectedBlock->checkFreePrecond(ptr); return isSmall; } /**** Check if an object was allocated by scalable_malloc ****/ static inline bool isRecognized (void* ptr) { return defaultMemPool->extMemPool.backend.ptrCanBeValid(ptr) && (isLargeObject(ptr) || isSmallObject(ptr)); } static inline void freeSmallObject(void *object) { /* mask low bits to get the block */ Block *block = (Block *)alignDown(object, slabSize); block->checkFreePrecond(object); #if MALLOC_CHECK_RECURSION if (block->isStartupAllocObject()) { ((StartupBlock *)block)->free(object); return; } #endif if (block->isOwnedByCurrentThread()) { block->freeOwnObject(object); } else { /* Slower path to add to the shared list, the allocatedCount is updated by the owner thread in malloc. */ FreeObject *objectToFree = block->findObjectToFree(object); block->freePublicObject(objectToFree); } } static void *internalPoolMalloc(MemoryPool* memPool, size_t size) { Bin* bin; Block * mallocBlock; if (!memPool) return nullptr; if (!size) size = sizeof(size_t); TLSData *tls = memPool->getTLS(/*create=*/true); /* Allocate a large object */ if (size >= minLargeObjectSize) return memPool->getFromLLOCache(tls, size, largeObjectAlignment); if (!tls) return nullptr; tls->markUsed(); /* * Get an element in thread-local array corresponding to the given size; * It keeps ptr to the active block for allocations of this size */ bin = tls->getAllocationBin(size); if ( !bin ) return nullptr; /* Get a block to try to allocate in. */ for( mallocBlock = bin->getActiveBlock(); mallocBlock; mallocBlock = bin->setPreviousBlockActive() ) // the previous block should be empty enough { if( FreeObject *result = mallocBlock->allocate() ) return result; } /* * else privatize publicly freed objects in some block and allocate from it */ mallocBlock = bin->getPrivatizedFreeListBlock(); if (mallocBlock) { MALLOC_ASSERT( mallocBlock->freeListNonNull(), ASSERT_TEXT ); if ( FreeObject *result = mallocBlock->allocateFromFreeList() ) return result; /* Else something strange happened, need to retry from the beginning; */ TRACEF(( "[ScalableMalloc trace] Something is wrong: no objects in public free list; reentering.\n" )); return internalPoolMalloc(memPool, size); } /* * no suitable own blocks, try to get a partial block that some other thread has discarded. */ mallocBlock = memPool->extMemPool.orphanedBlocks.get(tls, size); while (mallocBlock) { bin->pushTLSBin(mallocBlock); bin->setActiveBlock(mallocBlock); // TODO: move under the below condition? if( FreeObject *result = mallocBlock->allocate() ) return result; mallocBlock = memPool->extMemPool.orphanedBlocks.get(tls, size); } /* * else try to get a new empty block */ mallocBlock = memPool->getEmptyBlock(size); if (mallocBlock) { bin->pushTLSBin(mallocBlock); bin->setActiveBlock(mallocBlock); if( FreeObject *result = mallocBlock->allocate() ) return result; /* Else something strange happened, need to retry from the beginning; */ TRACEF(( "[ScalableMalloc trace] Something is wrong: no objects in empty block; reentering.\n" )); return internalPoolMalloc(memPool, size); } /* * else nothing works so return nullptr */ TRACEF(( "[ScalableMalloc trace] No memory found, returning nullptr.\n" )); return nullptr; } // When size==0 (i.e. unknown), detect here whether the object is large. // For size is known and < minLargeObjectSize, we still need to check // if the actual object is large, because large objects might be used // for aligned small allocations. static bool internalPoolFree(MemoryPool *memPool, void *object, size_t size) { if (!memPool || !object) return false; // The library is initialized at allocation call, so releasing while // not initialized means foreign object is releasing. MALLOC_ASSERT(isMallocInitialized(), ASSERT_TEXT); MALLOC_ASSERT(memPool->extMemPool.userPool() || isRecognized(object), "Invalid pointer during object releasing is detected."); if (size >= minLargeObjectSize || isLargeObject(object)) memPool->putToLLOCache(memPool->getTLS(/*create=*/false), object); else freeSmallObject(object); return true; } static void *internalMalloc(size_t size) { if (!size) size = sizeof(size_t); #if MALLOC_CHECK_RECURSION if (RecursiveMallocCallProtector::sameThreadActive()) return sizegetFromLLOCache(nullptr, size, slabSize); #endif if (!isMallocInitialized()) if (!doInitialization()) return nullptr; return internalPoolMalloc(defaultMemPool, size); } static void internalFree(void *object) { internalPoolFree(defaultMemPool, object, 0); } static size_t internalMsize(void* ptr) { MALLOC_ASSERT(ptr, "Invalid pointer passed to internalMsize"); if (isLargeObject(ptr)) { // TODO: return the maximum memory size, that can be written to this object LargeMemoryBlock* lmb = ((LargeObjectHdr*)ptr - 1)->memoryBlock; return lmb->objectSize; } else { Block *block = (Block*)alignDown(ptr, slabSize); return block->findObjectSize(ptr); } } } // namespace internal using namespace rml::internal; // legacy entry point saved for compatibility with binaries complied // with pre-6003 versions of TBB TBBMALLOC_EXPORT rml::MemoryPool *pool_create(intptr_t pool_id, const MemPoolPolicy *policy) { rml::MemoryPool *pool; MemPoolPolicy pol(policy->pAlloc, policy->pFree, policy->granularity); pool_create_v1(pool_id, &pol, &pool); return pool; } rml::MemPoolError pool_create_v1(intptr_t pool_id, const MemPoolPolicy *policy, rml::MemoryPool **pool) { if ( !policy->pAlloc || policy->versionfixedPool || policy->pFree)) { *pool = nullptr; return INVALID_POLICY; } if ( policy->version>MemPoolPolicy::TBBMALLOC_POOL_VERSION // future versions are not supported // new flags can be added in place of reserved, but default // behaviour must be supported by this version || policy->reserved ) { *pool = nullptr; return UNSUPPORTED_POLICY; } if (!isMallocInitialized()) if (!doInitialization()) { *pool = nullptr; return NO_MEMORY; } rml::internal::MemoryPool *memPool = (rml::internal::MemoryPool*)internalMalloc((sizeof(rml::internal::MemoryPool))); if (!memPool) { *pool = nullptr; return NO_MEMORY; } memset(static_cast(memPool), 0, sizeof(rml::internal::MemoryPool)); if (!memPool->init(pool_id, policy)) { internalFree(memPool); *pool = nullptr; return NO_MEMORY; } *pool = (rml::MemoryPool*)memPool; return POOL_OK; } bool pool_destroy(rml::MemoryPool* memPool) { if (!memPool) return false; bool ret = ((rml::internal::MemoryPool*)memPool)->destroy(); internalFree(memPool); return ret; } bool pool_reset(rml::MemoryPool* memPool) { if (!memPool) return false; return ((rml::internal::MemoryPool*)memPool)->reset(); } void *pool_malloc(rml::MemoryPool* mPool, size_t size) { return internalPoolMalloc((rml::internal::MemoryPool*)mPool, size); } void *pool_realloc(rml::MemoryPool* mPool, void *object, size_t size) { if (!object) return internalPoolMalloc((rml::internal::MemoryPool*)mPool, size); if (!size) { internalPoolFree((rml::internal::MemoryPool*)mPool, object, 0); return nullptr; } return reallocAligned((rml::internal::MemoryPool*)mPool, object, size, 0); } void *pool_aligned_malloc(rml::MemoryPool* mPool, size_t size, size_t alignment) { if (!isPowerOfTwo(alignment) || 0==size) return nullptr; return allocateAligned((rml::internal::MemoryPool*)mPool, size, alignment); } void *pool_aligned_realloc(rml::MemoryPool* memPool, void *ptr, size_t size, size_t alignment) { if (!isPowerOfTwo(alignment)) return nullptr; rml::internal::MemoryPool *mPool = (rml::internal::MemoryPool*)memPool; void *tmp; if (!ptr) tmp = allocateAligned(mPool, size, alignment); else if (!size) { internalPoolFree(mPool, ptr, 0); return nullptr; } else tmp = reallocAligned(mPool, ptr, size, alignment); return tmp; } bool pool_free(rml::MemoryPool *mPool, void *object) { return internalPoolFree((rml::internal::MemoryPool*)mPool, object, 0); } rml::MemoryPool *pool_identify(void *object) { rml::internal::MemoryPool *pool; if (isLargeObject(object)) { LargeObjectHdr *header = (LargeObjectHdr*)object - 1; pool = header->memoryBlock->pool; } else { Block *block = (Block*)alignDown(object, slabSize); pool = block->getMemPool(); } // do not return defaultMemPool, as it can't be used in pool_free() etc __TBB_ASSERT_RELEASE(pool!=defaultMemPool, "rml::pool_identify() can't be used for scalable_malloc() etc results."); return (rml::MemoryPool*)pool; } size_t pool_msize(rml::MemoryPool *mPool, void* object) { if (object) { // No assert for object recognition, cause objects allocated from non-default // memory pool do not participate in range checking and do not have valid backreferences for // small objects. Instead, check that an object belong to the certain memory pool. MALLOC_ASSERT_EX(mPool == pool_identify(object), "Object does not belong to the specified pool"); return internalMsize(object); } errno = EINVAL; // Unlike _msize, return 0 in case of parameter error. // Returning size_t(-1) looks more like the way to troubles. return 0; } } // namespace rml using namespace rml::internal; #if MALLOC_TRACE static unsigned int threadGoingDownCount = 0; #endif /* * When a thread is shutting down this routine should be called to remove all the thread ids * from the malloc blocks and replace them with a nullptr thread id. * * For pthreads, the function is set as a callback in pthread_key_create for TLS bin. * It will be automatically called at thread exit with the key value as the argument, * unless that value is nullptr. * For Windows, it is called from DllMain( DLL_THREAD_DETACH ). * * However neither of the above is called for the main process thread, so the routine * also needs to be called during the process shutdown. * */ // TODO: Consider making this function part of class MemoryPool. void doThreadShutdownNotification(TLSData* tls, bool main_thread) { TRACEF(( "[ScalableMalloc trace] Thread id %d blocks return start %d\n", getThreadId(), threadGoingDownCount++ )); #if USE_PTHREAD if (tls) { if (!shutdownSync.threadDtorStart()) return; tls->getMemPool()->onThreadShutdown(tls); shutdownSync.threadDtorDone(); } else #endif { suppress_unused_warning(tls); // not used on Windows // The default pool is safe to use at this point: // on Linux, only the main thread can go here before destroying defaultMemPool; // on Windows, shutdown is synchronized via loader lock and isMallocInitialized(). // See also __TBB_mallocProcessShutdownNotification() defaultMemPool->onThreadShutdown(defaultMemPool->getTLS(/*create=*/false)); // Take lock to walk through other pools; but waiting might be dangerous at this point // (e.g. on Windows the main thread might deadlock) bool locked = false; MallocMutex::scoped_lock lock(MemoryPool::memPoolListLock, /*wait=*/!main_thread, &locked); if (locked) { // the list is safe to process for (MemoryPool *memPool = defaultMemPool->next; memPool; memPool = memPool->next) memPool->onThreadShutdown(memPool->getTLS(/*create=*/false)); } } TRACEF(( "[ScalableMalloc trace] Thread id %d blocks return end\n", getThreadId() )); } #if USE_PTHREAD void mallocThreadShutdownNotification(void* arg) { // The routine is called for each pool (as TLS dtor) on each thread, except for the main thread if (!isMallocInitialized()) return; doThreadShutdownNotification((TLSData*)arg, false); } #else extern "C" void __TBB_mallocThreadShutdownNotification() { // The routine is called once per thread on Windows if (!isMallocInitialized()) return; doThreadShutdownNotification(nullptr, false); } #endif extern "C" void __TBB_mallocProcessShutdownNotification(bool windows_process_dying) { if (!isMallocInitialized()) return; // Don't clean allocator internals if the entire process is exiting if (!windows_process_dying) { doThreadShutdownNotification(nullptr, /*main_thread=*/true); } #if __TBB_MALLOC_LOCACHE_STAT printf("cache hit ratio %f, size hit %f\n", 1.*cacheHits/mallocCalls, 1.*memHitKB/memAllocKB); defaultMemPool->extMemPool.loc.reportStat(stdout); #endif shutdownSync.processExit(); #if __TBB_SOURCE_DIRECTLY_INCLUDED /* Pthread keys must be deleted as soon as possible to not call key dtor on thread termination when then the tbbmalloc code can be already unloaded. */ defaultMemPool->destroy(); destroyBackRefMain(&defaultMemPool->extMemPool.backend); ThreadId::destroy(); // Delete key for thread id hugePages.reset(); // new total malloc initialization is possible after this point mallocInitialized.store(0, std::memory_order_release); #endif // __TBB_SOURCE_DIRECTLY_INCLUDED #if COLLECT_STATISTICS unsigned nThreads = ThreadId::getMaxThreadId(); for( int i=1; i<=nThreads && iextMemPool.backend.ptrCanBeValid(object)) { if (isLargeObject(object)) { // must check 1st for large object, because small object check touches 4 pages on left, // and it can be inaccessible TLSData *tls = defaultMemPool->getTLS(/*create=*/false); defaultMemPool->putToLLOCache(tls, object); return; } else if (isSmallObject(object)) { freeSmallObject(object); return; } } if (original_free) original_free(object); } /********* End the free code *************/ /********* Code for scalable_realloc ***********/ /* * From K&R * "realloc changes the size of the object pointed to by p to size. The contents will * be unchanged up to the minimum of the old and the new sizes. If the new size is larger, * the new space is uninitialized. realloc returns a pointer to the new space, or * nullptr if the request cannot be satisfied, in which case *p is unchanged." * */ extern "C" void* scalable_realloc(void* ptr, size_t size) { void *tmp; if (!ptr) tmp = internalMalloc(size); else if (!size) { internalFree(ptr); return nullptr; } else tmp = reallocAligned(defaultMemPool, ptr, size, 0); if (!tmp) errno = ENOMEM; return tmp; } /* * A variant that provides additional memory safety, by checking whether the given address * was obtained with this allocator, and if not redirecting to the provided alternative call. */ extern "C" TBBMALLOC_EXPORT void* __TBB_malloc_safer_realloc(void* ptr, size_t sz, void* original_realloc) { void *tmp; // TODO: fix warnings about uninitialized use of tmp if (!ptr) { tmp = internalMalloc(sz); } else if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(ptr)) { if (!sz) { internalFree(ptr); return nullptr; } else { tmp = reallocAligned(defaultMemPool, ptr, sz, 0); } } #if USE_WINTHREAD else if (original_realloc && sz) { orig_ptrs *original_ptrs = static_cast(original_realloc); if ( original_ptrs->msize ){ size_t oldSize = original_ptrs->msize(ptr); tmp = internalMalloc(sz); if (tmp) { memcpy(tmp, ptr, szfree ){ original_ptrs->free( ptr ); } } } else tmp = nullptr; } #else else if (original_realloc) { typedef void* (*realloc_ptr_t)(void*,size_t); realloc_ptr_t original_realloc_ptr; (void *&)original_realloc_ptr = original_realloc; tmp = original_realloc_ptr(ptr,sz); } #endif else tmp = nullptr; if (!tmp) errno = ENOMEM; return tmp; } /********* End code for scalable_realloc ***********/ /********* Code for scalable_calloc ***********/ /* * From K&R * calloc returns a pointer to space for an array of nobj objects, * each of size size, or nullptr if the request cannot be satisfied. * The space is initialized to zero bytes. * */ extern "C" void * scalable_calloc(size_t nobj, size_t size) { // it's square root of maximal size_t value const size_t mult_not_overflow = size_t(1) << (sizeof(size_t)*CHAR_BIT/2); const size_t arraySize = nobj * size; // check for overflow during multiplication: if (nobj>=mult_not_overflow || size>=mult_not_overflow) // 1) heuristic check if (nobj && arraySize / nobj != size) { // 2) exact check errno = ENOMEM; return nullptr; } void* result = internalMalloc(arraySize); if (result) memset(result, 0, arraySize); else errno = ENOMEM; return result; } /********* End code for scalable_calloc ***********/ /********* Code for aligned allocation API **********/ extern "C" int scalable_posix_memalign(void **memptr, size_t alignment, size_t size) { if ( !isPowerOfTwoAtLeast(alignment, sizeof(void*)) ) return EINVAL; void *result = allocateAligned(defaultMemPool, size, alignment); if (!result) return ENOMEM; *memptr = result; return 0; } extern "C" void * scalable_aligned_malloc(size_t size, size_t alignment) { if (!isPowerOfTwo(alignment) || 0==size) { errno = EINVAL; return nullptr; } void *tmp = allocateAligned(defaultMemPool, size, alignment); if (!tmp) errno = ENOMEM; return tmp; } extern "C" void * scalable_aligned_realloc(void *ptr, size_t size, size_t alignment) { if (!isPowerOfTwo(alignment)) { errno = EINVAL; return nullptr; } void *tmp; if (!ptr) tmp = allocateAligned(defaultMemPool, size, alignment); else if (!size) { scalable_free(ptr); return nullptr; } else tmp = reallocAligned(defaultMemPool, ptr, size, alignment); if (!tmp) errno = ENOMEM; return tmp; } extern "C" TBBMALLOC_EXPORT void * __TBB_malloc_safer_aligned_realloc(void *ptr, size_t size, size_t alignment, void* orig_function) { /* corner cases left out of reallocAligned to not deal with errno there */ if (!isPowerOfTwo(alignment)) { errno = EINVAL; return nullptr; } void *tmp = nullptr; if (!ptr) { tmp = allocateAligned(defaultMemPool, size, alignment); } else if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(ptr)) { if (!size) { internalFree(ptr); return nullptr; } else { tmp = reallocAligned(defaultMemPool, ptr, size, alignment); } } #if USE_WINTHREAD else { orig_aligned_ptrs *original_ptrs = static_cast(orig_function); if (size) { // Without orig_msize, we can't do anything with this. // Just keeping old pointer. if ( original_ptrs->aligned_msize ){ // set alignment and offset to have possibly correct oldSize size_t oldSize = original_ptrs->aligned_msize(ptr, sizeof(void*), 0); tmp = allocateAligned(defaultMemPool, size, alignment); if (tmp) { memcpy(tmp, ptr, sizealigned_free ){ original_ptrs->aligned_free( ptr ); } } } } else { if ( original_ptrs->aligned_free ){ original_ptrs->aligned_free( ptr ); } return nullptr; } } #else // As original_realloc can't align result, and there is no way to find // size of reallocating object, we are giving up. suppress_unused_warning(orig_function); #endif if (!tmp) errno = ENOMEM; return tmp; } extern "C" void scalable_aligned_free(void *ptr) { internalFree(ptr); } /********* end code for aligned allocation API **********/ /********* Code for scalable_msize ***********/ /* * Returns the size of a memory block allocated in the heap. */ extern "C" size_t scalable_msize(void* ptr) { if (ptr) { MALLOC_ASSERT(isRecognized(ptr), "Invalid pointer in scalable_msize detected."); return internalMsize(ptr); } errno = EINVAL; // Unlike _msize, return 0 in case of parameter error. // Returning size_t(-1) looks more like the way to troubles. return 0; } /* * A variant that provides additional memory safety, by checking whether the given address * was obtained with this allocator, and if not redirecting to the provided alternative call. */ extern "C" TBBMALLOC_EXPORT size_t __TBB_malloc_safer_msize(void *object, size_t (*original_msize)(void*)) { if (object) { // Check if the memory was allocated by scalable_malloc if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(object)) return internalMsize(object); else if (original_msize) return original_msize(object); } // object is nullptr or unknown, or foreign and no original_msize #if USE_WINTHREAD errno = EINVAL; // errno expected to be set only on this platform #endif return 0; } /* * The same as above but for _aligned_msize case */ extern "C" TBBMALLOC_EXPORT size_t __TBB_malloc_safer_aligned_msize(void *object, size_t alignment, size_t offset, size_t (*orig_aligned_msize)(void*,size_t,size_t)) { if (object) { // Check if the memory was allocated by scalable_malloc if (mallocInitialized.load(std::memory_order_acquire) && isRecognized(object)) return internalMsize(object); else if (orig_aligned_msize) return orig_aligned_msize(object,alignment,offset); } // object is nullptr or unknown errno = EINVAL; return 0; } /********* End code for scalable_msize ***********/ extern "C" int scalable_allocation_mode(int param, intptr_t value) { if (param == TBBMALLOC_SET_SOFT_HEAP_LIMIT) { defaultMemPool->extMemPool.backend.setRecommendedMaxSize((size_t)value); return TBBMALLOC_OK; } else if (param == USE_HUGE_PAGES) { #if __unix__ switch (value) { case 0: case 1: hugePages.setMode(value); return TBBMALLOC_OK; default: return TBBMALLOC_INVALID_PARAM; } #else return TBBMALLOC_NO_EFFECT; #endif #if __TBB_SOURCE_DIRECTLY_INCLUDED } else if (param == TBBMALLOC_INTERNAL_SOURCE_INCLUDED) { switch (value) { case 0: // used by dynamic library case 1: // used by static library or directly included sources usedBySrcIncluded = value; return TBBMALLOC_OK; default: return TBBMALLOC_INVALID_PARAM; } #endif } else if (param == TBBMALLOC_SET_HUGE_SIZE_THRESHOLD) { defaultMemPool->extMemPool.loc.setHugeSizeThreshold((size_t)value); return TBBMALLOC_OK; } return TBBMALLOC_INVALID_PARAM; } extern "C" int scalable_allocation_command(int cmd, void *param) { if (param) return TBBMALLOC_INVALID_PARAM; bool released = false; switch(cmd) { case TBBMALLOC_CLEAN_THREAD_BUFFERS: if (TLSData *tls = defaultMemPool->getTLS(/*create=*/false)) released = tls->externalCleanup(/*cleanOnlyUnused*/false, /*cleanBins=*/true); break; case TBBMALLOC_CLEAN_ALL_BUFFERS: released = defaultMemPool->extMemPool.hardCachesCleanup(true); break; default: return TBBMALLOC_INVALID_PARAM; } return released ? TBBMALLOC_OK : TBBMALLOC_NO_EFFECT; } ================================================ FILE: third-party/tbb/src/tbbmalloc/large_objects.cpp ================================================ /* Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "tbbmalloc_internal.h" #include "../src/tbb/environment.h" #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) // Suppress warning: unary minus operator applied to unsigned type, result still unsigned // TBB_REVAMP_TODO: review this warning #pragma warning(push) #pragma warning(disable:4146) #endif /******************************* Allocation of large objects *********************************************/ namespace rml { namespace internal { /* ---------------------------- Large Object cache init section ---------------------------------------- */ void LargeObjectCache::init(ExtMemoryPool *memPool) { extMemPool = memPool; // scalable_allocation_mode can be called before allocator initialization, respect this manual request if (hugeSizeThreshold == 0) { // Huge size threshold initialization if environment variable was set long requestedThreshold = tbb::detail::r1::GetIntegralEnvironmentVariable("TBB_MALLOC_SET_HUGE_SIZE_THRESHOLD"); // Read valid env or initialize by default with max possible values if (requestedThreshold != -1) { setHugeSizeThreshold(requestedThreshold); } else { setHugeSizeThreshold(maxHugeSize); } } } /* ----------------------------- Huge size threshold settings ----------------------------------------- */ void LargeObjectCache::setHugeSizeThreshold(size_t value) { // Valid in the huge cache range: [MaxLargeSize, MaxHugeSize]. if (value <= maxHugeSize) { hugeSizeThreshold = value >= maxLargeSize ? alignToBin(value) : maxLargeSize; // Calculate local indexes for the global threshold size (for fast search inside a regular cleanup) largeCache.hugeSizeThresholdIdx = LargeCacheType::numBins; hugeCache.hugeSizeThresholdIdx = HugeCacheType::sizeToIdx(hugeSizeThreshold); } } bool LargeObjectCache::sizeInCacheRange(size_t size) { return size < maxHugeSize && (size <= defaultMaxHugeSize || size >= hugeSizeThreshold); } /* ----------------------------------------------------------------------------------------------------- */ /* The functor called by the aggregator for the operation list */ template class CacheBinFunctor { typename LargeObjectCacheImpl::CacheBin *const bin; ExtMemoryPool *const extMemPool; typename LargeObjectCacheImpl::BinBitMask *const bitMask; const int idx; LargeMemoryBlock *toRelease; bool needCleanup; uintptr_t currTime; /* Do preprocessing under the operation list. */ /* All the OP_PUT_LIST operations are merged in the one operation. All OP_GET operations are merged with the OP_PUT_LIST operations but it demands the update of the moving average value in the bin. Only the last OP_CLEAN_TO_THRESHOLD operation has sense. The OP_CLEAN_ALL operation also should be performed only once. Moreover it cancels the OP_CLEAN_TO_THRESHOLD operation. */ class OperationPreprocessor { // TODO: remove the dependency on CacheBin. typename LargeObjectCacheImpl::CacheBin *const bin; /* Contains the relative time in the operation list. It counts in the reverse order since the aggregator also provides operations in the reverse order. */ uintptr_t lclTime; /* opGet contains only OP_GET operations which cannot be merge with OP_PUT operations opClean contains all OP_CLEAN_TO_THRESHOLD and OP_CLEAN_ALL operations. */ CacheBinOperation *opGet, *opClean; /* The time of the last OP_CLEAN_TO_THRESHOLD operations */ uintptr_t cleanTime; /* lastGetOpTime - the time of the last OP_GET operation. lastGet - the same meaning as CacheBin::lastGet */ uintptr_t lastGetOpTime, lastGet; /* The total sum of all usedSize changes requested with CBOP_UPDATE_USED_SIZE operations. */ size_t updateUsedSize; /* The list of blocks for the OP_PUT_LIST operation. */ LargeMemoryBlock *head, *tail; int putListNum; /* if the OP_CLEAN_ALL is requested. */ bool isCleanAll; inline void commitOperation(CacheBinOperation *op) const; inline void addOpToOpList(CacheBinOperation *op, CacheBinOperation **opList) const; bool getFromPutList(CacheBinOperation* opGet, uintptr_t currTime); void addToPutList( LargeMemoryBlock *head, LargeMemoryBlock *tail, int num ); public: OperationPreprocessor(typename LargeObjectCacheImpl::CacheBin *bin) : bin(bin), lclTime(0), opGet(nullptr), opClean(nullptr), cleanTime(0), lastGetOpTime(0), lastGet(0), updateUsedSize(0), head(nullptr), tail(nullptr), putListNum(0), isCleanAll(false) {} void operator()(CacheBinOperation* opList); uintptr_t getTimeRange() const { return -lclTime; } friend class CacheBinFunctor; }; public: CacheBinFunctor(typename LargeObjectCacheImpl::CacheBin *bin, ExtMemoryPool *extMemPool, typename LargeObjectCacheImpl::BinBitMask *bitMask, int idx) : bin(bin), extMemPool(extMemPool), bitMask(bitMask), idx(idx), toRelease(nullptr), needCleanup(false), currTime(0) {} void operator()(CacheBinOperation* opList); bool isCleanupNeeded() const { return needCleanup; } LargeMemoryBlock *getToRelease() const { return toRelease; } uintptr_t getCurrTime() const { return currTime; } }; /* ---------------- Cache Bin Aggregator Operation Helpers ---------------- */ // The list of structures which describe the operation data struct OpGet { static const CacheBinOperationType type = CBOP_GET; LargeMemoryBlock **res; size_t size; uintptr_t currTime; }; struct OpPutList { static const CacheBinOperationType type = CBOP_PUT_LIST; LargeMemoryBlock *head; }; struct OpCleanToThreshold { static const CacheBinOperationType type = CBOP_CLEAN_TO_THRESHOLD; LargeMemoryBlock **res; uintptr_t currTime; }; struct OpCleanAll { static const CacheBinOperationType type = CBOP_CLEAN_ALL; LargeMemoryBlock **res; }; struct OpUpdateUsedSize { static const CacheBinOperationType type = CBOP_UPDATE_USED_SIZE; size_t size; }; union CacheBinOperationData { private: OpGet opGet; OpPutList opPutList; OpCleanToThreshold opCleanToThreshold; OpCleanAll opCleanAll; OpUpdateUsedSize opUpdateUsedSize; }; // Forward declarations template OpTypeData& opCast(CacheBinOperation &op); // Describes the aggregator operation struct CacheBinOperation : public MallocAggregatedOperation::type { CacheBinOperationType type; template CacheBinOperation(OpTypeData &d, CacheBinOperationStatus st = CBST_WAIT) { opCast(*this) = d; type = OpTypeData::type; MallocAggregatedOperation::type::status = st; } private: CacheBinOperationData data; template friend OpTypeData& opCast(CacheBinOperation &op); }; // The opCast function can be the member of CacheBinOperation but it will have // small stylistic ambiguity: it will look like a getter (with a cast) for the // CacheBinOperation::data data member but it should return a reference to // simplify the code from a lot of getter/setter calls. So the global cast in // the style of static_cast (or reinterpret_cast) seems to be more readable and // have more explicit semantic. template OpTypeData& opCast(CacheBinOperation &op) { return *reinterpret_cast(&op.data); } /* ------------------------------------------------------------------------ */ #if __TBB_MALLOC_LOCACHE_STAT //intptr_t mallocCalls, cacheHits; std::atomic mallocCalls, cacheHits; //intptr_t memAllocKB, memHitKB; std::atomic memAllocKB, memHitKB; #endif #if MALLOC_DEBUG inline bool lessThanWithOverflow(intptr_t a, intptr_t b) { return (a < b && (b - a < static_cast(UINTPTR_MAX/2))) || (a > b && (a - b > static_cast(UINTPTR_MAX/2))); } #endif /* ----------------------------------- Operation processing methods ------------------------------------ */ template void CacheBinFunctor:: OperationPreprocessor::commitOperation(CacheBinOperation *op) const { // FencedStore( (intptr_t&)(op->status), CBST_DONE ); op->status.store(CBST_DONE, std::memory_order_release); } template void CacheBinFunctor:: OperationPreprocessor::addOpToOpList(CacheBinOperation *op, CacheBinOperation **opList) const { op->next = *opList; *opList = op; } template bool CacheBinFunctor:: OperationPreprocessor::getFromPutList(CacheBinOperation *opGet, uintptr_t currTime) { if ( head ) { uintptr_t age = head->age; LargeMemoryBlock *next = head->next; *opCast(*opGet).res = head; commitOperation( opGet ); head = next; putListNum--; MALLOC_ASSERT( putListNum>=0, ASSERT_TEXT ); // use moving average with current hit interval bin->updateMeanHitRange( currTime - age ); return true; } return false; } template void CacheBinFunctor:: OperationPreprocessor::addToPutList(LargeMemoryBlock *h, LargeMemoryBlock *t, int num) { if ( head ) { MALLOC_ASSERT( tail, ASSERT_TEXT ); tail->next = h; h->prev = tail; tail = t; putListNum += num; } else { head = h; tail = t; putListNum = num; } } template void CacheBinFunctor:: OperationPreprocessor::operator()(CacheBinOperation* opList) { for ( CacheBinOperation *op = opList, *opNext; op; op = opNext ) { opNext = op->next; switch ( op->type ) { case CBOP_GET: { lclTime--; if ( !lastGetOpTime ) { lastGetOpTime = lclTime; lastGet = 0; } else if ( !lastGet ) lastGet = lclTime; if ( !getFromPutList(op,lclTime) ) { opCast(*op).currTime = lclTime; addOpToOpList( op, &opGet ); } } break; case CBOP_PUT_LIST: { LargeMemoryBlock *head = opCast(*op).head; LargeMemoryBlock *curr = head, *prev = nullptr; int num = 0; do { // we do not kept prev pointers during assigning blocks to bins, set them now curr->prev = prev; // Save the local times to the memory blocks. Local times are necessary // for the getFromPutList function which updates the hit range value in // CacheBin when OP_GET and OP_PUT_LIST operations are merged successfully. // The age will be updated to the correct global time after preprocessing // when global cache time is updated. curr->age = --lclTime; prev = curr; num += 1; STAT_increment(getThreadId(), ThreadCommonCounters, cacheLargeObj); } while ((curr = curr->next) != nullptr); LargeMemoryBlock *tail = prev; addToPutList(head, tail, num); while ( opGet ) { CacheBinOperation *next = opGet->next; if ( !getFromPutList(opGet, opCast(*opGet).currTime) ) break; opGet = next; } } break; case CBOP_UPDATE_USED_SIZE: updateUsedSize += opCast(*op).size; commitOperation( op ); break; case CBOP_CLEAN_ALL: isCleanAll = true; addOpToOpList( op, &opClean ); break; case CBOP_CLEAN_TO_THRESHOLD: { uintptr_t currTime = opCast(*op).currTime; // We don't worry about currTime overflow since it is a rare // occurrence and doesn't affect correctness cleanTime = cleanTime < currTime ? currTime : cleanTime; addOpToOpList( op, &opClean ); } break; default: MALLOC_ASSERT( false, "Unknown operation." ); } } MALLOC_ASSERT( !( opGet && head ), "Not all put/get pairs are processed!" ); } template void CacheBinFunctor::operator()(CacheBinOperation* opList) { MALLOC_ASSERT( opList, "Empty operation list is passed into operation handler." ); OperationPreprocessor prep(bin); prep(opList); if ( uintptr_t timeRange = prep.getTimeRange() ) { uintptr_t startTime = extMemPool->loc.getCurrTimeRange(timeRange); // endTime is used as the current (base) time since the local time is negative. uintptr_t endTime = startTime + timeRange; if ( prep.lastGetOpTime && prep.lastGet ) bin->setLastGet(prep.lastGet+endTime); if ( CacheBinOperation *opGet = prep.opGet ) { bool isEmpty = false; do { #if __TBB_MALLOC_WHITEBOX_TEST tbbmalloc_whitebox::locGetProcessed++; #endif const OpGet &opGetData = opCast(*opGet); if ( !isEmpty ) { if ( LargeMemoryBlock *res = bin->get() ) { uintptr_t getTime = opGetData.currTime + endTime; // use moving average with current hit interval bin->updateMeanHitRange( getTime - res->age); bin->updateCachedSize( -opGetData.size ); *opGetData.res = res; } else { isEmpty = true; uintptr_t lastGetOpTime = prep.lastGetOpTime+endTime; bin->forgetOutdatedState(lastGetOpTime); bin->updateAgeThreshold(lastGetOpTime); } } CacheBinOperation *opNext = opGet->next; bin->updateUsedSize( opGetData.size, bitMask, idx ); prep.commitOperation( opGet ); opGet = opNext; } while ( opGet ); if ( prep.lastGetOpTime ) bin->setLastGet( prep.lastGetOpTime + endTime ); } else if ( LargeMemoryBlock *curr = prep.head ) { curr->prev = nullptr; while ( curr ) { // Update local times to global times curr->age += endTime; curr=curr->next; } #if __TBB_MALLOC_WHITEBOX_TEST tbbmalloc_whitebox::locPutProcessed+=prep.putListNum; #endif toRelease = bin->putList(prep.head, prep.tail, bitMask, idx, prep.putListNum, extMemPool->loc.hugeSizeThreshold); } needCleanup = extMemPool->loc.isCleanupNeededOnRange(timeRange, startTime); currTime = endTime - 1; } if ( CacheBinOperation *opClean = prep.opClean ) { if ( prep.isCleanAll ) *opCast(*opClean).res = bin->cleanAll(bitMask, idx); else *opCast(*opClean).res = bin->cleanToThreshold(prep.cleanTime, bitMask, idx); CacheBinOperation *opNext = opClean->next; prep.commitOperation( opClean ); while ((opClean = opNext) != nullptr) { opNext = opClean->next; prep.commitOperation(opClean); } } if ( size_t size = prep.updateUsedSize ) bin->updateUsedSize(size, bitMask, idx); } /* ----------------------------------------------------------------------------------------------------- */ /* --------------------------- Methods for creating and executing operations --------------------------- */ template void LargeObjectCacheImpl:: CacheBin::ExecuteOperation(CacheBinOperation *op, ExtMemoryPool *extMemPool, BinBitMask *bitMask, int idx, bool longLifeTime) { CacheBinFunctor func( this, extMemPool, bitMask, idx ); aggregator.execute( op, func, longLifeTime ); if ( LargeMemoryBlock *toRelease = func.getToRelease()) { extMemPool->backend.returnLargeObject(toRelease); } if ( func.isCleanupNeeded() ) { extMemPool->loc.doCleanup( func.getCurrTime(), /*doThreshDecr=*/false); } } template LargeMemoryBlock *LargeObjectCacheImpl:: CacheBin::get(ExtMemoryPool *extMemPool, size_t size, BinBitMask *bitMask, int idx) { LargeMemoryBlock *lmb=nullptr; OpGet data = {&lmb, size, static_cast(0)}; CacheBinOperation op(data); ExecuteOperation( &op, extMemPool, bitMask, idx ); return lmb; } template void LargeObjectCacheImpl:: CacheBin::putList(ExtMemoryPool *extMemPool, LargeMemoryBlock *head, BinBitMask *bitMask, int idx) { MALLOC_ASSERT(sizeof(LargeMemoryBlock)+sizeof(CacheBinOperation)<=head->unalignedSize, "CacheBinOperation is too large to be placed in LargeMemoryBlock!"); OpPutList data = {head}; CacheBinOperation *op = new (head+1) CacheBinOperation(data, CBST_NOWAIT); ExecuteOperation( op, extMemPool, bitMask, idx, false ); } template bool LargeObjectCacheImpl:: CacheBin::cleanToThreshold(ExtMemoryPool *extMemPool, BinBitMask *bitMask, uintptr_t currTime, int idx) { LargeMemoryBlock *toRelease = nullptr; /* oldest may be more recent then age, that's why cast to signed type was used. age overflow is also processed correctly. */ if (last.load(std::memory_order_relaxed) && (intptr_t)(currTime - oldest.load(std::memory_order_relaxed)) > ageThreshold.load(std::memory_order_relaxed)) { OpCleanToThreshold data = {&toRelease, currTime}; CacheBinOperation op(data); ExecuteOperation( &op, extMemPool, bitMask, idx ); } bool released = toRelease; Backend *backend = &extMemPool->backend; while ( toRelease ) { LargeMemoryBlock *helper = toRelease->next; backend->returnLargeObject(toRelease); toRelease = helper; } return released; } template bool LargeObjectCacheImpl:: CacheBin::releaseAllToBackend(ExtMemoryPool *extMemPool, BinBitMask *bitMask, int idx) { LargeMemoryBlock *toRelease = nullptr; if (last.load(std::memory_order_relaxed)) { OpCleanAll data = {&toRelease}; CacheBinOperation op(data); ExecuteOperation(&op, extMemPool, bitMask, idx); } bool released = toRelease; Backend *backend = &extMemPool->backend; while ( toRelease ) { LargeMemoryBlock *helper = toRelease->next; MALLOC_ASSERT(!helper || lessThanWithOverflow(helper->age, toRelease->age), ASSERT_TEXT); backend->returnLargeObject(toRelease); toRelease = helper; } return released; } template void LargeObjectCacheImpl:: CacheBin::updateUsedSize(ExtMemoryPool *extMemPool, size_t size, BinBitMask *bitMask, int idx) { OpUpdateUsedSize data = {size}; CacheBinOperation op(data); ExecuteOperation( &op, extMemPool, bitMask, idx ); } /* ------------------------------ Unsafe methods used with the aggregator ------------------------------ */ template LargeMemoryBlock *LargeObjectCacheImpl:: CacheBin::putList(LargeMemoryBlock *head, LargeMemoryBlock *tail, BinBitMask *bitMask, int idx, int num, size_t hugeSizeThreshold) { size_t size = head->unalignedSize; usedSize.store(usedSize.load(std::memory_order_relaxed) - num * size, std::memory_order_relaxed); MALLOC_ASSERT( !last.load(std::memory_order_relaxed) || (last.load(std::memory_order_relaxed)->age != 0 && last.load(std::memory_order_relaxed)->age != -1U), ASSERT_TEXT ); MALLOC_ASSERT( (tail==head && num==1) || (tail!=head && num>1), ASSERT_TEXT ); MALLOC_ASSERT( tail, ASSERT_TEXT ); LargeMemoryBlock *toRelease = nullptr; if (size < hugeSizeThreshold && !lastCleanedAge) { // 1st object of such size was released. // Not cache it, and remember when this occurs // to take into account during cache miss. lastCleanedAge = tail->age; toRelease = tail; tail = tail->prev; if (tail) tail->next = nullptr; else head = nullptr; num--; } if (num) { // add [head;tail] list to cache tail->next = first; if (first) first->prev = tail; first = head; if (!last.load(std::memory_order_relaxed)) { MALLOC_ASSERT(0 == oldest.load(std::memory_order_relaxed), ASSERT_TEXT); oldest.store(tail->age, std::memory_order_relaxed); last.store(tail, std::memory_order_relaxed); } cachedSize.store(cachedSize.load(std::memory_order_relaxed) + num * size, std::memory_order_relaxed); } // No used object, and nothing in the bin, mark the bin as empty if (!usedSize.load(std::memory_order_relaxed) && !first) bitMask->set(idx, false); return toRelease; } template LargeMemoryBlock *LargeObjectCacheImpl:: CacheBin::get() { LargeMemoryBlock *result=first; if (result) { first = result->next; if (first) first->prev = nullptr; else { last.store(nullptr, std::memory_order_relaxed); oldest.store(0, std::memory_order_relaxed); } } return result; } template void LargeObjectCacheImpl:: CacheBin::forgetOutdatedState(uintptr_t currTime) { // If the time since the last get is LongWaitFactor times more than ageThreshold // for the bin, treat the bin as rarely-used and forget everything we know // about it. // If LongWaitFactor is too small, we forget too early and // so prevents good caching, while if too high, caching blocks // with unrelated usage pattern occurs. const uintptr_t sinceLastGet = currTime - lastGet; bool doCleanup = false; intptr_t threshold = ageThreshold.load(std::memory_order_relaxed); if (threshold) doCleanup = sinceLastGet > static_cast(Props::LongWaitFactor * threshold); else if (lastCleanedAge) doCleanup = sinceLastGet > static_cast(Props::LongWaitFactor * (lastCleanedAge - lastGet)); if (doCleanup) { lastCleanedAge = 0; ageThreshold.store(0, std::memory_order_relaxed); } } template LargeMemoryBlock *LargeObjectCacheImpl:: CacheBin::cleanToThreshold(uintptr_t currTime, BinBitMask *bitMask, int idx) { /* oldest may be more recent then age, that's why cast to signed type was used. age overflow is also processed correctly. */ if ( !last.load(std::memory_order_relaxed) || (intptr_t)(currTime - last.load(std::memory_order_relaxed)->age) < ageThreshold.load(std::memory_order_relaxed) ) return nullptr; #if MALLOC_DEBUG uintptr_t nextAge = 0; #endif do { #if MALLOC_DEBUG // check that list ordered MALLOC_ASSERT(!nextAge || lessThanWithOverflow(nextAge, last.load(std::memory_order_relaxed)->age), ASSERT_TEXT); nextAge = last.load(std::memory_order_relaxed)->age; #endif cachedSize.store(cachedSize.load(std::memory_order_relaxed) - last.load(std::memory_order_relaxed)->unalignedSize, std::memory_order_relaxed); last.store(last.load(std::memory_order_relaxed)->prev, std::memory_order_relaxed); } while (last.load(std::memory_order_relaxed) && (intptr_t)(currTime - last.load(std::memory_order_relaxed)->age) > ageThreshold.load(std::memory_order_relaxed)); LargeMemoryBlock *toRelease = nullptr; if (last.load(std::memory_order_relaxed)) { toRelease = last.load(std::memory_order_relaxed)->next; oldest.store(last.load(std::memory_order_relaxed)->age, std::memory_order_relaxed); last.load(std::memory_order_relaxed)->next = nullptr; } else { toRelease = first; first = nullptr; oldest.store(0, std::memory_order_relaxed); if (!usedSize.load(std::memory_order_relaxed)) bitMask->set(idx, false); } MALLOC_ASSERT( toRelease, ASSERT_TEXT ); lastCleanedAge = toRelease->age; return toRelease; } template LargeMemoryBlock *LargeObjectCacheImpl:: CacheBin::cleanAll(BinBitMask *bitMask, int idx) { if (!last.load(std::memory_order_relaxed)) return nullptr; LargeMemoryBlock *toRelease = first; last.store(nullptr, std::memory_order_relaxed); first = nullptr; oldest.store(0, std::memory_order_relaxed); cachedSize.store(0, std::memory_order_relaxed); if (!usedSize.load(std::memory_order_relaxed)) bitMask->set(idx, false); return toRelease; } /* ----------------------------------------------------------------------------------------------------- */ #if __TBB_MALLOC_BACKEND_STAT template size_t LargeObjectCacheImpl:: CacheBin::reportStat(int num, FILE *f) { #if __TBB_MALLOC_LOCACHE_STAT if (first) printf("%d(%lu): total %lu KB thr %ld lastCln %lu oldest %lu\n", num, num*Props::CacheStep+Props::MinSize, cachedSize.load(std::memory_order_relaxed)/1024, ageThresholdageThreshold.load(std::memory_order_relaxed), lastCleanedAge, oldest.load(std::memory_order_relaxed)); #else suppress_unused_warning(num); suppress_unused_warning(f); #endif return cachedSize.load(std::memory_order_relaxed); } #endif // Release objects from cache blocks that are older than ageThreshold template bool LargeObjectCacheImpl::regularCleanup(ExtMemoryPool *extMemPool, uintptr_t currTime, bool doThreshDecr) { bool released = false; BinsSummary binsSummary; // Threshold settings is below this cache or starts from zero index if (hugeSizeThresholdIdx == 0) return false; // Starting searching for bin that is less than huge size threshold (can be cleaned-up) int startSearchIdx = hugeSizeThresholdIdx - 1; for (int i = bitMask.getMaxTrue(startSearchIdx); i >= 0; i = bitMask.getMaxTrue(i-1)) { bin[i].updateBinsSummary(&binsSummary); if (!doThreshDecr && tooLargeLOC.load(std::memory_order_relaxed) > 2 && binsSummary.isLOCTooLarge()) { // if LOC is too large for quite long time, decrease the threshold // based on bin hit statistics. // For this, redo cleanup from the beginning. // Note: on this iteration total usedSz can be not too large // in comparison to total cachedSz, as we calculated it only // partially. We are ok with it. i = bitMask.getMaxTrue(startSearchIdx)+1; doThreshDecr = true; binsSummary.reset(); continue; } if (doThreshDecr) bin[i].decreaseThreshold(); if (bin[i].cleanToThreshold(extMemPool, &bitMask, currTime, i)) { released = true; } } // We want to find if LOC was too large for some time continuously, // so OK with races between incrementing and zeroing, but incrementing // must be atomic. if (binsSummary.isLOCTooLarge()) { tooLargeLOC++; } else { tooLargeLOC.store(0, std::memory_order_relaxed); } return released; } template bool LargeObjectCacheImpl::cleanAll(ExtMemoryPool *extMemPool) { bool released = false; for (int i = numBins-1; i >= 0; i--) { released |= bin[i].releaseAllToBackend(extMemPool, &bitMask, i); } return released; } template void LargeObjectCacheImpl::reset() { tooLargeLOC.store(0, std::memory_order_relaxed); for (int i = numBins-1; i >= 0; i--) bin[i].init(); bitMask.reset(); } #if __TBB_MALLOC_WHITEBOX_TEST template size_t LargeObjectCacheImpl::getLOCSize() const { size_t size = 0; for (int i = numBins-1; i >= 0; i--) size += bin[i].getSize(); return size; } size_t LargeObjectCache::getLOCSize() const { return largeCache.getLOCSize() + hugeCache.getLOCSize(); } template size_t LargeObjectCacheImpl::getUsedSize() const { size_t size = 0; for (int i = numBins-1; i >= 0; i--) size += bin[i].getUsedSize(); return size; } size_t LargeObjectCache::getUsedSize() const { return largeCache.getUsedSize() + hugeCache.getUsedSize(); } #endif // __TBB_MALLOC_WHITEBOX_TEST inline bool LargeObjectCache::isCleanupNeededOnRange(uintptr_t range, uintptr_t currTime) { return range >= cacheCleanupFreq || currTime+range < currTime-1 // overflow, 0 is power of 2, do cleanup // (prev;prev+range] contains n*cacheCleanupFreq || alignUp(currTime, cacheCleanupFreq)allLocalCaches.markUnused(); bool large_cache_cleaned = largeCache.regularCleanup(extMemPool, currTime, doThreshDecr); bool huge_cache_cleaned = hugeCache.regularCleanup(extMemPool, currTime, doThreshDecr); return large_cache_cleaned || huge_cache_cleaned; } bool LargeObjectCache::decreasingCleanup() { return doCleanup(cacheCurrTime.load(std::memory_order_acquire), /*doThreshDecr=*/true); } bool LargeObjectCache::regularCleanup() { return doCleanup(cacheCurrTime.load(std::memory_order_acquire), /*doThreshDecr=*/false); } bool LargeObjectCache::cleanAll() { bool large_cache_cleaned = largeCache.cleanAll(extMemPool); bool huge_cache_cleaned = hugeCache.cleanAll(extMemPool); return large_cache_cleaned || huge_cache_cleaned; } void LargeObjectCache::reset() { largeCache.reset(); hugeCache.reset(); } template LargeMemoryBlock *LargeObjectCacheImpl::get(ExtMemoryPool *extMemoryPool, size_t size) { int idx = Props::sizeToIdx(size); LargeMemoryBlock *lmb = bin[idx].get(extMemoryPool, size, &bitMask, idx); if (lmb) { MALLOC_ITT_SYNC_ACQUIRED(bin+idx); STAT_increment(getThreadId(), ThreadCommonCounters, allocCachedLargeObj); } return lmb; } template void LargeObjectCacheImpl::updateCacheState(ExtMemoryPool *extMemPool, DecreaseOrIncrease op, size_t size) { int idx = Props::sizeToIdx(size); MALLOC_ASSERT(idx < static_cast(numBins), ASSERT_TEXT); bin[idx].updateUsedSize(extMemPool, op==decrease? -size : size, &bitMask, idx); } #if __TBB_MALLOC_LOCACHE_STAT template void LargeObjectCacheImpl::reportStat(FILE *f) { size_t cachedSize = 0; for (int i=0; i void LargeObjectCacheImpl::putList(ExtMemoryPool *extMemPool, LargeMemoryBlock *toCache) { int toBinIdx = Props::sizeToIdx(toCache->unalignedSize); MALLOC_ITT_SYNC_RELEASING(bin+toBinIdx); bin[toBinIdx].putList(extMemPool, toCache, &bitMask, toBinIdx); } void LargeObjectCache::updateCacheState(DecreaseOrIncrease op, size_t size) { if (size < maxLargeSize) largeCache.updateCacheState(extMemPool, op, size); else if (size < maxHugeSize) hugeCache.updateCacheState(extMemPool, op, size); } uintptr_t LargeObjectCache::getCurrTimeRange(uintptr_t range) { return (cacheCurrTime.fetch_add(range) + 1); } void LargeObjectCache::registerRealloc(size_t oldSize, size_t newSize) { updateCacheState(decrease, oldSize); updateCacheState(increase, alignToBin(newSize)); } size_t LargeObjectCache::alignToBin(size_t size) { return size < maxLargeSize ? LargeCacheType::alignToBin(size) : HugeCacheType::alignToBin(size); } // Used for internal purpose int LargeObjectCache::sizeToIdx(size_t size) { MALLOC_ASSERT(size <= maxHugeSize, ASSERT_TEXT); return size < maxLargeSize ? LargeCacheType::sizeToIdx(size) : LargeCacheType::numBins + HugeCacheType::sizeToIdx(size); } void LargeObjectCache::putList(LargeMemoryBlock *list) { LargeMemoryBlock *toProcess, *n; for (LargeMemoryBlock *curr = list; curr; curr = toProcess) { LargeMemoryBlock *tail = curr; toProcess = curr->next; if (!sizeInCacheRange(curr->unalignedSize)) { extMemPool->backend.returnLargeObject(curr); continue; } int currIdx = sizeToIdx(curr->unalignedSize); // Find all blocks fitting to same bin. Not use more efficient sorting // algorithm because list is short (commonly, // LocalLOC's HIGH_MARK-LOW_MARK, i.e. 24 items). for (LargeMemoryBlock *b = toProcess; b; b = n) { n = b->next; if (sizeToIdx(b->unalignedSize) == currIdx) { tail->next = b; tail = b; if (toProcess == b) toProcess = toProcess->next; else { b->prev->next = b->next; if (b->next) b->next->prev = b->prev; } } } tail->next = nullptr; if (curr->unalignedSize < maxLargeSize) largeCache.putList(extMemPool, curr); else hugeCache.putList(extMemPool, curr); } } void LargeObjectCache::put(LargeMemoryBlock *largeBlock) { size_t blockSize = largeBlock->unalignedSize; if (sizeInCacheRange(blockSize)) { largeBlock->next = nullptr; if (blockSize < maxLargeSize) largeCache.putList(extMemPool, largeBlock); else hugeCache.putList(extMemPool, largeBlock); } else { extMemPool->backend.returnLargeObject(largeBlock); } } LargeMemoryBlock *LargeObjectCache::get(size_t size) { MALLOC_ASSERT( size >= minLargeSize, ASSERT_TEXT ); if (sizeInCacheRange(size)) { return size < maxLargeSize ? largeCache.get(extMemPool, size) : hugeCache.get(extMemPool, size); } return nullptr; } LargeMemoryBlock *ExtMemoryPool::mallocLargeObject(MemoryPool *pool, size_t allocationSize) { #if __TBB_MALLOC_LOCACHE_STAT mallocCalls++; memAllocKB.fetch_add(allocationSize/1024); #endif LargeMemoryBlock* lmb = loc.get(allocationSize); if (!lmb) { BackRefIdx backRefIdx = BackRefIdx::newBackRef(/*largeObj=*/true); if (backRefIdx.isInvalid()) return nullptr; // unalignedSize is set in getLargeBlock lmb = backend.getLargeBlock(allocationSize); if (!lmb) { removeBackRef(backRefIdx); loc.updateCacheState(decrease, allocationSize); return nullptr; } lmb->backRefIdx = backRefIdx; lmb->pool = pool; STAT_increment(getThreadId(), ThreadCommonCounters, allocNewLargeObj); } else { #if __TBB_MALLOC_LOCACHE_STAT cacheHits++; memHitKB.fetch_add(allocationSize/1024); #endif } return lmb; } void ExtMemoryPool::freeLargeObject(LargeMemoryBlock *mBlock) { loc.put(mBlock); } void ExtMemoryPool::freeLargeObjectList(LargeMemoryBlock *head) { loc.putList(head); } bool ExtMemoryPool::softCachesCleanup() { bool ret = false; if (!softCachesCleanupInProgress.exchange(1, std::memory_order_acq_rel)) { ret = loc.regularCleanup(); softCachesCleanupInProgress.store(0, std::memory_order_release); } return ret; } bool ExtMemoryPool::hardCachesCleanup(bool wait) { if (hardCachesCleanupInProgress.exchange(1, std::memory_order_acq_rel)) { if (!wait) return false; AtomicBackoff backoff; while (hardCachesCleanupInProgress.exchange(1, std::memory_order_acq_rel)) backoff.pause(); } // thread-local caches must be cleaned before LOC, // because object from thread-local cache can be released to LOC bool ret = releaseAllLocalCaches(); ret |= orphanedBlocks.cleanup(&backend); ret |= loc.cleanAll(); ret |= backend.clean(); hardCachesCleanupInProgress.store(0, std::memory_order_release); return ret; } #if BACKEND_HAS_MREMAP void *ExtMemoryPool::remap(void *ptr, size_t oldSize, size_t newSize, size_t alignment) { const size_t oldUnalignedSize = ((LargeObjectHdr*)ptr - 1)->memoryBlock->unalignedSize; void *o = backend.remap(ptr, oldSize, newSize, alignment); if (o) { LargeMemoryBlock *lmb = ((LargeObjectHdr*)o - 1)->memoryBlock; loc.registerRealloc(oldUnalignedSize, lmb->unalignedSize); } return o; } #endif /* BACKEND_HAS_MREMAP */ /*********** End allocation of large objects **********/ } // namespace internal } // namespace rml #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) #pragma warning(pop) #endif ================================================ FILE: third-party/tbb/src/tbbmalloc/large_objects.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_tbbmalloc_internal_H #error tbbmalloc_internal.h must be included at this point #endif #ifndef __TBB_large_objects_H #define __TBB_large_objects_H //! The list of possible Cache Bin Aggregator operations. /* Declared here to avoid Solaris Studio* 12.2 "multiple definitions" error */ enum CacheBinOperationType { CBOP_INVALID = 0, CBOP_GET, CBOP_PUT_LIST, CBOP_CLEAN_TO_THRESHOLD, CBOP_CLEAN_ALL, CBOP_UPDATE_USED_SIZE }; // The Cache Bin Aggregator operation status list. // CBST_NOWAIT can be specified for non-blocking operations. enum CacheBinOperationStatus { CBST_WAIT = 0, CBST_NOWAIT, CBST_DONE }; /* * Bins that grow with arithmetic step */ template struct LargeBinStructureProps { public: static const size_t MinSize = MIN_SIZE, MaxSize = MAX_SIZE; static const size_t CacheStep = 8 * 1024; static const unsigned NumBins = (MaxSize - MinSize) / CacheStep; static size_t alignToBin(size_t size) { return alignUp(size, CacheStep); } static int sizeToIdx(size_t size) { MALLOC_ASSERT(MinSize <= size && size < MaxSize, ASSERT_TEXT); MALLOC_ASSERT(size % CacheStep == 0, ASSERT_TEXT); return (size - MinSize) / CacheStep; } }; /* * Bins that grow with special geometric progression. */ template struct HugeBinStructureProps { private: // Sizes grow with the following formula: Size = MinSize * (2 ^ (Index / StepFactor)) // There are StepFactor bins (8 be default) between each power of 2 bin static const int MaxSizeExp = Log2::value; static const int MinSizeExp = Log2::value; static const int StepFactor = 8; static const int StepFactorExp = Log2::value; public: static const size_t MinSize = MIN_SIZE, MaxSize = MAX_SIZE; static const unsigned NumBins = (MaxSizeExp - MinSizeExp) * StepFactor; static size_t alignToBin(size_t size) { MALLOC_ASSERT(size >= StepFactor, "Size must not be less than the StepFactor"); int sizeExp = (int)BitScanRev(size); MALLOC_ASSERT(sizeExp >= 0, "BitScanRev() cannot return -1, as size >= stepfactor > 0"); MALLOC_ASSERT(sizeExp >= StepFactorExp, "sizeExp >= StepFactorExp, because size >= stepFactor"); int minorStepExp = sizeExp - StepFactorExp; return alignUp(size, 1ULL << minorStepExp); } // Sizes between the power of 2 values are approximated to StepFactor. static int sizeToIdx(size_t size) { MALLOC_ASSERT(MinSize <= size && size <= MaxSize, ASSERT_TEXT); int sizeExp = (int)BitScanRev(size); // same as __TBB_Log2 MALLOC_ASSERT(sizeExp >= 0, "BitScanRev() cannot return -1, as size >= stepfactor > 0"); MALLOC_ASSERT(sizeExp >= StepFactorExp, "sizeExp >= StepFactorExp, because size >= stepFactor"); int minorStepExp = sizeExp - StepFactorExp; size_t majorStepSize = 1ULL << sizeExp; int minorIdx = (size - majorStepSize) >> minorStepExp; MALLOC_ASSERT(size == majorStepSize + ((size_t)minorIdx << minorStepExp), "Size is not aligned on the bin"); return StepFactor * (sizeExp - MinSizeExp) + minorIdx; } }; /* * Cache properties accessor * * TooLargeFactor -- when cache size treated "too large" in comparison to user data size * OnMissFactor -- If cache miss occurred and cache was cleaned, * set ageThreshold to OnMissFactor * the difference * between current time and last time cache was cleaned. * LongWaitFactor -- to detect rarely-used bins and forget about their usage history */ template struct LargeObjectCacheProps : public StructureProps { static const int TooLargeFactor = TOO_LARGE, OnMissFactor = ON_MISS, LongWaitFactor = LONG_WAIT; }; template class LargeObjectCacheImpl { private: // Current sizes of used and cached objects. It's calculated while we are // traversing bins, and used for isLOCTooLarge() check at the same time. class BinsSummary { size_t usedSz; size_t cachedSz; public: BinsSummary() : usedSz(0), cachedSz(0) {} // "too large" criteria bool isLOCTooLarge() const { return cachedSz > Props::TooLargeFactor * usedSz; } void update(size_t usedSize, size_t cachedSize) { usedSz += usedSize; cachedSz += cachedSize; } void reset() { usedSz = cachedSz = 0; } }; public: // The number of bins to cache large/huge objects. static const uint32_t numBins = Props::NumBins; typedef BitMaskMax BinBitMask; // 2-linked list of same-size cached blocks ordered by age (oldest on top) // TODO: are we really want the list to be 2-linked? This allows us // reduce memory consumption and do less operations under lock. // TODO: try to switch to 32-bit logical time to save space in CacheBin // and move bins to different cache lines. class CacheBin { private: LargeMemoryBlock* first; std::atomic last; /* age of an oldest block in the list; equal to last->age, if last defined, used for quick checking it without acquiring the lock. */ std::atomic oldest; /* currAge when something was excluded out of list because of the age, not because of cache hit */ uintptr_t lastCleanedAge; /* Current threshold value for the blocks of a particular size. Set on cache miss. */ std::atomic ageThreshold; /* total size of all objects corresponding to the bin and allocated by user */ std::atomic usedSize; /* total size of all objects cached in the bin */ std::atomic cachedSize; /* mean time of presence of block in the bin before successful reuse */ std::atomic meanHitRange; /* time of last get called for the bin */ uintptr_t lastGet; typename MallocAggregator::type aggregator; void ExecuteOperation(CacheBinOperation *op, ExtMemoryPool *extMemPool, BinBitMask *bitMask, int idx, bool longLifeTime = true); /* should be placed in zero-initialized memory, ctor not needed. */ CacheBin(); public: void init() { memset(static_cast(this), 0, sizeof(CacheBin)); } /* ---------- Cache accessors ---------- */ void putList(ExtMemoryPool *extMemPool, LargeMemoryBlock *head, BinBitMask *bitMask, int idx); LargeMemoryBlock *get(ExtMemoryPool *extMemPool, size_t size, BinBitMask *bitMask, int idx); /* ---------- Cleanup functions -------- */ bool cleanToThreshold(ExtMemoryPool *extMemPool, BinBitMask *bitMask, uintptr_t currTime, int idx); bool releaseAllToBackend(ExtMemoryPool *extMemPool, BinBitMask *bitMask, int idx); /* ------------------------------------- */ void updateUsedSize(ExtMemoryPool *extMemPool, size_t size, BinBitMask *bitMask, int idx); void decreaseThreshold() { intptr_t threshold = ageThreshold.load(std::memory_order_relaxed); if (threshold) ageThreshold.store((threshold + meanHitRange.load(std::memory_order_relaxed)) / 2, std::memory_order_relaxed); } void updateBinsSummary(BinsSummary *binsSummary) const { binsSummary->update(usedSize.load(std::memory_order_relaxed), cachedSize.load(std::memory_order_relaxed)); } size_t getSize() const { return cachedSize.load(std::memory_order_relaxed); } size_t getUsedSize() const { return usedSize.load(std::memory_order_relaxed); } size_t reportStat(int num, FILE *f); /* --------- Unsafe methods used with the aggregator ------- */ void forgetOutdatedState(uintptr_t currTime); LargeMemoryBlock *putList(LargeMemoryBlock *head, LargeMemoryBlock *tail, BinBitMask *bitMask, int idx, int num, size_t hugeObjectThreshold); LargeMemoryBlock *get(); LargeMemoryBlock *cleanToThreshold(uintptr_t currTime, BinBitMask *bitMask, int idx); LargeMemoryBlock *cleanAll(BinBitMask *bitMask, int idx); void updateUsedSize(size_t size, BinBitMask *bitMask, int idx) { if (!usedSize.load(std::memory_order_relaxed)) bitMask->set(idx, true); usedSize.store(usedSize.load(std::memory_order_relaxed) + size, std::memory_order_relaxed); if (!usedSize.load(std::memory_order_relaxed) && !first) bitMask->set(idx, false); } void updateMeanHitRange( intptr_t hitRange ) { hitRange = hitRange >= 0 ? hitRange : 0; intptr_t mean = meanHitRange.load(std::memory_order_relaxed); mean = mean ? (mean + hitRange) / 2 : hitRange; meanHitRange.store(mean, std::memory_order_relaxed); } void updateAgeThreshold( uintptr_t currTime ) { if (lastCleanedAge) ageThreshold.store(Props::OnMissFactor * (currTime - lastCleanedAge), std::memory_order_relaxed); } void updateCachedSize(size_t size) { cachedSize.store(cachedSize.load(std::memory_order_relaxed) + size, std::memory_order_relaxed); } void setLastGet( uintptr_t newLastGet ) { lastGet = newLastGet; } /* -------------------------------------------------------- */ }; // Huge bins index for fast regular cleanup searching in case of // the "huge size threshold" setting defined intptr_t hugeSizeThresholdIdx; private: // How many times LOC was "too large" std::atomic tooLargeLOC; // for fast finding of used bins and bins with non-zero usedSize; // indexed from the end, as we need largest 1st BinBitMask bitMask; // bins with lists of recently freed large blocks cached for reuse CacheBin bin[numBins]; public: /* ------------ CacheBin structure dependent stuff ------------ */ static size_t alignToBin(size_t size) { return Props::alignToBin(size); } static int sizeToIdx(size_t size) { return Props::sizeToIdx(size); } /* --------- Main cache functions (put, get object) ------------ */ void putList(ExtMemoryPool *extMemPool, LargeMemoryBlock *largeBlock); LargeMemoryBlock *get(ExtMemoryPool *extMemPool, size_t size); /* ------------------------ Cleanup ---------------------------- */ bool regularCleanup(ExtMemoryPool *extMemPool, uintptr_t currAge, bool doThreshDecr); bool cleanAll(ExtMemoryPool *extMemPool); /* -------------------------- Other ---------------------------- */ void updateCacheState(ExtMemoryPool *extMemPool, DecreaseOrIncrease op, size_t size); void reset(); void reportStat(FILE *f); #if __TBB_MALLOC_WHITEBOX_TEST size_t getLOCSize() const; size_t getUsedSize() const; #endif }; class LargeObjectCache { private: // Large bins [minLargeSize, maxLargeSize) // Huge bins [maxLargeSize, maxHugeSize) static const size_t minLargeSize = 8 * 1024, maxLargeSize = 8 * 1024 * 1024, // Cache memory up to 1TB (or 2GB for 32-bit arch), but sieve objects from the special threshold maxHugeSize = tbb::detail::select_size_t_constant<2147483648U, 1099511627776ULL>::value; public: // Upper bound threshold for caching size. After that size all objects sieve through cache // By default - 64MB, previous value was 129MB (needed by some Intel(R) Math Kernel Library (Intel(R) MKL) benchmarks) static const size_t defaultMaxHugeSize = 64UL * 1024UL * 1024UL; // After that size large object interpreted as huge and does not participate in regular cleanup. // Can be changed during the program execution. size_t hugeSizeThreshold; private: // Large objects cache properties typedef LargeBinStructureProps LargeBSProps; typedef LargeObjectCacheProps LargeCacheTypeProps; // Huge objects cache properties typedef HugeBinStructureProps HugeBSProps; typedef LargeObjectCacheProps HugeCacheTypeProps; // Cache implementation type with properties typedef LargeObjectCacheImpl< LargeCacheTypeProps > LargeCacheType; typedef LargeObjectCacheImpl< HugeCacheTypeProps > HugeCacheType; // Beginning of largeCache is more actively used and smaller than hugeCache, // so put hugeCache first to prevent false sharing // with LargeObjectCache's predecessor HugeCacheType hugeCache; LargeCacheType largeCache; /* logical time, incremented on each put/get operation To prevent starvation between pools, keep separately for each pool. Overflow is OK, as we only want difference between its current value and some recent. Both malloc and free should increment logical time, as in a different case multiple cached blocks would have same age, and accuracy of predictors suffers. */ std::atomic cacheCurrTime; // Memory pool that owns this LargeObjectCache. // strict 1:1 relation, never changed ExtMemoryPool *extMemPool; // Returns artificial bin index, // it's used only during sorting and never saved static int sizeToIdx(size_t size); // Our friends friend class Backend; public: void init(ExtMemoryPool *memPool); // Item accessors void put(LargeMemoryBlock *largeBlock); void putList(LargeMemoryBlock *head); LargeMemoryBlock *get(size_t size); void updateCacheState(DecreaseOrIncrease op, size_t size); bool isCleanupNeededOnRange(uintptr_t range, uintptr_t currTime); // Cleanup operations bool doCleanup(uintptr_t currTime, bool doThreshDecr); bool decreasingCleanup(); bool regularCleanup(); bool cleanAll(); void reset(); void reportStat(FILE *f); #if __TBB_MALLOC_WHITEBOX_TEST size_t getLOCSize() const; size_t getUsedSize() const; #endif // Cache deals with exact-fit sizes, so need to align each size // to the specific bin when put object to cache static size_t alignToBin(size_t size); void setHugeSizeThreshold(size_t value); // Check if we should cache or sieve this size bool sizeInCacheRange(size_t size); uintptr_t getCurrTimeRange(uintptr_t range); void registerRealloc(size_t oldSize, size_t newSize); }; #endif // __TBB_large_objects_H ================================================ FILE: third-party/tbb/src/tbbmalloc/shared_utils.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_shared_utils_H #define __TBB_shared_utils_H // Include files containing declarations of intptr_t and uintptr_t #include // size_t #if _MSC_VER typedef unsigned __int16 uint16_t; typedef unsigned __int32 uint32_t; typedef unsigned __int64 uint64_t; #if !UINTPTR_MAX #define UINTPTR_MAX SIZE_MAX #endif #else // _MSC_VER #include #endif /* * Functions to align an integer down or up to the given power of two, * and test for such an alignment, and for power of two. */ template static inline T alignDown(T arg, uintptr_t alignment) { return T( (uintptr_t)arg & ~(alignment-1)); } template static inline T alignUp (T arg, uintptr_t alignment) { return T(((uintptr_t)arg+(alignment-1)) & ~(alignment-1)); // /*is this better?*/ return (((uintptr_t)arg-1) | (alignment-1)) + 1; } template // works for not power-of-2 alignments static inline T alignUpGeneric(T arg, uintptr_t alignment) { if (size_t rem = arg % alignment) { arg += alignment - rem; } return arg; } /* * Compile time Log2 calculation */ template struct Log2 { static const int value = 1 + Log2<(NUM >> 1)>::value; }; template <> struct Log2<1> { static const int value = 0; }; #if defined(min) #undef min #endif template T min ( const T& val1, const T& val2 ) { return val1 < val2 ? val1 : val2; } /* * Functions to parse files information (system files for example) */ #include #if defined(_MSC_VER) && (_MSC_VER<1900) && !defined(__INTEL_COMPILER) // Suppress overzealous compiler warnings that default ctor and assignment // operator cannot be generated and object 'class' can never be instantiated. #pragma warning(push) #pragma warning(disable:4510 4512 4610) #endif #if __SUNPRO_CC // Suppress overzealous compiler warnings that a class with a reference member // lacks a user-defined constructor, which can lead to errors #pragma error_messages (off, refmemnoconstr) #endif // TODO: add a constructor to remove warnings suppression struct parseFileItem { const char* format; long long& value; }; #if defined(_MSC_VER) && (_MSC_VER<1900) && !defined(__INTEL_COMPILER) #pragma warning(pop) #endif #if __SUNPRO_CC #pragma error_messages (on, refmemnoconstr) #endif template void parseFile(const char* file, const parseFileItem (&items)[N]) { // Tries to find all items in each line int found[N] = { 0 }; // If all items found, stop forward file reading int numFound = 0; // Line storage char buf[BUF_LINE_SIZE]; if (FILE *f = fopen(file, "r")) { while (numFound < N && fgets(buf, BUF_LINE_SIZE, f)) { for (int i = 0; i < N; ++i) { if (!found[i] && 1 == sscanf(buf, items[i].format, &items[i].value)) { ++numFound; found[i] = 1; } } } fclose(f); } } namespace rml { namespace internal { /* * Best estimate of cache line size, for the purpose of avoiding false sharing. * Too high causes memory overhead, too low causes false-sharing overhead. * Because, e.g., 32-bit code might run on a 64-bit system with a larger cache line size, * it would probably be better to probe at runtime where possible and/or allow for an environment variable override, * but currently this is still used for compile-time layout of class Block, so the change is not entirely trivial. */ #if __powerpc64__ || __ppc64__ || __bgp__ const uint32_t estimatedCacheLineSize = 128; #else const uint32_t estimatedCacheLineSize = 64; #endif } // namespace internal } // namespace rml #endif /* __TBB_shared_utils_H */ ================================================ FILE: third-party/tbb/src/tbbmalloc/tbbmalloc.cpp ================================================ /* Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "TypeDefinitions.h" // Customize.h and proxy.h get included #include "tbbmalloc_internal_api.h" #include "../tbb/assert_impl.h" // Out-of-line TBB assertion handling routines are instantiated here. #include "oneapi/tbb/version.h" #include "oneapi/tbb/scalable_allocator.h" #undef UNICODE #if USE_PTHREAD #include // dlopen #elif USE_WINTHREAD #include #endif namespace rml { namespace internal { #if TBB_USE_DEBUG #define DEBUG_SUFFIX "_debug" #else #define DEBUG_SUFFIX #endif /* TBB_USE_DEBUG */ // MALLOCLIB_NAME is the name of the oneTBB memory allocator library. #if _WIN32||_WIN64 #define MALLOCLIB_NAME "tbbmalloc" DEBUG_SUFFIX ".dll" #elif __APPLE__ #define MALLOCLIB_NAME "libtbbmalloc" DEBUG_SUFFIX ".2.dylib" #elif __FreeBSD__ || __NetBSD__ || __OpenBSD__ || __sun || _AIX || __ANDROID__ #define MALLOCLIB_NAME "libtbbmalloc" DEBUG_SUFFIX ".so" #elif __unix__ #define MALLOCLIB_NAME "libtbbmalloc" DEBUG_SUFFIX __TBB_STRING(.so.2) #else #error Unknown OS #endif void init_tbbmalloc() { #if __TBB_USE_ITT_NOTIFY MallocInitializeITT(); #endif /* Preventing TBB allocator library from unloading to prevent resource leak, as memory is not released on the library unload. */ #if USE_WINTHREAD && !__TBB_SOURCE_DIRECTLY_INCLUDED && !__TBB_WIN8UI_SUPPORT // Prevent Windows from displaying message boxes if it fails to load library UINT prev_mode = SetErrorMode (SEM_FAILCRITICALERRORS); HMODULE lib; BOOL ret = GetModuleHandleEx(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS |GET_MODULE_HANDLE_EX_FLAG_PIN, (LPCTSTR)&scalable_malloc, &lib); MALLOC_ASSERT(lib && ret, "Allocator can't find itself."); tbb::detail::suppress_unused_warning(ret); SetErrorMode (prev_mode); #endif /* USE_PTHREAD && !__TBB_SOURCE_DIRECTLY_INCLUDED */ } #if !__TBB_SOURCE_DIRECTLY_INCLUDED #if USE_WINTHREAD extern "C" BOOL WINAPI DllMain( HINSTANCE /*hInst*/, DWORD callReason, LPVOID lpvReserved) { if (callReason==DLL_THREAD_DETACH) { __TBB_mallocThreadShutdownNotification(); } else if (callReason==DLL_PROCESS_DETACH) { __TBB_mallocProcessShutdownNotification(lpvReserved != nullptr); } return TRUE; } #else /* !USE_WINTHREAD */ struct RegisterProcessShutdownNotification { // Work around non-reentrancy in dlopen() on Android RegisterProcessShutdownNotification() { // prevents unloading, POSIX case // We need better support for the library pinning // when dlopen can't find TBBmalloc library. // for example: void *ret = dlopen(MALLOCLIB_NAME, RTLD_NOW); // MALLOC_ASSERT(ret, "Allocator can't load itself."); dlopen(MALLOCLIB_NAME, RTLD_NOW); } RegisterProcessShutdownNotification(RegisterProcessShutdownNotification&) = delete; RegisterProcessShutdownNotification& operator=(const RegisterProcessShutdownNotification&) = delete; ~RegisterProcessShutdownNotification() { __TBB_mallocProcessShutdownNotification(false); } }; static RegisterProcessShutdownNotification reg; #endif /* !USE_WINTHREAD */ #endif /* !__TBB_SOURCE_DIRECTLY_INCLUDED */ } } // namespaces ================================================ FILE: third-party/tbb/src/tbbmalloc/tbbmalloc.rc ================================================ // Copyright (c) 2005-2025 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. ///////////////////////////////////////////////////////////////////////////// // // Includes // #include #include "../../include/oneapi/tbb/version.h" ///////////////////////////////////////////////////////////////////////////// // Neutral resources #ifdef _WIN32 LANGUAGE LANG_NEUTRAL, SUBLANG_NEUTRAL #pragma code_page(1252) #endif //_WIN32 ///////////////////////////////////////////////////////////////////////////// // // Version // #define TBB_VERNUMBERS TBB_VERSION_MAJOR,TBB_VERSION_MINOR,TBB_VERSION_PATCH #define TBB_VERSION TBB_VERSION_STRING VS_VERSION_INFO VERSIONINFO FILEVERSION TBB_VERNUMBERS PRODUCTVERSION TBB_VERNUMBERS FILEFLAGSMASK 0x17L #ifdef _DEBUG FILEFLAGS 0x1L #else FILEFLAGS 0x0L #endif FILEOS 0x40004L FILETYPE 0x2L FILESUBTYPE 0x0L BEGIN BLOCK "StringFileInfo" BEGIN BLOCK "000004b0" BEGIN VALUE "CompanyName", "Intel Corporation\0" VALUE "FileDescription", "oneAPI Threading Building Blocks (oneTBB) library\0" VALUE "FileVersion", TBB_VERSION "\0" VALUE "LegalCopyright", "Copyright 2005-2025 Intel Corporation. All Rights Reserved.\0" VALUE "LegalTrademarks", "\0" #ifndef TBB_USE_DEBUG VALUE "OriginalFilename", "tbbmalloc.dll\0" #else VALUE "OriginalFilename", "tbbmalloc_debug.dll\0" #endif VALUE "ProductName", "oneAPI Threading Building Blocks (oneTBB)\0" VALUE "ProductVersion", TBB_VERSION "\0" VALUE "PrivateBuild", "\0" VALUE "SpecialBuild", "\0" END END BLOCK "VarFileInfo" BEGIN VALUE "Translation", 0x0, 1200 END END ================================================ FILE: third-party/tbb/src/tbbmalloc/tbbmalloc_internal.h ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_tbbmalloc_internal_H #define __TBB_tbbmalloc_internal_H #include "TypeDefinitions.h" /* Also includes customization layer Customize.h */ #if USE_PTHREAD // Some pthreads documentation says that must be first header. #include typedef pthread_key_t tls_key_t; #elif USE_WINTHREAD #include typedef DWORD tls_key_t; #else #error Must define USE_PTHREAD or USE_WINTHREAD #endif #include // TODO: *BSD also has it #define BACKEND_HAS_MREMAP __linux__ #define CHECK_ALLOCATION_RANGE MALLOC_DEBUG || MALLOC_ZONE_OVERLOAD_ENABLED || MALLOC_UNIXLIKE_OVERLOAD_ENABLED #include "oneapi/tbb/detail/_config.h" // for __TBB_LIBSTDCPP_EXCEPTION_HEADERS_BROKEN #include "oneapi/tbb/detail/_template_helpers.h" #if __TBB_LIBSTDCPP_EXCEPTION_HEADERS_BROKEN #define _EXCEPTION_PTR_H /* prevents exception_ptr.h inclusion */ #define _GLIBCXX_NESTED_EXCEPTION_H /* prevents nested_exception.h inclusion */ #endif #include #include #include // for CHAR_BIT #include // for memset #if MALLOC_CHECK_RECURSION #include /* for placement new */ #endif #include "oneapi/tbb/scalable_allocator.h" #include "tbbmalloc_internal_api.h" /********* Various compile-time options **************/ #if !__TBB_DEFINE_MIC && __TBB_MIC_NATIVE #error Intel(R) Many Integrated Core Compiler does not define __MIC__ anymore. #endif #define MALLOC_TRACE 0 #if MALLOC_TRACE #define TRACEF(x) printf x #else #define TRACEF(x) ((void)0) #endif /* MALLOC_TRACE */ #define ASSERT_TEXT nullptr #define COLLECT_STATISTICS ( MALLOC_DEBUG && MALLOCENV_COLLECT_STATISTICS ) #ifndef USE_INTERNAL_TID #define USE_INTERNAL_TID COLLECT_STATISTICS || MALLOC_TRACE #endif #include "Statistics.h" // call yield for whitebox testing, skip in real library #ifndef WhiteboxTestingYield #define WhiteboxTestingYield() ((void)0) #endif /********* End compile-time options **************/ namespace rml { namespace internal { #if __TBB_MALLOC_LOCACHE_STAT extern intptr_t mallocCalls, cacheHits; extern intptr_t memAllocKB, memHitKB; #endif //! Utility template function to prevent "unused" warnings by various compilers. template void suppress_unused_warning( const T& ) {} /********** Various global default constants ********/ /* * Default huge page size */ #if defined __loongarch64 static const size_t HUGE_PAGE_SIZE = 32 * 1024 * 1024; #else static const size_t HUGE_PAGE_SIZE = 2 * 1024 * 1024; #endif /********** End of global default constants *********/ /********** Various numeric parameters controlling allocations ********/ /* * slabSize - the size of a block for allocation of small objects, * it must be larger than maxSegregatedObjectSize. */ const uintptr_t slabSize = 16*1024; /* * Large blocks cache cleanup frequency. * It should be power of 2 for the fast checking. */ const unsigned cacheCleanupFreq = 256; /* * Alignment of large (>= minLargeObjectSize) objects. */ const size_t largeObjectAlignment = estimatedCacheLineSize; /* * This number of bins in the TLS that leads to blocks that we can allocate in. */ const uint32_t numBlockBinLimit = 31; /********** End of numeric parameters controlling allocations *********/ class BlockI; class Block; struct LargeMemoryBlock; struct ExtMemoryPool; struct MemRegion; class FreeBlock; class TLSData; class Backend; class MemoryPool; struct CacheBinOperation; extern const uint32_t minLargeObjectSize; enum DecreaseOrIncrease { decrease, increase }; class TLSKey { tls_key_t TLS_pointer_key; public: bool init(); bool destroy(); TLSData* getThreadMallocTLS() const; void setThreadMallocTLS( TLSData * newvalue ); TLSData* createTLS(MemoryPool *memPool, Backend *backend); }; template inline void AtomicUpdate(std::atomic& location, Arg newVal, const Compare &cmp) { static_assert(sizeof(Arg) == sizeof(intptr_t), "Type of argument must match AtomicCompareExchange type."); Arg old = location.load(std::memory_order_acquire); for (; cmp(old, newVal); ) { if (location.compare_exchange_strong(old, newVal)) break; // TODO: do we need backoff after unsuccessful CAS? //old = val; } } // TODO: make BitMaskBasic more general // TODO: check that BitMaskBasic is not used for synchronization // (currently, it fits BitMaskMin well, but not as suitable for BitMaskMax) template class BitMaskBasic { static const unsigned SZ = (NUM-1)/(CHAR_BIT*sizeof(uintptr_t))+1; static const unsigned WORD_LEN = CHAR_BIT*sizeof(uintptr_t); std::atomic mask[SZ]; protected: void set(size_t idx, bool val) { MALLOC_ASSERT(idx class BitMaskMin : public BitMaskBasic { public: void set(size_t idx, bool val) { BitMaskBasic::set(idx, val); } int getMinTrue(unsigned startIdx) const { return BitMaskBasic::getMinTrue(startIdx); } }; template class BitMaskMax : public BitMaskBasic { public: void set(size_t idx, bool val) { MALLOC_ASSERT(NUM >= idx + 1, ASSERT_TEXT); BitMaskBasic::set(NUM - 1 - idx, val); } int getMaxTrue(unsigned startIdx) const { MALLOC_ASSERT(NUM >= startIdx + 1, ASSERT_TEXT); int p = BitMaskBasic::getMinTrue(NUM-startIdx-1); return -1==p? -1 : (int)NUM - 1 - p; } }; // The part of thread-specific data that can be modified by other threads. // Such modifications must be protected by AllLocalCaches::listLock. struct TLSRemote { TLSRemote *next, *prev; }; // The list of all thread-local data; supporting cleanup of thread caches class AllLocalCaches { TLSRemote *head; MallocMutex listLock; // protects operations in the list public: void registerThread(TLSRemote *tls); void unregisterThread(TLSRemote *tls); bool cleanup(bool cleanOnlyUnused); void markUnused(); void reset() { head = nullptr; } }; class LifoList { public: inline LifoList(); inline void push(Block *block); inline Block *pop(); inline Block *grab(); private: std::atomic top; MallocMutex lock; }; /* * When a block that is not completely free is returned for reuse by other threads * this is where the block goes. * * LifoList assumes zero initialization; so below its constructors are omitted, * to avoid linking with C++ libraries on Linux. */ class OrphanedBlocks { LifoList bins[numBlockBinLimit]; public: Block *get(TLSData *tls, unsigned int size); void put(intptr_t binTag, Block *block); void reset(); bool cleanup(Backend* backend); }; /* Large objects entities */ #include "large_objects.h" // select index size for BackRefMain based on word size: default is uint32_t, // uint16_t for 32-bit platforms template struct MainIndexSelect { typedef uint32_t main_type; }; template<> struct MainIndexSelect { typedef uint16_t main_type; }; class BackRefIdx { // composite index to backreference array public: typedef MainIndexSelect<4 < sizeof(uintptr_t)>::main_type main_t; private: static const main_t invalid = ~main_t(0); main_t main; // index in BackRefMain uint16_t largeObj:1; // is this object "large"? uint16_t offset :15; // offset from beginning of BackRefBlock public: BackRefIdx() : main(invalid), largeObj(0), offset(0) {} bool isInvalid() const { return main == invalid; } bool isLargeObject() const { return largeObj; } main_t getMain() const { return main; } uint16_t getOffset() const { return offset; } #if __TBB_USE_THREAD_SANITIZER friend __attribute__((no_sanitize("thread"))) BackRefIdx dereference(const BackRefIdx* ptr) { BackRefIdx idx; idx.main = ptr->main; idx.largeObj = ptr->largeObj; idx.offset = ptr->offset; return idx; } #else friend BackRefIdx dereference(const BackRefIdx* ptr) { return *ptr; } #endif // only newBackRef can modify BackRefIdx static BackRefIdx newBackRef(bool largeObj); }; // Block header is used during block coalescing // and must be preserved in used blocks. class BlockI { #if __clang__ && !__INTEL_COMPILER #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wunused-private-field" #endif intptr_t blockState[2]; #if __clang__ && !__INTEL_COMPILER #pragma clang diagnostic pop // "-Wunused-private-field" #endif }; struct LargeMemoryBlock : public BlockI { MemoryPool *pool; // owner pool LargeMemoryBlock *next, // ptrs in list of cached blocks *prev, // 2-linked list of pool's large objects // Used to destroy backrefs on pool destroy (backrefs are global) // and for object releasing during pool reset. *gPrev, *gNext; uintptr_t age; // age of block while in cache size_t objectSize; // the size requested by a client size_t unalignedSize; // the size requested from backend BackRefIdx backRefIdx; // cached here, used copy is in LargeObjectHdr }; // Classes and methods for backend.cpp #include "backend.h" // An TBB allocator mode that can be controlled by user // via API/environment variable. Must be placed in zero-initialized memory. // External synchronization assumed. // TODO: TBB_VERSION support class AllocControlledMode { intptr_t val; bool setDone; public: intptr_t get() const { MALLOC_ASSERT(setDone, ASSERT_TEXT); return val; } // Note: set() can be called before init() void set(intptr_t newVal) { val = newVal; setDone = true; } bool ready() const { return setDone; } // envName - environment variable to get controlled mode void initReadEnv(const char *envName, intptr_t defaultVal) { if (!setDone) { // unreferenced formal parameter warning tbb::detail::suppress_unused_warning(envName); #if !__TBB_WIN8UI_SUPPORT // TODO: use strtol to get the actual value of the envirable const char *envVal = getenv(envName); if (envVal && !strcmp(envVal, "1")) val = 1; else #endif val = defaultVal; setDone = true; } } }; // Page type to be used inside MapMemory. // Regular (4KB aligned), Huge and Transparent Huge Pages (2MB aligned). enum PageType { REGULAR = 0, PREALLOCATED_HUGE_PAGE, TRANSPARENT_HUGE_PAGE }; // init() and printStatus() is called only under global initialization lock. // Race is possible between registerAllocation() and registerReleasing(), // harm is that up to single huge page releasing is missed (because failure // to get huge page is registered only 1st time), that is negligible. // setMode is also can be called concurrently. // Object must reside in zero-initialized memory // TODO: can we check for huge page presence during every 10th mmap() call // in case huge page is released by another process? class HugePagesStatus { private: AllocControlledMode requestedMode; // changed only by user // to keep enabled and requestedMode consistent MallocMutex setModeLock; size_t pageSize; std::atomic needActualStatusPrint; static void doPrintStatus(bool state, const char *stateName) { // Under macOS* fprintf/snprintf acquires an internal lock, so when // 1st allocation is done under the lock, we got a deadlock. // Do not use fprintf etc during initialization. fputs("TBBmalloc: huge pages\t", stderr); if (!state) fputs("not ", stderr); fputs(stateName, stderr); fputs("\n", stderr); } void parseSystemMemInfo() { bool hpAvailable = false; bool thpAvailable = false; long long hugePageSize = -1; #if __unix__ // Check huge pages existence long long meminfoHugePagesTotal = 0; parseFileItem meminfoItems[] = { // Parse system huge page size { "Hugepagesize: %lld kB", hugePageSize }, // Check if there are preallocated huge pages on the system // https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt { "HugePages_Total: %lld", meminfoHugePagesTotal } }; parseFile("/proc/meminfo", meminfoItems); // Double check another system information regarding preallocated // huge pages if there are no information in /proc/meminfo long long vmHugePagesTotal = 0; parseFileItem vmItem[] = { { "%lld", vmHugePagesTotal } }; // We parse a counter number, it can't be huge parseFile("/proc/sys/vm/nr_hugepages", vmItem); if (hugePageSize > -1 && (meminfoHugePagesTotal > 0 || vmHugePagesTotal > 0)) { MALLOC_ASSERT(hugePageSize != 0, "Huge Page size can't be zero if we found preallocated."); // Any non zero value clearly states that there are preallocated // huge pages on the system hpAvailable = true; } // Check if there is transparent huge pages support on the system long long thpPresent = 'n'; parseFileItem thpItem[] = { { "[alwa%cs] madvise never\n", thpPresent } }; parseFile("/sys/kernel/mm/transparent_hugepage/enabled", thpItem); if (hugePageSize > -1 && thpPresent == 'y') { MALLOC_ASSERT(hugePageSize != 0, "Huge Page size can't be zero if we found thp existence."); thpAvailable = true; } #endif MALLOC_ASSERT(!pageSize, "Huge page size can't be set twice. Double initialization."); // Initialize object variables if (hugePageSize > -1) { pageSize = hugePageSize * 1024; // was read in KB from meminfo } else { pageSize = 0; } isHPAvailable = hpAvailable; isTHPAvailable = thpAvailable; } public: // System information bool isHPAvailable; bool isTHPAvailable; // User defined value bool isEnabled; void init() { parseSystemMemInfo(); MallocMutex::scoped_lock lock(setModeLock); requestedMode.initReadEnv("TBB_MALLOC_USE_HUGE_PAGES", 0); isEnabled = (isHPAvailable || isTHPAvailable) && requestedMode.get(); } // Could be set from user code at any place. // If we didn't call init() at this place, isEnabled will be false void setMode(intptr_t newVal) { MallocMutex::scoped_lock lock(setModeLock); requestedMode.set(newVal); isEnabled = (isHPAvailable || isTHPAvailable) && newVal; } void reset() { needActualStatusPrint.store(0, std::memory_order_relaxed); pageSize = 0; isEnabled = isHPAvailable = isTHPAvailable = false; } // If memory mapping size is a multiple of huge page size, some OS kernels // can use huge pages transparently. Use this when huge pages are requested. size_t getGranularity() const { if (requestedMode.ready()) return requestedMode.get() ? pageSize : 0; else return HUGE_PAGE_SIZE; // the mode is not yet known; assume typical 2MB huge pages } void printStatus() { doPrintStatus(requestedMode.get(), "requested"); if (requestedMode.get()) { // report actual status iff requested if (pageSize) needActualStatusPrint.store(1, std::memory_order_release); else doPrintStatus(/*state=*/false, "available"); } } }; class AllLargeBlocksList { MallocMutex largeObjLock; LargeMemoryBlock *loHead; public: void add(LargeMemoryBlock *lmb); void remove(LargeMemoryBlock *lmb); template void releaseAll(Backend *backend); }; struct ExtMemoryPool { Backend backend; LargeObjectCache loc; AllLocalCaches allLocalCaches; OrphanedBlocks orphanedBlocks; intptr_t poolId; // To find all large objects. Used during user pool destruction, // to release all backreferences in large blocks (slab blocks do not have them). AllLargeBlocksList lmbList; // Callbacks to be used instead of MapMemory/UnmapMemory. rawAllocType rawAlloc; rawFreeType rawFree; size_t granularity; bool keepAllMemory, delayRegsReleasing, // TODO: implements fixedPool with calling rawFree on destruction fixedPool; TLSKey tlsPointerKey; // per-pool TLS key std::atomic softCachesCleanupInProgress; std::atomic hardCachesCleanupInProgress; bool init(intptr_t poolId, rawAllocType rawAlloc, rawFreeType rawFree, size_t granularity, bool keepAllMemory, bool fixedPool); bool initTLS(); // i.e., not system default pool for scalable_malloc/scalable_free bool userPool() const { return rawAlloc; } // true if something has been released bool softCachesCleanup(); bool releaseAllLocalCaches(); bool hardCachesCleanup(bool wait); void *remap(void *ptr, size_t oldSize, size_t newSize, size_t alignment); bool reset() { loc.reset(); allLocalCaches.reset(); orphanedBlocks.reset(); bool ret = tlsPointerKey.destroy(); backend.reset(); return ret; } bool destroy() { MALLOC_ASSERT(isPoolValid(), "Possible double pool_destroy or heap corruption"); if (!userPool()) { loc.reset(); allLocalCaches.reset(); } // pthread_key_dtors must be disabled before memory unmapping // TODO: race-free solution bool ret = tlsPointerKey.destroy(); if (rawFree || !userPool()) ret &= backend.destroy(); // pool is not valid after this point granularity = 0; return ret; } void delayRegionsReleasing(bool mode) { delayRegsReleasing = mode; } inline bool regionsAreReleaseable() const; LargeMemoryBlock *mallocLargeObject(MemoryPool *pool, size_t allocationSize); void freeLargeObject(LargeMemoryBlock *lmb); void freeLargeObjectList(LargeMemoryBlock *head); #if MALLOC_DEBUG // use granulatity as marker for pool validity bool isPoolValid() const { return granularity; } #endif }; inline bool Backend::inUserPool() const { return extMemPool->userPool(); } struct LargeObjectHdr { LargeMemoryBlock *memoryBlock; /* Backreference points to LargeObjectHdr. Duplicated in LargeMemoryBlock to reuse in subsequent allocations. */ BackRefIdx backRefIdx; }; struct FreeObject { FreeObject *next; }; /******* A helper class to support overriding malloc with scalable_malloc *******/ #if MALLOC_CHECK_RECURSION class RecursiveMallocCallProtector { // pointer to an automatic data of holding thread static std::atomic autoObjPtr; static MallocMutex rmc_mutex; static std::atomic owner_thread; /* Under FreeBSD 8.0 1st call to any pthread function including pthread_self leads to pthread initialization, that causes malloc calls. As 1st usage of RecursiveMallocCallProtector can be before pthread initialized, pthread calls can't be used in 1st instance of RecursiveMallocCallProtector. RecursiveMallocCallProtector is used 1st time in checkInitialization(), so there is a guarantee that on 2nd usage pthread is initialized. No such situation observed with other supported OSes. */ #if __FreeBSD__ static bool canUsePthread; #else static const bool canUsePthread = true; #endif /* The variable modified in checkInitialization, so can be read without memory barriers. */ static bool mallocRecursionDetected; MallocMutex::scoped_lock* lock_acquired; char scoped_lock_space[sizeof(MallocMutex::scoped_lock)+1]; public: RecursiveMallocCallProtector() : lock_acquired(nullptr) { lock_acquired = new (scoped_lock_space) MallocMutex::scoped_lock( rmc_mutex ); if (canUsePthread) owner_thread.store(pthread_self(), std::memory_order_relaxed); autoObjPtr.store(&scoped_lock_space, std::memory_order_relaxed); } RecursiveMallocCallProtector(RecursiveMallocCallProtector&) = delete; RecursiveMallocCallProtector& operator=(RecursiveMallocCallProtector) = delete; ~RecursiveMallocCallProtector() { if (lock_acquired) { autoObjPtr.store(nullptr, std::memory_order_relaxed); lock_acquired->~scoped_lock(); } } static bool sameThreadActive() { if (!autoObjPtr.load(std::memory_order_relaxed)) // fast path return false; // Some thread has an active recursive call protector; check if the current one. // Exact pthread_self based test if (canUsePthread) { if (pthread_equal( owner_thread.load(std::memory_order_relaxed), pthread_self() )) { mallocRecursionDetected = true; return true; } else return false; } // inexact stack size based test const uintptr_t threadStackSz = 2*1024*1024; int dummy; uintptr_t xi = (uintptr_t)autoObjPtr.load(std::memory_order_relaxed), yi = (uintptr_t)&dummy; uintptr_t diffPtr = xi > yi ? xi - yi : yi - xi; return diffPtr < threadStackSz; } /* The function is called on 1st scalable_malloc call to check if malloc calls scalable_malloc (nested call must set mallocRecursionDetected). */ static void detectNaiveOverload() { if (!malloc_proxy) { #if __FreeBSD__ /* If !canUsePthread, we can't call pthread_self() before, but now pthread is already on, so can do it. */ if (!canUsePthread) { canUsePthread = true; owner_thread.store(pthread_self(), std::memory_order_relaxed); } #endif free(malloc(1)); } } }; #else class RecursiveMallocCallProtector { public: RecursiveMallocCallProtector() {} ~RecursiveMallocCallProtector() {} }; #endif /* MALLOC_CHECK_RECURSION */ unsigned int getThreadId(); bool initBackRefMain(Backend *backend); void destroyBackRefMain(Backend *backend); void removeBackRef(BackRefIdx backRefIdx); void setBackRef(BackRefIdx backRefIdx, void *newPtr); void *getBackRef(BackRefIdx backRefIdx); } // namespace internal } // namespace rml #endif // __TBB_tbbmalloc_internal_H ================================================ FILE: third-party/tbb/src/tbbmalloc/tbbmalloc_internal_api.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_tbbmalloc_internal_api_H #define __TBB_tbbmalloc_internal_api_H #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ typedef enum { /* Tune usage of source included allocator. Selected value is large enough to not intercept with constants from AllocationModeParam. */ TBBMALLOC_INTERNAL_SOURCE_INCLUDED = 65536 } AllocationModeInternalParam; void MallocInitializeITT(); void __TBB_mallocProcessShutdownNotification(bool); #if _WIN32||_WIN64 void __TBB_mallocThreadShutdownNotification(); #endif #ifdef __cplusplus } /* extern "C" */ #endif /* __cplusplus */ #endif /* __TBB_tbbmalloc_internal_api_H */ ================================================ FILE: third-party/tbb/src/tbbmalloc_proxy/CMakeLists.txt ================================================ # Copyright (c) 2020-2024 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. if (NOT BUILD_SHARED_LIBS) return() endif() add_library(tbbmalloc_proxy function_replacement.cpp proxy.cpp) if (WIN32) target_sources(tbbmalloc_proxy PRIVATE tbbmalloc_proxy.rc) endif() add_library(TBB::tbbmalloc_proxy ALIAS tbbmalloc_proxy) target_compile_definitions(tbbmalloc_proxy PUBLIC $<$:TBB_USE_DEBUG> PRIVATE __TBBMALLOCPROXY_BUILD) target_include_directories(tbbmalloc_proxy PUBLIC $ $) if (NOT APPLE AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU") # gcc 5.0 and later have -Wno-sized-deallocation options set(TBB_WARNING_SUPPRESS ${TBB_WARNING_SUPPRESS} $<$>:-Wno-sized-deallocation>) endif() target_compile_options(tbbmalloc_proxy PRIVATE ${TBB_CXX_STD_FLAG} # TODO: consider making it PUBLIC. ${TBB_MMD_FLAG} ${TBB_DSE_FLAG} ${TBB_WARNING_LEVEL} ${TBB_WARNING_SUPPRESS} ${TBB_LIB_COMPILE_FLAGS} ${TBB_COMMON_COMPILE_FLAGS} ) set_target_properties(tbbmalloc_proxy PROPERTIES VERSION ${TBBMALLOC_BINARY_VERSION}.${TBB_BINARY_MINOR_VERSION} SOVERSION ${TBBMALLOC_BINARY_VERSION}) if (UNIX AND NOT APPLE) # Avoid use of target_link_libraries here as it changes /DEF option to \DEF on Windows. set_target_properties(tbbmalloc_proxy PROPERTIES LINK_FLAGS "${TBB_LINK_DEF_FILE_FLAG}\"${CMAKE_CURRENT_SOURCE_DIR}/def/${TBB_DEF_FILE_PREFIX}-proxy.def\"" LINK_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/def/${TBB_DEF_FILE_PREFIX}-proxy.def" DEFINE_SYMBOL "") endif() # Prefer using target_link_options instead of target_link_libraries to specify link options because # target_link_libraries may incorrectly handle some options (on Windows, for example). if (COMMAND target_link_options) target_link_options(tbbmalloc_proxy PRIVATE ${TBB_LIB_LINK_FLAGS} ${TBB_COMMON_LINK_FLAGS} ) else() target_link_libraries(tbbmalloc_proxy PRIVATE ${TBB_LIB_LINK_FLAGS} ${TBB_COMMON_LINK_FLAGS} ) endif() target_link_libraries(tbbmalloc_proxy PRIVATE TBB::tbbmalloc Threads::Threads ${TBB_LIB_LINK_LIBS} ${TBB_COMMON_LINK_LIBS} ) if(TBB_BUILD_APPLE_FRAMEWORKS) set_target_properties(tbbmalloc_proxy PROPERTIES FRAMEWORK TRUE FRAMEWORK_VERSION ${TBBMALLOC_BINARY_VERSION}.${TBB_BINARY_MINOR_VERSION} XCODE_ATTRIBUTE_PRODUCT_BUNDLE_IDENTIFIER com.intel.tbbmalloc-proxy MACOSX_FRAMEWORK_IDENTIFIER com.intel.tbbmalloc-proxy MACOSX_FRAMEWORK_BUNDLE_VERSION ${TBBMALLOC_BINARY_VERSION}.${TBB_BINARY_MINOR_VERSION} MACOSX_FRAMEWORK_SHORT_VERSION_STRING ${TBBMALLOC_BINARY_VERSION}) endif() tbb_install_target(tbbmalloc_proxy) ================================================ FILE: third-party/tbb/src/tbbmalloc_proxy/def/lin32-proxy.def ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ { global: calloc; free; malloc; realloc; posix_memalign; memalign; aligned_alloc; valloc; pvalloc; mallinfo; mallopt; malloc_usable_size; __libc_malloc; __libc_realloc; __libc_calloc; __libc_free; __libc_memalign; __libc_pvalloc; __libc_valloc; __TBB_malloc_proxy; _ZdaPv; /* next ones are new/delete */ _ZdaPvRKSt9nothrow_t; _ZdlPv; _ZdlPvRKSt9nothrow_t; _Znaj; _ZnajRKSt9nothrow_t; _Znwj; _ZnwjRKSt9nothrow_t; local: /* TBB symbols */ *3rml8internal*; *3tbb*; *__TBB*; }; ================================================ FILE: third-party/tbb/src/tbbmalloc_proxy/def/lin64-proxy.def ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ { global: calloc; free; malloc; realloc; posix_memalign; memalign; aligned_alloc; valloc; pvalloc; mallinfo; mallopt; malloc_usable_size; __libc_malloc; __libc_realloc; __libc_calloc; __libc_free; __libc_memalign; __libc_pvalloc; __libc_valloc; __TBB_malloc_proxy; _ZdaPv; /* next ones are new/delete */ _ZdaPvRKSt9nothrow_t; _ZdlPv; _ZdlPvRKSt9nothrow_t; _Znam; _ZnamRKSt9nothrow_t; _Znwm; _ZnwmRKSt9nothrow_t; local: /* TBB symbols */ *3rml8internal*; *3tbb*; *__TBB*; }; ================================================ FILE: third-party/tbb/src/tbbmalloc_proxy/function_replacement.cpp ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "oneapi/tbb/detail/_config.h" #include "oneapi/tbb/detail/_assert.h" #include "../tbb/assert_impl.h" #if !__TBB_WIN8UI_SUPPORT && defined(_WIN32) #ifndef _CRT_SECURE_NO_DEPRECATE #define _CRT_SECURE_NO_DEPRECATE 1 #endif // no standard-conforming implementation of snprintf prior to VS 2015 #if !defined(_MSC_VER) || _MSC_VER>=1900 #define LOG_PRINT(s, n, format, ...) snprintf(s, n, format, __VA_ARGS__) #else #define LOG_PRINT(s, n, format, ...) _snprintf_s(s, n, _TRUNCATE, format, __VA_ARGS__) #endif #include #include #include #include #include "function_replacement.h" // The information about a standard memory allocation function for the replacement log struct FunctionInfo { const char* funcName; const char* dllName; }; // Namespace that processes and manages the output of records to the Log journal // that will be provided to user by TBB_malloc_replacement_log() namespace Log { // Value of RECORDS_COUNT is set due to the fact that we maximally // scan 8 modules, and in every module we can swap 6 opcodes. (rounded to 8) static const unsigned RECORDS_COUNT = 8 * 8; static const unsigned RECORD_LENGTH = MAX_PATH; // Need to add 1 to count of records, because last record must be always nullptr static char *records[RECORDS_COUNT + 1]; static bool replacement_status = true; // Internal counter that contains number of next string for record static unsigned record_number = 0; // Function that writes info about (not)found opcodes to the Log journal // functionInfo - information about a standard memory allocation function for the replacement log // opcodeString - string, that contain byte code of this function // status - information about function replacement status static void record(FunctionInfo functionInfo, const char * opcodeString, bool status) { __TBB_ASSERT(functionInfo.dllName, "Empty DLL name value"); __TBB_ASSERT(functionInfo.funcName, "Empty function name value"); __TBB_ASSERT(opcodeString, "Empty opcode"); __TBB_ASSERT(record_number <= RECORDS_COUNT, "Incorrect record number"); //If some replacement failed -> set status to false replacement_status &= status; // If we reach the end of the log, write this message to the last line if (record_number == RECORDS_COUNT) { // %s - workaround to fix empty variable argument parsing behavior in GCC LOG_PRINT(records[RECORDS_COUNT - 1], RECORD_LENGTH, "%s", "Log was truncated."); return; } char* entry = (char*)HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, RECORD_LENGTH); __TBB_ASSERT(entry, "Invalid memory was returned"); LOG_PRINT(entry, RECORD_LENGTH, "%s: %s (%s), byte pattern: <%s>", status ? "Success" : "Fail", functionInfo.funcName, functionInfo.dllName, opcodeString); records[record_number++] = entry; } }; inline UINT_PTR Ptr2Addrint(LPVOID ptr) { Int2Ptr i2p; i2p.lpv = ptr; return i2p.uip; } inline LPVOID Addrint2Ptr(UINT_PTR ptr) { Int2Ptr i2p; i2p.uip = ptr; return i2p.lpv; } // Is the distance between addr1 and addr2 smaller than dist inline bool IsInDistance(UINT_PTR addr1, UINT_PTR addr2, __int64 dist) { __int64 diff = addr1>addr2 ? addr1-addr2 : addr2-addr1; return diff= m_allocSize) { // Found a free region, try to allocate a page in this region void *newPage = VirtualAlloc(newAddr, m_allocSize, MEM_COMMIT|MEM_RESERVE, PAGE_READWRITE); if (!newPage) break; // Add the new page to the pages database MemoryBuffer *pBuff = new (m_lastBuffer) MemoryBuffer(newPage, m_allocSize); ++m_lastBuffer; return pBuff; } } // Failed to find a buffer in the distance return 0; } public: MemoryProvider() { SYSTEM_INFO sysInfo; GetSystemInfo(&sysInfo); m_allocSize = sysInfo.dwAllocationGranularity; m_lastBuffer = &m_pages[0]; } // We can't free the pages in the destructor because the trampolines // are using these memory locations and a replaced function might be called // after the destructor was called. ~MemoryProvider() { } // Return a memory location in distance less than 2^31 from input address UINT_PTR GetLocation(UINT_PTR addr) { MemoryBuffer *pBuff = m_pages; for (; pBuffm_next, addr, MAX_DISTANCE); ++pBuff) { if (pBuff->m_next < pBuff->m_base + pBuff->m_size) { UINT_PTR loc = pBuff->m_next; pBuff->m_next += MAX_PROBE_SIZE; return loc; } } pBuff = CreateBuffer(addr); if(!pBuff) return 0; UINT_PTR loc = pBuff->m_next; pBuff->m_next += MAX_PROBE_SIZE; return loc; } private: MemoryBuffer m_pages[MAX_NUM_BUFFERS]; MemoryBuffer *m_lastBuffer; DWORD m_allocSize; }; static MemoryProvider memProvider; // Compare opcodes from dictionary (str1) and opcodes from code (str2) // str1 might contain '*' to mask addresses // RETURN: 0 if opcodes did not match, 1 on success size_t compareStrings( const char *str1, const char *str2 ) { for (size_t i=0; str1[i]!=0; i++){ if( str1[i]!='*' && str1[i]!='#' && str1[i]!=str2[i] ) return 0; } return 1; } // Check function prologue with known prologues from the dictionary // opcodes - dictionary // inpAddr - pointer to function prologue // Dictionary contains opcodes for several full asm instructions // + one opcode byte for the next asm instruction for safe address processing // RETURN: 1 + the index of the matched pattern, or 0 if no match found. static UINT CheckOpcodes( const char ** opcodes, void *inpAddr, bool abortOnError, const FunctionInfo* functionInfo = nullptr) { static size_t opcodesStringsCount = 0; static size_t maxOpcodesLength = 0; static size_t opcodes_pointer = (size_t)opcodes; char opcodeString[2*MAX_PATTERN_SIZE+1]; size_t i; size_t result = 0; // Get the values for static variables // max length and number of patterns if( !opcodesStringsCount || opcodes_pointer != (size_t)opcodes ){ while( *(opcodes + opcodesStringsCount)!= nullptr ){ if( (i=strlen(*(opcodes + opcodesStringsCount))) > maxOpcodesLength ) maxOpcodesLength = i; opcodesStringsCount++; } opcodes_pointer = (size_t)opcodes; __TBB_ASSERT( maxOpcodesLength/2 <= MAX_PATTERN_SIZE, "Pattern exceeded the limit of 28 opcodes/56 symbols" ); } // Translate prologue opcodes to string format to compare for( i=0; i= SIZE_OF_RELJUMP, "Incorrect bytecode pattern?" ); UINT_PTR trampAddr = memProvider.GetLocation(srcAddr); if (!trampAddr) return 0; *storedAddr = Addrint2Ptr(trampAddr); // Set 'executable' flag for original instructions in the new place DWORD pageFlags = PAGE_EXECUTE_READWRITE; if (!VirtualProtect(*storedAddr, MAX_PROBE_SIZE, pageFlags, &pageFlags)) return 0; // Copy original instructions to the new place memcpy(*storedAddr, codePtr, bytesToMove); offset = srcAddr - trampAddr; offset32 = (UINT)(offset & 0xFFFFFFFF); CorrectOffset( trampAddr, pattern, offset32 ); // Set jump to the code after replacement offset32 -= SIZE_OF_RELJUMP; *(UCHAR*)(trampAddr+bytesToMove) = 0xE9; memcpy((UCHAR*)(trampAddr+bytesToMove+1), &offset32, sizeof(offset32)); } // The following will work correctly even if srcAddr>tgtAddr, as long as // address difference is less than 2^31, which is guaranteed by IsInDistance. offset = tgtAddr - srcAddr - SIZE_OF_RELJUMP; offset32 = (UINT)(offset & 0xFFFFFFFF); // Insert the jump to the new code *codePtr = 0xE9; memcpy(codePtr+1, &offset32, sizeof(offset32)); // Fill the rest with NOPs to correctly see disassembler of old code in debugger. for( unsigned i=SIZE_OF_RELJUMP; i= SIZE_OF_INDJUMP, "Incorrect bytecode pattern?" ); UINT_PTR trampAddr = memProvider.GetLocation(srcAddr); if (!trampAddr) return 0; *storedAddr = Addrint2Ptr(trampAddr); // Set 'executable' flag for original instructions in the new place DWORD pageFlags = PAGE_EXECUTE_READWRITE; if (!VirtualProtect(*storedAddr, MAX_PROBE_SIZE, pageFlags, &pageFlags)) return 0; // Copy original instructions to the new place memcpy(*storedAddr, codePtr, bytesToMove); offset = srcAddr - trampAddr; offset32 = (UINT)(offset & 0xFFFFFFFF); CorrectOffset( trampAddr, pattern, offset32 ); // Set jump to the code after replacement. It is within the distance of relative jump! offset32 -= SIZE_OF_RELJUMP; *(UCHAR*)(trampAddr+bytesToMove) = 0xE9; memcpy((UCHAR*)(trampAddr+bytesToMove+1), &offset32, sizeof(offset32)); } // Fill the buffer offset = location - srcAddr - SIZE_OF_INDJUMP; offset32 = (UINT)(offset & 0xFFFFFFFF); *(codePtr) = 0xFF; *(codePtr+1) = 0x25; memcpy(codePtr+2, &offset32, sizeof(offset32)); // Fill the rest with NOPs to correctly see disassembler of old code in debugger. for( unsigned i=SIZE_OF_INDJUMP; i 0, "abortOnError ignored in CheckOpcodes?" ); pattern = opcodes[opcodeIdx-1]; // -1 compensates for +1 in CheckOpcodes } } probeSize = InsertTrampoline32(inpAddr, targetAddr, pattern, origFunc); if (!probeSize) probeSize = InsertTrampoline64(inpAddr, targetAddr, pattern, origFunc); // Restore original protection VirtualProtect(inpAddr, MAX_PROBE_SIZE, origProt, &origProt); if (!probeSize) return FALSE; FlushInstructionCache(GetCurrentProcess(), inpAddr, probeSize); FlushInstructionCache(GetCurrentProcess(), origFunc, probeSize); return TRUE; } // Routine to replace the functions // TODO: replace opcodesNumber with opcodes and opcodes number to check if we replace right code. FRR_TYPE ReplaceFunctionA(const char *dllName, const char *funcName, FUNCPTR newFunc, const char ** opcodes, FUNCPTR* origFunc) { // Cache the results of the last search for the module // Assume that there was no DLL unload between static char cachedName[MAX_PATH+1]; static HMODULE cachedHM = 0; if (!dllName || !*dllName) return FRR_NODLL; if (!cachedHM || strncmp(dllName, cachedName, MAX_PATH) != 0) { // Find the module handle for the input dll HMODULE hModule = GetModuleHandleA(dllName); if (hModule == 0) { // Couldn't find the module with the input name cachedHM = 0; return FRR_NODLL; } cachedHM = hModule; strncpy(cachedName, dllName, MAX_PATH); } FARPROC inpFunc = GetProcAddress(cachedHM, funcName); if (inpFunc == 0) { // Function was not found return FRR_NOFUNC; } if (!InsertTrampoline((void*)inpFunc, (void*)newFunc, opcodes, (void**)origFunc)){ // Failed to insert the trampoline to the target address return FRR_FAILED; } return FRR_OK; } FRR_TYPE ReplaceFunctionW(const wchar_t *dllName, const char *funcName, FUNCPTR newFunc, const char ** opcodes, FUNCPTR* origFunc) { // Cache the results of the last search for the module // Assume that there was no DLL unload between static wchar_t cachedName[MAX_PATH+1]; static HMODULE cachedHM = 0; if (!dllName || !*dllName) return FRR_NODLL; if (!cachedHM || wcsncmp(dllName, cachedName, MAX_PATH) != 0) { // Find the module handle for the input dll HMODULE hModule = GetModuleHandleW(dllName); if (hModule == 0) { // Couldn't find the module with the input name cachedHM = 0; return FRR_NODLL; } cachedHM = hModule; wcsncpy(cachedName, dllName, MAX_PATH); } FARPROC inpFunc = GetProcAddress(cachedHM, funcName); if (inpFunc == 0) { // Function was not found return FRR_NOFUNC; } if (!InsertTrampoline((void*)inpFunc, (void*)newFunc, opcodes, (void**)origFunc)){ // Failed to insert the trampoline to the target address return FRR_FAILED; } return FRR_OK; } bool IsPrologueKnown(const char* dllName, const char *funcName, const char **opcodes, HMODULE module) { FARPROC inpFunc = GetProcAddress(module, funcName); FunctionInfo functionInfo = { funcName, dllName }; if (!inpFunc) { Log::record(functionInfo, "unknown", /*status*/ false); return false; } return CheckOpcodes( opcodes, (void*)inpFunc, /*abortOnError=*/false, &functionInfo) != 0; } // Public Windows API extern "C" __declspec(dllexport) int TBB_malloc_replacement_log(char *** function_replacement_log_ptr) { if (function_replacement_log_ptr != nullptr) { *function_replacement_log_ptr = Log::records; } // If we have no logs -> return false status return Log::replacement_status && Log::records[0] != nullptr ? 0 : -1; } #endif /* !__TBB_WIN8UI_SUPPORT && defined(_WIN32) */ ================================================ FILE: third-party/tbb/src/tbbmalloc_proxy/function_replacement.h ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_function_replacement_H #define __TBB_function_replacement_H #include //for ptrdiff_t typedef enum { FRR_OK, /* Succeeded in replacing the function */ FRR_NODLL, /* The requested DLL was not found */ FRR_NOFUNC, /* The requested function was not found */ FRR_FAILED, /* The function replacement request failed */ } FRR_TYPE; typedef enum { FRR_FAIL, /* Required function */ FRR_IGNORE, /* optional function */ } FRR_ON_ERROR; typedef void (*FUNCPTR)(); #ifndef UNICODE #define ReplaceFunction ReplaceFunctionA #else #define ReplaceFunction ReplaceFunctionW #endif //UNICODE FRR_TYPE ReplaceFunctionA(const char *dllName, const char *funcName, FUNCPTR newFunc, const char ** opcodes, FUNCPTR* origFunc=nullptr); FRR_TYPE ReplaceFunctionW(const wchar_t *dllName, const char *funcName, FUNCPTR newFunc, const char ** opcodes, FUNCPTR* origFunc=nullptr); bool IsPrologueKnown(const char* dllName, const char *funcName, const char **opcodes, HMODULE module); // Utilities to convert between ADDRESS and LPVOID union Int2Ptr { UINT_PTR uip; LPVOID lpv; }; inline UINT_PTR Ptr2Addrint(LPVOID ptr); inline LPVOID Addrint2Ptr(UINT_PTR ptr); // The size of a trampoline region const unsigned MAX_PROBE_SIZE = 32; // The size of a jump relative instruction "e9 00 00 00 00" const unsigned SIZE_OF_RELJUMP = 5; // The size of jump RIP relative indirect "ff 25 00 00 00 00" const unsigned SIZE_OF_INDJUMP = 6; // The size of address we put in the location (in Intel64) const unsigned SIZE_OF_ADDRESS = 8; // The size limit (in bytes) for an opcode pattern to fit into a trampoline // There should be enough space left for a relative jump; +1 is for the extra pattern byte. const unsigned MAX_PATTERN_SIZE = MAX_PROBE_SIZE - SIZE_OF_RELJUMP + 1; // The max distance covered in 32 bits: 2^31 - 1 - C // where C should not be smaller than the size of a probe. // The latter is important to correctly handle "backward" jumps. const __int64 MAX_DISTANCE = (((__int64)1 << 31) - 1) - MAX_PROBE_SIZE; // The maximum number of distinct buffers in memory const ptrdiff_t MAX_NUM_BUFFERS = 256; #endif //__TBB_function_replacement_H ================================================ FILE: third-party/tbb/src/tbbmalloc_proxy/proxy.cpp ================================================ /* Copyright (c) 2005-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #if __unix__ && !__ANDROID__ // include indirectly so that is not included #include // include indirectly so that is not included #include // Working around compiler issue with Anaconda's gcc 7.3 compiler package. // New gcc ported for old libc may provide their inline implementation // of aligned_alloc as required by new C++ standard, this makes it hard to // redefine aligned_alloc here. However, running on systems with new libc // version, it still needs it to be redefined, thus tricking system headers #if defined(__GLIBC_PREREQ) #if !__GLIBC_PREREQ(2, 16) && _GLIBCXX_HAVE_ALIGNED_ALLOC // tell that there is no aligned_alloc #undef _GLIBCXX_HAVE_ALIGNED_ALLOC // trick to define another symbol instead #define aligned_alloc __hidden_redefined_aligned_alloc // Fix the state and undefine the trick #include #undef aligned_alloc #endif // !__GLIBC_PREREQ(2, 16) && _GLIBCXX_HAVE_ALIGNED_ALLOC #endif // defined(__GLIBC_PREREQ) #include #endif // __unix__ && !__ANDROID__ #include "proxy.h" #include "oneapi/tbb/detail/_config.h" #include "oneapi/tbb/scalable_allocator.h" #include "../tbb/environment.h" #if !defined(__EXCEPTIONS) && !defined(_CPPUNWIND) && !defined(__SUNPRO_CC) #if TBB_USE_EXCEPTIONS #error Compilation settings do not support exception handling. Please do not set TBB_USE_EXCEPTIONS macro or set it to 0. #elif !defined(TBB_USE_EXCEPTIONS) #define TBB_USE_EXCEPTIONS 0 #endif #elif !defined(TBB_USE_EXCEPTIONS) #define TBB_USE_EXCEPTIONS 1 #endif #if MALLOC_UNIXLIKE_OVERLOAD_ENABLED || _WIN32 && !__TBB_WIN8UI_SUPPORT /*** internal global operator new implementation (Linux, Windows) ***/ #include // Synchronization primitives to protect original library pointers and new_handler #include "../tbbmalloc/Synchronize.h" // Use MallocMutex implementation typedef MallocMutex ProxyMutex; // Adds aliasing and copy attributes to function if available #if defined(__has_attribute) #if __has_attribute(__copy__) #define __TBB_ALIAS_ATTR_COPY(name) __attribute__((alias (#name), __copy__(name))) #endif #endif #ifndef __TBB_ALIAS_ATTR_COPY #define __TBB_ALIAS_ATTR_COPY(name) __attribute__((alias (#name))) #endif // In case there is no std::get_new_handler function // which provides synchronized access to std::new_handler #if !__TBB_CPP11_GET_NEW_HANDLER_PRESENT static ProxyMutex new_lock; #endif static inline void* InternalOperatorNew(size_t sz) { void* res = scalable_malloc(sz); #if TBB_USE_EXCEPTIONS while (!res) { std::new_handler handler; #if __TBB_CPP11_GET_NEW_HANDLER_PRESENT handler = std::get_new_handler(); #else { ProxyMutex::scoped_lock lock(new_lock); handler = std::set_new_handler(0); std::set_new_handler(handler); } #endif if (handler) { (*handler)(); } else { throw std::bad_alloc(); } res = scalable_malloc(sz); } #endif /* TBB_USE_EXCEPTIONS */ return res; } /*** end of internal global operator new implementation ***/ #endif // MALLOC_UNIXLIKE_OVERLOAD_ENABLED || _WIN32 && !__TBB_WIN8UI_SUPPORT #if MALLOC_UNIXLIKE_OVERLOAD_ENABLED || MALLOC_ZONE_OVERLOAD_ENABLED #ifndef __THROW #define __THROW #endif /*** service functions and variables ***/ #include // for memset #include // for sysconf static long memoryPageSize; static inline void initPageSize() { memoryPageSize = sysconf(_SC_PAGESIZE); } #if MALLOC_UNIXLIKE_OVERLOAD_ENABLED #include #include // mallinfo /* __TBB_malloc_proxy used as a weak symbol by libtbbmalloc for: 1) detection that the proxy library is loaded 2) check that dlsym("malloc") found something different from our replacement malloc */ extern "C" TBBMALLOCPROXY_EXPORT void *__TBB_malloc_proxy(size_t) __TBB_ALIAS_ATTR_COPY(malloc); static void *orig_msize; #elif MALLOC_ZONE_OVERLOAD_ENABLED #include "proxy_overload_osx.h" #endif // MALLOC_ZONE_OVERLOAD_ENABLED // Original (i.e., replaced) functions, // they are never changed for MALLOC_ZONE_OVERLOAD_ENABLED. static void *orig_free, *orig_realloc; #if MALLOC_UNIXLIKE_OVERLOAD_ENABLED #define ZONE_ARG #define PREFIX(name) name static void *orig_libc_free, *orig_libc_realloc; // We already tried to find ptr to original functions. static std::atomic origFuncSearched{false}; inline void InitOrigPointers() { // race is OK here, as different threads found same functions if (!origFuncSearched.load(std::memory_order_acquire)) { orig_free = dlsym(RTLD_NEXT, "free"); orig_realloc = dlsym(RTLD_NEXT, "realloc"); orig_msize = dlsym(RTLD_NEXT, "malloc_usable_size"); orig_libc_free = dlsym(RTLD_NEXT, "__libc_free"); orig_libc_realloc = dlsym(RTLD_NEXT, "__libc_realloc"); origFuncSearched.store(true, std::memory_order_release); } } /*** replacements for malloc and the family ***/ extern "C" { #elif MALLOC_ZONE_OVERLOAD_ENABLED // each impl_* function has such 1st argument, it's unused #define ZONE_ARG struct _malloc_zone_t *, #define PREFIX(name) impl_##name // not interested in original functions for zone overload inline void InitOrigPointers() {} #endif // MALLOC_UNIXLIKE_OVERLOAD_ENABLED and MALLOC_ZONE_OVERLOAD_ENABLED TBBMALLOCPROXY_EXPORT void *PREFIX(malloc)(ZONE_ARG size_t size) __THROW { return scalable_malloc(size); } TBBMALLOCPROXY_EXPORT void *PREFIX(calloc)(ZONE_ARG size_t num, size_t size) __THROW { return scalable_calloc(num, size); } TBBMALLOCPROXY_EXPORT void PREFIX(free)(ZONE_ARG void *object) __THROW { InitOrigPointers(); __TBB_malloc_safer_free(object, (void (*)(void*))orig_free); } TBBMALLOCPROXY_EXPORT void *PREFIX(realloc)(ZONE_ARG void* ptr, size_t sz) __THROW { InitOrigPointers(); return __TBB_malloc_safer_realloc(ptr, sz, orig_realloc); } /* The older *NIX interface for aligned allocations; it's formally substituted by posix_memalign and deprecated, so we do not expect it to cause cyclic dependency with C RTL. */ TBBMALLOCPROXY_EXPORT void *PREFIX(memalign)(ZONE_ARG size_t alignment, size_t size) __THROW { return scalable_aligned_malloc(size, alignment); } /* valloc allocates memory aligned on a page boundary */ TBBMALLOCPROXY_EXPORT void *PREFIX(valloc)(ZONE_ARG size_t size) __THROW { if (! memoryPageSize) initPageSize(); return scalable_aligned_malloc(size, memoryPageSize); } #undef ZONE_ARG #undef PREFIX #if MALLOC_UNIXLIKE_OVERLOAD_ENABLED // match prototype from system headers #if __ANDROID__ TBBMALLOCPROXY_EXPORT size_t malloc_usable_size(const void *ptr) __THROW #else TBBMALLOCPROXY_EXPORT size_t malloc_usable_size(void *ptr) __THROW #endif { InitOrigPointers(); return __TBB_malloc_safer_msize(const_cast(ptr), (size_t (*)(void*))orig_msize); } TBBMALLOCPROXY_EXPORT int posix_memalign(void **memptr, size_t alignment, size_t size) __THROW { return scalable_posix_memalign(memptr, alignment, size); } /* pvalloc allocates smallest set of complete pages which can hold the requested number of bytes. Result is aligned on page boundary. */ TBBMALLOCPROXY_EXPORT void *pvalloc(size_t size) __THROW { if (! memoryPageSize) initPageSize(); // align size up to the page size, // pvalloc(0) returns 1 page, see man libmpatrol size = size? ((size-1) | (memoryPageSize-1)) + 1 : memoryPageSize; return scalable_aligned_malloc(size, memoryPageSize); } TBBMALLOCPROXY_EXPORT int mallopt(int /*param*/, int /*value*/) __THROW { return 1; } #if defined(__GLIBC__) || defined(__ANDROID__) TBBMALLOCPROXY_EXPORT struct mallinfo mallinfo() __THROW { struct mallinfo m; memset(&m, 0, sizeof(struct mallinfo)); return m; } #endif #if __ANDROID__ // Android doesn't have malloc_usable_size, provide it to be compatible // with Linux, in addition overload dlmalloc_usable_size() that presented // under Android. TBBMALLOCPROXY_EXPORT size_t dlmalloc_usable_size(const void *ptr) __TBB_ALIAS_ATTR_COPY(malloc_usable_size); #else // __ANDROID__ // TODO: consider using __typeof__ to guarantee the correct declaration types // C11 function, supported starting GLIBC 2.16 TBBMALLOCPROXY_EXPORT void *aligned_alloc(size_t alignment, size_t size) __TBB_ALIAS_ATTR_COPY(memalign); // Those non-standard functions are exported by GLIBC, and might be used // in conjunction with standard malloc/free, so we must overload them. // Bionic doesn't have them. Not removing from the linker scripts, // as absent entry points are ignored by the linker. TBBMALLOCPROXY_EXPORT void *__libc_malloc(size_t size) __TBB_ALIAS_ATTR_COPY(malloc); TBBMALLOCPROXY_EXPORT void *__libc_calloc(size_t num, size_t size) __TBB_ALIAS_ATTR_COPY(calloc); TBBMALLOCPROXY_EXPORT void *__libc_memalign(size_t alignment, size_t size) __TBB_ALIAS_ATTR_COPY(memalign); TBBMALLOCPROXY_EXPORT void *__libc_pvalloc(size_t size) __TBB_ALIAS_ATTR_COPY(pvalloc); TBBMALLOCPROXY_EXPORT void *__libc_valloc(size_t size) __TBB_ALIAS_ATTR_COPY(valloc); // call original __libc_* to support naive replacement of free via __libc_free etc TBBMALLOCPROXY_EXPORT void __libc_free(void *ptr) { InitOrigPointers(); __TBB_malloc_safer_free(ptr, (void (*)(void*))orig_libc_free); } TBBMALLOCPROXY_EXPORT void *__libc_realloc(void *ptr, size_t size) { InitOrigPointers(); return __TBB_malloc_safer_realloc(ptr, size, orig_libc_realloc); } #endif // !__ANDROID__ } /* extern "C" */ /*** replacements for global operators new and delete ***/ TBBMALLOCPROXY_EXPORT void* operator new(size_t sz) { return InternalOperatorNew(sz); } TBBMALLOCPROXY_EXPORT void* operator new[](size_t sz) { return InternalOperatorNew(sz); } TBBMALLOCPROXY_EXPORT void operator delete(void* ptr) noexcept { InitOrigPointers(); __TBB_malloc_safer_free(ptr, (void (*)(void*))orig_free); } TBBMALLOCPROXY_EXPORT void operator delete[](void* ptr) noexcept { InitOrigPointers(); __TBB_malloc_safer_free(ptr, (void (*)(void*))orig_free); } TBBMALLOCPROXY_EXPORT void* operator new(size_t sz, const std::nothrow_t&) noexcept { return scalable_malloc(sz); } TBBMALLOCPROXY_EXPORT void* operator new[](std::size_t sz, const std::nothrow_t&) noexcept { return scalable_malloc(sz); } TBBMALLOCPROXY_EXPORT void operator delete(void* ptr, const std::nothrow_t&) noexcept { InitOrigPointers(); __TBB_malloc_safer_free(ptr, (void (*)(void*))orig_free); } TBBMALLOCPROXY_EXPORT void operator delete[](void* ptr, const std::nothrow_t&) noexcept { InitOrigPointers(); __TBB_malloc_safer_free(ptr, (void (*)(void*))orig_free); } #endif /* MALLOC_UNIXLIKE_OVERLOAD_ENABLED */ #endif /* MALLOC_UNIXLIKE_OVERLOAD_ENABLED || MALLOC_ZONE_OVERLOAD_ENABLED */ #ifdef _WIN32 #include #if !__TBB_WIN8UI_SUPPORT #include #include "function_replacement.h" template // generic function to find length of array inline size_t arrayLength(const T(&)[N]) { return N; } void __TBB_malloc_safer_delete( void *ptr) { __TBB_malloc_safer_free( ptr, nullptr ); } void* safer_aligned_malloc( size_t size, size_t alignment ) { // workaround for "is power of 2 pow N" bug that accepts zeros return scalable_aligned_malloc( size, alignment>sizeof(size_t*)?alignment:sizeof(size_t*) ); } // we do not support _expand(); void* safer_expand( void *, size_t ) { return nullptr; } #define __TBB_ORIG_ALLOCATOR_REPLACEMENT_WRAPPER(CRTLIB) \ void (*orig_free_##CRTLIB)(void*); \ void __TBB_malloc_safer_free_##CRTLIB(void *ptr) \ { \ __TBB_malloc_safer_free( ptr, orig_free_##CRTLIB ); \ } \ \ void (*orig__aligned_free_##CRTLIB)(void*); \ void __TBB_malloc_safer__aligned_free_##CRTLIB(void *ptr) \ { \ __TBB_malloc_safer_free( ptr, orig__aligned_free_##CRTLIB ); \ } \ \ size_t (*orig__msize_##CRTLIB)(void*); \ size_t __TBB_malloc_safer__msize_##CRTLIB(void *ptr) \ { \ return __TBB_malloc_safer_msize( ptr, orig__msize_##CRTLIB ); \ } \ \ size_t (*orig__aligned_msize_##CRTLIB)(void*, size_t, size_t); \ size_t __TBB_malloc_safer__aligned_msize_##CRTLIB( void *ptr, size_t alignment, size_t offset) \ { \ return __TBB_malloc_safer_aligned_msize( ptr, alignment, offset, orig__aligned_msize_##CRTLIB ); \ } \ \ void* __TBB_malloc_safer_realloc_##CRTLIB( void *ptr, size_t size ) \ { \ orig_ptrs func_ptrs = {orig_free_##CRTLIB, orig__msize_##CRTLIB}; \ return __TBB_malloc_safer_realloc( ptr, size, &func_ptrs ); \ } \ \ void* __TBB_malloc_safer__aligned_realloc_##CRTLIB( void *ptr, size_t size, size_t alignment ) \ { \ orig_aligned_ptrs func_ptrs = {orig__aligned_free_##CRTLIB, orig__aligned_msize_##CRTLIB}; \ return __TBB_malloc_safer_aligned_realloc( ptr, size, alignment, &func_ptrs ); \ } // Only for ucrtbase: substitution for _o_free void (*orig__o_free)(void*); void __TBB_malloc__o_free(void *ptr) { __TBB_malloc_safer_free( ptr, orig__o_free ); } // Only for ucrtbase: substitution for _free_base void(*orig__free_base)(void*); void __TBB_malloc__free_base(void *ptr) { __TBB_malloc_safer_free(ptr, orig__free_base); } // Size limit is MAX_PATTERN_SIZE (28) byte codes / 56 symbols per line. // * can be used to match any digit in byte codes. // # followed by several * indicate a relative address that needs to be corrected. // Purpose of the pattern is to mark an instruction bound; it should consist of several // full instructions plus one extra byte code. It's not required for the patterns // to be unique (i.e., it's OK to have same pattern for unrelated functions). // TODO: use hot patch prologues if exist const char* known_bytecodes[] = { #if _WIN64 // "========================================================" - 56 symbols "E9********CCCC", // multiple - jmp(0xE9) with address followed by empty space (0xCC - INT 3) "4883EC284885C974", // release free() "4883EC284885C975", // release _msize() "4885C974375348", // release free() 8.0.50727.42, 10.0 "C7442410000000008B", // release free() ucrtbase.dll 10.0.14393.33 "48895C24085748", // release _aligned_msize() ucrtbase.dll 10.0.14393.33 "48894C24084883EC28BA", // debug prologue "4C894424184889542410", // debug _aligned_msize() 10.0 "48894C24084883EC2848", // debug _aligned_free 10.0 "488BD1488D0D#*******E9", // _o_free(), ucrtbase.dll #if __TBB_OVERLOAD_OLD_MSVCR "48895C2408574883EC3049", // release _aligned_msize 9.0 "4883EC384885C975", // release _msize() 9.0 "4C8BC1488B0DA6E4040033", // an old win64 SDK #endif #else // _WIN32 // "========================================================" - 56 symbols "8BFF558BEC8B", // multiple "8BFF558BEC83", // release free() & _msize() 10.0.40219.325, _msize() ucrtbase.dll "8BFF558BECFF", // release _aligned_msize ucrtbase.dll "8BFF558BEC51", // release free() & _msize() ucrtbase.dll 10.0.14393.33 "558BEC8B450885C074", // release _aligned_free 11.0 "558BEC837D08000F", // release _msize() 11.0.51106.1 "558BEC837D08007419FF", // release free() 11.0.50727.1 "558BEC8B450885C075", // release _aligned_msize() 11.0.50727.1 "558BEC6A018B", // debug free() & _msize() 11.0 "558BEC8B451050", // debug _aligned_msize() 11.0 "558BEC8B450850", // debug _aligned_free 11.0 "8BFF558BEC6A", // debug free() & _msize() 10.0.40219.325 #if __TBB_OVERLOAD_OLD_MSVCR "6A1868********E8", // release free() 8.0.50727.4053, 9.0 "6A1C68********E8", // release _msize() 8.0.50727.4053, 9.0 #endif #endif // _WIN64/_WIN32 nullptr }; #define __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL_ENTRY(CRT_VER,function_name,dbgsuffix) \ ReplaceFunctionWithStore( #CRT_VER #dbgsuffix ".dll", #function_name, \ (FUNCPTR)__TBB_malloc_safer_##function_name##_##CRT_VER##dbgsuffix, \ known_bytecodes, (FUNCPTR*)&orig_##function_name##_##CRT_VER##dbgsuffix ); #define __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL_ENTRY_NO_FALLBACK(CRT_VER,function_name,dbgsuffix) \ ReplaceFunctionWithStore( #CRT_VER #dbgsuffix ".dll", #function_name, \ (FUNCPTR)__TBB_malloc_safer_##function_name##_##CRT_VER##dbgsuffix, 0, nullptr ); #define __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL_ENTRY_REDIRECT(CRT_VER,function_name,dest_func,dbgsuffix) \ ReplaceFunctionWithStore( #CRT_VER #dbgsuffix ".dll", #function_name, \ (FUNCPTR)__TBB_malloc_safer_##dest_func##_##CRT_VER##dbgsuffix, 0, nullptr ); #define __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL_IMPL(CRT_VER,dbgsuffix) \ if (BytecodesAreKnown(#CRT_VER #dbgsuffix ".dll")) { \ __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL_ENTRY(CRT_VER,free,dbgsuffix) \ __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL_ENTRY(CRT_VER,_msize,dbgsuffix) \ __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL_ENTRY_NO_FALLBACK(CRT_VER,realloc,dbgsuffix) \ __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL_ENTRY(CRT_VER,_aligned_free,dbgsuffix) \ __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL_ENTRY(CRT_VER,_aligned_msize,dbgsuffix) \ __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL_ENTRY_NO_FALLBACK(CRT_VER,_aligned_realloc,dbgsuffix) \ } else \ SkipReplacement(#CRT_VER #dbgsuffix ".dll"); #define __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL_RELEASE(CRT_VER) __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL_IMPL(CRT_VER,) #define __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL_DEBUG(CRT_VER) __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL_IMPL(CRT_VER,d) #define __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL(CRT_VER) \ __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL_RELEASE(CRT_VER) \ __TBB_ORIG_ALLOCATOR_REPLACEMENT_CALL_DEBUG(CRT_VER) #if __TBB_OVERLOAD_OLD_MSVCR __TBB_ORIG_ALLOCATOR_REPLACEMENT_WRAPPER(msvcr70d); __TBB_ORIG_ALLOCATOR_REPLACEMENT_WRAPPER(msvcr70); __TBB_ORIG_ALLOCATOR_REPLACEMENT_WRAPPER(msvcr71d); __TBB_ORIG_ALLOCATOR_REPLACEMENT_WRAPPER(msvcr71); __TBB_ORIG_ALLOCATOR_REPLACEMENT_WRAPPER(msvcr80d); __TBB_ORIG_ALLOCATOR_REPLACEMENT_WRAPPER(msvcr80); __TBB_ORIG_ALLOCATOR_REPLACEMENT_WRAPPER(msvcr90d); __TBB_ORIG_ALLOCATOR_REPLACEMENT_WRAPPER(msvcr90); #endif __TBB_ORIG_ALLOCATOR_REPLACEMENT_WRAPPER(msvcr100d); __TBB_ORIG_ALLOCATOR_REPLACEMENT_WRAPPER(msvcr100); __TBB_ORIG_ALLOCATOR_REPLACEMENT_WRAPPER(msvcr110d); __TBB_ORIG_ALLOCATOR_REPLACEMENT_WRAPPER(msvcr110); __TBB_ORIG_ALLOCATOR_REPLACEMENT_WRAPPER(msvcr120d); __TBB_ORIG_ALLOCATOR_REPLACEMENT_WRAPPER(msvcr120); __TBB_ORIG_ALLOCATOR_REPLACEMENT_WRAPPER(ucrtbase); /*** replacements for global operators new and delete ***/ #if _MSC_VER && !defined(__INTEL_COMPILER) #pragma warning( push ) #pragma warning( disable : 4290 ) #endif /*** operator new overloads internals (Linux, Windows) ***/ void* operator_new(size_t sz) { return InternalOperatorNew(sz); } void* operator_new_arr(size_t sz) { return InternalOperatorNew(sz); } void operator_delete(void* ptr) noexcept { __TBB_malloc_safer_delete(ptr); } #if _MSC_VER && !defined(__INTEL_COMPILER) #pragma warning( pop ) #endif void operator_delete_arr(void* ptr) noexcept { __TBB_malloc_safer_delete(ptr); } void* operator_new_t(size_t sz, const std::nothrow_t&) noexcept { return scalable_malloc(sz); } void* operator_new_arr_t(std::size_t sz, const std::nothrow_t&) noexcept { return scalable_malloc(sz); } void operator_delete_t(void* ptr, const std::nothrow_t&) noexcept { __TBB_malloc_safer_delete(ptr); } void operator_delete_arr_t(void* ptr, const std::nothrow_t&) noexcept { __TBB_malloc_safer_delete(ptr); } struct Module { const char *name; bool doFuncReplacement; // do replacement in the DLL }; Module modules_to_replace[] = { {"msvcr100d.dll", true}, {"msvcr100.dll", true}, {"msvcr110d.dll", true}, {"msvcr110.dll", true}, {"msvcr120d.dll", true}, {"msvcr120.dll", true}, {"ucrtbase.dll", true}, // "ucrtbased.dll" is not supported because of problems with _dbg functions #if __TBB_OVERLOAD_OLD_MSVCR {"msvcr90d.dll", true}, {"msvcr90.dll", true}, {"msvcr80d.dll", true}, {"msvcr80.dll", true}, {"msvcr70d.dll", true}, {"msvcr70.dll", true}, {"msvcr71d.dll", true}, {"msvcr71.dll", true}, #endif #if __TBB_TODO // TODO: Try enabling replacement for non-versioned system binaries below {"msvcrtd.dll", true}, {"msvcrt.dll", true}, #endif }; /* We need to replace following functions: malloc calloc _aligned_malloc _expand (by dummy implementation) ??2@YAPAXI@Z operator new (ia32) ??_U@YAPAXI@Z void * operator new[] (size_t size) (ia32) ??3@YAXPAX@Z operator delete (ia32) ??_V@YAXPAX@Z operator delete[] (ia32) ??2@YAPEAX_K@Z void * operator new(unsigned __int64) (intel64) ??_V@YAXPEAX@Z void * operator new[](unsigned __int64) (intel64) ??3@YAXPEAX@Z operator delete (intel64) ??_V@YAXPEAX@Z operator delete[] (intel64) ??2@YAPAXIABUnothrow_t@std@@@Z void * operator new (size_t sz, const std::nothrow_t&) noexcept (optional) ??_U@YAPAXIABUnothrow_t@std@@@Z void * operator new[] (size_t sz, const std::nothrow_t&) noexcept (optional) and these functions have runtime-specific replacement: realloc free _msize _aligned_realloc _aligned_free _aligned_msize */ typedef struct FRData_t { //char *_module; const char *_func; FUNCPTR _fptr; FRR_ON_ERROR _on_error; } FRDATA; FRDATA c_routines_to_replace[] = { { "malloc", (FUNCPTR)scalable_malloc, FRR_FAIL }, { "calloc", (FUNCPTR)scalable_calloc, FRR_FAIL }, { "_aligned_malloc", (FUNCPTR)safer_aligned_malloc, FRR_FAIL }, { "_expand", (FUNCPTR)safer_expand, FRR_IGNORE }, }; FRDATA cxx_routines_to_replace[] = { #if _WIN64 { "??2@YAPEAX_K@Z", (FUNCPTR)operator_new, FRR_FAIL }, { "??_U@YAPEAX_K@Z", (FUNCPTR)operator_new_arr, FRR_FAIL }, { "??3@YAXPEAX@Z", (FUNCPTR)operator_delete, FRR_FAIL }, { "??_V@YAXPEAX@Z", (FUNCPTR)operator_delete_arr, FRR_FAIL }, #else { "??2@YAPAXI@Z", (FUNCPTR)operator_new, FRR_FAIL }, { "??_U@YAPAXI@Z", (FUNCPTR)operator_new_arr, FRR_FAIL }, { "??3@YAXPAX@Z", (FUNCPTR)operator_delete, FRR_FAIL }, { "??_V@YAXPAX@Z", (FUNCPTR)operator_delete_arr, FRR_FAIL }, #endif { "??2@YAPAXIABUnothrow_t@std@@@Z", (FUNCPTR)operator_new_t, FRR_IGNORE }, { "??_U@YAPAXIABUnothrow_t@std@@@Z", (FUNCPTR)operator_new_arr_t, FRR_IGNORE } }; #ifndef UNICODE typedef char unicode_char_t; #define WCHAR_SPEC "%s" #else typedef wchar_t unicode_char_t; #define WCHAR_SPEC "%ls" #endif // Check that we recognize bytecodes that should be replaced by trampolines. // If some functions have unknown prologue patterns, replacement should not be done. bool BytecodesAreKnown(const unicode_char_t *dllName) { const char *funcName[] = {"free", "_msize", "_aligned_free", "_aligned_msize", 0}; HMODULE module = GetModuleHandle(dllName); if (!module) return false; for (int i=0; funcName[i]; i++) if (! IsPrologueKnown(dllName, funcName[i], known_bytecodes, module)) { fprintf(stderr, "TBBmalloc: skip allocation functions replacement in " WCHAR_SPEC ": unknown prologue for function " WCHAR_SPEC "\n", dllName, funcName[i]); return false; } return true; } void SkipReplacement(const unicode_char_t *dllName) { #ifndef UNICODE const char *dllStr = dllName; #else const size_t sz = 128; // all DLL name must fit char buffer[sz]; size_t real_sz; char *dllStr = buffer; errno_t ret = wcstombs_s(&real_sz, dllStr, sz, dllName, sz-1); __TBB_ASSERT(!ret, "Dll name conversion failed"); #endif for (size_t i=0; i extern "C" { TBBMALLOC_EXPORT void __TBB_malloc_safer_free( void *ptr, void (*original_free)(void*)); TBBMALLOC_EXPORT void * __TBB_malloc_safer_realloc( void *ptr, size_t, void* ); TBBMALLOC_EXPORT void * __TBB_malloc_safer_aligned_realloc( void *ptr, size_t, size_t, void* ); TBBMALLOC_EXPORT size_t __TBB_malloc_safer_msize( void *ptr, size_t (*orig_msize_crt80d)(void*)); TBBMALLOC_EXPORT size_t __TBB_malloc_safer_aligned_msize( void *ptr, size_t, size_t, size_t (*orig_msize_crt80d)(void*,size_t,size_t)); #if MALLOC_ZONE_OVERLOAD_ENABLED TBBMALLOC_EXPORT void __TBB_malloc_free_definite_size(void *object, size_t size); #endif } // extern "C" // Struct with original free() and _msize() pointers struct orig_ptrs { void (*free) (void*); size_t (*msize)(void*); }; struct orig_aligned_ptrs { void (*aligned_free) (void*); size_t (*aligned_msize)(void*,size_t,size_t); }; #endif /* _TBB_malloc_proxy_H_ */ ================================================ FILE: third-party/tbb/src/tbbmalloc_proxy/proxy_overload_osx.h ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ // The original source for this code is // Copyright (c) 2011, Google Inc. // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following disclaimer // in the documentation and/or other materials provided with the // distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include #include static kern_return_t enumerator(task_t, void *, unsigned, vm_address_t, memory_reader_t, vm_range_recorder_t) { return KERN_FAILURE; } static size_t good_size(malloc_zone_t *, size_t size) { return size; } static boolean_t zone_check(malloc_zone_t *) /* Consistency checker */ { return true; } static void zone_print(malloc_zone_t *, boolean_t) { } static void zone_log(malloc_zone_t *, void *) {} static void zone_force_lock(malloc_zone_t *) {} static void zone_force_unlock(malloc_zone_t *) {} static void zone_statistics(malloc_zone_t *, malloc_statistics_t *s) { s->blocks_in_use = 0; s->size_in_use = s->max_size_in_use = s->size_allocated = 0; } static boolean_t zone_locked(malloc_zone_t *) { return false; } static boolean_t impl_zone_enable_discharge_checking(malloc_zone_t *) { return false; } static void impl_zone_disable_discharge_checking(malloc_zone_t *) {} static void impl_zone_discharge(malloc_zone_t *, void *) {} static void impl_zone_destroy(struct _malloc_zone_t *) {} /* note: impl_malloc_usable_size() is called for each free() call, so it must be fast */ static size_t impl_malloc_usable_size(struct _malloc_zone_t *, const void *ptr) { // malloc_usable_size() is used by macOS* to recognize which memory manager // allocated the address, so our wrapper must not redirect to the original function. return __TBB_malloc_safer_msize(const_cast(ptr), nullptr); } static void *impl_malloc(struct _malloc_zone_t *, size_t size); static void *impl_calloc(struct _malloc_zone_t *, size_t num_items, size_t size); static void *impl_valloc(struct _malloc_zone_t *, size_t size); static void impl_free(struct _malloc_zone_t *, void *ptr); static void *impl_realloc(struct _malloc_zone_t *, void *ptr, size_t size); static void *impl_memalign(struct _malloc_zone_t *, size_t alignment, size_t size); /* ptr is in zone and have reported size */ static void impl_free_definite_size(struct _malloc_zone_t*, void *ptr, size_t size) { __TBB_malloc_free_definite_size(ptr, size); } /* Empty out caches in the face of memory pressure. */ static size_t impl_pressure_relief(struct _malloc_zone_t *, size_t /* goal */) { return 0; } static malloc_zone_t *system_zone = nullptr; struct DoMallocReplacement { DoMallocReplacement() { static malloc_introspection_t introspect; memset(&introspect, 0, sizeof(malloc_introspection_t)); static malloc_zone_t zone; memset(&zone, 0, sizeof(malloc_zone_t)); introspect.enumerator = &enumerator; introspect.good_size = &good_size; introspect.check = &zone_check; introspect.print = &zone_print; introspect.log = zone_log; introspect.force_lock = &zone_force_lock; introspect.force_unlock = &zone_force_unlock; introspect.statistics = zone_statistics; introspect.zone_locked = &zone_locked; introspect.enable_discharge_checking = &impl_zone_enable_discharge_checking; introspect.disable_discharge_checking = &impl_zone_disable_discharge_checking; introspect.discharge = &impl_zone_discharge; zone.size = &impl_malloc_usable_size; zone.malloc = &impl_malloc; zone.calloc = &impl_calloc; zone.valloc = &impl_valloc; zone.free = &impl_free; zone.realloc = &impl_realloc; zone.destroy = &impl_zone_destroy; zone.zone_name = "tbbmalloc"; zone.introspect = &introspect; zone.version = 8; zone.memalign = impl_memalign; zone.free_definite_size = &impl_free_definite_size; zone.pressure_relief = &impl_pressure_relief; // make sure that default purgeable zone is initialized malloc_default_purgeable_zone(); void* ptr = malloc(1); // get all registered memory zones unsigned zcount = 0; malloc_zone_t** zone_array = nullptr; kern_return_t errorcode = malloc_get_all_zones(mach_task_self(),nullptr,(vm_address_t**)&zone_array,&zcount); if (!errorcode && zone_array && zcount>0) { // find the zone that allocated ptr for (unsigned i=0; isize(z,ptr)>0) { // the right one is found system_zone = z; break; } } } free(ptr); malloc_zone_register(&zone); if (system_zone) { // after unregistration of the system zone, the last registered (i.e. our) zone becomes the default malloc_zone_unregister(system_zone); // register the system zone back malloc_zone_register(system_zone); } } }; static DoMallocReplacement doMallocReplacement; ================================================ FILE: third-party/tbb/src/tbbmalloc_proxy/tbbmalloc_proxy.rc ================================================ // Copyright (c) 2005-2025 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. ///////////////////////////////////////////////////////////////////////////// // // Includes // #include #include "../../include/oneapi/tbb/version.h" ///////////////////////////////////////////////////////////////////////////// // Neutral resources #ifdef _WIN32 LANGUAGE LANG_NEUTRAL, SUBLANG_NEUTRAL #pragma code_page(1252) #endif //_WIN32 ///////////////////////////////////////////////////////////////////////////// // // Version // #define TBB_VERNUMBERS TBB_VERSION_MAJOR,TBB_VERSION_MINOR,TBB_VERSION_PATCH #define TBB_VERSION TBB_VERSION_STRING VS_VERSION_INFO VERSIONINFO FILEVERSION TBB_VERNUMBERS PRODUCTVERSION TBB_VERNUMBERS FILEFLAGSMASK 0x17L #ifdef _DEBUG FILEFLAGS 0x1L #else FILEFLAGS 0x0L #endif FILEOS 0x40004L FILETYPE 0x2L FILESUBTYPE 0x0L BEGIN BLOCK "StringFileInfo" BEGIN BLOCK "000004b0" BEGIN VALUE "CompanyName", "Intel Corporation\0" VALUE "FileDescription", "oneAPI Threading Building Blocks (oneTBB) library\0" VALUE "FileVersion", TBB_VERSION "\0" VALUE "LegalCopyright", "Copyright 2005-2025 Intel Corporation. All Rights Reserved.\0" VALUE "LegalTrademarks", "\0" #ifndef TBB_USE_DEBUG VALUE "OriginalFilename", "tbbmalloc_proxy.dll\0" #else VALUE "OriginalFilename", "tbbmalloc_proxy_debug.dll\0" #endif VALUE "ProductName", "oneAPI Threading Building Blocks (oneTBB)\0" VALUE "ProductVersion", TBB_VERSION "\0" VALUE "PrivateBuild", "\0" VALUE "SpecialBuild", "\0" END END BLOCK "VarFileInfo" BEGIN VALUE "Translation", 0x0, 1200 END END ================================================ FILE: third-party/tbb/test/CMakeLists.txt ================================================ # Copyright (c) 2020-2025 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. include(ProcessorCount) # General function for test target generation function(tbb_add_test) set(oneValueArgs SUBDIR NAME SUFFIX) set(multiValueArgs DEPENDENCIES) cmake_parse_arguments(_tbb_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(_tbb_test_TARGET_NAME ${_tbb_test_NAME}) if (_tbb_test_SUFFIX) set(_tbb_test_TARGET_NAME ${_tbb_test_NAME}_${_tbb_test_SUFFIX}) endif() # Define the target for test add_executable(${_tbb_test_TARGET_NAME} ${_tbb_test_SUBDIR}/${_tbb_test_NAME}.cpp) target_include_directories(${_tbb_test_TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}) # cmake>=3.4 no longer adds flags to export symbols from executables (CMP0065) set_property(TARGET ${_tbb_test_TARGET_NAME} PROPERTY ENABLE_EXPORTS TRUE) target_compile_options(${_tbb_test_TARGET_NAME} PRIVATE ${TBB_CXX_STD_FLAG} ${TBB_WARNING_LEVEL} # Warning suppression C4324: structure was padded due to alignment specifier $<$:/wd4324> ${TBB_TEST_WARNING_FLAGS} ${TBB_TEST_COMPILE_FLAGS} ${TBB_COMMON_COMPILE_FLAGS} ) if (TBB_BUILD_APPLE_FRAMEWORKS) add_compile_definitions(TBB_USE_APPLE_FRAMEWORKS) endif() if (ANDROID_PLATFORM) # Expand the linker rpath by the CMAKE_LIBRARY_OUTPUT_DIRECTORY path since clang compiler from Android SDK # doesn't respect the -L flag. target_link_libraries(${_tbb_test_TARGET_NAME} PRIVATE "-Wl,--rpath-link,${CMAKE_LIBRARY_OUTPUT_DIRECTORY}") add_test(NAME ${_tbb_test_TARGET_NAME} COMMAND ${CMAKE_COMMAND} -DBINARIES_PATH=${CMAKE_LIBRARY_OUTPUT_DIRECTORY} -DTEST_NAME=${_tbb_test_TARGET_NAME} -P ${PROJECT_SOURCE_DIR}/cmake/android/test_launcher.cmake) else() add_test(NAME ${_tbb_test_TARGET_NAME} COMMAND ${_tbb_test_TARGET_NAME} --force-colors=1 WORKING_DIRECTORY ${TBB_TEST_WORKING_DIRECTORY}) # Additional testing scenarios if Intel(R) Software Development Emulator is found if (UNIX AND ";test_mutex;conformance_mutex;" MATCHES ";${_tbb_test_TARGET_NAME};" AND SDE_EXE) add_test(NAME ${_tbb_test_TARGET_NAME}_SDE COMMAND ${SDE_EXE} -nhm -rtm_mode disabled -- ./${_tbb_test_TARGET_NAME} --force-colors=1 WORKING_DIRECTORY ${TBB_TEST_WORKING_DIRECTORY}) set_property(TEST ${_tbb_test_TARGET_NAME}_SDE PROPERTY ENVIRONMENT ${TBB_TESTS_ENVIRONMENT} APPEND) endif() endif() set_property(TEST ${_tbb_test_TARGET_NAME} PROPERTY ENVIRONMENT ${TBB_TESTS_ENVIRONMENT} APPEND) set_property(TEST ${_tbb_test_TARGET_NAME} PROPERTY RUN_SERIAL TRUE) # Prefer using target_link_options instead of target_link_libraries to specify link options because # target_link_libraries may incorrectly handle some options (on Windows, for example). if (COMMAND target_link_options) target_link_options(${_tbb_test_TARGET_NAME} PRIVATE ${TBB_COMMON_LINK_FLAGS} ${TBB_TEST_LINK_FLAGS}) else() target_link_libraries(${_tbb_test_TARGET_NAME} PRIVATE ${TBB_COMMON_LINK_FLAGS} ${TBB_TEST_LINK_FLAGS}) endif() target_compile_definitions(${_tbb_test_TARGET_NAME} PRIVATE $<$:TBB_USE_DEBUG> $<$:__TBB_CPF_BUILD=1> $<$>:__TBB_DYNAMIC_LOAD_ENABLED=0> $<$>:__TBB_SOURCE_DIRECTLY_INCLUDED=1> $<$:__TBB_TCM_TESTING_ENABLED=1>) target_link_libraries(${_tbb_test_TARGET_NAME} PRIVATE ${_tbb_test_DEPENDENCIES} Threads::Threads ${TBB_COMMON_LINK_LIBS}) if (COMMAND _tbb_run_memcheck) _tbb_run_memcheck(${_tbb_test_NAME} ${_tbb_test_SUBDIR}) endif() endfunction() # Function for C test target generation function(tbb_add_c_test) set(oneValueArgs SUBDIR NAME) set(multiValueArgs DEPENDENCIES) cmake_parse_arguments(_tbb_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) # Define the target for test add_executable(${_tbb_test_NAME} ${_tbb_test_SUBDIR}/${_tbb_test_NAME}.c) target_include_directories(${_tbb_test_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}) if (ANDROID_PLATFORM) add_test(NAME ${_tbb_test_NAME} COMMAND ${CMAKE_COMMAND} -DBINARIES_PATH=${CMAKE_LIBRARY_OUTPUT_DIRECTORY} -DTEST_NAME=${_tbb_test_NAME} -P ${PROJECT_SOURCE_DIR}/cmake/android/test_launcher.cmake) else() add_test(NAME ${_tbb_test_NAME} COMMAND ${_tbb_test_NAME} --force-colors=1 WORKING_DIRECTORY ${TBB_TEST_WORKING_DIRECTORY}) endif() set_property(TEST ${_tbb_test_NAME} PROPERTY ENVIRONMENT ${TBB_TESTS_ENVIRONMENT} APPEND) set_property(TEST ${_tbb_test_NAME} PROPERTY RUN_SERIAL TRUE) target_compile_definitions(${_tbb_test_NAME} PRIVATE $<$:TBB_USE_DEBUG> $<$:__TBB_CPF_BUILD=1>) target_link_libraries(${_tbb_test_NAME} PRIVATE ${_tbb_test_DEPENDENCIES} Threads::Threads) endfunction() # Function for lib test target generation function(tbb_add_lib_test) set(oneValueArgs SUBDIR NAME) set(multiValueArgs DEPENDENCIES) cmake_parse_arguments(_tbb_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_library(_${_tbb_test_NAME} ${_tbb_test_SUBDIR}/${_tbb_test_NAME}.cpp) target_include_directories(_${_tbb_test_NAME} PUBLIC $ $ PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}) # TODO: fix warnings if (MSVC) # signed unsigned mismatch, declaration hides class member set(TBB_WARNING_SUPPRESS ${TBB_WARNING_SUPPRESS} /wd4267 /wd4244 /wd4245 /wd4018 /wd4458) endif() set(TEST_LIB_COMPILE_FLAGS -D_USRDLL) # TODO: add ${TBB_WARNING_LEVEL} and fix problems target_compile_options(_${_tbb_test_NAME} PRIVATE ${TBB_CXX_STD_FLAG} # TODO: consider making it PUBLIC. ${TBB_MMD_FLAG} ${TBB_DSE_FLAG} ${TBB_LIB_COMPILE_FLAGS} ${TBBMALLOC_LIB_COMPILE_FLAGS} ${TBB_COMMON_COMPILE_FLAGS} ${TEST_LIB_COMPILE_FLAGS} ) target_compile_definitions(_${_tbb_test_NAME} PRIVATE $<$:TBB_USE_DEBUG> $<$:__TBB_CPF_BUILD=1> $<$>:__TBB_DYNAMIC_LOAD_ENABLED=0> $<$>:__TBB_SOURCE_DIRECTLY_INCLUDED=1>) # Prefer using target_link_options instead of target_link_libraries to specify link options because # target_link_libraries may incorrectly handle some options (on Windows, for example). if (COMMAND target_link_options) target_link_options(_${_tbb_test_NAME} PRIVATE ${TBB_LIB_LINK_FLAGS} ${TBB_COMMON_LINK_FLAGS} ) else() target_link_libraries(_${_tbb_test_NAME} PRIVATE ${TBB_LIB_LINK_FLAGS} ${TBB_COMMON_LINK_FLAGS} ) endif() target_link_libraries(_${_tbb_test_NAME} PRIVATE Threads::Threads ${_tbb_test_DEPENDENCIES} ${TBB_LIB_LINK_LIBS} ${TBB_COMMON_LINK_LIBS} ) endfunction() function(_tbb_get_hwloc_runtime_vars) set(oneValueArgs ENV_EXTENSION_VARIABLE) set(multiValueArgs HWLOC_VERSION_LIST) cmake_parse_arguments(_runtime_vars "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) foreach(HWLOC_VERSION ${_runtime_vars_HWLOC_VERSION_LIST}) get_target_property(HWLOC_LOCATION HWLOC::${HWLOC_VERSION} IMPORTED_LOCATION) get_filename_component(HWLOC_LOCATION_PATH ${HWLOC_LOCATION} DIRECTORY) list(APPEND LIBRARIES_PATH ${HWLOC_LOCATION_PATH}) endforeach() if (WIN32) string(REPLACE ";" "\;" LIBRARIES_PATH "${LIBRARIES_PATH}\;$ENV{PATH}") string(REPLACE "/" "\\" LIBRARIES_PATH "${LIBRARIES_PATH}") set(${_runtime_vars_ENV_EXTENSION_VARIABLE} "PATH=${LIBRARIES_PATH}" PARENT_SCOPE) else() string(REPLACE ";" ":" LIBRARIES_PATH "${LIBRARIES_PATH}:$ENV{LD_LIBRARY_PATH}") set(${_runtime_vars_ENV_EXTENSION_VARIABLE} "LD_LIBRARY_PATH=${LIBRARIES_PATH}" PARENT_SCOPE) endif() endfunction() function(tbb_configure_hwloc_dependent_test) set(oneValueArgs SUBDIR NAME SUFFIX TBBBIND_VERSION) set(multiValueArgs HWLOC_REQUIRED_VERSION_LIST) cmake_parse_arguments(_hwloc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(HWLOC_REQUIREMENTS_SATISFIED TRUE) foreach(HWLOC_TARGET ${_hwloc_test_HWLOC_REQUIRED_VERSION_LIST}) if (NOT TARGET HWLOC::${HWLOC_TARGET}) set(HWLOC_REQUIREMENTS_SATISFIED FALSE) endif() endforeach() if (NOT HWLOC_REQUIREMENTS_SATISFIED) return() endif() list(GET _hwloc_test_HWLOC_REQUIRED_VERSION_LIST 0 TEST_HWLOC_VERSION) tbb_add_test( SUBDIR ${_hwloc_test_SUBDIR} NAME ${_hwloc_test_NAME} SUFFIX ${_hwloc_test_SUFFIX} DEPENDENCIES TBB::tbb HWLOC::${TEST_HWLOC_VERSION} ) _tbb_get_hwloc_runtime_vars( ENV_EXTENSION_VARIABLE HWLOC_RUNTIME_VARS HWLOC_VERSION_LIST ${_hwloc_test_HWLOC_REQUIRED_VERSION_LIST} ) set_property(TEST ${_hwloc_test_NAME}_${_hwloc_test_SUFFIX} PROPERTY ENVIRONMENT "${HWLOC_RUNTIME_VARS}" TBB_VERSION=1 APPEND) set_tests_properties(${_hwloc_test_NAME}_${_hwloc_test_SUFFIX} PROPERTIES PASS_REGULAR_EXPRESSION "oneTBB: TBBBIND.*${_hwloc_test_TBBBIND_VERSION}" FAIL_REGULAR_EXPRESSION "Status:.*FAILURE" ) # The tbbbind isn't loading on 32-bit Windows systems with more then 32 available hardware threads if (WIN32 AND CMAKE_SIZEOF_VOID_P EQUAL 4 AND SYSTEM_CONCURRENCY GREATER 32) set_tests_properties(${_hwloc_test_NAME}_${_hwloc_test_SUFFIX} PROPERTIES PASS_REGULAR_EXPRESSION "oneTBB: TBBBIND.*UNAVAILABLE" FAIL_REGULAR_EXPRESSION "Status:.*FAILURE" ) else() target_compile_definitions(${_hwloc_test_NAME}_${_hwloc_test_SUFFIX} PRIVATE __TBB_HWLOC_VALID_ENVIRONMENT) endif() add_dependencies(test_suite_arena_constraints ${_hwloc_test_NAME}_${_hwloc_test_SUFFIX}) endfunction() function(tbb_add_tbbbind_test) set(oneValueArgs SUBDIR NAME) cmake_parse_arguments(_tbbbind_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) tbb_add_test(SUBDIR ${_tbbbind_test_SUBDIR} NAME ${_tbbbind_test_NAME} DEPENDENCIES TBB::tbb) add_dependencies(test_suite_arena_constraints ${_tbbbind_test_NAME}) set_property(TEST ${_tbbbind_test_NAME} PROPERTY ENVIRONMENT TBB_VERSION=1 APPEND) # Handle the case when HWLOC was found using pkg-config if (NOT DEFINED HWLOC_TARGET_EXPLICITLY_DEFINED AND TARGET PkgConfig::HWLOC) set_tests_properties(${_tbbbind_test_NAME} PROPERTIES PASS_REGULAR_EXPRESSION "oneTBB: TBBBIND.*${TBBBIND_LIBRARY_NAME}" FAIL_REGULAR_EXPRESSION "Status: FAILURE!" ) target_link_libraries(${_tbbbind_test_NAME} PRIVATE PkgConfig::HWLOC) target_compile_definitions(${_tbbbind_test_NAME} PRIVATE __TBB_HWLOC_VALID_ENVIRONMENT) return() endif() # Disable all HWLOC dependent tests in case of unsupported environment. if (TBB_WINDOWS_DRIVER OR ANDROID_PLATFORM OR APPLE OR NOT BUILD_SHARED_LIBS) return() endif() ProcessorCount(SYSTEM_CONCURRENCY) # Make sure fake HWLOC is found before system one if (UNIX AND NOT TARGET HWLOC::hwloc_stub AND NOT TBB_TCM_TESTING) # The idea is to produce fake HWLOC library and link it with the test # to ensure that TBBBind won't be loaded in order to simulate the case # when real HWLOC is not present in the system. file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/stubhwloc.cpp "void hwloc_stub(){}") add_library(hwloc_stub SHARED ${CMAKE_CURRENT_BINARY_DIR}/stubhwloc.cpp) add_library(HWLOC::hwloc_stub ALIAS hwloc_stub) set_target_properties(hwloc_stub PROPERTIES OUTPUT_NAME "hwloc" SOVERSION 15 ) endif() set_tests_properties(${_tbbbind_test_NAME} PROPERTIES PASS_REGULAR_EXPRESSION "oneTBB: TBBBIND.*UNAVAILABLE" FAIL_REGULAR_EXPRESSION "Status:.*FAILURE" ) if (TARGET HWLOC::hwloc_stub) _tbb_get_hwloc_runtime_vars( ENV_EXTENSION_VARIABLE HWLOC_RUNTIME_VARS HWLOC_VERSION_LIST "hwloc_stub" ) target_link_libraries(${_tbbbind_test_NAME} PRIVATE HWLOC::hwloc_stub) set_property(TEST ${_tbbbind_test_NAME} PROPERTY ENVIRONMENT "${HWLOC_RUNTIME_VARS}" APPEND) endif() if (TARGET HWLOC::hwloc_2_5 AND NOT HWLOC_2_5_TESTS_STATUS_SHOWN) message(STATUS "HWLOC 2.5 dependent tests were enabled.") set(HWLOC_2_5_TESTS_STATUS_SHOWN TRUE PARENT_SCOPE) endif() if (TARGET HWLOC::hwloc_2 AND NOT HWLOC_2_TESTS_STATUS_SHOWN) message(STATUS "HWLOC 2 dependent tests were enabled.") set(HWLOC_2_TESTS_STATUS_SHOWN TRUE PARENT_SCOPE) endif() if (TARGET HWLOC::hwloc_1_11 AND NOT HWLOC_1_11_TESTS_STATUS_SHOWN) message(STATUS "HWLOC 1.11 dependent tests were enabled.") set(HWLOC_1_11_TESTS_STATUS_SHOWN TRUE PARENT_SCOPE) endif() list(APPEND HWLOC_TEST_CASES hwloc_2_5 hwloc_2 hwloc_1_11 hwloc_2_5_hwloc_2 hwloc_2_5_hwloc_1_11 hwloc_2_hwloc_1_11 hwloc_2_5_hwloc_2_hwloc_1_11 incompatible_hwlocs_1_11_vs_2_5 incompatible_hwlocs_1_11_vs_2 ) list(APPEND HWLOC_TEST_CASE_0_VARS tbbbind_2_5 "hwloc_2_5") list(APPEND HWLOC_TEST_CASE_1_VARS tbbbind_2 "hwloc_2") list(APPEND HWLOC_TEST_CASE_2_VARS tbbbind "hwloc_1_11") list(APPEND HWLOC_TEST_CASE_3_VARS tbbbind_2_5 "hwloc_2_5,hwloc_2") list(APPEND HWLOC_TEST_CASE_4_VARS tbbbind_2_5 "hwloc_2_5,hwloc_1_11") list(APPEND HWLOC_TEST_CASE_5_VARS tbbbind_2 "hwloc_2,hwloc_1_11") list(APPEND HWLOC_TEST_CASE_6_VARS tbbbind_2_5 "hwloc_2_5,hwloc_2,hwloc_1_11") list(APPEND HWLOC_TEST_CASE_7_VARS tbbbind_2_5 "hwloc_1_11,hwloc_2_5") list(APPEND HWLOC_TEST_CASE_8_VARS tbbbind_2 "hwloc_1_11,hwloc_2") foreach(TEST_CASE ${HWLOC_TEST_CASES}) list(FIND HWLOC_TEST_CASES ${TEST_CASE} TEST_CASE_INDEX) list(GET HWLOC_TEST_CASE_${TEST_CASE_INDEX}_VARS 0 TEST_CASE_TBBBIND_EXPECTED_VERSION) list(GET HWLOC_TEST_CASE_${TEST_CASE_INDEX}_VARS 1 TEST_CASE_TBBBIND_HWLOC_REQUIRED_VERSIONS) string(REPLACE "," ";" TEST_CASE_TBBBIND_HWLOC_REQUIRED_VERSIONS "${TEST_CASE_TBBBIND_HWLOC_REQUIRED_VERSIONS}") tbb_configure_hwloc_dependent_test( SUBDIR ${_tbbbind_test_SUBDIR} NAME ${_tbbbind_test_NAME} SUFFIX ${TEST_CASE} TBBBIND_VERSION ${TEST_CASE_TBBBIND_EXPECTED_VERSION} HWLOC_REQUIRED_VERSION_LIST ${TEST_CASE_TBBBIND_HWLOC_REQUIRED_VERSIONS} ) endforeach() endfunction() # Copy libraries to test folder to make it visible during tests execution if external TBB is tested. # TODO: check and update for multi-config generators. if (TBB_FOUND) list(APPEND _tbb_test_components tbb tbbmalloc tbbmalloc_proxy tbbbind tbbbind_2_0 tbbbind_2_5) foreach(_component ${_tbb_test_components}) if (TARGET TBB::${_component}) get_property(${_component}_lib_file_location TARGET TBB::${_component} PROPERTY LOCATION) file(COPY ${${_component}_lib_file_location} DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) unset(${_component}_lib_file_location CACHE) endif() endforeach() unset(_tbb_test_components) endif() # Find Intel(R) Software Development Emulator to run test_mutex and conformance_mutex for coverage set(_sde_find_name sde) if (UNIX AND TBB_ARCH EQUAL 64) set(_sde_find_name sde64) endif() find_program(SDE_EXE NAMES ${_sde_find_name} PATHS ENV PATH PATH_SUFFIXES bin) unset(_sde_find_name) # Common target for the tbbbind related tests add_custom_target(test_suite_arena_constraints) # Check support for --no-as-needed linker option if (MINGW OR NOT WIN32) include(CheckCXXSourceCompiles) set(CMAKE_REQUIRED_LIBRARIES "-Wl,--no-as-needed") check_cxx_source_compiles("int main(int, char*[]) { return 0; }" LINKER_HAS_NO_AS_NEEDED) unset(CMAKE_REQUIRED_LIBRARIES) endif() if (TARGET TBB::tbb) # Define the tests tbb_add_test(SUBDIR tbb NAME test_tick_count DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_allocators DEPENDENCIES TBB::tbb) if (NOT TBB_TCM_TESTING) tbb_add_test(SUBDIR tbb NAME test_arena_priorities DEPENDENCIES TBB::tbb) endif() tbb_add_test(SUBDIR tbb NAME test_dynamic_link DEPENDENCIES TBB::tbb) if (LINKER_HAS_NO_AS_NEEDED) # The linker may not detect a dependency on pthread in static variable constructors. target_link_libraries(test_dynamic_link PRIVATE "-Wl,--no-as-needed") endif() if (APPLE OR ANDROID_PLATFORM) target_link_libraries(test_dynamic_link PRIVATE -rdynamic) endif() if (WIN32 AND NOT TBB_TCM_TESTING) tbb_add_test(SUBDIR tbb NAME test_numa_dist DEPENDENCIES TBB::tbb) endif() tbb_add_test(SUBDIR tbb NAME test_collaborative_call_once DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_concurrent_lru_cache DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_concurrent_unordered_map DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_concurrent_unordered_set DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_concurrent_map DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_concurrent_set DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_concurrent_priority_queue DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_partitioner DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_parallel_for DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_parallel_for_each DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_parallel_reduce DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_parallel_sort DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_parallel_invoke DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_parallel_scan DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_parallel_pipeline DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_eh_algorithms DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_blocked_range DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_concurrent_vector DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_task_group DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_concurrent_hash_map DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_task_arena DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_parallel_phase DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_enumerable_thread_specific DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_concurrent_queue DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_resumable_tasks DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_mutex DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_function_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_multifunction_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_broadcast_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_buffer_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_composite_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_continue_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_eh_flow_graph DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_flow_graph DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_flow_graph_priorities DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_flow_graph_whitebox DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_indexer_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_join_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_join_node_key_matching DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_join_node_key_matching_n_args DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_join_node_msg_key_matching DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_join_node_msg_key_matching_n_args DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_join_node_preview DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_limiter_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_priority_queue_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_queue_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_sequencer_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_split_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_tagged_msg DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_overwrite_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_write_once_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_async_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_input_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_profiling DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_concurrent_queue_whitebox DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_intrusive_list DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_semaphore DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_environment_whitebox DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_hw_concurrency DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_eh_thread DEPENDENCIES TBB::tbb) if (NOT TBB_TCM_TESTING) tbb_add_test(SUBDIR tbb NAME test_global_control DEPENDENCIES TBB::tbb) endif() tbb_add_test(SUBDIR tbb NAME test_task DEPENDENCIES TBB::tbb) if (TBB_TCM_TESTING AND NOT WINDOWS_STORE AND NOT TBB_WINDOWS_DRIVER) add_test(NAME test_tcm_enabled COMMAND test_task --force-colors=1 WORKING_DIRECTORY ${TBB_TEST_WORKING_DIRECTORY}) set_tests_properties(test_tcm_enabled PROPERTIES ENVIRONMENT "TBB_VERSION=1;TCM_ENABLE=1" PASS_REGULAR_EXPRESSION "TCM: VERSION.*" FAIL_REGULAR_EXPRESSION "TCM: TCM *disabled" ) add_test(NAME test_tcm_disabled COMMAND test_task --force-colors=1 WORKING_DIRECTORY ${TBB_TEST_WORKING_DIRECTORY}) set_tests_properties(test_tcm_disabled PROPERTIES ENVIRONMENT "TBB_VERSION=1;TCM_ENABLE=0" PASS_REGULAR_EXPRESSION "TCM: TCM *disabled" ) endif() if (TBB_FUZZ_TESTING AND NOT WIN32) if (NOT ((CMAKE_CXX_COMPILER_ID STREQUAL Clang) OR (CMAKE_CXX_COMPILER_ID STREQUAL IntelLLVM))) message(FATAL_ERROR "Fuzzing requires Clang or IntelLLVM compiler.") endif() tbb_add_test(SUBDIR tbb NAME test_fuzzing) add_dependencies(test_fuzzing test_task) target_compile_definitions(test_fuzzing PRIVATE CMD="$ >/dev/null 2>&1") target_link_options(test_fuzzing PRIVATE -fsanitize=fuzzer) endif() tbb_add_test(SUBDIR tbb NAME test_concurrent_monitor DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR tbb NAME test_scheduler_mix DEPENDENCIES TBB::tbb) # test_handle_perror tbb_add_test(SUBDIR tbb NAME test_handle_perror) target_include_directories(test_handle_perror PRIVATE $ ) # HWLOC related test if (NOT TBB_EMSCRIPTEN) tbb_add_tbbbind_test(SUBDIR tbb NAME test_arena_constraints) endif() if ((NOT "${CMAKE_SYSTEM_PROCESSOR}" MATCHES "mips") AND (NOT TBB_EMSCRIPTEN)) # TODO: Fix for MIPS tbb_add_test(SUBDIR tbb NAME test_tbb_fork DEPENDENCIES TBB::tbb) endif() tbb_add_test(SUBDIR tbb NAME test_tbb_header DEPENDENCIES TBB::tbb) target_sources(test_tbb_header PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/tbb/test_tbb_header_secondary.cpp) if (TBB_OPENMP_FLAG AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "(mips)") tbb_add_test(SUBDIR tbb NAME test_openmp DEPENDENCIES TBB::tbb) set_target_properties(test_openmp PROPERTIES COMPILE_FLAGS ${TBB_OPENMP_FLAG}) if (NOT TBB_OPENMP_NO_LINK_FLAG) set_target_properties(test_openmp PROPERTIES LINK_FLAGS ${TBB_OPENMP_FLAG}) endif() endif() # Define the conformance tests tbb_add_test(SUBDIR conformance NAME conformance_tick_count DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_allocators DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_mutex DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_task_group DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_task_group_context DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_task_arena DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_collaborative_call_once DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_concurrent_lru_cache DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_concurrent_unordered_map DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_concurrent_unordered_set DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_concurrent_map DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_concurrent_set DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_concurrent_priority_queue DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_parallel_for DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_parallel_for_each DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_parallel_reduce DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_parallel_scan DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_parallel_sort DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_parallel_pipeline DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_parallel_invoke DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_blocked_range DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_blocked_range2d DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_blocked_range3d DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_blocked_nd_range DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_concurrent_vector DEPENDENCIES TBB::tbb) if (NOT TBB_TCM_TESTING) tbb_add_test(SUBDIR conformance NAME conformance_global_control DEPENDENCIES TBB::tbb) endif() tbb_add_test(SUBDIR conformance NAME conformance_concurrent_hash_map DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_enumerable_thread_specific DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_combinable DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_concurrent_queue DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_resumable_tasks DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_version DEPENDENCIES TBB::tbb) # functional nodes conformance tbb_add_test(SUBDIR conformance NAME conformance_function_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_multifunction_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_input_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_continue_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_async_node DEPENDENCIES TBB::tbb) # buffering nodes conformance tbb_add_test(SUBDIR conformance NAME conformance_overwrite_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_write_once_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_buffer_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_queue_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_priority_queue_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_sequencer_node DEPENDENCIES TBB::tbb) # service nodes conformance tbb_add_test(SUBDIR conformance NAME conformance_limiter_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_broadcast_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_composite_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_indexer_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_split_node DEPENDENCIES TBB::tbb) tbb_add_test(SUBDIR conformance NAME conformance_join_node DEPENDENCIES TBB::tbb) # flowraph auxiliary conformance # TODO: add conformance tests for graph_node, continue_msg, tagged_msg, copy_body, input_port, output_port, make_edge, remove_edge tbb_add_test(SUBDIR conformance NAME conformance_graph DEPENDENCIES TBB::tbb) # HWLOC related conformance if (NOT TBB_EMSCRIPTEN) tbb_add_tbbbind_test(SUBDIR conformance NAME conformance_arena_constraints) endif() if (MSVC AND BUILD_SHARED_LIBS AND CMAKE_VERSION VERSION_GREATER 3.13) # LINK_OPTIONS property first appeared in 3.13 # version of the CMake tbb_add_test(SUBDIR tbb NAME test_implicit_linkage_on_windows) # TODO: consider setting environment instead of passing additional # compiler and linker options target_include_directories(test_implicit_linkage_on_windows PRIVATE $) set_target_properties(test_implicit_linkage_on_windows PROPERTIES LINK_OPTIONS LINKER:/LIBPATH:$) add_dependencies(test_implicit_linkage_on_windows TBB::tbb) endif() endif() if (TARGET TBB::tbbmalloc) # TBB allocator tests if (NOT "${CMAKE_SYSTEM_PROCESSOR}" MATCHES "mips") if (NOT TBB_EMSCRIPTEN) # Define TBB malloc tests tbb_add_test(SUBDIR tbbmalloc NAME test_scalable_allocator DEPENDENCIES TBB::tbbmalloc) tbb_add_test(SUBDIR tbbmalloc NAME test_malloc_pools DEPENDENCIES TBB::tbbmalloc) tbb_add_test(SUBDIR tbbmalloc NAME test_malloc_init_shutdown DEPENDENCIES TBB::tbbmalloc) tbb_add_test(SUBDIR tbbmalloc NAME test_malloc_regression DEPENDENCIES TBB::tbbmalloc) if (TARGET TBB::tbb) tbb_add_test(SUBDIR tbbmalloc NAME test_malloc_shutdown_hang DEPENDENCIES TBB::tbb TBB::tbbmalloc) endif() if (NOT (WINDOWS_STORE OR TBB_WINDOWS_DRIVER)) # TODO: Consider adding following tests on WINDOWS_STORE and TBB_WINDOWS_DRIVER platforms tbb_add_test(SUBDIR tbbmalloc NAME test_malloc_compliance DEPENDENCIES TBB::tbbmalloc) tbb_add_lib_test(SUBDIR tbbmalloc NAME test_malloc_used_by_lib DEPENDENCIES TBB::tbbmalloc) tbb_add_test(SUBDIR tbbmalloc NAME test_malloc_used_by_lib DEPENDENCIES _test_malloc_used_by_lib) tbb_add_lib_test(SUBDIR tbbmalloc NAME test_malloc_lib_unload) tbb_add_test(SUBDIR tbbmalloc NAME test_malloc_lib_unload DEPENDENCIES _test_malloc_lib_unload) endif() enable_language(C) tbb_add_c_test(SUBDIR tbbmalloc NAME test_malloc_pure_c DEPENDENCIES TBB::tbbmalloc) endif() # ---------------------------------------------------------------------------------------- # Whitebox testing if (NOT TBB_EMSCRIPTEN) add_executable(test_malloc_whitebox tbbmalloc/test_malloc_whitebox.cpp) target_include_directories(test_malloc_whitebox PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../include ${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}) target_compile_definitions(test_malloc_whitebox PRIVATE __TBBMALLOC_BUILD) target_compile_options(test_malloc_whitebox PRIVATE ${TBB_CXX_STD_FLAG} ${TBB_WARNING_SUPPRESS} ${TBB_TEST_COMPILE_FLAGS} ${TBB_COMMON_COMPILE_FLAGS} ${TBBMALLOC_LIB_COMPILE_FLAGS} ) if (ANDROID_PLATFORM) add_test(NAME test_malloc_whitebox COMMAND ${CMAKE_COMMAND} -DBINARIES_PATH=${CMAKE_LIBRARY_OUTPUT_DIRECTORY} -DTEST_NAME=test_malloc_whitebox -P ${PROJECT_SOURCE_DIR}/cmake/android/test_launcher.cmake) else() add_test(NAME test_malloc_whitebox COMMAND test_malloc_whitebox --force-colors=1) endif() if (COMMAND target_link_options) target_link_options(test_malloc_whitebox PRIVATE ${TBB_COMMON_LINK_FLAGS}) else() target_link_libraries(test_malloc_whitebox PRIVATE ${TBB_COMMON_LINK_FLAGS}) endif() target_link_libraries(test_malloc_whitebox PRIVATE Threads::Threads ${TBB_COMMON_LINK_LIBS}) endif() # ------------------------------------------------------------------------------------------ # Define TBB malloc conformance tests # tbbmalloc_add_test(conformance conformance_scalable_allocator) if ("${CMAKE_MSVC_RUNTIME_LIBRARY}" STREQUAL MultiThreaded OR "${CMAKE_MSVC_RUNTIME_LIBRARY}" STREQUAL MultiThreadedDebug) if ("${CMAKE_MSVC_RUNTIME_LIBRARY}" STREQUAL MultiThreaded) set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreadedDLL) else() set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreadedDebugDLL) endif() endif() # Thread Sanitizer overloads memory management routines that conflicts with tbbmalloc_proxy. if (BUILD_SHARED_LIBS AND NOT TBB_SANITIZE MATCHES "thread" AND TBBMALLOC_PROXY_BUILD AND NOT MSVC_CXX_ARCHITECTURE_ID MATCHES "ARM64") # Define TBB malloc proxy tests tbb_add_lib_test(SUBDIR tbbmalloc NAME test_malloc_atexit DEPENDENCIES TBB::tbbmalloc_proxy TBB::tbbmalloc) if (NOT TBB_EMSCRIPTEN) tbb_add_test(SUBDIR tbbmalloc NAME test_malloc_atexit DEPENDENCIES TBB::tbbmalloc_proxy TBB::tbbmalloc _test_malloc_atexit) endif() tbb_add_test(SUBDIR tbbmalloc NAME test_malloc_overload DEPENDENCIES TBB::tbbmalloc_proxy) tbb_add_test(SUBDIR tbbmalloc NAME test_malloc_overload_disable DEPENDENCIES TBB::tbbmalloc_proxy TBB::tbbmalloc) # safer_msize call need to be available tbb_add_test(SUBDIR tbbmalloc NAME test_malloc_new_handler DEPENDENCIES TBB::tbbmalloc_proxy) endif() endif() endif() unset(HWLOC_2_5_TESTS_STATUS_SHOWN) unset(HWLOC_2_TESTS_STATUS_SHOWN) unset(HWLOC_1_11_TESTS_STATUS_SHOWN) ================================================ FILE: third-party/tbb/test/common/allocator_overload.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef tbb_test_harness_allocator_overload_H #define tbb_test_harness_allocator_overload_H #include "config.h" #include "../src/tbbmalloc_proxy/proxy.h" // for MALLOC_UNIXLIKE_OVERLOAD_ENABLED, MALLOC_ZONE_OVERLOAD_ENABLED #include "oneapi/tbb/detail/_config.h" // for __TBB_WIN8UI_SUPPORT // Skip configurations with unsupported system malloc overload: // skip unsupported MSVCs, WIN8UI and MINGW (it doesn't define _MSC_VER), // no support for MSVC 2015 and greater in debug for now, // don't use defined(_MSC_VER), because result of using defined() in macro expansion is undefined #define MALLOC_WINDOWS_OVERLOAD_ENABLED ((_WIN32||_WIN64) && !__TBB_WIN8UI_SUPPORT && _MSC_VER >= 1500 && !(_MSC_VER >= 1900 && _DEBUG)) // Skip configurations with unsupported system malloc overload: // * overload via linking with -lmalloc_proxy is broken in offload, // as the library is loaded too late in that mode, // * LD_PRELOAD mechanism is broken in offload #define HARNESS_SKIP_TEST (!MALLOC_WINDOWS_OVERLOAD_ENABLED && !MALLOC_UNIXLIKE_OVERLOAD_ENABLED && !MALLOC_ZONE_OVERLOAD_ENABLED) #endif // tbb_test_harness_allocator_overload_H ================================================ FILE: third-party/tbb/test/common/allocator_stl_test_common.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ // Tests for compatibility with the host's STL. #ifndef __TBB_test_common_allocator_stl_test_H_ #define __TBB_test_common_allocator_stl_test_H_ #include "common/test.h" template void TestSequence(const typename Container::allocator_type &a) { constexpr auto iter_count = 1000; Container c(a); for(int i = 0; i < iter_count; ++i){ c.push_back(i * i); } typename Container::const_iterator p = c.begin(); for(int i = 0; i < iter_count; ++i) { REQUIRE(*p == i*i); ++p; } // regression test against compilation error for GCC 4.6.2 c.resize(1000); } template void TestSet(const typename Set::allocator_type &a) { Set s(typename Set::key_compare(), a); using value_type = typename Set::value_type; for(int i = 0; i < 100; ++i) s.insert(value_type(3 * i)); for( int i = 0; i < 300; ++i ) { REQUIRE(s.erase(i) == size_t(i % 3 == 0)); } } template void TestMap(const typename Map::allocator_type &a) { Map m(typename Map::key_compare(), a); using value_type = typename Map::value_type; for(int i = 0; i < 100; ++i) m.insert(value_type(i,i*i)); for(int i=0; i < 100; ++i) REQUIRE(m.find(i)->second == i * i); } #include #include #include #include #include struct MoveOperationTracker { int my_value; MoveOperationTracker(int value = 0) : my_value(value) {} MoveOperationTracker(const MoveOperationTracker&) { REQUIRE_MESSAGE(false, "Copy constructor is called"); } MoveOperationTracker(MoveOperationTracker&& m) noexcept : my_value( m.my_value ) { } MoveOperationTracker& operator=(MoveOperationTracker const&) { REQUIRE_MESSAGE(false, "Copy assignment operator is called"); return *this; } MoveOperationTracker& operator=(MoveOperationTracker&& m) noexcept { my_value = m.my_value; return *this; } bool operator==(int value) const { return my_value == value; } bool operator==(const MoveOperationTracker& m) const { return my_value == m.my_value; } }; template void TestAllocatorWithSTL(const Allocator &a = Allocator()) { // Allocator type conversion section using Ai = typename std::allocator_traits::template rebind_alloc; using Acii = typename std::allocator_traits::template rebind_alloc >; #if _MSC_VER && _CPPLIB_VER < 650 using Aci = typename std::allocator_traits::template rebind_alloc; using Aii = typename std::allocator_traits::template rebind_alloc >; #endif // _MSC_VER // Sequenced containers TestSequence >(a); TestSequence >(a); TestSequence >(a); using Amot = typename std::allocator_traits::template rebind_alloc; TestSequence >(a); TestSequence >(a); TestSequence >(a); // Associative containers TestSet, Ai> >(a); TestSet, Ai> >(a); TestMap, Acii> >(a); TestMap, Acii> >(a); #if _MSC_VER && _CPPLIB_VER < 650 // Test compatibility with Microsoft's implementation of std::allocator for some cases that // are undefined according to the ISO standard but permitted by Microsoft. TestSequence >(a); #if _CPPLIB_VER>=500 TestSequence >(a); #endif TestSequence >(a); TestSet, Aci> >(a); TestMap, Aii> >(a); TestMap, Acii> >(a); TestMap, Aii> >(a); TestMap, Acii> >(a); #endif /* _MSC_VER */ } #endif // __TBB_test_common_allocator_stl_test_H_ ================================================ FILE: third-party/tbb/test/common/allocator_test_common.h ================================================ /* Copyright (c) 2005-2022 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ // Basic testing of an allocator // Tests against requirements in 20.1.5 of ISO C++ Standard (1998). // Does not check for thread safety or false sharing issues. // // Tests for compatibility with the host's STL are in // test_Allocator_STL.h. Those tests are in a separate file // because they bring in lots of STL headers, and the tests here // are supposed to work in the abscense of STL. #ifndef __TBB_test_common_allocator_test_common_H_ #define __TBB_test_common_allocator_test_common_H_ #include "common/test.h" #include "common/utils.h" #include //for std::pair #include //! Compile-time error if x and y have different types template void AssertSameType( const T& /*x*/, const T& /*y*/ ) {} //! The function to zero-initialize arrays; useful to avoid warnings template void zero_fill(void* array, size_t n) { memset(array, 0, sizeof(T)*n); } template struct is_zero_filling { static const bool value = false; }; int NumberOfFoo; template struct Foo { T foo_array[N]; Foo() { zero_fill(foo_array, N); ++NumberOfFoo; } Foo( const Foo& x ) { *this = x; //Internal call of assignment } Foo& operator=( const Foo& x ) { for (size_t i = 0; i < N; i++) foo_array[i] = x.foo_array[i]; ++NumberOfFoo; return *this; } ~Foo() { --NumberOfFoo; } }; inline char PseudoRandomValue( size_t j, size_t k ) { return char(j*3 ^ j>>4 ^ k); } #if __APPLE__ #include #include // A RAII class to disable stderr in a certain scope. It's not thread-safe. class DisableStderr { int stderrCopy; static void dupToStderrAndClose(int fd) { int ret = dup2(fd, STDERR_FILENO); // close current stderr REQUIRE(ret != -1); ret = close(fd); REQUIRE(ret != -1); } public: DisableStderr() { int devNull = open("/dev/null", O_WRONLY); REQUIRE(devNull != -1); stderrCopy = dup(STDERR_FILENO); REQUIRE(stderrCopy != -1); dupToStderrAndClose(devNull); } ~DisableStderr() { dupToStderrAndClose(stderrCopy); } }; #endif //! T is type and A is allocator for that type template void TestBrokenAllocator(A& a) { T x; const T cx = T(); // See Table 32 in ISO ++ Standard typename A::pointer px = &x; typename A::const_pointer pcx = &cx; typename A::reference rx = x; REQUIRE(&rx == &x); typename A::const_reference rcx = cx; REQUIRE(&rcx==&cx); typename A::value_type v = x; typename A::size_type size; size = 0; --size; REQUIRE_MESSAGE(size > 0, "not an unsigned integral type?"); typename A::difference_type difference; difference = 0; --difference; REQUIRE_MESSAGE(difference<0, "not an signed integral type?"); // "rebind" tested by our caller REQUIRE(a.address(rx) == px); REQUIRE(a.address(rcx) == pcx); // Test "a.max_size()" AssertSameType(a.max_size(), typename A::size_type(0)); // Following assertion catches case where max_size() is so large that computation of // number of bytes for such an allocation would overflow size_type. REQUIRE_MESSAGE((a.max_size() * typename A::size_type(sizeof(T)) >= a.max_size()), "max_size larger than reasonable"); // Test "a.construct(p,t)" int n = NumberOfFoo; typename A::pointer p = a.allocate(1); a.construct(p, cx); REQUIRE_MESSAGE(NumberOfFoo == n + 1, "constructor for Foo not called?"); // Test "a.destroy(p)" a.destroy(p); REQUIRE_MESSAGE(NumberOfFoo == n, "destructor for Foo not called?"); a.deallocate(p, 1); { typedef typename A::template rebind >::other pair_allocator_type; pair_allocator_type pair_allocator(a); int NumberOfFooBeforeConstruct = NumberOfFoo; typename pair_allocator_type::pointer pair_pointer = pair_allocator.allocate(1); pair_allocator.construct(pair_pointer, cx, cx); REQUIRE_MESSAGE(NumberOfFoo == NumberOfFooBeforeConstruct+2, "constructor for Foo not called appropriate number of times?"); pair_allocator.destroy(pair_pointer); REQUIRE_MESSAGE(NumberOfFoo == NumberOfFooBeforeConstruct, "destructor for Foo not called appropriate number of times?"); pair_allocator.deallocate(pair_pointer, 1); } } //! T is type and A is allocator for that type template void TestAllocatorConcept(A& a) { // Test "a.allocate(p,n) typename std::allocator_traits
::pointer array[100]; std::size_t sizeof_T = sizeof(T); for(std::size_t k = 0; k < 100; ++k) { array[k] = a.allocate(k); char* s = reinterpret_cast(reinterpret_cast(array[k])); for(std::size_t j=0; j < k * sizeof_T; ++j) s[j] = PseudoRandomValue(j, k); } // Test "a.deallocate(p,n) for(std::size_t k = 0; k < 100; ++k) { char* s = reinterpret_cast(reinterpret_cast(array[k])); for(std::size_t j = 0; j < k * sizeof_T; ++j) REQUIRE(s[j] == PseudoRandomValue(j, k)); a.deallocate(array[k], k); } } //! T is type and A is allocator for that type template void TestAllocatorExceptions(A& a) { #if TBB_USE_EXCEPTIONS volatile size_t too_big = (~std::size_t(0) - 1024 * 1024) / sizeof(T); bool exception_caught = false; typename std::allocator_traits::pointer p1 = nullptr; try { #if __APPLE__ // On macOS*, failure to map memory results in messages to stderr; // suppress them. DisableStderr disableStderr; #endif p1 = a.allocate(too_big); } catch (std::bad_alloc&) { exception_caught = true; } REQUIRE_MESSAGE(exception_caught, "allocate expected to throw bad_alloc"); a.deallocate(p1, too_big); #endif // TBB_USE_EXCEPTIONS utils::suppress_unused_warning(a); } #if _MSC_VER && !defined(__INTEL_COMPILER) // Workaround for erroneous "conditional expression is constant" warning in method check_allocate. #pragma warning (disable: 4127) #endif // A is an allocator for some type template struct Body: utils::NoAssign { using pointer_type = typename std::allocator_traits::pointer; using value_type = typename std::allocator_traits::value_type; // For the int types and above this test runs too long static const std::size_t max_k = sizeof(value_type) < sizeof(int) ? 100000 : 5000; A &a; Body(A &a_) : a(a_) {} void check_allocate(pointer_type array[], std::size_t i, std::size_t t) const { REQUIRE(array[i] == nullptr); std::size_t size = i * (i & 3); array[i] = a.allocate(size); REQUIRE_MESSAGE(array[i] != nullptr, "allocator returned null"); char* s = reinterpret_cast(reinterpret_cast(array[i])); for(std::size_t j = 0; j < size * sizeof(value_type); ++j) { if(is_zero_filling::template rebind_alloc>::value) REQUIRE(!s[j]); s[j] = PseudoRandomValue(i, t); } } void check_deallocate(pointer_type array[], std::size_t i, std::size_t t) const { REQUIRE(array[i] != nullptr); size_t size = i * (i & 3); char* s = reinterpret_cast(reinterpret_cast(array[i])); for(std::size_t j=0; j < size * sizeof(value_type); ++j) REQUIRE_MESSAGE(s[j] == PseudoRandomValue(i, t), "Thread safety test failed"); a.deallocate(array[i], size); array[i] = nullptr; } void operator()(std::size_t thread_id) const { pointer_type array[256]; for(std::size_t k = 0; k < 256; ++k) array[k] = nullptr; for(std::size_t k = 0; k < max_k; ++k) { std::size_t i = static_cast(PseudoRandomValue(k, thread_id)); if(!array[i]) check_allocate(array, i, thread_id); else check_deallocate(array, i, thread_id); } for(std::size_t k = 0; k < 256; ++k) if(array[k]) check_deallocate(array, k, thread_id); } }; template void TestThreadSafety(A &a) { utils::NativeParallelFor(4, Body(a)); } enum TestName { Concept, Broken, Exceptions, ThreadSafety, Comparison }; template void TestAllocator(TestName name, const Allocator &a = Allocator()) { using FooChar = Foo; using FooDouble = Foo; using FooInt = Foo; using FooFloat = Foo; #if TBB_ALLOCATOR_TRAITS_BROKEN using AllocatorFooChar = typename Allocator::template rebind::other; using AllocatorFooDouble = typename Allocator::template rebind::other; using AllocatorFooInt = typename AllocatorFooChar::template rebind::other; using AllocatorFooFloat = typename AllocatorFooDouble::template rebind::other; #else using AllocatorFooChar = typename std::allocator_traits::template rebind_alloc; using AllocatorFooDouble = typename std::allocator_traits::template rebind_alloc; using AllocatorFooInt = typename std::allocator_traits::template rebind_alloc; using AllocatorFooFloat = typename std::allocator_traits::template rebind_alloc; #endif NumberOfFoo = 0; Allocator a_cpy(a); AllocatorFooChar a1(a); AllocatorFooDouble a2(a); AllocatorFooInt b1(a1); AllocatorFooFloat b2(a2); switch(name) { case Comparison: REQUIRE(a_cpy == a); REQUIRE(a1 == b1); REQUIRE(!(a2 != b2)); break; case Concept: TestAllocatorConcept(b1); TestAllocatorConcept(a1); TestAllocatorConcept(b2); TestAllocatorConcept(a2); break; case Broken: #if TBB_ALLOCATOR_TRAITS_BROKEN TestBrokenAllocator(b1); TestBrokenAllocator(a1); TestBrokenAllocator(b2); TestBrokenAllocator(a2); #endif break; case Exceptions: TestAllocatorExceptions(b1); TestAllocatorExceptions(a1); TestAllocatorExceptions(b2); TestAllocatorExceptions(a2); break; case ThreadSafety: TestThreadSafety(a1); TestThreadSafety(a2); break; } REQUIRE_MESSAGE(NumberOfFoo == 0, "Allocate/deallocate count mismatched"); } #endif // __TBB_test_common_allocator_test_common_H_ ================================================ FILE: third-party/tbb/test/common/checktype.h ================================================ /* Copyright (c) 2005-2021 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_test_common_checktype_H #define __TBB_test_common_checktype_H #include "config.h" #include // Type that checks that there are no operations with the destroyed object class DestroyedTracker { protected: enum StateType { LIVE = 0x56781234, DEAD = 0xDEADBEEF }; StateType my_state; public: DestroyedTracker() : my_state(LIVE) {} DestroyedTracker( const DestroyedTracker& src ) : my_state(LIVE) { CHECK_FAST_MESSAGE(src.is_alive(), "Constructing from the dead source"); } ~DestroyedTracker() { CHECK_FAST_MESSAGE(is_alive(), "Destructing the dead object"); my_state = DEAD; } DestroyedTracker& operator=( const DestroyedTracker& src ) { CHECK_FAST_MESSAGE(is_alive(), "Assignment to the dead object"); CHECK_FAST_MESSAGE(src.is_alive(), "Assignment from the dead source"); return *this; } bool is_alive() const { return my_state == LIVE; } }; // class DestroyedTracker // Type that checks construction and destruction template class CheckType : DestroyedTracker { public: static std::atomic check_type_counter; CheckType( Counter n = 0 ) : my_id(n), am_ready(false) { ++check_type_counter; } CheckType( const CheckType& other ) : DestroyedTracker(other) { CHECK_FAST(is_alive()); CHECK_FAST(other.is_alive()); my_id = other.my_id; am_ready = other.am_ready; ++check_type_counter; } operator int() const { return int(my_id); } CheckType& operator++() { ++my_id; return *this; } CheckType& operator=( const CheckType& other ) { CHECK_FAST(is_alive()); CHECK_FAST(other.is_alive()); my_id = other.my_id; am_ready = other.am_ready; return *this; } ~CheckType() { CHECK_FAST(is_alive()); --check_type_counter; CHECK_FAST_MESSAGE(check_type_counter >= 0, "Too many destructions"); } Counter id() const { CHECK_FAST(is_alive()); return my_id; } bool is_ready() { CHECK_FAST(is_alive()); return am_ready; } void get_ready() { CHECK_FAST(is_alive()); if (my_id == Counter(0)) { my_id = Counter(1); am_ready = true; } } private: Counter my_id; bool am_ready; }; // class CheckType namespace std { template struct hash> { std::size_t operator()( const CheckType& obj ) const { return std::size_t(obj.id()); } }; } template std::atomic CheckType::check_type_counter; // A dummy class template struct Checker { Checker() {} // do nothing ~Checker() {} // do nothing }; // A specialization for CheckType that initialize a counter on creation // and checks that the constructions and destructions of CheckType match template struct Checker> { Checker() { CheckType::check_type_counter = 0; } ~Checker() { CHECK_MESSAGE(CheckType::check_type_counter == 0, "CheckType constructions and destructions don't match"); } }; // struct Checker #endif // __TBB_test_common_checktype_H ================================================ FILE: third-party/tbb/test/common/common_arena_constraints.h ================================================ /* Copyright (c) 2019-2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_test_common_arena_constraints_H_ #define __TBB_test_common_arena_constraints_H_ #if _WIN32 || _WIN64 #define _CRT_SECURE_NO_WARNINGS #endif #include "common/test.h" #include "common/spin_barrier.h" #include "common/utils.h" #include "common/memory_usage.h" #include "common/utils_concurrency_limit.h" #include "oneapi/tbb/task_arena.h" #include "oneapi/tbb/spin_mutex.h" #include #include #if (_WIN32 || _WIN64) && __TBB_HWLOC_VALID_ENVIRONMENT #include int get_processors_group_count() { SYSTEM_INFO si; GetNativeSystemInfo(&si); DWORD_PTR pam, sam, m = 1; GetProcessAffinityMask( GetCurrentProcess(), &pam, &sam ); int nproc = 0; for ( std::size_t i = 0; i < sizeof(DWORD_PTR) * CHAR_BIT; ++i, m <<= 1 ) { if ( pam & m ) ++nproc; } // Setting up processor groups in case the process does not restrict affinity mask and more than one processor group is present if ( nproc == (int)si.dwNumberOfProcessors ) { // The process does not have restricting affinity mask and multiple processor groups are possible return (int)GetActiveProcessorGroupCount(); } else { return 1; } } #else int get_processors_group_count() { return 1; } #endif //TODO: Write a test that checks for memory leaks during dynamic link/unlink of TBBbind. #if __TBB_HWLOC_VALID_ENVIRONMENT #include "oneapi/tbb/concurrent_unordered_set.h" #include #include #if _MSC_VER #if __clang__ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-parameter" #else #pragma warning( push ) #pragma warning( disable : 4100 ) #endif #endif #include #if _MSC_VER #if __clang__ #pragma GCC diagnostic pop #else #pragma warning( pop ) #endif #endif #define __HWLOC_HYBRID_CPUS_INTERFACES_PRESENT (HWLOC_API_VERSION >= 0x20400) #define __HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING_PRESENT (HWLOC_API_VERSION >= 0x20500) // At this moment the hybrid CPUs HWLOC interfaces returns unexpected results on some Windows machines // in the 32-bit arch mode. #define __HWLOC_HYBRID_CPUS_INTERFACES_VALID (!_WIN32 || _WIN64) #define __HYBRID_CPUS_TESTING __HWLOC_HYBRID_CPUS_INTERFACES_PRESENT && __HWLOC_HYBRID_CPUS_INTERFACES_VALID #define __HWLOC_CPUBIND_PRESENT (!__APPLE__) // Macro to check hwloc interfaces return codes #define hwloc_require_ex(command, ...) \ REQUIRE_MESSAGE(command(__VA_ARGS__) >= 0, "Error occurred inside hwloc call."); struct index_info { int index{-1}; int concurrency{-1}; hwloc_bitmap_t cpuset{nullptr}; index_info() = default; index_info(const index_info& src) : index{src.index} , concurrency{src.concurrency} , cpuset{hwloc_bitmap_dup(src.cpuset)} {} index_info& operator=(index_info src) { index = src.index; concurrency = src.concurrency; std::swap(cpuset, src.cpuset); return *this; } ~index_info() { hwloc_bitmap_free(cpuset); } }; struct core_info { hwloc_bitmap_t cpuset{nullptr}; core_info() = default; core_info(hwloc_bitmap_t _cpuset) : cpuset{hwloc_bitmap_dup(_cpuset)} {} core_info(const core_info& src) : cpuset{hwloc_bitmap_dup(src.cpuset)} {} core_info& operator=(core_info src) { std::swap(cpuset, src.cpuset); return *this; } ~core_info() { hwloc_bitmap_free(cpuset); } }; class system_info { hwloc_topology_t topology; hwloc_cpuset_t process_cpuset{nullptr}; std::vector numa_node_infos{}; std::vector cpu_kind_infos{}; std::vector core_infos{}; // hwloc_cpuset_t and hwloc_nodeset_t (inherited from hwloc_bitmap_t ) is pointers, // so we must manage memory allocation and deallocation using memory_handler_t = tbb::concurrent_unordered_set; memory_handler_t memory_handler{}; static system_info* system_info_ptr; public: static void initialize() { static system_info topology_instance; system_info_ptr = &topology_instance; } private: static system_info& instance() { REQUIRE_MESSAGE(system_info_ptr, "Get access to the uninitialize system info.(reference)"); return *system_info_ptr; } system_info() { hwloc_require_ex(hwloc_topology_init, &topology); #if __HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING_PRESENT if ( get_processors_group_count() == 1 ) { REQUIRE( hwloc_topology_set_flags(topology, HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM | HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING) == 0 ); } #endif hwloc_require_ex(hwloc_topology_load, topology); #if __HWLOC_CPUBIND_PRESENT if ( get_processors_group_count() > 1 ) { process_cpuset = hwloc_bitmap_dup(hwloc_topology_get_complete_cpuset(topology)); } else { process_cpuset = hwloc_bitmap_alloc(); hwloc_require_ex(hwloc_get_cpubind, topology, process_cpuset, 0); } #else process_cpuset = hwloc_bitmap_dup(hwloc_topology_get_complete_cpuset(topology)); #endif hwloc_obj_t current_numa_node = nullptr; index_info current_node_info{}; while ((current_numa_node = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_NUMANODE, current_numa_node)) != nullptr) { current_node_info.index = static_cast(current_numa_node->logical_index); current_node_info.cpuset = hwloc_bitmap_dup(current_numa_node->cpuset); hwloc_bitmap_and(current_node_info.cpuset, current_node_info.cpuset, process_cpuset); current_node_info.concurrency = hwloc_bitmap_weight(current_node_info.cpuset); if(current_node_info.concurrency) { numa_node_infos.push_back(current_node_info); } } if (numa_node_infos.empty()) { current_node_info.index = 0; current_node_info.cpuset = hwloc_bitmap_dup(process_cpuset); current_node_info.concurrency = hwloc_bitmap_weight(process_cpuset); numa_node_infos.push_back(current_node_info); } std::sort(numa_node_infos.begin(), numa_node_infos.end(), [](const index_info& a, const index_info& b) { return a.index < b.index; } ); bool core_types_parsing_broken = true; #if __HYBRID_CPUS_TESTING // Getting CPU kinds info auto num_cpu_kinds = hwloc_cpukinds_get_nr(topology, 0); REQUIRE_MESSAGE(num_cpu_kinds >= 0, "HWLOC cannot detect the number of cpukinds.(reference)"); core_types_parsing_broken = num_cpu_kinds == 0; int current_efficiency = -1; cpu_kind_infos.resize(num_cpu_kinds); for (auto kind_index = 0; kind_index < num_cpu_kinds; ++kind_index) { auto& cki = cpu_kind_infos[kind_index]; cki.cpuset = hwloc_bitmap_alloc(); CHECK_MESSAGE( cki.cpuset, "HWLOC was unable to allocate bitmap. Following checks might fail.(reference)" ); hwloc_require_ex( hwloc_cpukinds_get_info, topology, kind_index, cki.cpuset, ¤t_efficiency, /*nr_infos*/nullptr, /*infos*/nullptr, /*flags*/0 ); if (current_efficiency < 0) { core_types_parsing_broken = true; break; } hwloc_bitmap_and(cki.cpuset, cki.cpuset, process_cpuset); cki.index = hwloc_cpukinds_get_by_cpuset(topology, cki.cpuset, /*flags*/0); REQUIRE_MESSAGE(cki.index >= 0, "hwloc failed obtaining kind index via cpuset.(reference)"); cki.concurrency = hwloc_bitmap_weight(cki.cpuset); } #endif /*__HYBRID_CPUS_TESTING*/ if (core_types_parsing_broken) { cpu_kind_infos.resize(1); cpu_kind_infos[0].index = -1; cpu_kind_infos[0].cpuset = hwloc_bitmap_dup(process_cpuset); cpu_kind_infos[0].concurrency = hwloc_bitmap_weight(process_cpuset); } hwloc_bitmap_t core_affinity = hwloc_bitmap_alloc(); hwloc_obj_t current_core = nullptr; while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) { hwloc_bitmap_and(core_affinity, process_cpuset, current_core->cpuset); if (hwloc_bitmap_weight(core_affinity) > 0) { core_infos.emplace_back(core_affinity); } } hwloc_bitmap_free(core_affinity); testing_reference_topology_parsing_validation(); } ~system_info() { for (auto& allocated_mask: memory_handler) { hwloc_bitmap_free(allocated_mask); } hwloc_topology_destroy(topology); hwloc_bitmap_free(process_cpuset); } void testing_reference_topology_parsing_validation() { hwloc_cpuset_t buffer_cpu_set = hwloc_bitmap_alloc(); REQUIRE_MESSAGE(numa_node_infos.size() > 0, "Negative NUMA nodes count.(reference)"); REQUIRE_MESSAGE(cpu_kind_infos.size() > 0, "Negative core types count.(reference)"); REQUIRE_MESSAGE(core_infos.size() > 0, "Negative available cores count.(reference)"); #if NUMA_NODES_NUMBER REQUIRE_MESSAGE(numa_node_infos.size() == NUMA_NODES_NUMBER, "Manual NUMA nodes count validation fails.(reference)"); #endif /*NUMA_NODES_NUMBER*/ #if CORE_TYPES_NUMBER REQUIRE_MESSAGE(cpu_kind_infos.size() == CORE_TYPES_NUMBER, "Manual core types count validation fails.(reference)"); #endif /*CORE_TYPES_NUMBER*/ // NUMA topology verification hwloc_bitmap_zero(buffer_cpu_set); for (const auto& numa_node_info: numa_node_infos) { REQUIRE_MESSAGE(!hwloc_bitmap_intersects(buffer_cpu_set, numa_node_info.cpuset), "NUMA nodes related CPUset have the same bits. " "It seems like error during HWLOC topology parsing.(reference)"); hwloc_bitmap_or(buffer_cpu_set, buffer_cpu_set, numa_node_info.cpuset); } REQUIRE_MESSAGE( hwloc_bitmap_isequal(buffer_cpu_set, process_cpuset), "Intersected NUMA nodes masks should be equal to process affinity.(reference)"); // Core types topology verification hwloc_bitmap_zero(buffer_cpu_set); for (const auto& cpu_kind_info: cpu_kind_infos) { REQUIRE_FALSE_MESSAGE(hwloc_bitmap_intersects(buffer_cpu_set, cpu_kind_info.cpuset), "core types related CPUset have the same bits. " "It seems like error during HWLOC topology parsing.(reference)"); hwloc_bitmap_or(buffer_cpu_set, buffer_cpu_set, cpu_kind_info.cpuset); } REQUIRE_MESSAGE(hwloc_bitmap_isequal(buffer_cpu_set, process_cpuset), "Intersected core type masks should be equal to process affinity.(reference)"); hwloc_bitmap_free(buffer_cpu_set); } public: typedef hwloc_bitmap_t affinity_mask; typedef hwloc_const_bitmap_t const_affinity_mask; static hwloc_const_bitmap_t get_process_affinity_mask() { return instance().process_cpuset; } static std::size_t get_maximal_threads_per_core() { auto max_threads_it = std::max_element( std::begin(instance().core_infos), std::end(instance().core_infos), [](const core_info& first, const core_info& second) { return hwloc_bitmap_weight(first.cpuset) < hwloc_bitmap_weight(second.cpuset); } ); __TBB_ASSERT(hwloc_bitmap_weight(max_threads_it->cpuset) > 0, "Not positive maximal threads per core value.(reference)"); return hwloc_bitmap_weight(max_threads_it->cpuset); } static affinity_mask allocate_empty_affinity_mask() { affinity_mask result = hwloc_bitmap_alloc(); instance().memory_handler.insert(result); return result; } static affinity_mask allocate_current_affinity_mask() { affinity_mask result = hwloc_bitmap_alloc(); instance().memory_handler.insert(result); #if __HWLOC_CPUBIND_PRESENT hwloc_require_ex(hwloc_get_cpubind, instance().topology, result, HWLOC_CPUBIND_THREAD); #else hwloc_bitmap_copy(result, hwloc_topology_get_complete_cpuset(instance().topology)); #endif REQUIRE_MESSAGE(!hwloc_bitmap_iszero(result), "Empty current affinity mask."); return result; } static std::vector get_cpu_kinds_info() { return instance().cpu_kind_infos; } static std::vector get_numa_nodes_info() { return instance().numa_node_infos; } static std::vector get_cores_info() { return instance().core_infos; } static std::vector get_available_max_threads_values() { std::vector result{}; for (int value = -1; value <= (int)get_maximal_threads_per_core(); ++value) { if (value != 0) { result.push_back(value); } } return result; } }; // class system_info system_info* system_info::system_info_ptr{nullptr}; system_info::affinity_mask prepare_reference_affinity_mask(const tbb::task_arena::constraints& c) { auto reference_affinity = system_info::allocate_empty_affinity_mask(); hwloc_bitmap_copy(reference_affinity, system_info::get_process_affinity_mask()); if (c.numa_id != tbb::task_arena::automatic) { const auto& numa_nodes_info = system_info::get_numa_nodes_info(); auto required_info = std::find_if(numa_nodes_info.begin(), numa_nodes_info.end(), [&](const index_info& info) { return info.index == c.numa_id; } ); REQUIRE_MESSAGE(required_info != numa_nodes_info.end(), "Constraints instance has wrong NUMA index."); hwloc_bitmap_and(reference_affinity, reference_affinity, required_info->cpuset); } if (c.core_type != tbb::task_arena::automatic) { const auto& core_types_info = system_info::get_cpu_kinds_info(); auto required_info = std::find_if(core_types_info.begin(), core_types_info.end(), [&](index_info info) { return info.index == c.core_type; } ); REQUIRE_MESSAGE(required_info != core_types_info.end(), "Constraints instance has wrong core type index."); hwloc_bitmap_and(reference_affinity, reference_affinity, required_info->cpuset); } return reference_affinity; } void test_constraints_affinity_and_concurrency(tbb::task_arena::constraints constraints, system_info::affinity_mask arena_affinity) { int default_concurrency = tbb::info::default_concurrency(constraints); system_info::affinity_mask reference_affinity = prepare_reference_affinity_mask(constraints); int max_threads_per_core = static_cast(system_info::get_maximal_threads_per_core()); if (constraints.max_threads_per_core == tbb::task_arena::automatic || constraints.max_threads_per_core == max_threads_per_core) { REQUIRE_MESSAGE(hwloc_bitmap_isequal(reference_affinity, arena_affinity), "Wrong affinity mask was applied for the constraints instance."); REQUIRE_MESSAGE(hwloc_bitmap_weight(reference_affinity) == default_concurrency, "Wrong default_concurrency was returned for the constraints instance."); } else { REQUIRE_MESSAGE(constraints.max_threads_per_core < max_threads_per_core, "Constraints instance has wrong max_threads_per_core value."); REQUIRE_MESSAGE(hwloc_bitmap_isincluded(arena_affinity, reference_affinity), "If custom threads per core value is applied then the applied affinity" "should be a sub-set of the affinity applied to constraints without such restriction."); system_info::affinity_mask core_affinity = system_info::allocate_empty_affinity_mask(); int threads_per_current_core = 0, valid_concurrency = 0; for (const auto& current_core : system_info::get_cores_info()) { hwloc_bitmap_and(core_affinity, reference_affinity, current_core.cpuset); threads_per_current_core = hwloc_bitmap_weight(core_affinity); if (threads_per_current_core > 0) { // current core should exist in the valid affinity mask hwloc_bitmap_and(core_affinity, arena_affinity, current_core.cpuset); threads_per_current_core = std::min(threads_per_current_core, constraints.max_threads_per_core); valid_concurrency += threads_per_current_core; REQUIRE_MESSAGE(hwloc_bitmap_weight(core_affinity) == threads_per_current_core , "Wrong number of threads may be scheduled to some core."); } } REQUIRE_MESSAGE(valid_concurrency == default_concurrency, "Wrong default_concurrency was returned for the constraints instance."); REQUIRE_MESSAGE(valid_concurrency == hwloc_bitmap_weight(arena_affinity), "Wrong number of bits inside the affinity mask."); } } system_info::affinity_mask get_arena_affinity(tbb::task_arena& ta) { system_info::affinity_mask arena_affinity; ta.execute([&]{ arena_affinity = system_info::allocate_current_affinity_mask(); }); utils::SpinBarrier exit_barrier(ta.max_concurrency()); tbb::spin_mutex affinity_mutex{}; for (int i = 0; i < ta.max_concurrency() - 1; ++i) { ta.enqueue([&] { { tbb::spin_mutex::scoped_lock lock(affinity_mutex); system_info::affinity_mask thread_affinity = system_info::allocate_current_affinity_mask(); if (get_processors_group_count() == 1) { REQUIRE_MESSAGE(hwloc_bitmap_isequal(thread_affinity, arena_affinity), "Threads have different masks on machine without several processors groups."); } hwloc_bitmap_or(arena_affinity, arena_affinity, thread_affinity); } exit_barrier.wait(); }); } exit_barrier.wait(); return arena_affinity; } #else /*__TBB_HWLOC_VALID_ENVIRONMENT*/ namespace system_info { // The values that seems meaningful for the most systems that exists at this moment // Should be used when we cannot parse the system topology std::vector get_available_max_threads_values() { return {tbb::task_arena::automatic, 1, 2}; } } #endif /*!__TBB_HWLOC_VALID_ENVIRONMENT*/ struct constraints_hash { std::size_t operator()(const tbb::task_arena::constraints& c) const { return (std::hash{}(c.numa_id) ^ std::hash{}(c.core_type) ^ std::hash{}(c.max_threads_per_core)); } }; struct constraints_equal { bool operator()(const tbb::task_arena::constraints& c1, const tbb::task_arena::constraints& c2) const { return (c1.numa_id == c2.numa_id && c1.core_type == c2.core_type && c1.max_threads_per_core == c2.max_threads_per_core); } }; using constraints_container = std::unordered_set; // Using unordered_set to get rid of duplicated elements constraints_container generate_constraints_variety() { static constraints_container constraints_variety = [](){ using constraints = tbb::task_arena::constraints; constraints_container results{}; std::vector numa_nodes = tbb::info::numa_nodes(); numa_nodes.push_back((tbb::numa_node_id)tbb::task_arena::automatic); #if __HYBRID_CPUS_TESTING std::vector core_types = tbb::info::core_types(); core_types.push_back((tbb::core_type_id)tbb::task_arena::automatic); #endif results.insert(constraints{}); for (const auto& numa_node : numa_nodes) { results.insert(constraints{}.set_numa_id(numa_node)); #if __HYBRID_CPUS_TESTING for (const auto& core_type : core_types) { results.insert(constraints{}.set_core_type(core_type)); results.insert( constraints{} .set_numa_id(numa_node) .set_core_type(core_type) ); for (const auto& max_threads_per_core: system_info::get_available_max_threads_values()) { results.insert( constraints{} .set_max_threads_per_core(max_threads_per_core) ); results.insert( constraints{} .set_numa_id(numa_node) .set_max_threads_per_core(max_threads_per_core) ); results.insert( constraints{} .set_core_type(core_type) .set_max_threads_per_core(max_threads_per_core) ); results.insert( constraints{} .set_numa_id(numa_node) .set_core_type(core_type) .set_max_threads_per_core(max_threads_per_core) ); } } #endif /*__HYBRID_CPUS_TESTING*/ } int max_threads_per_core = system_info::get_available_max_threads_values().back(); // Some constraints may cause unexpected behavior, which would be fixed later. if (get_processors_group_count() > 1) { for(auto it = results.begin(); it != results.end();) { if (it->max_threads_per_core != max_threads_per_core && (it->numa_id == tbb::task_arena::automatic || tbb::info::numa_nodes().size() == 1) #if __HYBRID_CPUS_TESTING && (it->core_type == tbb::task_arena::automatic || tbb::info::core_types().size() == 1) #endif /*__HYBRID_CPUS_TESTING*/ ) { it = results.erase(it); } else { ++it; } } } return results; }(); return constraints_variety; } #endif // __TBB_test_common_arena_constraints_H_ ================================================ FILE: third-party/tbb/test/common/concepts_common.h ================================================ /* Copyright (c) 2021-2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_test_common_concepts_common_H #define __TBB_test_common_concepts_common_H #include "tbb/parallel_pipeline.h" #include "tbb/parallel_for_each.h" #include "tbb/flow_graph.h" #include "tbb/parallel_scan.h" #include "iterator.h" #include #if __TBB_CPP20_CONCEPTS_PRESENT namespace test_concepts { struct Dummy {}; enum class State { correct, incorrect_first_input, incorrect_second_input, incorrect_third_input, incorrect_return_type, incorrect_constness, not_defined, incorrect, non_constant_expression }; struct Copyable { Copyable( const Copyable& ) = default; }; struct NonCopyable { NonCopyable( const NonCopyable& ) = delete; }; struct CopyAssignable { CopyAssignable& operator=( const CopyAssignable& ) = default; }; struct NonCopyAssignable { NonCopyAssignable& operator=( const NonCopyAssignable& ) = delete; }; struct DefaultInitializable { DefaultInitializable() = default; }; struct NonDefaultInitializable { NonDefaultInitializable() = delete; }; namespace blocked_range_value { template struct BlockedRangeValue { BlockedRangeValue( const BlockedRangeValue& ) requires EnableCopyCtor = default; BlockedRangeValue& operator=( const BlockedRangeValue& ) requires EnableCopyAssignment = default; // Prospective destructors ~BlockedRangeValue() requires EnableDtor = default; ~BlockedRangeValue() = delete; bool operator<( const BlockedRangeValue& ) const requires (EnableOperatorLess == State::correct) { return true; } bool operator<( Dummy ) const requires (EnableOperatorLess == State::incorrect_first_input) { return true; } Dummy operator<( const BlockedRangeValue& ) const requires (EnableOperatorLess == State::incorrect_return_type) { return Dummy{}; } bool operator<( const BlockedRangeValue& ) requires (EnableOperatorLess == State::incorrect_constness) { return true; } std::size_t operator-( const BlockedRangeValue& ) const requires (EnableOperatorMinus == State::correct) { return 0; } std::size_t operator-( Dummy ) const requires (EnableOperatorMinus == State::incorrect_first_input) { return 0; } Dummy operator-( const BlockedRangeValue& ) const requires (EnableOperatorMinus == State::incorrect_return_type) { return Dummy{}; } std::size_t operator-( const BlockedRangeValue& ) requires (EnableOperatorMinus == State::incorrect_constness) { return 0; } BlockedRangeValue operator+( std::size_t ) const requires (EnableOperatorPlusSizeT == State::correct) { return *this; } BlockedRangeValue operator+( Dummy ) const requires (EnableOperatorPlusSizeT == State::incorrect_first_input) { return *this; } Dummy operator+( std::size_t ) const requires (EnableOperatorPlusSizeT == State::incorrect_return_type) { return Dummy{}; } BlockedRangeValue operator+( std::size_t ) requires (EnableOperatorPlusSizeT == State::incorrect_constness) { return *this; } }; using Correct = BlockedRangeValue; using NonCopyable = BlockedRangeValue; using NonCopyAssignable = BlockedRangeValue; using NonDestructible = BlockedRangeValue; using NoOperatorLess = BlockedRangeValue; using OperatorLessNonConst = BlockedRangeValue; using WrongInputOperatorLess = BlockedRangeValue; using WrongReturnOperatorLess = BlockedRangeValue; using NoOperatorMinus = BlockedRangeValue; using OperatorMinusNonConst = BlockedRangeValue; using WrongInputOperatorMinus = BlockedRangeValue; using WrongReturnOperatorMinus = BlockedRangeValue; using NoOperatorPlus = BlockedRangeValue; using OperatorPlusNonConst = BlockedRangeValue; using WrongInputOperatorPlus = BlockedRangeValue; using WrongReturnOperatorPlus = BlockedRangeValue; } // namespace blocked_range_value namespace range { template struct Range { Range( Range&, tbb::split ) requires EnableSplitCtor {} Range( const Range& ) requires EnableCopyCtor = default; // Prospective destructors ~Range() requires EnableDtor = default; ~Range() = delete; bool empty() const requires (EnableEmpty == State::correct) { return true; } bool empty() requires (EnableEmpty == State::incorrect_constness) { return true; } Dummy empty() const requires (EnableEmpty == State::incorrect_return_type) { return Dummy{}; } bool is_divisible() const requires (EnableIsDivisible == State::correct) { return true; } bool is_divisible() requires (EnableIsDivisible == State::incorrect_constness) { return true; } Dummy is_divisible() const requires (EnableIsDivisible == State::incorrect_return_type) { return Dummy{}; } }; using Correct = Range; using NonCopyable = Range; using NonSplittable = Range; using NonDestructible = Range; using NoEmpty = Range; using EmptyNonConst = Range; using WrongReturnEmpty = Range; using NoIsDivisible = Range; using IsDivisibleNonConst = Range; using WrongReturnIsDivisible = Range; } // namespace range namespace parallel_for_body { template struct ParallelForBody { ParallelForBody( const ParallelForBody& ) requires EnableCopyCtor = default; // Prospective destructors ~ParallelForBody() requires EnableDtor = default; ~ParallelForBody() = delete; void operator()( Range& ) const requires (EnableFunctionCallOperator == State::correct) {} void operator()( Range& ) requires (EnableFunctionCallOperator == State::incorrect_constness) {} void operator()( Dummy ) const requires (EnableFunctionCallOperator == State::incorrect_first_input) {} }; template using Correct = ParallelForBody; template using NonCopyable = ParallelForBody; template using NonDestructible = ParallelForBody; template using NoOperatorRoundBrackets = ParallelForBody; template using OperatorRoundBracketsNonConst = ParallelForBody; template using WrongInputOperatorRoundBrackets = ParallelForBody; } // namespace parallel_for_body namespace parallel_for_function { template struct ParallelForFunc { void operator()( Index ) const requires (EnableFunctionCallOperator == State::correct) {} void operator()( Index ) requires (EnableFunctionCallOperator == State::incorrect_constness) {} void operator()( Dummy ) const requires (EnableFunctionCallOperator == State::incorrect_first_input) {} }; template using Correct = ParallelForFunc; template using NoOperatorRoundBrackets = ParallelForFunc; template using OperatorRoundBracketsNonConst = ParallelForFunc; template using WrongInputOperatorRoundBrackets = ParallelForFunc; } // namespace parallel_for_function namespace parallel_for_index { template struct ParallelForIndex { ParallelForIndex(int) requires EnableIntCtor {} ParallelForIndex( const ParallelForIndex& ) requires EnableCopyCtor = default; ParallelForIndex& operator=( const ParallelForIndex& ) requires EnableCopyAssign = default; // Prospective destructors ~ParallelForIndex() requires EnableDtor = default; ~ParallelForIndex() = delete; bool operator<( const ParallelForIndex& ) const requires (EnableLess == State::correct) { return true; } bool operator<( const ParallelForIndex& ) requires (EnableLess == State::incorrect_constness) { return true; } bool operator<( Dummy ) const requires (EnableLess == State::incorrect_first_input) { return true; } Dummy operator<( const ParallelForIndex& ) const requires (EnableLess == State::incorrect_return_type) { return Dummy{}; } std::size_t operator-( const ParallelForIndex& ) const requires (EnableMinus == State::correct) { return 0; } std::size_t operator-( const ParallelForIndex& ) requires (EnableMinus == State::incorrect_constness) { return 0; } std::size_t operator-( Dummy ) const requires (EnableMinus == State::incorrect_first_input) { return 0; } Dummy operator-( const ParallelForIndex& ) const requires (EnableMinus == State::incorrect_return_type) { return Dummy{}; } ParallelForIndex operator+( std::size_t ) const requires (EnablePlus == State::correct) { return *this; } ParallelForIndex operator+( std::size_t ) requires (EnablePlus == State::incorrect_constness) { return *this; } ParallelForIndex operator+( Dummy ) const requires (EnablePlus == State::incorrect_first_input) { return *this; } Dummy operator+( std::size_t ) const requires (EnablePlus == State::incorrect_return_type) { return Dummy{}; } }; using Correct = ParallelForIndex; using NoIntCtor = ParallelForIndex; using NonCopyable = ParallelForIndex; using NonCopyAssignable = ParallelForIndex; using NonDestructible = ParallelForIndex; using NoOperatorLess = ParallelForIndex; using OperatorLessNonConst = ParallelForIndex; using WrongInputOperatorLess = ParallelForIndex; using WrongReturnOperatorLess = ParallelForIndex; using NoOperatorMinus = ParallelForIndex; using OperatorMinusNonConst = ParallelForIndex; using WrongInputOperatorMinus = ParallelForIndex; using WrongReturnOperatorMinus = ParallelForIndex; using NoOperatorPlus = ParallelForIndex; using OperatorPlusNonConst = ParallelForIndex; using WrongInputOperatorPlus = ParallelForIndex; using WrongReturnOperatorPlus = ParallelForIndex; } // namespace parallel_for_index namespace parallel_for_each_body { template struct ParallelForEachBody { void operator()( const T& ) const requires (EnableFunctionCallOperator == State::correct) {} void operator()( const T& ) requires (EnableFunctionCallOperator == State::incorrect_constness) {} void operator()( Dummy ) const requires (EnableFunctionCallOperator == State::incorrect_first_input) {} }; template struct ParallelForEachFeederBody { void operator()( const T&, tbb::feeder& ) const requires (EnableFunctionCallOperator == State::correct) {} void operator()( const T&, tbb::feeder& ) requires (EnableFunctionCallOperator == State::incorrect_constness) {} void operator()( Dummy, tbb::feeder& ) const requires (EnableFunctionCallOperator == State::incorrect_first_input) {} void operator()( const T&, Dummy ) const requires (EnableFunctionCallOperator == State::incorrect_second_input) {} }; template using Correct = ParallelForEachBody; template using NoOperatorRoundBrackets = ParallelForEachBody; template using OperatorRoundBracketsNonConst = ParallelForEachBody; template using WrongInputOperatorRoundBrackets = ParallelForEachBody; template using WithFeeder = ParallelForEachFeederBody; template using WithFeederNoOperatorRoundBrackets = ParallelForEachFeederBody; template using WithFeederOperatorRoundBracketsNonConst = ParallelForEachFeederBody; template using WithFeederWrongFirstInputOperatorRoundBrackets = ParallelForEachFeederBody; template using WithFeederWrongSecondInputOperatorRoundBrackets = ParallelForEachFeederBody; } // namespace parallel_for_each_body namespace parallel_sort_value { template struct ParallelSortValue { ParallelSortValue(ParallelSortValue&&) requires MovableV = default; ParallelSortValue& operator=(ParallelSortValue&&) requires MoveAssignableV = default; friend bool operator<(const ParallelSortValue&, const ParallelSortValue&) requires ComparableV { return true; } }; using CorrectValue = ParallelSortValue; using NonMovableValue = ParallelSortValue; using NonMoveAssignableValue = ParallelSortValue; using NonComparableValue = ParallelSortValue; } // namespace parallel_sort_value template class ConstantIT { T data{}; const T& operator* () const { return data; } }; namespace container_based_sequence { template struct ContainerBasedSequence { using iterator = T*; T* begin() requires EnableBegin { return nullptr; } T* end() requires EnableEnd { return nullptr; } }; using Correct = ContainerBasedSequence; using NoBegin = ContainerBasedSequence; using NoEnd = ContainerBasedSequence; template using CustomValueCBS = ContainerBasedSequence; struct ConstantCBS { ConstantIT begin() const { return ConstantIT{}; } ConstantIT end() const { return ConstantIT{}; } }; struct ForwardIteratorCBS { utils::ForwardIterator begin() { return utils::ForwardIterator{}; } utils::ForwardIterator end() { return begin(); } }; } // namespace container_based_sequence namespace parallel_reduce_body { template struct ParallelReduceBody { ParallelReduceBody( ParallelReduceBody&, tbb::split ) requires EnableSplitCtor {} // Prospective destructors ~ParallelReduceBody() requires EnableDtor = default; ~ParallelReduceBody() = delete; void operator()( const Range& ) requires (EnableFunctionCallOperator == State::correct) {} void operator()( Dummy ) requires (EnableFunctionCallOperator == State::incorrect_first_input) {} void join( ParallelReduceBody& ) requires (EnableJoin == State::correct) {} void join( Dummy ) requires (EnableJoin == State::incorrect_first_input) {} }; template using Correct = ParallelReduceBody; template using NonSplittable = ParallelReduceBody; template using NonDestructible = ParallelReduceBody; template using NoOperatorRoundBrackets = ParallelReduceBody; template using WrongInputOperatorRoundBrackets = ParallelReduceBody; template using NoJoin = ParallelReduceBody; template using WrongInputJoin = ParallelReduceBody; } // namespace parallel_reduce_body namespace parallel_reduce_function { template struct ParallelReduceFunction { int operator()( const Range&, const int& ) const requires (EnableFunctionCallOperator == State::correct) { return 0; } int operator()( const Range&, const int& ) requires (EnableFunctionCallOperator == State::incorrect_constness) { return 0; } int operator()( Dummy, const int& ) const requires (EnableFunctionCallOperator == State::incorrect_first_input) { return 0; } int operator()( const Range&, Dummy ) const requires (EnableFunctionCallOperator == State::incorrect_second_input) { return 0; } Dummy operator()( const Range&, const int& ) const requires (EnableFunctionCallOperator == State::incorrect_return_type) { return Dummy{}; } }; template using Correct = ParallelReduceFunction; template using NoOperatorRoundBrackets = ParallelReduceFunction; template using OperatorRoundBracketsNonConst = ParallelReduceFunction; template using WrongFirstInputOperatorRoundBrackets = ParallelReduceFunction; template using WrongSecondInputOperatorRoundBrackets = ParallelReduceFunction; template using WrongReturnOperatorRoundBrackets = ParallelReduceFunction; } // namespace parallel_reduce_function namespace parallel_reduce_combine { template struct ParallelReduceCombine { T operator()( const T& a, const T& ) const requires (EnableFunctionCallOperator == State::correct) { return a; } T operator()( const T& a, const T& ) requires (EnableFunctionCallOperator == State::incorrect_constness) { return a; } T operator()( Dummy, const T& a ) const requires (EnableFunctionCallOperator == State::incorrect_first_input) { return a; } T operator()( const T& a, Dummy ) const requires (EnableFunctionCallOperator == State::incorrect_second_input) { return a; } Dummy operator()( const T&, const T& ) const requires (EnableFunctionCallOperator == State::incorrect_return_type) { return Dummy{}; } }; template using Correct = ParallelReduceCombine; template using NoOperatorRoundBrackets = ParallelReduceCombine; template using OperatorRoundBracketsNonConst = ParallelReduceCombine; template using WrongFirstInputOperatorRoundBrackets = ParallelReduceCombine; template using WrongSecondInputOperatorRoundBrackets = ParallelReduceCombine; template using WrongReturnOperatorRoundBrackets = ParallelReduceCombine; } // namespace parallel_reduce_reduction namespace parallel_scan_body { template struct ParallelScanBody { ParallelScanBody( ParallelScanBody&, tbb::split ) requires EnableSplitCtor {} void reverse_join( ParallelScanBody& ) requires (EnableReverseJoin == State::correct) {} void reverse_join( Dummy ) requires (EnableReverseJoin == State::incorrect_first_input) {} void assign( ParallelScanBody& ) requires (EnableAssign == State::correct) {} void assign( Dummy ) requires (EnableAssign == State::incorrect_first_input) {} void operator()( const Range&, tbb::pre_scan_tag ) requires (EnablePreScanRoundBrackets == State::correct) {} void operator()( Dummy, tbb::pre_scan_tag ) requires (EnablePreScanRoundBrackets == State::incorrect_first_input) {} void operator()( const Range&, Dummy ) requires (EnablePreScanRoundBrackets == State::incorrect_second_input) {} void operator()( const Range&, tbb::final_scan_tag ) requires (EnableFinalScanRoundBrackets == State::correct) {} void operator()( Dummy, tbb::final_scan_tag ) requires (EnableFinalScanRoundBrackets == State::incorrect_first_input) {} void operator()( const Range&, Dummy ) requires (EnableFinalScanRoundBrackets == State::incorrect_second_input) {} }; template using Correct = ParallelScanBody; template using NonSplittable = ParallelScanBody; template using NoReverseJoin = ParallelScanBody; template using WrongInputReverseJoin = ParallelScanBody; template using NoAssign = ParallelScanBody; template using WrongInputAssign = ParallelScanBody; template using NoPreScanOperatorRoundBrackets = ParallelScanBody; template using WrongFirstInputPreScanOperatorRoundBrackets = ParallelScanBody; template using WrongSecondInputPreScanOperatorRoundBrackets = ParallelScanBody; template using NoFinalScanOperatorRoundBrackets = ParallelScanBody; template using WrongFirstInputFinalScanOperatorRoundBrackets = ParallelScanBody; template using WrongSecondInputFinalScanOperatorRoundBrackets = ParallelScanBody; } // namespace parallel_scan_body namespace parallel_scan_function { template struct ParallelScanFunction { T operator()( const Range&, const T& a, bool ) const requires (EnableFunctionCallOperator == State::correct) { return a; } T operator()( const Range&, const T& a, bool ) requires (EnableFunctionCallOperator == State::incorrect_constness) { return a; } T operator()( Dummy, const T& a, bool ) const requires (EnableFunctionCallOperator == State::incorrect_first_input) { return a; } T operator()( const Range&, Dummy, bool ) const requires (EnableFunctionCallOperator == State::incorrect_second_input) { return T{}; } T operator()( const Range&, const T& a, Dummy ) const requires (EnableFunctionCallOperator == State::incorrect_third_input) { return a; } Dummy operator()( const Range&, const T&, bool ) const requires (EnableFunctionCallOperator == State::incorrect_return_type) { return Dummy{}; } }; template using Correct = ParallelScanFunction; template using NoOperatorRoundBrackets = ParallelScanFunction; template using OperatorRoundBracketsNonConst = ParallelScanFunction; template using WrongFirstInputOperatorRoundBrackets = ParallelScanFunction; template using WrongSecondInputOperatorRoundBrackets = ParallelScanFunction; template using WrongThirdInputOperatorRoundBrackets = ParallelScanFunction; template using WrongReturnOperatorRoundBrackets = ParallelScanFunction; } // namespace parallel_scan_function namespace parallel_scan_combine { using namespace parallel_reduce_combine; } // namespace parallel_scan_combine namespace compare { template struct Compare { bool operator()( const T&, const T& ) const requires (EnableFunctionCallOperator == State::correct) { return true; } bool operator()( Dummy, const T& ) const requires (EnableFunctionCallOperator == State::incorrect_first_input) { return true; } bool operator()( const T&, Dummy ) const requires (EnableFunctionCallOperator == State::incorrect_second_input) { return true; } Dummy operator()( const T&, const T& ) const requires (EnableFunctionCallOperator == State::incorrect_return_type) { return Dummy{}; } }; template using Correct = Compare; template using NoOperatorRoundBrackets = Compare; template using WrongFirstInputOperatorRoundBrackets = Compare; template using WrongSecondInputOperatorRoundBrackets = Compare; template using WrongReturnOperatorRoundBrackets = Compare; } // namespace compare namespace hash_compare { template struct HashCompare { HashCompare( const HashCompare& ) requires EnableCopyCtor = default; // Prospective destructors ~HashCompare() requires EnableDtor = default; ~HashCompare() = delete; std::size_t hash( const Key& ) const requires (EnableHash == State::correct) { return 0; } std::size_t hash( const Key& ) requires (EnableHash == State::incorrect_constness) { return 0; } std::size_t hash( Dummy ) const requires (EnableHash == State::incorrect_first_input) { return 0; } Dummy hash( const Key& ) const requires (EnableHash == State::incorrect_return_type) { return Dummy{}; } bool equal( const Key&, const Key& ) const requires (EnableEqual == State::correct) { return true; } bool equal( const Key&, const Key& ) requires (EnableEqual == State::incorrect_constness) { return true; } bool equal( Dummy, const Key& ) const requires (EnableEqual == State::incorrect_first_input) { return true; } bool equal( const Key&, Dummy ) const requires (EnableEqual == State::incorrect_second_input) { return true; } Dummy equal( const Key&, const Key& ) const requires (EnableEqual == State::incorrect_return_type) { return Dummy{}; } }; template using Correct = HashCompare; template using NonCopyable = HashCompare; template using NonDestructible = HashCompare; template using NoHash = HashCompare; template using HashNonConst = HashCompare; template using WrongInputHash = HashCompare; template using WrongReturnHash = HashCompare; template using NoEqual = HashCompare; template using EqualNonConst = HashCompare; template using WrongFirstInputEqual = HashCompare; template using WrongSecondInputEqual = HashCompare; template using WrongReturnEqual = HashCompare; } // namespace hash_compare namespace rw_mutex { template struct DefineRWScopedLock { struct scoped_lock { scoped_lock() requires EnableSLDefaultCtor = default; scoped_lock( RwMutex&, bool = true ) requires EnableSLMutexCtor {} // Prospective destructors ~scoped_lock() requires EnableSLDtor = default; ~scoped_lock() = delete; void acquire( RwMutex&, bool = true ) requires (EnableSLAcquire == State::correct) {} void acquire( Dummy, bool = true ) requires (EnableSLAcquire == State::incorrect_first_input) {} void acquire( RwMutex&, Dummy = Dummy{} ) requires (EnableSLAcquire == State::incorrect_second_input) {} bool try_acquire( RwMutex&, bool = true ) requires (EnableSLTryAcquire == State::correct) { return true; } bool try_acquire( Dummy, bool = true ) requires (EnableSLTryAcquire == State::incorrect_first_input) { return true; } bool try_acquire( RwMutex&, Dummy = Dummy{} ) requires (EnableSLTryAcquire == State::incorrect_second_input) { return true; } Dummy try_acquire( RwMutex&, bool = true ) requires (EnableSLTryAcquire == State::incorrect_return_type) { return Dummy{}; } void release() requires (EnableSLRelease) {} bool upgrade_to_writer() requires (EnableSLUpgrade == State::correct) { return true; } Dummy upgrade_to_writer() requires (EnableSLUpgrade == State::incorrect_return_type) { return Dummy{}; } bool downgrade_to_reader() requires (EnableSLDowngrade == State::correct) { return true; } Dummy downgrade_to_reader() requires (EnableSLDowngrade == State::incorrect_return_type) { return Dummy{}; } bool is_writer() const requires (EnableIsWriter == State::correct) { return true; } Dummy is_writer() const requires (EnableIsWriter == State::incorrect_return_type) { return Dummy{}; } bool is_writer() requires (EnableIsWriter == State::incorrect_constness) { return true; } }; }; template inline const bool mutex_trait_impl = true; template <> inline const int mutex_trait_impl = 0; template <> inline bool mutex_trait_impl = true; template struct RWMutex : std::conditional_t, EnableSLDefaultCtor, EnableSLMutexCtor, EnableSLDtor, EnableSLAcquire, EnableSLTryAcquire, EnableSLRelease, EnableSLUpgrade, EnableSLDowngrade, EnableSLIsWriter>, Dummy> {}; using Correct = RWMutex; using NoScopedLock = RWMutex; using ScopedLockNoDefaultCtor = RWMutex; using ScopedLockNoMutexCtor = RWMutex; using ScopedLockNoDtor = RWMutex; using ScopedLockNoAcquire = RWMutex; using ScopedLockWrongFirstInputAcquire = RWMutex; using ScopedLockWrongSecondInputAcquire = RWMutex; using ScopedLockNoTryAcquire = RWMutex; using ScopedLockWrongFirstInputTryAcquire = RWMutex; using ScopedLockWrongSecondInputTryAcquire = RWMutex; using ScopedLockWrongReturnTryAcquire = RWMutex; using ScopedLockNoRelease = RWMutex; using ScopedLockNoUpgrade = RWMutex; using ScopedLockWrongReturnUpgrade = RWMutex; using ScopedLockNoDowngrade = RWMutex; using ScopedLockWrongReturnDowngrade = RWMutex; using ScopedLockNoIsWriter = RWMutex; using ScopedLockIsWriterNonConst = RWMutex; using ScopedLockWrongReturnIsWriter = RWMutex; } // namespace rw_mutex // Flow Graph testing infrastructure below namespace input_node_body { template struct InputNodeBody { InputNodeBody( const InputNodeBody& ) requires EnableCopyCtor = default; // Prospective destructors ~InputNodeBody() requires EnableDtor = default; ~InputNodeBody() = delete; Output operator()( tbb::flow_control& ) requires (EnableFunctionCallOperator == State::correct) { return Output{}; } Output operator()( Dummy ) requires (EnableFunctionCallOperator == State::incorrect_first_input) { return Output{}; } Dummy operator()( tbb::flow_control& ) requires (EnableFunctionCallOperator == State::incorrect_return_type) { return Dummy{}; } }; template using Correct = InputNodeBody; template using NonCopyable = InputNodeBody; template using NonDestructible = InputNodeBody; template using NoOperatorRoundBrackets = InputNodeBody; template using WrongInputOperatorRoundBrackets = InputNodeBody; template using WrongReturnOperatorRoundBrackets = InputNodeBody; } // namespace input_node_body namespace function_node_body { template struct FunctionNodeBody { FunctionNodeBody( const FunctionNodeBody& ) requires EnableCopyCtor = default; // Prospective destructors ~FunctionNodeBody() requires EnableDtor = default; ~FunctionNodeBody() = delete; Output operator()( const Input& ) requires (EnableFunctionCallOperator == State::correct) { return Output{}; } Output operator()( Dummy ) requires (EnableFunctionCallOperator == State::incorrect_first_input) { return Dummy{}; } Dummy operator()( const Input& ) requires (EnableFunctionCallOperator == State::incorrect_return_type) { return Output{}; } }; template using Correct = FunctionNodeBody; template using NonCopyable = FunctionNodeBody; template using NonDestructible = FunctionNodeBody; template using NoOperatorRoundBrackets = FunctionNodeBody; template using WrongInputRoundBrackets = FunctionNodeBody; template using WrongReturnRoundBrackets = FunctionNodeBody; } // namespace function_node_body namespace mf_async_node_body { template struct PortsNodeBody { PortsNodeBody( const PortsNodeBody& ) requires EnableCopyCtor = default; // Prospective destructors ~PortsNodeBody() requires EnableDtor = default; ~PortsNodeBody() = delete; void operator()( const Input&, PortsType& ) requires (EnableFunctionCallOperator == State::correct) {} void operator()( Dummy, PortsType& ) requires (EnableFunctionCallOperator == State::incorrect_first_input) {} void operator()( const Input&, Dummy ) requires (EnableFunctionCallOperator == State::incorrect_second_input) {} }; } // namespace mf_async_node_body namespace multifunction_node_body { template using output_ports_type = typename tbb::flow::multifunction_node::output_ports_type; using mf_async_node_body::PortsNodeBody; template using Correct = PortsNodeBody, /*CopyCtor = */true, /*Dtor = */true, /*() = */State::correct>; template using NonCopyable = PortsNodeBody, /*CopyCtor = */false, /*Dtor = */true, /*() = */State::correct>; template using NonDestructible = PortsNodeBody, /*CopyCtor = */true, /*Dtor = */false, /*() = */State::correct>; template using NoOperatorRoundBrackets = PortsNodeBody, /*CopyCtor = */true, /*Dtor = */true, /*() = */State::not_defined>; template using WrongFirstInputOperatorRoundBrackets = PortsNodeBody, /*CopyCtor = */true, /*Dtor = */true, /*() = */State::incorrect_first_input>; template using WrongSecondInputOperatorRoundBrackets = PortsNodeBody, /*CopyCtor = */true, /*Dtor = */true, /*() = */State::incorrect_second_input>; } // namespace multifunction_node_body namespace async_node_body { template using gateway_type = typename tbb::flow::async_node::gateway_type; using mf_async_node_body::PortsNodeBody; template using Correct = PortsNodeBody, /*CopyCtor = */true, /*Dtor = */true, /*() = */State::correct>; template using NonCopyable = PortsNodeBody, /*CopyCtor = */false, /*Dtor = */true, /*() = */State::correct>; template using NonDestructible = PortsNodeBody, /*CopyCtor = */true, /*Dtor = */false, /*() = */State::correct>; template using NoOperatorRoundBrackets = PortsNodeBody, /*CopyCtor = */true, /*Dtor = */true, /*() = */State::not_defined>; template using WrongFirstInputOperatorRoundBrackets = PortsNodeBody, /*CopyCtor = */true, /*Dtor = */true, /*() = */State::incorrect_first_input>; template using WrongSecondInputOperatorRoundBrackets = PortsNodeBody, /*CopyCtor = */true, /*Dtor = */true, /*() = */State::incorrect_second_input>; } // namespace async_node_body namespace continue_node_body { template struct ContinueNodeBody { ContinueNodeBody( const ContinueNodeBody& ) requires EnableCopyCtor = default; // Prospective destructors ~ContinueNodeBody() requires EnableDtor = default; ~ContinueNodeBody() = delete; Output operator()( tbb::flow::continue_msg ) requires (EnableFunctionCallOperator == State::correct) { return Output{}; } Output operator()( Dummy ) requires (EnableFunctionCallOperator == State::incorrect_first_input) { return Output{}; } Dummy operator()( tbb::flow::continue_msg ) requires (EnableFunctionCallOperator == State::incorrect_return_type) { return Dummy{}; } }; template using Correct = ContinueNodeBody; template using NonCopyable = ContinueNodeBody; template using NonDestructible = ContinueNodeBody; template using NoOperatorRoundBrackets = ContinueNodeBody; template using WrongInputOperatorRoundBrackets = ContinueNodeBody; template using WrongReturnOperatorRoundBrackets = ContinueNodeBody; } // namespace continue_node_body namespace sequencer { template struct Sequencer { Sequencer( const Sequencer& ) requires EnableCopyCtor = default; // Prospective destructors ~Sequencer() requires EnableDtor = default; ~Sequencer() = delete; std::size_t operator()( const T& ) requires (EnableFunctionCallOperator == State::correct) { return 0; } std::size_t operator()( Dummy ) requires (EnableFunctionCallOperator == State::incorrect_first_input) { return 0; } Dummy operator()( const T& ) requires (EnableFunctionCallOperator == State::incorrect_return_type) { return Dummy{}; } }; template using Correct = Sequencer; template using NonCopyable = Sequencer; template using NonDestructible = Sequencer; template using NoOperatorRoundBrackets = Sequencer; template using WrongInputOperatorRoundBrackets = Sequencer; template using WrongReturnOperatorRoundBrackets = Sequencer; } // namespace sequencer namespace join_node_function_object { template struct JoinNodeFunctionObject { JoinNodeFunctionObject( const JoinNodeFunctionObject& ) requires EnableCopyCtor = default; // Prospective destructors ~JoinNodeFunctionObject() requires EnableDtor = default; ~JoinNodeFunctionObject() = delete; Key operator()( const Input& ) requires (EnableFunctionCallOperator == State::correct) { return Key{}; } Key operator()( Dummy ) requires (EnableFunctionCallOperator == State::incorrect_first_input) { return Key{}; } Dummy operator()( const Input& ) requires (EnableFunctionCallOperator == State::incorrect_return_type) { return Dummy{}; } }; template using Correct = JoinNodeFunctionObject; template using NonCopyable = JoinNodeFunctionObject; template using NonDestructible = JoinNodeFunctionObject; template using NoOperatorRoundBrackets = JoinNodeFunctionObject; template using WrongInputOperatorRoundBrackets = JoinNodeFunctionObject; template using WrongReturnOperatorRoundBrackets = JoinNodeFunctionObject; } // namespace join_node_function_object template concept container_range = tbb::detail::tbb_range && std::input_iterator && requires(T& range) { typename T::value_type; typename T::reference; typename T::size_type; typename T::difference_type; { range.begin() } -> std::same_as; { range.end() } -> std::same_as; { std::as_const(range).grainsize() } -> std::same_as; }; } // namespace test_concepts #endif // __TBB_CPP20_CONCEPTS_PRESENT #endif // __TBB_test_common_concepts_common_H ================================================ FILE: third-party/tbb/test/common/concurrency_tracker.h ================================================ /* Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_test_common_concurrency_tracker_H #define __TBB_test_common_concurrency_tracker_H #include "common/test.h" #include "utils.h" #include "spin_barrier.h" #include "oneapi/tbb/parallel_for.h" #include namespace utils { static std::atomic ctInstantParallelism; static std::atomic ctPeakParallelism; static thread_local std::uintptr_t ctNested; class ConcurrencyTracker { bool m_Outer; static void Started () { unsigned p = ++ctInstantParallelism; unsigned q = ctPeakParallelism; while( q

0, "Mismatched call to ConcurrencyTracker::Stopped()" ); --ctInstantParallelism; } public: ConcurrencyTracker() : m_Outer(false) { std::uintptr_t nested = ctNested; CHECK_FAST(nested <= 1); if ( !ctNested ) { Started(); m_Outer = true; ctNested = 1; } } ~ConcurrencyTracker() { if ( m_Outer ) { Stopped(); #if __GNUC__ // Some GCC versions tries to optimize out this write operation. So we need to perform this cast. static_cast(ctNested) = 0; #else ctNested = 0; #endif } } static unsigned PeakParallelism() { return ctPeakParallelism; } static unsigned InstantParallelism() { return ctInstantParallelism; } static void Reset() { CHECK_MESSAGE(ctInstantParallelism == 0, "Reset cannot be called when concurrency tracking is underway"); ctInstantParallelism = ctPeakParallelism = 0; } }; // ConcurrencyTracker struct ExactConcurrencyLevel : NoCopy { private: SpinBarrier *myBarrier; // count unique worker threads mutable std::atomic myUniqueThreadsCnt; static thread_local int myUniqueThreads; static std::atomic myEpoch; mutable std::atomic myActiveBodyCnt; // output parameter for parallel_for body to report that max is reached mutable std::atomic myReachedMax; // zero timeout means no barrier is used during concurrency level detection const double myTimeout; const size_t myConcLevel; static std::mutex global_mutex; ExactConcurrencyLevel(double timeout, size_t concLevel) : myBarrier(nullptr), myUniqueThreadsCnt(0), myReachedMax(false), myTimeout(timeout), myConcLevel(concLevel) { myActiveBodyCnt = 0; } bool run() { const int LOOP_ITERS = 100; SpinBarrier barrier((unsigned)myConcLevel, /*throwaway=*/true); if (myTimeout != 0.) myBarrier = &barrier; tbb::parallel_for((size_t)0, myConcLevel*LOOP_ITERS, *this, tbb::simple_partitioner()); return myReachedMax; } public: void operator()(size_t) const { size_t v = ++myActiveBodyCnt; REQUIRE_MESSAGE(v <= myConcLevel, "Number of active bodies is too high."); if (v == myConcLevel) // record that the max expected concurrency was observed myReachedMax = true; // try to get barrier when 1st time in the thread if (myBarrier) { myBarrier->wait(); } if (myUniqueThreads != myEpoch) { ++myUniqueThreadsCnt; myUniqueThreads = myEpoch; } for (int i=0; i<100; i++) tbb::detail::machine_pause(1); --myActiveBodyCnt; } enum Mode { None, // When multiple blocking checks are performed, there might be not enough // concurrency for all of them. Serialize check() calls. Serialize }; // check that we have never got more than concLevel threads, // and that in some moment we saw exactly concLevel threads static void check(size_t concLevel, Mode m = None) { ExactConcurrencyLevel o(30., concLevel); bool ok = false; if (m == Serialize) { std::lock_guard lock(global_mutex); ok = o.run(); } else { ok = o.run(); } CHECK(ok); } static bool isEqual(size_t concLevel) { ExactConcurrencyLevel o(3., concLevel); return o.run(); } static void checkLessOrEqual(size_t concLevel) { ++ExactConcurrencyLevel::myEpoch; ExactConcurrencyLevel o(0., concLevel); o.run(); // ignore result, as without a barrier it is not reliable CHECK_MESSAGE(o.myUniqueThreadsCnt<=concLevel, "Too many workers observed."); } }; std::mutex ExactConcurrencyLevel::global_mutex; thread_local int ExactConcurrencyLevel::myUniqueThreads; std::atomic ExactConcurrencyLevel::myEpoch; } // namespace Harness #endif /* __TBB_test_common_concurrency_tracker_H */ ================================================ FILE: third-party/tbb/test/common/concurrent_associative_common.h ================================================ /* Copyright (c) 2005-2023 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifndef __TBB_test_common_concurrent_associative_common_H #define __TBB_test_common_concurrent_associative_common_H #include "config.h" #include "custom_allocators.h" #include "utils.h" #include "utils_concurrency_limit.h" #include "container_move_support.h" #include "checktype.h" #include "range_based_for_support.h" #include "initializer_list_support.h" #include "node_handling_support.h" #include "containers_common.h" #include "test_comparisons.h" #include "concepts_common.h" #include #include #include "oneapi/tbb/parallel_for.h" // This structure should be specialized for multimap and multiset classes template struct AllowMultimapping : std::false_type {}; template inline void CheckAllocator(typename Table::allocator_type& a, size_t expected_allocs, size_t expected_frees, bool exact = true) { if(exact) { REQUIRE( a.allocations == expected_allocs); REQUIRE( a.frees == expected_frees); } else { REQUIRE( a.allocations >= expected_allocs); REQUIRE( a.frees >= expected_frees); REQUIRE( a.allocations - a.frees == expected_allocs - expected_frees ); } } // value generator for maps template > struct ValueFactoryBase { using strip_key = typename std::remove_const::type; static Value make( const Key& key ) { return Value(key, key); } static Value make( const Key& key, const Key& mapped ) { return Value(key, mapped); } static strip_key key( const Value& value ) { return value.first; } static strip_key get( const Value& value ) { return strip_key(value.second); } template static U convert( const Value& value ) { return U(value.second); } }; template struct SpecialTests { static void Test() {} }; // value generator for sets template struct ValueFactoryBase { static Key make( const Key& key ) { return key; } static Key make( const Key& key, const Key& ) { return key; } static Key key( const Key& value ) { return value; } static Key get( const Key& value ) { return value; } template static U convert( const Key& value ) { return U(value); } }; template struct Value : ValueFactoryBase { template static bool compare( const typename Container::iterator& it, U val ) { return (Value::template convert(*it) == val); } }; template void SpecialMapTests(){ Map cont; const Map &ccont( cont ); // mapped_type& operator[](const key_type& k); cont[1] = 2; // bool empty() const; REQUIRE_MESSAGE( !ccont.empty( ), "Concurrent container empty after adding an element" ); // size_type size() const; REQUIRE_MESSAGE( ccont.size( ) == 1, "Concurrent container size incorrect" ); REQUIRE_MESSAGE( cont[1] == 2, "Concurrent container value incorrect" ); // mapped_type& at( const key_type& k ); // const mapped_type& at(const key_type& k) const; REQUIRE_MESSAGE( cont.at( 1 ) == 2, "Concurrent container value incorrect" ); REQUIRE_MESSAGE( ccont.at( 1 ) == 2, "Concurrent container value incorrect" ); // iterator find(const key_type& k); typename Map::iterator it = cont.find( 1 ); REQUIRE_MESSAGE((it != cont.end( ) && Value::get( *(it) ) == 2), "Element with key 1 not properly found" ); cont.unsafe_erase( it ); it = cont.find( 1 ); REQUIRE_MESSAGE( it == cont.end( ), "Element with key 1 not properly erased" ); } template void CheckMultiMap(MultiMap &m, int *targets, int tcount, int key) { std::vector vfound(tcount,false); std::pair range = m.equal_range( key ); for(typename MultiMap::iterator it = range.first; it != range.second; ++it) { bool found = false; for( int i = 0; i < tcount; ++i) { if((*it).second == targets[i]) { if(!vfound[i]) { // we can insert duplicate values vfound[i] = found = true; break; } } } // just in case an extra value in equal_range... REQUIRE_MESSAGE(found, "extra value from equal range"); } for(int i = 0; i < tcount; ++i) REQUIRE_MESSAGE(vfound[i], "missing value"); } template void MultiMapEraseTests(){ MultiMap cont1, cont2; // Assignment to begin to avoid maybe-uninitialized warning typename MultiMap::iterator erased_it = cont1.begin(); for (int i = 0; i < 10; ++i) { if ( i != 1 ) { cont1.insert(std::make_pair(1, i)); cont2.insert(std::make_pair(1, i)); } else { erased_it = cont1.insert(std::make_pair(1, i)).first; } } cont1.unsafe_erase(erased_it); REQUIRE_MESSAGE(cont1.size() == cont2.size(), "Incorrect count of elements was erased"); typename MultiMap::iterator it1 = cont1.begin(); typename MultiMap::iterator it2 = cont2.begin(); for (typename MultiMap::size_type i = 0; i < cont2.size(); ++i) { REQUIRE_MESSAGE((*(it1++) == *(it2++)), "Multimap repetitive key was not erased properly"); } } template void SpecialMultiMapTests(){ int one_values[] = { 7, 2, 13, 23, 13 }; int zero_values[] = { 4, 9, 13, 29, 42, 111}; int n_zero_values = sizeof(zero_values) / sizeof(int); int n_one_values = sizeof(one_values) / sizeof(int); MultiMap cont; const MultiMap &ccont( cont ); // mapped_type& operator[](const key_type& k); cont.insert( std::make_pair( 1, one_values[0] ) ); // bool empty() const; REQUIRE_MESSAGE( !ccont.empty( ), "Concurrent container empty after adding an element" ); // size_type size() const; REQUIRE_MESSAGE( ccont.size( ) == 1, "Concurrent container size incorrect" ); REQUIRE_MESSAGE( (*(cont.begin( ))).second == one_values[0], "Concurrent container value incorrect" ); REQUIRE_MESSAGE( (*(cont.equal_range( 1 )).first).second == one_values[0], "Improper value from equal_range" ); REQUIRE_MESSAGE( ((cont.equal_range( 1 )).second == cont.end( )), "Improper iterator from equal_range" ); cont.insert( std::make_pair( 1, one_values[1] ) ); // bool empty() const; REQUIRE_MESSAGE( !ccont.empty( ), "Concurrent container empty after adding an element" ); // size_type size() const; REQUIRE_MESSAGE( ccont.size( ) == 2, "Concurrent container size incorrect" ); CheckMultiMap(cont, one_values, 2, 1); // insert the other {1,x} values for( int i = 2; i < n_one_values; ++i ) { cont.insert( std::make_pair( 1, one_values[i] ) ); } CheckMultiMap(cont, one_values, n_one_values, 1); REQUIRE_MESSAGE( (cont.equal_range( 1 )).second == cont.end( ), "Improper iterator from equal_range" ); cont.insert( std::make_pair( 0, zero_values[0] ) ); // bool empty() const; REQUIRE_MESSAGE( !ccont.empty( ), "Concurrent container empty after adding an element" ); // size_type size() const; REQUIRE_MESSAGE( ccont.size( ) == (size_t)(n_one_values+1), "Concurrent container size incorrect" ); CheckMultiMap(cont, one_values, n_one_values, 1); CheckMultiMap(cont, zero_values, 1, 0); REQUIRE_MESSAGE( (*cont.find(0)).second == zero_values[0], "Concurrent container value incorrect"); // insert the rest of the zero values for( int i = 1; i < n_zero_values; ++i) { cont.insert( std::make_pair( 0, zero_values[i] ) ); } CheckMultiMap(cont, one_values, n_one_values, 1); CheckMultiMap(cont, zero_values, n_zero_values, 0); // clear, reinsert interleaved cont.clear(); int bigger_num = ( n_one_values > n_zero_values ) ? n_one_values : n_zero_values; for( int i = 0; i < bigger_num; ++i ) { if(i < n_one_values) cont.insert( std::make_pair( 1, one_values[i] ) ); if(i < n_zero_values) cont.insert( std::make_pair( 0, zero_values[i] ) ); } CheckMultiMap(cont, one_values, n_one_values, 1); CheckMultiMap(cont, zero_values, n_zero_values, 0); MultiMapEraseTests(); } template void check_value_state( const T& t, std::true_type ) { REQUIRE_MESSAGE(is_state_predicate{}(t), "Unexpected value state"); } template void check_value_state(const T&, std::false_type) {} template void test_rvalue_insert( Key k1, Key k2 ) { Container cont; std::pair ins = cont.insert(Value::make(k1)); REQUIRE_MESSAGE(ins.second, "Element 1 has not been inserted"); REQUIRE_MESSAGE(Value::get(*ins.first) == k1, "Element 1 has not been inserted"); check_value_state(*ins.first, CheckElementState{}); typename Container::iterator it2 = cont.insert(ins.first, Value::make(k2)); REQUIRE_MESSAGE(Value::get(*it2) == k2, "Element 2 has not been inserted"); check_value_state(*it2, CheckElementState{}); } namespace emplace_helpers { template std::pair call_emplace_impl( Container& c, Arg&& k, Value* ) { // Call emplace implementation for sets return c.emplace(std::forward(k)); } template std::pair call_emplace_impl( Container& c, Arg&& k, std::pair* ) { // Call emplace implementation for maps return c.emplace(k, std::forward(k)); } template typename Container::iterator call_emplace_hint_impl( Container& c, typename Container::const_iterator hint, Arg&& k, Value* ) { // Call emplace_hint implementation for sets return c.emplace_hint(hint, std::forward(k)); } template typename Container::iterator call_emplace_hint_impl( Container& c, typename Container::const_iterator hint, Arg&& k, std::pair* ) { // Call emplace_hint implementation for maps return c.emplace_hint(hint, k, std::forward(k)); } template std::pair call_emplace( Container& c, Arg&& k ) { typename Container::value_type* selector = nullptr; return call_emplace_impl(c, std::forward(k), selector); } template typename Container::iterator call_emplace_hint( Container& c, typename Container::const_iterator hint, Arg&& k ) { typename Container::value_type* selector = nullptr; return call_emplace_hint_impl(c, hint, std::forward(k), selector); } } // namespace emplace_helpers template void test_emplace_insert( Key key1, Key key2 ) { Container cont; std::pair ins = emplace_helpers::call_emplace(cont, key1); REQUIRE_MESSAGE(ins.second, "Element 1 has not been inserted"); REQUIRE_MESSAGE(Value::compare(ins.first, key1), "Element 1 has not been inserted"); check_value_state(*ins.first, CheckElementState{}); //if (!AllowMultimapping::value) { // std::pair rep_ins = emplace_helpers::call_emplace(cont, key1); // REQUIRE_MESSAGE(!rep_ins.second, "Element 1 has been emplaced twice into the container with unique keys"); // REQUIRE(Value::compare(rep_ins.first, key1)); //} typename Container::iterator it2 = emplace_helpers::call_emplace_hint(cont, ins.first, key2); REQUIRE_MESSAGE(Value::compare(it2, key2), "Element 2 has not been inserted"); check_value_state(*it2, CheckElementState{}); } template std::pair CheckRecursiveRange( Range range ) { std::pair sum(0, 0); // count, sum for (Iterator i = range.begin(); i != range.end(); ++i) { ++sum.first; sum.second += Value::get(*i); } if (range.is_divisible()) { Range range2(range, tbb::split{}); auto sum1 = CheckRecursiveRange(range); auto sum2 = CheckRecursiveRange(range2); sum1.first += sum2.first; sum1.second += sum2.second; REQUIRE_MESSAGE(sum == sum1, "Mismatched ranges afted division"); } return sum; } using atomic_byte_type = std::atomic; void CheckRange( atomic_byte_type array[], int n, bool allow_multimapping, int odd_count ) { if (allow_multimapping) { for (int k = 0; k < n; ++k) { if (k % 2) { REQUIRE(array[k] == odd_count); } else { REQUIRE(array[k] == 2); } } } else { for (int k = 0; k < n; ++k) { REQUIRE(array[k] == 1); } } } template void check_equal( const T& cont1, const T& cont2 ) { REQUIRE_MESSAGE(cont1 == cont2, "Containers should be equal"); REQUIRE_MESSAGE(cont2 == cont1, "Containers should be equal"); REQUIRE_MESSAGE(!(cont1 != cont2), "Containers should not be unequal"); REQUIRE_MESSAGE(!(cont2 != cont1), "Containers should not be unequal"); } template void check_unequal( const T& cont1, const T& cont2 ) { REQUIRE_MESSAGE(cont1 != cont2, "Containers should be unequal"); REQUIRE_MESSAGE(cont2 != cont1, "Containers should be unequal"); REQUIRE_MESSAGE(!(cont1 == cont2), "Containers should not be equal"); REQUIRE_MESSAGE(!(cont2 == cont1), "Containers should not be equal"); } // Break value for maps template void break_value( std::pair& value ) { ++value.second; } template void break_value( std::pair& value ) { ++value.second.bar(); } // Break value for sets template void break_value( T& value ) { ++value; } void break_value( move_support_tests::FooWithAssign& value ) { ++value.bar(); } template void test_comparison_operators() { T cont; check_equal(cont, cont); cont.insert(Value::make(1)); cont.insert(Value::make(2)); T cont2 = cont; check_equal(cont, cont2); T cont3; check_unequal(cont, cont3); T cont4; cont4.insert(Value::make(1)); cont4.insert(Value::make(2)); check_equal(cont, cont4); T cont5; cont5.insert(Value::make(1)); cont5.insert(Value::make(3)); check_unequal(cont, cont5); T cont6; cont6.insert(Value::make(1)); auto value2 = Value::make(2); break_value(value2); cont6.insert(value2); check_unequal(cont, cont6); } template void test_empty_container_range(Container&& cont) { REQUIRE(cont.empty()); Range r = cont.range(); REQUIRE_MESSAGE(r.empty(), "Empty container range should be empty"); REQUIRE_MESSAGE(!r.is_divisible(), "Empty container range should not be divisible"); REQUIRE_MESSAGE(r.begin() == r.end(), "Incorrect iterators on empty range"); REQUIRE_MESSAGE(r.begin() == cont.begin(), "Incorrect iterators on empty range"); } template void test_basic_common() { T cont; const T &ccont(cont); CheckNoAllocations(cont); // bool empty() const; REQUIRE_MESSAGE(ccont.empty(), "Concurrent container is not empty after construction"); // size_type size() const; REQUIRE_MESSAGE(ccont.size() == 0, "Concurrent container is not empty after construction"); // size_type max_size() const; REQUIRE_MESSAGE(ccont.max_size() > 0, "Concurrent container max size is invalid"); //iterator begin(); //iterator end(); REQUIRE_MESSAGE(cont.begin() == cont.end(), "Concurrent container iterators are invalid after construction"); REQUIRE_MESSAGE(ccont.begin() == ccont.end(), "Concurrent container iterators are invalid after construction"); REQUIRE_MESSAGE(cont.cbegin() == cont.cend(), "Concurrent container iterators are invalid after construction"); // Test range for empty container using range_type = typename T::range_type; using const_range_type = typename T::const_range_type; test_empty_container_range(cont); test_empty_container_range(cont); test_empty_container_range(ccont); T empty_cont; const T& empty_ccont = empty_cont; for (int i = 0; i < 1000; ++i) { empty_cont.insert(Value::make(i)); } empty_cont.clear(); test_empty_container_range(empty_cont); test_empty_container_range(empty_cont); test_empty_container_range(empty_ccont); //std::pair insert(const value_type& obj); std::pair ins = cont.insert(Value::make(1)); REQUIRE_MESSAGE((ins.second == true && Value::get(*(ins.first)) == 1), "Element 1 has not been inserted properly"); test_rvalue_insert(1,2); test_emplace_insert(1,2); // bool empty() const; REQUIRE_MESSAGE(!ccont.empty(), "Concurrent container is empty after adding an element"); // size_type size() const; REQUIRE_MESSAGE(ccont.size() == 1, "Concurrent container size is incorrect"); std::pair ins2 = cont.insert(Value::make(1)); if (AllowMultimapping::value) { // std::pair insert(const value_type& obj); REQUIRE_MESSAGE((ins2.second == true && Value::get(*(ins2.first)) == 1), "Element 1 has not been inserted properly"); // size_type size() const; REQUIRE_MESSAGE(ccont.size() == 2, "Concurrent container size is incorrect"); // size_type count(const key_type& k) const; REQUIRE_MESSAGE(ccont.count(1) == 2, "Concurrent container count(1) is incorrect"); // std::pair equal_range(const key_type& k); std::pair range = cont.equal_range(1); typename T::iterator it; it = range.first; REQUIRE_MESSAGE((it != cont.end() && Value::get(*it) == 1), "Element 1 has not been found properly"); unsigned int count = 0; for (; it != range.second; it++) { count++; REQUIRE_MESSAGE((Value::get(*it) == 1), "Element 1 has not been found properly"); } REQUIRE_MESSAGE(count == 2, "Range doesn't have the right number of elements"); } else { // std::pair insert(const value_type& obj); REQUIRE_MESSAGE((ins2.second == false && ins2.first == ins.first), "Element 1 should not be re-inserted"); // size_type size() const; REQUIRE_MESSAGE(ccont.size() == 1, "Concurrent container size is incorrect"); // size_type count(const key_type& k) const; REQUIRE_MESSAGE(ccont.count(1) == 1, "Concurrent container count(1) is incorrect"); // std::pair equal_range(const key_type& k) const; // std::pair equal_range(const key_type& k); std::pair range = cont.equal_range(1); typename T::iterator it = range.first; REQUIRE_MESSAGE((it != cont.end() && Value::get(*it) == 1), "Element 1 has not been found properly"); REQUIRE_MESSAGE((++it == range.second), "Range doesn't have the right number of elements"); } // const_iterator find(const key_type& k) const; // iterator find(const key_type& k); typename T::iterator it = cont.find(1); REQUIRE_MESSAGE((it != cont.end() && Value::get(*(it)) == 1), "Element 1 has not been found properly"); REQUIRE_MESSAGE(ccont.find(1) == it, "Element 1 has not been found properly"); //bool contains(const key_type&k) const REQUIRE_MESSAGE(cont.contains(1), "contains() cannot detect existing element"); REQUIRE_MESSAGE(!cont.contains(0), "contains() detect not existing element"); // iterator insert(const_iterator hint, const value_type& obj); typename T::iterator it2 = cont.insert(ins.first, Value::make(2)); REQUIRE_MESSAGE((Value::get(*it2) == 2), "Element 2 has not been inserted properly"); // T(const T& _Umap) T newcont = ccont; REQUIRE_MESSAGE((AllowMultimapping{} ? (newcont.size() == 3) : (newcont.size() == 2)), "Copy construction has not copied the elements properly"); // size_type unsafe_erase(const key_type& k); typename T::size_type size; #if _MSC_VER && __INTEL_COMPILER == 1900 // The compiler optimizes the next line too aggressively. #pragma noinline #endif size = cont.unsafe_erase(1); REQUIRE_MESSAGE((AllowMultimapping{} ? (size == 2) : (size == 1)), "Erase has not removed the right number of elements"); // iterator unsafe_erase(iterator position); typename T::iterator it4 = cont.unsafe_erase(cont.find(2)); REQUIRE_MESSAGE((it4 == cont.end() && cont.size() == 0), "Erase has not removed the last element properly"); // iterator unsafe_erase(const_iterator position); cont.insert(Value::make(3)); typename T::iterator it5 = cont.unsafe_erase(cont.cbegin()); REQUIRE_MESSAGE((it5 == cont.end() && cont.size() == 0), "Erase has not removed the last element properly"); // template void insert(InputIterator first, InputIterator last); cont.insert(newcont.begin(), newcont.end()); REQUIRE_MESSAGE((AllowMultimapping{} ? (cont.size() == 3) : (cont.size() == 2)), "Range insert has not copied the elements properly"); // iterator unsafe_erase(const_iterator first, const_iterator last); std::pair range2 = newcont.equal_range(1); newcont.unsafe_erase(range2.first, range2.second); REQUIRE_MESSAGE(newcont.size() == 1, "Range erase has not erased the elements properly"); // void clear(); newcont.clear(); REQUIRE_MESSAGE((newcont.begin() == newcont.end() && newcont.size() == 0), "Clear has not cleared the container"); // void insert(std::initializer_list &il); newcont.insert( { Value::make( 1 ), Value::make( 2 ), Value::make( 1 ) } ); if (AllowMultimapping::value) { REQUIRE_MESSAGE(newcont.size() == 3, "Concurrent container size is incorrect"); REQUIRE_MESSAGE(newcont.count(1) == 2, "Concurrent container count(1) is incorrect"); REQUIRE_MESSAGE(newcont.count(2) == 1, "Concurrent container count(2) is incorrect"); std::pair range = cont.equal_range(1); it = range.first; // REQUIRE_MESSAGE((it != newcont.end() && Value::get(*it) == 1), "Element 1 has not been found properly"); REQUIRE_MESSAGE((it != newcont.end()), "iterator" ); REQUIRE_MESSAGE((Value::get(*it) == 1), "value"); unsigned int count = 0; for (; it != range.second; it++) { count++; REQUIRE_MESSAGE(Value::get(*it) == 1, "Element 1 has not been found properly"); } REQUIRE_MESSAGE(count == 2, "Range doesn't have the right number of elements"); range = newcont.equal_range(2); it = range.first; REQUIRE_MESSAGE((it != newcont.end() && Value::get(*it) == 2), "Element 2 has not been found properly"); count = 0; for (; it != range.second; it++) { count++; REQUIRE_MESSAGE(Value::get(*it) == 2, "Element 2 has not been found properly"); } REQUIRE_MESSAGE(count == 1, "Range doesn't have the right number of elements"); } else { REQUIRE_MESSAGE(newcont.size() == 2, "Concurrent container size is incorrect"); REQUIRE_MESSAGE(newcont.count(1) == 1, "Concurrent container count(1) is incorrect"); REQUIRE_MESSAGE(newcont.count(2) == 1, "Concurrent container count(2) is incorrect"); std::pair range = newcont.equal_range(1); it = range.first; REQUIRE_MESSAGE((it != newcont.end() && Value::get(*it) == 1), "Element 1 has not been found properly"); REQUIRE_MESSAGE((++it == range.second), "Range doesn't have the right number of elements"); range = newcont.equal_range(2); it = range.first; REQUIRE_MESSAGE((it != newcont.end() && Value::get(*it) == 2), "Element 2 has not been found properly"); REQUIRE_MESSAGE((++it == range.second), "Range doesn't have the right number of elements"); } // T& operator=(const T& _Umap) newcont = ccont; REQUIRE_MESSAGE((AllowMultimapping{} ? (newcont.size() == 3) : (newcont.size() == 2)), "Assignment operator has not copied the elements properly"); cont.clear(); CheckNoAllocations(cont); for (int i = 0; i < 256; i++) { std::pair ins3 = cont.insert(Value::make(i)); REQUIRE_MESSAGE((ins3.second == true && Value::get(*(ins3.first)) == i), "Element 1 has not been inserted properly"); } REQUIRE_MESSAGE(cont.size() == 256, "Wrong number of elements have been inserted"); REQUIRE(!cont.range().empty()); REQUIRE(!ccont.range().empty()); REQUIRE((256 == CheckRecursiveRange(cont.range()).first)); REQUIRE((256 == CheckRecursiveRange(ccont.range()).first)); REQUIRE(cont.range().grainsize() > 0); REQUIRE(ccont.range().grainsize() > 0); // void swap(T&); cont.swap(newcont); REQUIRE_MESSAGE(newcont.size() == 256, "Wrong number of elements after swap"); REQUIRE_MESSAGE(newcont.count(200) == 1, "Element with key 200 is not present after swap"); REQUIRE_MESSAGE(newcont.count(16) == 1, "Element with key 16 is not present after swap"); REQUIRE_MESSAGE(newcont.count(99) == 1, "Element with key 99 is not present after swap"); REQUIRE_MESSAGE((AllowMultimapping{} ? (cont.size() == 3) : (cont.size() == 2)), "Assignment operator has not copied the elements properly"); T newcont_bkp = newcont; newcont.swap(newcont); REQUIRE_MESSAGE(newcont == newcont_bkp, "Unexpected swap-with-itself behavior"); test_comparison_operators(); SpecialTests::Test(); } template class FillTable { Container& my_table; const int my_items; bool my_asymptotic; using pair_ib = std::pair; public: FillTable(Container& table, int items, bool asymptotic) : my_table(table), my_items(items), my_asymptotic(asymptotic) { REQUIRE((!(items&1) && items > 100)); } void operator()( int thread_index ) const { if (thread_index == 0) { // Fill even keys forward (single thread) bool last_inserted = true; for (int i = 0; i < my_items; i += 2) { int val = my_asymptotic ? 1 : i; pair_ib pib = my_table.insert(Value::make(val)); REQUIRE_MESSAGE((Value::get(*(pib.first)) == val), "Element not properly inserted"); REQUIRE_MESSAGE((last_inserted || !pib.second), "Previous key was not inserted but current key is inserted"); last_inserted = pib.second; } } else if (thread_index == 1) { // Fill even keys backward (single thread) bool last_inserted = true; for (int i = my_items - 2; i >= 0; i -= 2) { int val = my_asymptotic ? 1 : i; pair_ib pib = my_table.insert(Value::make(val)); REQUIRE_MESSAGE((Value::get(*(pib.first)) == val), "Element not properly inserted"); REQUIRE_MESSAGE((last_inserted || !pib.second), "Previous key was not inserted but current key is inserted"); last_inserted = pib.second; } } else if (!(thread_index & 1)) { // Fill odd keys forward (multiple threads) for (int i = 1; i < my_items; i += 2) { if (i % 32 == 1 && i + 6 < my_items) { if (my_asymptotic) { auto init = { Value::make(1), Value::make(1), Value::make(1) }; my_table.insert(init); REQUIRE_MESSAGE(Value::get(*my_table.find(1)) == 1, "Element not properly inserted"); } else { auto init = { Value::make(i), Value::make(i + 2), Value::make(i + 4) }; my_table.insert(init); REQUIRE_MESSAGE(Value::get(*my_table.find(i)) == i, "Element i not properly inserted"); REQUIRE_MESSAGE(Value::get(*my_table.find(i + 2)) == i + 2, "Element i + 2 not properly inserted"); REQUIRE_MESSAGE(Value::get(*my_table.find(i + 4)) == i + 4, "Element i + 4 not properly inserted"); } i += 4; } else { pair_ib pib = my_table.insert(Value::make(my_asymptotic ? 1 : i)); REQUIRE_MESSAGE(Value::get(*(pib.first)) == (my_asymptotic ? 1 : i), "Element not properly inserted"); } } } else { // Check odd keys backward (multiple threads) if (!my_asymptotic) { bool last_found = false; for (int i = my_items - 1; i >= 0; i -= 2) { typename Container::iterator it = my_table.find(i); if (it != my_table.end()) { // found REQUIRE_MESSAGE(Value::get(*it) == i, "Element not properly inserted"); last_found = true; } else { REQUIRE_MESSAGE(!last_found, "Previous key was found, but current was not found"); } } } } } }; // class FillTable template struct ParallelTraverseBody { const int n; atomic_byte_type* const array; ParallelTraverseBody( atomic_byte_type arr[], int num ) : n(num), array(arr) {} void operator()( const Range& range ) const { for (auto i = range.begin(); i != range.end(); ++i) { int k = static_cast(Value::key(*i)); REQUIRE(k == Value::get(*i)); REQUIRE(0 <= k); REQUIRE(k < n); ++array[k]; } } }; // struct ParallelTraverseBody template class CheckTable : utils::NoAssign { T& table; public: CheckTable(T& t) : NoAssign(), table(t) {} void operator()(int i) const { int c = (int)table.count(i); CHECK_MESSAGE(c, "must exist"); } }; template void test_concurrent_common( bool asymptotic = false ) { #if __TBB_USE_ASSERT int items = 2000; #else int items = 20000; #endif int items_inserted = 0; int num_threads = 16; Container table; if (AllowMultimapping::value) { // TODO: comment items = 4 * items / (num_threads + 2); items_inserted = items + (num_threads - 2) * items / 4; } else { items_inserted = items; } utils::NativeParallelFor(num_threads, FillTable(table, items, asymptotic)); REQUIRE(int(table.size()) == items_inserted); if (!asymptotic) { atomic_byte_type* array = new atomic_byte_type[items]; std::memset(static_cast(array), 0, items * sizeof(atomic_byte_type)); typename Container::range_type r = table.range(); auto p = CheckRecursiveRange(r); REQUIRE(items_inserted == p.first); tbb::parallel_for(r, ParallelTraverseBody(array, items)); CheckRange(array, items, AllowMultimapping::value, (num_threads - 1)/2); const Container& const_table = table; std::memset(static_cast(array), 0, items * sizeof(atomic_byte_type)); typename Container::const_range_type cr = const_table.range(); p = CheckRecursiveRange(cr); REQUIRE(items_inserted == p.first); tbb::parallel_for(cr, ParallelTraverseBody(array, items)); CheckRange(array, items, AllowMultimapping::value, (num_threads - 1) / 2); delete[] array; tbb::parallel_for(0, items, CheckTable(table)); } table.clear(); // TODO: add check for container allocator counters } template void test_rvalue_ref_support() { using namespace move_support_tests; test_move_constructor(); test_move_assignment(); #if TBB_USE_EXCEPTIONS test_ex_move_constructor(); #endif } template void test_range_based_for_support() { using namespace range_based_for_support_tests; Container cont; const int sequence_length = 100; for (int i = 1; i <= sequence_length; ++i) { cont.insert(Value::make(i)); } auto range_based_for_result = range_based_for_accumulate(cont, UnifiedSummer{}, 0); auto reference_result = gauss_summ_of_int_sequence(sequence_length); REQUIRE_MESSAGE(range_based_for_result == reference_result, "Incorrect accumulated value generated via range based for"); } template void test_initializer_list_support( std::initializer_list init ) { using namespace initializer_list_support_tests; test_initializer_list_support_without_assign(init); test_initializer_list_support_without_assign({}); } template void test_set_specific_types() { // TODO: add tests for atomics Checker check_types; const int num = 10; // Test int std::list arr_int; for (int i = 0; i != num; ++i) { arr_int.emplace_back(i); } check_types.template check(arr_int); // Test reference_wrapper std::list> arr_ref; for (auto it = arr_int.begin(); it != arr_int.end(); ++it) { arr_ref.emplace_back(*it); } check_types.template check(arr_ref); // Test share_ptr std::list> arr_shr; for (int i = 0; i != num; ++i) { arr_shr.emplace_back(std::make_shared(i)); } check_types.template check(arr_shr); // Test weak_ptr std::list> arr_weak; std::copy(arr_shr.begin(), arr_shr.end(), std::back_inserter(arr_weak)); check_types.template check(arr_weak); // Test std::pair std::list> arr_pairs; for (int i = 0; i != num; ++i) { arr_pairs.emplace_back(i, i); } check_types.template check(arr_pairs); // Test std::basic_string std::list, tbb::tbb_allocator>> arr_strings; for (int i = 0; i != num; ++i) { arr_strings.emplace_back(i, char(i)); } check_types.template check(arr_strings); } template void test_map_specific_types() { // TODO: add tests for int-atomic pairs Checker check_types; const int num = 10; // Test int-int pairs std::list> arr_int_int_pairs; for (int i = 0; i != num; ++i) { arr_int_int_pairs.emplace_back(i, num - i); } check_types.template check(arr_int_int_pairs); // Test reference_wrapper-int pairs std::list, int>> arr_ref_int_pairs; for (auto& item : arr_int_int_pairs) { arr_ref_int_pairs.emplace_back(item.first, item.second); } check_types.template check(arr_ref_int_pairs); // Test int-reference_wrapper pairs std::list>> arr_int_ref_pairs; for (auto& item : arr_int_int_pairs) { arr_int_ref_pairs.emplace_back(item.first, item.second); } check_types.template check(arr_int_ref_pairs); // Test shared_ptr std::list, std::shared_ptr>> arr_shared_pairs; for (int i = 0; i != num; ++i) { const int num_minus_i = num - i; arr_shared_pairs.emplace_back(std::make_shared(i), std::make_shared(num_minus_i)); } check_types.template check(arr_shared_pairs); // Test weak_ptr std::list, std::weak_ptr>> arr_wick_pairs; std::copy(arr_shared_pairs.begin(), arr_shared_pairs.end(), std::back_inserter(arr_wick_pairs)); check_types.template check(arr_wick_pairs); // Test std::pair using pair_key_type = std::pair; std::list> arr_pair_int_pairs; for (int i = 0; i != num; ++i) { arr_pair_int_pairs.emplace_back(pair_key_type{i, i}, i); } check_types.template check(arr_pair_int_pairs); // Test std::basic_string using tbb_string_key_type = std::basic_string, tbb::tbb_allocator>; std::list> arr_tbb_string_pairs; for (int i = 0; i != num; ++i) { tbb_string_key_type key(i, char(i)); arr_tbb_string_pairs.emplace_back(key, i); } check_types.template check(arr_tbb_string_pairs); } namespace test { // For the sake of simplified testing, make std::unique_ptr implicitly convertible to/from the pointer template class unique_ptr : public std::unique_ptr { public: using pointer = typename std::unique_ptr::pointer; unique_ptr( pointer p ) : std::unique_ptr(p) {} operator pointer() const { return this->get(); } }; // class unique_ptr } // namespace test namespace std { template struct hash> { std::size_t operator()(const test::unique_ptr& ptr) const { return std::hash>{}(ptr); } }; } template struct CallIf { template void operator()( Func func ) const { func(); } }; template <> struct CallIf { template void operator()( Func ) const {} }; template class TestOperatorSquareBrackets { using value_type = typename Table::value_type; Table& my_c; const value_type& my_value; public: TestOperatorSquareBrackets( Table& c, const value_type& value ) : my_c(c), my_value(value) {} void operator()() const { utils::IsEqual equal; REQUIRE(equal(my_c[my_value.first], my_value.second)); typename Table::key_type temp_key = my_value.first; REQUIRE(equal(my_c[std::move(temp_key)], my_value.second)); } }; // class TestOperatorSquareBrackets template void TestSquareBracketsAndAt( Table&, const Value&, /*multimap = */std::true_type ) { // operator [] and at are not presented in the multimap } template void TestSquareBracketsAndAt( Table& c, const Value& value, /*multimap = */std::false_type ) { CallIf()(TestOperatorSquareBrackets(c, value)); utils::IsEqual equal; REQUIRE(equal(c.at(value.first), value.second)); // TODO: add test for at exceptions const Table& constC = c; REQUIRE(equal(constC.at(value.first), value.second)); } template void TestMapSpecificMethods( Table&, const Value& ) {} template void TestMapSpecificMethods( Table& c, const std::pair& value ) { TestSquareBracketsAndAt(c, value, std::integral_constant::value>{}); } template class CheckValue { Table& my_c; public: CheckValue( Table& c ) : my_c(c) {} void operator()( const typename Table::value_type& value ) { using iterator = typename Table::iterator; using const_iterator = typename Table::const_iterator; const Table& constC = my_c; // count REQUIRE(my_c.count(Value
::key(value)) == 1); // find utils::IsEqual equal; REQUIRE(equal(*my_c.find(Value
::key(value)), value)); REQUIRE(equal(*constC.find(Value
::key(value)), value)); // erase REQUIRE(my_c.unsafe_erase(Value
::key(value)) != 0); REQUIRE(my_c.unsafe_erase(Value
::key(value)) == 0); // insert std::pair res = my_c.insert(value); REQUIRE(equal(*res.first, value)); REQUIRE(res.second); // erase iterator it = res.first; ++it; REQUIRE(my_c.unsafe_erase(res.first) == it); // insert REQUIRE(equal(*my_c.insert(my_c.begin(), value), value)); // equal_range std::pair r1 = my_c.equal_range(Value
::key(value)); REQUIRE((equal(*r1.first, value) && ++r1.first == r1.second)); std::pair r2 = constC.equal_range(Value
::key(value)); REQUIRE((equal(*r2.first, value) && ++r2.first == r2.second)); TestMapSpecificMethods(my_c, value); } }; // class CheckValue namespace detail { #if (__INTEL_COMPILER || __clang__ ) && __TBB_GLIBCXX_VERSION && __TBB_GLIBCXX_VERSION < 40900 template struct assignable_atomic : std::atomic { using std::atomic::operator=; assignable_atomic& operator=(const assignable_atomic& a) { store(a.load(std::memory_order_relaxed), std::memory_order_relaxed); } }; template using atomic_type = assignable_atomic; #else template using atomic_type = std::atomic; #endif } template class TestRange { const std::list& my_lst; std::vector>& my_marks; public: TestRange( const std::list& lst, std::vector>& marks ) : my_lst(lst), my_marks(marks) { std::fill(my_marks.begin(), my_marks.end(), false); } template void operator()( const Range& r ) const { do_test_range(r.begin(), r.end()); } template void do_test_range( Iterator i, Iterator j ) const { for (Iterator it = i; it != j;) { Iterator prev_it = it++; auto it2 = std::search(my_lst.begin(), my_lst.end(), prev_it, it, utils::IsEqual()); REQUIRE(it2 != my_lst.end()); auto dist = std::distance(my_lst.begin(), it2); REQUIRE(!my_marks[dist]); my_marks[dist] = true; } } }; // class TestRange template void CommonExamine( Table c, const std::list& lst ) { using value_type = typename Table::value_type; if (!(!c.empty() && c.size() == lst.size() && c.max_size() >= c.size())) { std::cout << std::boolalpha; std::cout << "Empty? " << c.empty() << std::endl; std::cout << "sizes equal? " << (c.size() == lst.size()) << std::endl; std::cout << "\t" << c.size() << std::endl; std::cout << "\t" << lst.size() << std::endl; std::cout << "Max size greater? " << (c.max_size() >= c.size()) << std::endl; } REQUIRE((!c.empty() && c.size() == lst.size() && c.max_size() >= c.size())); std::for_each(lst.begin(), lst.end(), CheckValue(c)); std::vector> marks(lst.size()); TestRange(lst, marks).do_test_range(c.begin(), c.end()); REQUIRE(std::find(marks.begin(), marks.end(), false) == marks.end()); const Table constC = c; REQUIRE(c.size() == constC.size()); TestRange(lst, marks).do_test_range(c.begin(), c.end()); REQUIRE(std::find(marks.begin(), marks.end(), false) == marks.end()); tbb::parallel_for(c.range(), TestRange(lst, marks)); REQUIRE(std::find(marks.begin(), marks.end(), false) == marks.end()); tbb::parallel_for(constC.range(), TestRange(lst, marks)); REQUIRE(std::find(marks.begin(), marks.end(), false) == marks.end()); Table c2; auto begin5 = lst.begin(); std::advance(begin5, 5); c2.insert(lst.begin(), begin5); std::for_each(lst.begin(), begin5, CheckValue(c2)); c2.swap(c); REQUIRE(c2.size() == lst.size()); REQUIRE(c.size() == 5); std::for_each(lst.begin(), lst.end(), CheckValue(c2)); c2.clear(); REQUIRE(c2.size() == 0); auto alloc = c.get_allocator(); value_type* ptr = alloc.allocate(1); REQUIRE(ptr != nullptr); alloc.deallocate(ptr, 1); } template void test_scoped_allocator() { using allocator_data_type = AllocatorAwareData>>; using basic_allocator_type = std::scoped_allocator_adaptor>; using container_value_type = typename ContainerTraits::template container_value_type; using allocator_type = typename std::allocator_traits::template rebind_alloc; using container_type = typename ContainerTraits::template container_type; allocator_type allocator; allocator_data_type key1(1, allocator); allocator_data_type key2(2, allocator); container_value_type value1 = Value::make(key1); container_value_type value2 = Value::make(key2); auto init_list = { value1, value2 }; container_type c1(allocator); container_type c2(allocator); allocator_data_type::activate(); emplace_helpers::call_emplace(c1, key1); emplace_helpers::call_emplace(c2, std::move(key2)); c1.clear(); c2.clear(); c1.insert(value1); c2.insert(std::move(value2)); c1.clear(); c2.clear(); c1.insert(init_list); c2.insert(value1); c1 = c2; c2 = std::move(c1); allocator_data_type::deactivate(); } struct int_key { int my_item; int_key(int i) : my_item(i) {} }; bool operator==(const int_key& ik, int i) { return ik.my_item == i; } bool operator==(int i, const int_key& ik) { return i == ik.my_item; } bool operator==(const int_key& ik1, const int_key& ik2) { return ik1.my_item == ik2.my_item; } bool operator<( const int_key& ik, int i ) { return ik.my_item < i; } bool operator<( int i, const int_key& ik ) { return i < ik.my_item; } bool operator<( const int_key& ik1, const int_key& ik2 ) { return ik1.my_item < ik2.my_item; } struct char_key { const char* my_item; char_key(const char* c) : my_item(c) {} const char& operator[] (std::size_t pos) const { return my_item[pos]; } std::size_t size() const { return std::strlen(my_item); } }; bool operator==(const char_key& ck, std::string c) { std::size_t i = 0; while (ck[i] != '\0' && i < c.size() && ck[i] == c[i]) { ++i;} return c.size() == i && ck[i] == '\0'; } bool operator==(std::string c, const char_key& ck) { return ck == c; } bool operator==(const char_key& ck1, const char_key& ck2) { std::size_t i = 0; while (ck1[i] != '\0' && ck2[i] != '\0' && ck1[i] == ck2[i]) { ++i;} return ck1[i] == ck2[i]; } bool operator<( const char_key& ck, std::string c ) { return std::lexicographical_compare(ck.my_item, ck.my_item + ck.size(), c.begin(), c.end()); } bool operator<(std::string c, const char_key& ck) { return std::lexicographical_compare(c.begin(), c.end(), ck.my_item, ck.my_item + ck.size()); } bool operator<( const char_key& ck1, const char_key& ck2 ) { return std::lexicographical_compare(ck1.my_item, ck1.my_item + ck1.size(), ck2.my_item, ck2.my_item + ck2.size()); } struct equal_to { using is_transparent = int; template bool operator()(const T &lhs, const W &rhs) const { return lhs == rhs; } }; struct hash_with_transparent_key_equal { template size_t operator()(const T& key) const { return hash(key); } using transparent_key_equal = equal_to; int prime = 433494437; int first_factor = 41241245; int second_factor = 2523422; size_t hash(const int& key) const { return (first_factor * key + second_factor) % prime; } size_t hash(const int_key& key) const { return (first_factor * key.my_item + second_factor) % prime; } size_t hash(const std::string& key) const { int sum = 0; for (auto it = key.begin(); it != key.end(); ++it) { sum += first_factor * (*it) + second_factor; } return sum % prime; } size_t hash(const char_key& key) const { int sum = 0; int i = 0; while (key[i] != '\0') { sum += first_factor * key[i++] + second_factor; } return sum % prime; } }; template void check_heterogeneous_functions_key_int_impl() { static_assert(std::is_same::value, "incorrect key_type for heterogeneous lookup test"); // Initialization Container c; int size = 10; for (int i = 0; i < size; i++) { c.insert(Value::make(i)); } // Insert first duplicated element for multicontainers if (AllowMultimapping::value) { c.insert(Value::make(0)); } // Look up testing for (int i = 0; i < size; i++) { int_key k(i); int key = i; REQUIRE_MESSAGE(c.find(k) == c.find(key), "Incorrect heterogeneous find return value"); REQUIRE_MESSAGE(c.count(k) == c.count(key), "Incorrect heterogeneous count return value"); } // unsafe_extract testing for (int i = 0; i < size; i++) { Container extract_c = c; int_key int_k(i); auto int_key_extract = extract_c.unsafe_extract(int_k); if (!AllowMultimapping::value) { REQUIRE_MESSAGE(extract_c.find(int_k) == extract_c.end(), "Key exists after extract"); } REQUIRE_MESSAGE(!int_key_extract.empty(), "Empty node with exists key"); REQUIRE_MESSAGE(node_handling_tests::compare_handle_getters(int_key_extract, Value::make(i)), "Incorrect node"); } // unsafe_extract not exists key auto not_exists = c.unsafe_extract(int_key(100)); REQUIRE_MESSAGE(not_exists.empty(), "Not empty node with not exists key"); // multimap unsafe_extract testing if (AllowMultimapping::value) { Container extract_m; for (int i = 0; i < size; i++) { extract_m.insert(Value::make(i)); extract_m.insert(Value::make(i, i + 1)); } for (int i = 0; i < size; i++) { int_key int_k(i); auto int_key_extract = extract_m.unsafe_extract(int_k); REQUIRE_MESSAGE(!int_key_extract.empty(), "Empty node with exists key"); REQUIRE_MESSAGE((node_handling_tests::compare_handle_getters(int_key_extract, Value::make(i, i)) || node_handling_tests::compare_handle_getters(int_key_extract, Value::make(i, i + 1))), "Incorrect node"); REQUIRE_MESSAGE(extract_m.find(int_k) != extract_m.end(), "All nodes for key deleted"); } } // Erase testing for (int i = 0; i < size; i++) { auto count_before_erase = c.count(i); auto result = c.unsafe_erase(int_key(i)); REQUIRE_MESSAGE(count_before_erase == result,"Incorrect erased elements count"); REQUIRE_MESSAGE(c.count(i) == 0, "Some elements was not erased"); } } template void check_heterogeneous_functions_key_string_impl() { static_assert(std::is_same::value, "incorrect key_type for heterogeneous lookup test"); // Initialization std::vector keys{"key1", "key2", "key3", "key4", "key5", "key6", "key7", "key8", "key9", "key10"}; std::vector values{"value1", "value2", "value3", "value4", "value5", "value6", "value7", "value8", "value9", "value10", "value11"}; Container c; for (auto it = keys.begin(); it != keys.end(); ++it) { c.insert(Value::make(*it)); } // Insert first duplicated element for multicontainers if (AllowMultimapping::value) { c.insert(Value::make(*keys.begin())); } // Look up testing for (auto it = keys.begin(); it != keys.end(); ++it) { std::string key = *it; char_key k{*it}; REQUIRE_MESSAGE(c.find(k) == c.find(key), "Incorrect heterogeneous find return value"); REQUIRE_MESSAGE(c.count(k) == c.count(key), "Incorrect heterogeneous count return value"); } // unsafe_extract testing for (auto it = keys.begin(); it != keys.end(); ++it) { Container extract_c = c; char_key k{*it}; auto char_key_extract = extract_c.unsafe_extract(k); REQUIRE_MESSAGE(!char_key_extract.empty(), "Empty node with exists key"); REQUIRE_MESSAGE(node_handling_tests::compare_handle_getters(char_key_extract, Value::make(*it)), "Incorrect node"); } // unsafe_extract not exists key auto not_exists = c.unsafe_extract(char_key("not exists")); REQUIRE_MESSAGE(not_exists.empty(), "Not empty node with not exists key"); // multimap unsafe_extract testing if (AllowMultimapping::value){ Container extract_m; for (std::size_t i = 0; i < keys.size(); i++) { extract_m.insert(Value::make(keys[i], values[i])); extract_m.insert(Value::make(keys[i], values[i + 1])); } for (std::size_t i = 0; i < keys.size(); i++) { char_key char_k(keys[i]); auto char_key_extract = extract_m.unsafe_extract(char_k); REQUIRE_MESSAGE(!char_key_extract.empty(), "Empty node with exists key"); REQUIRE_MESSAGE((node_handling_tests::compare_handle_getters(char_key_extract, Value::make(keys[i], values[i])) || node_handling_tests::compare_handle_getters(char_key_extract, Value::make(keys[i], values[i + 1]))), "Incorrect node"); REQUIRE_MESSAGE(extract_m.find(char_k) != extract_m.end(), "All nodes for key deleted"); } } // Erase testing for (auto it = keys.begin(); it != keys.end(); ++it) { std::string key = *it; char_key k{*it}; auto count_before_erase = c.count(key); auto result = c.unsafe_erase(k); REQUIRE_MESSAGE(count_before_erase==result,"Incorrect erased elements count"); REQUIRE_MESSAGE(c.count(key)==0, "Some elements was not erased"); } } struct CountingKey { static std::size_t counter; CountingKey() { ++counter; } CountingKey( const CountingKey& ) { ++counter; } ~CountingKey() {} CountingKey& operator=( const CountingKey& ) { return *this; } static void reset() { counter = 0; } }; std::size_t CountingKey::counter; namespace std { template <> struct hash { std::size_t operator()( const CountingKey& ) const { return 0; } }; } bool operator==( const CountingKey&, const CountingKey& ) { return true; } bool operator<( const CountingKey&, const CountingKey& ) { return true; } struct int_constructible_object { int_constructible_object(int k) : key(k) {} int key; }; // struct int_constructible_object bool operator==( const int_constructible_object& lhs, const int_constructible_object rhs ) { return lhs.key == rhs.key; } // A test for // template insert(P&&) in maps template